{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 798, "global_step": 3190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00031347962382445143, "grad_norm": 4.473039150238037, "learning_rate": 5e-10, "loss": 8.3121, "step": 1 }, { "epoch": 0.00031347962382445143, "eval_loss": 28.89470672607422, "eval_runtime": 20.3862, "eval_samples_per_second": 131.805, "eval_steps_per_second": 8.241, "step": 1 }, { "epoch": 0.0006269592476489029, "grad_norm": 3.657515287399292, "learning_rate": 1e-09, "loss": 7.0439, "step": 2 }, { "epoch": 0.0009404388714733542, "grad_norm": 2.806065559387207, "learning_rate": 1.5e-09, "loss": 6.6257, "step": 3 }, { "epoch": 0.0012539184952978057, "grad_norm": 2.810598611831665, "learning_rate": 2e-09, "loss": 6.5132, "step": 4 }, { "epoch": 0.001567398119122257, "grad_norm": 4.099343299865723, "learning_rate": 2.5e-09, "loss": 8.1055, "step": 5 }, { "epoch": 0.0018808777429467085, "grad_norm": 3.389047384262085, "learning_rate": 3e-09, "loss": 8.7388, "step": 6 }, { "epoch": 0.00219435736677116, "grad_norm": 3.4108972549438477, "learning_rate": 3.5000000000000003e-09, "loss": 7.8426, "step": 7 }, { "epoch": 0.0025078369905956114, "grad_norm": 3.3495447635650635, "learning_rate": 4e-09, "loss": 8.2811, "step": 8 }, { "epoch": 0.0028213166144200625, "grad_norm": 3.485499143600464, "learning_rate": 4.500000000000001e-09, "loss": 7.3836, "step": 9 }, { "epoch": 0.003134796238244514, "grad_norm": 4.053262233734131, "learning_rate": 5e-09, "loss": 9.1597, "step": 10 }, { "epoch": 0.0034482758620689655, "grad_norm": 4.419150352478027, "learning_rate": 5.5000000000000004e-09, "loss": 12.4238, "step": 11 }, { "epoch": 0.003761755485893417, "grad_norm": 3.277268409729004, "learning_rate": 6e-09, "loss": 8.4173, "step": 12 }, { "epoch": 0.0040752351097178684, "grad_norm": 3.8386521339416504, "learning_rate": 6.5e-09, "loss": 8.2201, "step": 13 }, { "epoch": 0.00438871473354232, "grad_norm": 4.788142681121826, "learning_rate": 7.000000000000001e-09, "loss": 11.7976, "step": 14 }, { "epoch": 0.004702194357366771, "grad_norm": 3.0357067584991455, "learning_rate": 7.500000000000001e-09, "loss": 8.5286, "step": 15 }, { "epoch": 0.005015673981191223, "grad_norm": 3.4486613273620605, "learning_rate": 8e-09, "loss": 7.839, "step": 16 }, { "epoch": 0.005329153605015674, "grad_norm": 4.829311370849609, "learning_rate": 8.5e-09, "loss": 9.6776, "step": 17 }, { "epoch": 0.005642633228840125, "grad_norm": 4.219234466552734, "learning_rate": 9.000000000000001e-09, "loss": 7.8809, "step": 18 }, { "epoch": 0.0059561128526645765, "grad_norm": 3.634462833404541, "learning_rate": 9.5e-09, "loss": 9.879, "step": 19 }, { "epoch": 0.006269592476489028, "grad_norm": 3.924381732940674, "learning_rate": 1e-08, "loss": 9.0212, "step": 20 }, { "epoch": 0.0065830721003134795, "grad_norm": 3.562445640563965, "learning_rate": 1.05e-08, "loss": 8.593, "step": 21 }, { "epoch": 0.006896551724137931, "grad_norm": 2.989351749420166, "learning_rate": 1.1000000000000001e-08, "loss": 6.3834, "step": 22 }, { "epoch": 0.007210031347962382, "grad_norm": 3.755312919616699, "learning_rate": 1.15e-08, "loss": 9.2207, "step": 23 }, { "epoch": 0.007523510971786834, "grad_norm": 3.771742582321167, "learning_rate": 1.2e-08, "loss": 9.2504, "step": 24 }, { "epoch": 0.007836990595611285, "grad_norm": 2.664695978164673, "learning_rate": 1.2500000000000001e-08, "loss": 7.0286, "step": 25 }, { "epoch": 0.008150470219435737, "grad_norm": 2.916489601135254, "learning_rate": 1.3e-08, "loss": 7.5708, "step": 26 }, { "epoch": 0.008463949843260187, "grad_norm": 3.4173834323883057, "learning_rate": 1.3500000000000002e-08, "loss": 7.0245, "step": 27 }, { "epoch": 0.00877742946708464, "grad_norm": 4.134421348571777, "learning_rate": 1.4000000000000001e-08, "loss": 8.218, "step": 28 }, { "epoch": 0.00909090909090909, "grad_norm": 2.478123188018799, "learning_rate": 1.45e-08, "loss": 6.6974, "step": 29 }, { "epoch": 0.009404388714733543, "grad_norm": 4.880210876464844, "learning_rate": 1.5000000000000002e-08, "loss": 13.0694, "step": 30 }, { "epoch": 0.009717868338557993, "grad_norm": 3.532501220703125, "learning_rate": 1.55e-08, "loss": 6.8898, "step": 31 }, { "epoch": 0.010031347962382446, "grad_norm": 3.8485097885131836, "learning_rate": 1.6e-08, "loss": 10.3871, "step": 32 }, { "epoch": 0.010344827586206896, "grad_norm": 4.742996692657471, "learning_rate": 1.6500000000000002e-08, "loss": 10.987, "step": 33 }, { "epoch": 0.010658307210031349, "grad_norm": 3.595039129257202, "learning_rate": 1.7e-08, "loss": 9.6221, "step": 34 }, { "epoch": 0.0109717868338558, "grad_norm": 3.5569448471069336, "learning_rate": 1.75e-08, "loss": 6.5841, "step": 35 }, { "epoch": 0.01128526645768025, "grad_norm": 4.763426303863525, "learning_rate": 1.8000000000000002e-08, "loss": 7.6671, "step": 36 }, { "epoch": 0.011598746081504702, "grad_norm": 3.6167099475860596, "learning_rate": 1.8500000000000004e-08, "loss": 10.4234, "step": 37 }, { "epoch": 0.011912225705329153, "grad_norm": 4.075372219085693, "learning_rate": 1.9e-08, "loss": 11.068, "step": 38 }, { "epoch": 0.012225705329153605, "grad_norm": 6.240333080291748, "learning_rate": 1.95e-08, "loss": 17.4708, "step": 39 }, { "epoch": 0.012539184952978056, "grad_norm": 3.7709567546844482, "learning_rate": 2e-08, "loss": 8.2594, "step": 40 }, { "epoch": 0.012852664576802508, "grad_norm": 2.885645866394043, "learning_rate": 2.0500000000000005e-08, "loss": 6.7495, "step": 41 }, { "epoch": 0.013166144200626959, "grad_norm": 3.3022165298461914, "learning_rate": 2.1e-08, "loss": 7.0996, "step": 42 }, { "epoch": 0.013479623824451411, "grad_norm": 3.8264548778533936, "learning_rate": 2.15e-08, "loss": 8.166, "step": 43 }, { "epoch": 0.013793103448275862, "grad_norm": 4.580362796783447, "learning_rate": 2.2000000000000002e-08, "loss": 14.0935, "step": 44 }, { "epoch": 0.014106583072100314, "grad_norm": 4.131213665008545, "learning_rate": 2.25e-08, "loss": 12.5754, "step": 45 }, { "epoch": 0.014420062695924765, "grad_norm": 3.5261685848236084, "learning_rate": 2.3e-08, "loss": 7.9894, "step": 46 }, { "epoch": 0.014733542319749215, "grad_norm": 3.0584185123443604, "learning_rate": 2.3500000000000002e-08, "loss": 6.4833, "step": 47 }, { "epoch": 0.015047021943573668, "grad_norm": 3.39809513092041, "learning_rate": 2.4e-08, "loss": 8.1705, "step": 48 }, { "epoch": 0.015360501567398118, "grad_norm": 4.006636619567871, "learning_rate": 2.45e-08, "loss": 8.5717, "step": 49 }, { "epoch": 0.01567398119122257, "grad_norm": 5.995611190795898, "learning_rate": 2.5000000000000002e-08, "loss": 8.4349, "step": 50 }, { "epoch": 0.01598746081504702, "grad_norm": 3.6572229862213135, "learning_rate": 2.5500000000000003e-08, "loss": 7.9315, "step": 51 }, { "epoch": 0.016300940438871474, "grad_norm": 3.6359832286834717, "learning_rate": 2.6e-08, "loss": 8.9688, "step": 52 }, { "epoch": 0.016614420062695926, "grad_norm": 2.8677127361297607, "learning_rate": 2.6500000000000002e-08, "loss": 6.3906, "step": 53 }, { "epoch": 0.016927899686520375, "grad_norm": 2.7862868309020996, "learning_rate": 2.7000000000000004e-08, "loss": 7.7946, "step": 54 }, { "epoch": 0.017241379310344827, "grad_norm": 3.8618948459625244, "learning_rate": 2.75e-08, "loss": 7.8249, "step": 55 }, { "epoch": 0.01755485893416928, "grad_norm": 3.185887336730957, "learning_rate": 2.8000000000000003e-08, "loss": 7.2935, "step": 56 }, { "epoch": 0.017868338557993732, "grad_norm": 2.7501015663146973, "learning_rate": 2.8500000000000004e-08, "loss": 6.9817, "step": 57 }, { "epoch": 0.01818181818181818, "grad_norm": 3.1869773864746094, "learning_rate": 2.9e-08, "loss": 7.383, "step": 58 }, { "epoch": 0.018495297805642633, "grad_norm": 5.212038993835449, "learning_rate": 2.9500000000000003e-08, "loss": 14.1096, "step": 59 }, { "epoch": 0.018808777429467086, "grad_norm": 3.6909945011138916, "learning_rate": 3.0000000000000004e-08, "loss": 9.0418, "step": 60 }, { "epoch": 0.019122257053291535, "grad_norm": 4.0268120765686035, "learning_rate": 3.05e-08, "loss": 11.0808, "step": 61 }, { "epoch": 0.019435736677115987, "grad_norm": 4.155067443847656, "learning_rate": 3.1e-08, "loss": 10.4174, "step": 62 }, { "epoch": 0.01974921630094044, "grad_norm": 4.077535629272461, "learning_rate": 3.1500000000000004e-08, "loss": 8.0036, "step": 63 }, { "epoch": 0.02006269592476489, "grad_norm": 3.3879809379577637, "learning_rate": 3.2e-08, "loss": 7.2178, "step": 64 }, { "epoch": 0.02037617554858934, "grad_norm": 4.345617771148682, "learning_rate": 3.25e-08, "loss": 11.3773, "step": 65 }, { "epoch": 0.020689655172413793, "grad_norm": 3.596099853515625, "learning_rate": 3.3000000000000004e-08, "loss": 7.7855, "step": 66 }, { "epoch": 0.021003134796238245, "grad_norm": 2.798985004425049, "learning_rate": 3.35e-08, "loss": 6.8671, "step": 67 }, { "epoch": 0.021316614420062698, "grad_norm": 3.921792507171631, "learning_rate": 3.4e-08, "loss": 11.5327, "step": 68 }, { "epoch": 0.021630094043887146, "grad_norm": 3.9658515453338623, "learning_rate": 3.4500000000000005e-08, "loss": 7.4312, "step": 69 }, { "epoch": 0.0219435736677116, "grad_norm": 3.8455910682678223, "learning_rate": 3.5e-08, "loss": 11.7665, "step": 70 }, { "epoch": 0.02225705329153605, "grad_norm": 3.4687187671661377, "learning_rate": 3.550000000000001e-08, "loss": 9.1295, "step": 71 }, { "epoch": 0.0225705329153605, "grad_norm": 3.2690117359161377, "learning_rate": 3.6000000000000005e-08, "loss": 8.421, "step": 72 }, { "epoch": 0.022884012539184952, "grad_norm": 4.495233535766602, "learning_rate": 3.65e-08, "loss": 8.9433, "step": 73 }, { "epoch": 0.023197492163009405, "grad_norm": 3.4736380577087402, "learning_rate": 3.700000000000001e-08, "loss": 7.5205, "step": 74 }, { "epoch": 0.023510971786833857, "grad_norm": 3.021930456161499, "learning_rate": 3.7500000000000005e-08, "loss": 6.0582, "step": 75 }, { "epoch": 0.023824451410658306, "grad_norm": 3.8124265670776367, "learning_rate": 3.8e-08, "loss": 9.6513, "step": 76 }, { "epoch": 0.02413793103448276, "grad_norm": 2.6047370433807373, "learning_rate": 3.850000000000001e-08, "loss": 6.0856, "step": 77 }, { "epoch": 0.02445141065830721, "grad_norm": 4.412445545196533, "learning_rate": 3.9e-08, "loss": 9.3832, "step": 78 }, { "epoch": 0.024764890282131663, "grad_norm": 4.0916547775268555, "learning_rate": 3.950000000000001e-08, "loss": 10.2509, "step": 79 }, { "epoch": 0.025078369905956112, "grad_norm": 3.16266131401062, "learning_rate": 4e-08, "loss": 8.3115, "step": 80 }, { "epoch": 0.025391849529780564, "grad_norm": 3.2230985164642334, "learning_rate": 4.05e-08, "loss": 8.4255, "step": 81 }, { "epoch": 0.025705329153605017, "grad_norm": 4.53424072265625, "learning_rate": 4.100000000000001e-08, "loss": 9.7329, "step": 82 }, { "epoch": 0.026018808777429465, "grad_norm": 2.889455795288086, "learning_rate": 4.15e-08, "loss": 7.6362, "step": 83 }, { "epoch": 0.026332288401253918, "grad_norm": 4.101494789123535, "learning_rate": 4.2e-08, "loss": 9.3998, "step": 84 }, { "epoch": 0.02664576802507837, "grad_norm": 3.166928291320801, "learning_rate": 4.2500000000000003e-08, "loss": 8.9123, "step": 85 }, { "epoch": 0.026959247648902823, "grad_norm": 4.550684452056885, "learning_rate": 4.3e-08, "loss": 10.9083, "step": 86 }, { "epoch": 0.02727272727272727, "grad_norm": 3.8120524883270264, "learning_rate": 4.35e-08, "loss": 11.0573, "step": 87 }, { "epoch": 0.027586206896551724, "grad_norm": 3.2697463035583496, "learning_rate": 4.4000000000000004e-08, "loss": 6.9374, "step": 88 }, { "epoch": 0.027899686520376176, "grad_norm": 4.755700588226318, "learning_rate": 4.45e-08, "loss": 9.1937, "step": 89 }, { "epoch": 0.02821316614420063, "grad_norm": 2.45084547996521, "learning_rate": 4.5e-08, "loss": 8.1863, "step": 90 }, { "epoch": 0.028526645768025077, "grad_norm": 4.488142490386963, "learning_rate": 4.5500000000000004e-08, "loss": 10.0932, "step": 91 }, { "epoch": 0.02884012539184953, "grad_norm": 2.933330774307251, "learning_rate": 4.6e-08, "loss": 6.199, "step": 92 }, { "epoch": 0.029153605015673982, "grad_norm": 3.5675182342529297, "learning_rate": 4.65e-08, "loss": 11.0667, "step": 93 }, { "epoch": 0.02946708463949843, "grad_norm": 3.300973653793335, "learning_rate": 4.7000000000000004e-08, "loss": 8.7929, "step": 94 }, { "epoch": 0.029780564263322883, "grad_norm": 4.447729587554932, "learning_rate": 4.75e-08, "loss": 13.4648, "step": 95 }, { "epoch": 0.030094043887147336, "grad_norm": 3.820695638656616, "learning_rate": 4.8e-08, "loss": 8.4362, "step": 96 }, { "epoch": 0.030407523510971788, "grad_norm": 3.2529191970825195, "learning_rate": 4.8500000000000004e-08, "loss": 7.8372, "step": 97 }, { "epoch": 0.030721003134796237, "grad_norm": 5.665775775909424, "learning_rate": 4.9e-08, "loss": 13.5665, "step": 98 }, { "epoch": 0.03103448275862069, "grad_norm": 3.9593966007232666, "learning_rate": 4.9500000000000006e-08, "loss": 8.0411, "step": 99 }, { "epoch": 0.03134796238244514, "grad_norm": 3.611478567123413, "learning_rate": 5.0000000000000004e-08, "loss": 7.8446, "step": 100 }, { "epoch": 0.031661442006269594, "grad_norm": 3.8838236331939697, "learning_rate": 5.05e-08, "loss": 8.1889, "step": 101 }, { "epoch": 0.03197492163009404, "grad_norm": 3.930360794067383, "learning_rate": 5.100000000000001e-08, "loss": 11.6122, "step": 102 }, { "epoch": 0.0322884012539185, "grad_norm": 3.1364479064941406, "learning_rate": 5.1500000000000005e-08, "loss": 7.8931, "step": 103 }, { "epoch": 0.03260188087774295, "grad_norm": 4.427796840667725, "learning_rate": 5.2e-08, "loss": 10.2633, "step": 104 }, { "epoch": 0.032915360501567396, "grad_norm": 4.472553730010986, "learning_rate": 5.250000000000001e-08, "loss": 9.7383, "step": 105 }, { "epoch": 0.03322884012539185, "grad_norm": 3.7383227348327637, "learning_rate": 5.3000000000000005e-08, "loss": 10.8243, "step": 106 }, { "epoch": 0.0335423197492163, "grad_norm": 4.171073913574219, "learning_rate": 5.35e-08, "loss": 9.0764, "step": 107 }, { "epoch": 0.03385579937304075, "grad_norm": 3.8834450244903564, "learning_rate": 5.400000000000001e-08, "loss": 11.7289, "step": 108 }, { "epoch": 0.034169278996865206, "grad_norm": 5.5527753829956055, "learning_rate": 5.4500000000000005e-08, "loss": 13.2347, "step": 109 }, { "epoch": 0.034482758620689655, "grad_norm": 3.0170059204101562, "learning_rate": 5.5e-08, "loss": 8.0964, "step": 110 }, { "epoch": 0.034796238244514104, "grad_norm": 4.954318046569824, "learning_rate": 5.550000000000001e-08, "loss": 9.1267, "step": 111 }, { "epoch": 0.03510971786833856, "grad_norm": 5.236219882965088, "learning_rate": 5.6000000000000005e-08, "loss": 13.9724, "step": 112 }, { "epoch": 0.03542319749216301, "grad_norm": 3.203052520751953, "learning_rate": 5.65e-08, "loss": 7.3345, "step": 113 }, { "epoch": 0.035736677115987464, "grad_norm": 3.730330228805542, "learning_rate": 5.700000000000001e-08, "loss": 11.3994, "step": 114 }, { "epoch": 0.03605015673981191, "grad_norm": 3.654137372970581, "learning_rate": 5.7500000000000005e-08, "loss": 7.6924, "step": 115 }, { "epoch": 0.03636363636363636, "grad_norm": 3.058237314224243, "learning_rate": 5.8e-08, "loss": 6.7775, "step": 116 }, { "epoch": 0.03667711598746082, "grad_norm": 3.9933547973632812, "learning_rate": 5.850000000000001e-08, "loss": 9.6667, "step": 117 }, { "epoch": 0.03699059561128527, "grad_norm": 4.147707462310791, "learning_rate": 5.9000000000000006e-08, "loss": 7.6471, "step": 118 }, { "epoch": 0.037304075235109715, "grad_norm": 4.296032428741455, "learning_rate": 5.950000000000001e-08, "loss": 13.4849, "step": 119 }, { "epoch": 0.03761755485893417, "grad_norm": 3.6536829471588135, "learning_rate": 6.000000000000001e-08, "loss": 10.2448, "step": 120 }, { "epoch": 0.03793103448275862, "grad_norm": 3.77724552154541, "learning_rate": 6.05e-08, "loss": 8.568, "step": 121 }, { "epoch": 0.03824451410658307, "grad_norm": 3.5914738178253174, "learning_rate": 6.1e-08, "loss": 7.9479, "step": 122 }, { "epoch": 0.038557993730407525, "grad_norm": 8.78665828704834, "learning_rate": 6.15e-08, "loss": 7.3475, "step": 123 }, { "epoch": 0.038871473354231974, "grad_norm": 3.715463399887085, "learning_rate": 6.2e-08, "loss": 8.8613, "step": 124 }, { "epoch": 0.03918495297805643, "grad_norm": 3.4635043144226074, "learning_rate": 6.250000000000001e-08, "loss": 7.8405, "step": 125 }, { "epoch": 0.03949843260188088, "grad_norm": 3.9361000061035156, "learning_rate": 6.300000000000001e-08, "loss": 7.8298, "step": 126 }, { "epoch": 0.03981191222570533, "grad_norm": 3.872082233428955, "learning_rate": 6.35e-08, "loss": 11.5712, "step": 127 }, { "epoch": 0.04012539184952978, "grad_norm": 3.6292521953582764, "learning_rate": 6.4e-08, "loss": 8.9746, "step": 128 }, { "epoch": 0.04043887147335423, "grad_norm": 3.303443193435669, "learning_rate": 6.45e-08, "loss": 7.3576, "step": 129 }, { "epoch": 0.04075235109717868, "grad_norm": 2.838657855987549, "learning_rate": 6.5e-08, "loss": 8.1253, "step": 130 }, { "epoch": 0.04106583072100314, "grad_norm": 3.875380039215088, "learning_rate": 6.550000000000001e-08, "loss": 9.9537, "step": 131 }, { "epoch": 0.041379310344827586, "grad_norm": 3.1287903785705566, "learning_rate": 6.600000000000001e-08, "loss": 8.871, "step": 132 }, { "epoch": 0.041692789968652035, "grad_norm": 3.414733648300171, "learning_rate": 6.65e-08, "loss": 6.58, "step": 133 }, { "epoch": 0.04200626959247649, "grad_norm": 3.1217563152313232, "learning_rate": 6.7e-08, "loss": 7.591, "step": 134 }, { "epoch": 0.04231974921630094, "grad_norm": 4.4469733238220215, "learning_rate": 6.75e-08, "loss": 9.0155, "step": 135 }, { "epoch": 0.042633228840125395, "grad_norm": 3.728848695755005, "learning_rate": 6.8e-08, "loss": 8.98, "step": 136 }, { "epoch": 0.042946708463949844, "grad_norm": 4.085875988006592, "learning_rate": 6.850000000000001e-08, "loss": 9.7549, "step": 137 }, { "epoch": 0.04326018808777429, "grad_norm": 2.9348063468933105, "learning_rate": 6.900000000000001e-08, "loss": 7.0279, "step": 138 }, { "epoch": 0.04357366771159875, "grad_norm": 3.2348380088806152, "learning_rate": 6.950000000000001e-08, "loss": 8.1917, "step": 139 }, { "epoch": 0.0438871473354232, "grad_norm": 3.731046438217163, "learning_rate": 7e-08, "loss": 8.1675, "step": 140 }, { "epoch": 0.044200626959247646, "grad_norm": 2.7796294689178467, "learning_rate": 7.05e-08, "loss": 7.1764, "step": 141 }, { "epoch": 0.0445141065830721, "grad_norm": 3.467015027999878, "learning_rate": 7.100000000000001e-08, "loss": 7.7669, "step": 142 }, { "epoch": 0.04482758620689655, "grad_norm": 4.046057224273682, "learning_rate": 7.150000000000001e-08, "loss": 11.0822, "step": 143 }, { "epoch": 0.045141065830721, "grad_norm": 4.335888385772705, "learning_rate": 7.200000000000001e-08, "loss": 9.0507, "step": 144 }, { "epoch": 0.045454545454545456, "grad_norm": 3.208477258682251, "learning_rate": 7.250000000000001e-08, "loss": 8.5856, "step": 145 }, { "epoch": 0.045768025078369905, "grad_norm": 4.974661350250244, "learning_rate": 7.3e-08, "loss": 13.4032, "step": 146 }, { "epoch": 0.04608150470219436, "grad_norm": 5.451123237609863, "learning_rate": 7.35e-08, "loss": 11.6684, "step": 147 }, { "epoch": 0.04639498432601881, "grad_norm": 5.630722999572754, "learning_rate": 7.400000000000001e-08, "loss": 14.3619, "step": 148 }, { "epoch": 0.04670846394984326, "grad_norm": 3.298954725265503, "learning_rate": 7.450000000000001e-08, "loss": 7.9049, "step": 149 }, { "epoch": 0.047021943573667714, "grad_norm": 6.209390640258789, "learning_rate": 7.500000000000001e-08, "loss": 12.7281, "step": 150 }, { "epoch": 0.04733542319749216, "grad_norm": 4.096133232116699, "learning_rate": 7.550000000000001e-08, "loss": 10.6689, "step": 151 }, { "epoch": 0.04764890282131661, "grad_norm": 3.8314168453216553, "learning_rate": 7.6e-08, "loss": 9.3038, "step": 152 }, { "epoch": 0.04796238244514107, "grad_norm": 4.284617900848389, "learning_rate": 7.65e-08, "loss": 11.3222, "step": 153 }, { "epoch": 0.04827586206896552, "grad_norm": 3.8246536254882812, "learning_rate": 7.700000000000001e-08, "loss": 9.8774, "step": 154 }, { "epoch": 0.048589341692789965, "grad_norm": 3.011291265487671, "learning_rate": 7.750000000000001e-08, "loss": 6.99, "step": 155 }, { "epoch": 0.04890282131661442, "grad_norm": 3.5155868530273438, "learning_rate": 7.8e-08, "loss": 10.0299, "step": 156 }, { "epoch": 0.04921630094043887, "grad_norm": 3.2512242794036865, "learning_rate": 7.85e-08, "loss": 7.9809, "step": 157 }, { "epoch": 0.049529780564263326, "grad_norm": 3.8495590686798096, "learning_rate": 7.900000000000002e-08, "loss": 8.4488, "step": 158 }, { "epoch": 0.049843260188087775, "grad_norm": 4.043474197387695, "learning_rate": 7.950000000000002e-08, "loss": 7.3039, "step": 159 }, { "epoch": 0.050156739811912224, "grad_norm": 3.638864755630493, "learning_rate": 8e-08, "loss": 12.0005, "step": 160 }, { "epoch": 0.05047021943573668, "grad_norm": 3.9693477153778076, "learning_rate": 8.05e-08, "loss": 5.8005, "step": 161 }, { "epoch": 0.05078369905956113, "grad_norm": 4.173606872558594, "learning_rate": 8.1e-08, "loss": 14.1363, "step": 162 }, { "epoch": 0.05109717868338558, "grad_norm": 3.1908795833587646, "learning_rate": 8.15e-08, "loss": 8.1196, "step": 163 }, { "epoch": 0.05141065830721003, "grad_norm": 2.548922061920166, "learning_rate": 8.200000000000002e-08, "loss": 6.7172, "step": 164 }, { "epoch": 0.05172413793103448, "grad_norm": 3.2565834522247314, "learning_rate": 8.25e-08, "loss": 7.6421, "step": 165 }, { "epoch": 0.05203761755485893, "grad_norm": 4.065040111541748, "learning_rate": 8.3e-08, "loss": 9.3676, "step": 166 }, { "epoch": 0.05235109717868339, "grad_norm": 3.38653826713562, "learning_rate": 8.35e-08, "loss": 8.983, "step": 167 }, { "epoch": 0.052664576802507836, "grad_norm": 3.8572115898132324, "learning_rate": 8.4e-08, "loss": 8.1122, "step": 168 }, { "epoch": 0.05297805642633229, "grad_norm": 3.2368323802948, "learning_rate": 8.45e-08, "loss": 8.4592, "step": 169 }, { "epoch": 0.05329153605015674, "grad_norm": 3.422175884246826, "learning_rate": 8.500000000000001e-08, "loss": 9.0227, "step": 170 }, { "epoch": 0.05360501567398119, "grad_norm": 4.226418972015381, "learning_rate": 8.55e-08, "loss": 10.3479, "step": 171 }, { "epoch": 0.053918495297805645, "grad_norm": 4.698319435119629, "learning_rate": 8.6e-08, "loss": 9.7526, "step": 172 }, { "epoch": 0.054231974921630094, "grad_norm": 3.448711395263672, "learning_rate": 8.65e-08, "loss": 9.1493, "step": 173 }, { "epoch": 0.05454545454545454, "grad_norm": 3.3972206115722656, "learning_rate": 8.7e-08, "loss": 6.7097, "step": 174 }, { "epoch": 0.054858934169279, "grad_norm": 3.6650757789611816, "learning_rate": 8.750000000000001e-08, "loss": 10.019, "step": 175 }, { "epoch": 0.05517241379310345, "grad_norm": 3.743401288986206, "learning_rate": 8.800000000000001e-08, "loss": 9.1552, "step": 176 }, { "epoch": 0.055485893416927896, "grad_norm": 3.6061248779296875, "learning_rate": 8.85e-08, "loss": 8.0651, "step": 177 }, { "epoch": 0.05579937304075235, "grad_norm": 3.2070910930633545, "learning_rate": 8.9e-08, "loss": 8.7052, "step": 178 }, { "epoch": 0.0561128526645768, "grad_norm": 4.579248428344727, "learning_rate": 8.95e-08, "loss": 8.3114, "step": 179 }, { "epoch": 0.05642633228840126, "grad_norm": 4.0575690269470215, "learning_rate": 9e-08, "loss": 8.8177, "step": 180 }, { "epoch": 0.056739811912225706, "grad_norm": 3.667318820953369, "learning_rate": 9.050000000000001e-08, "loss": 9.8964, "step": 181 }, { "epoch": 0.057053291536050155, "grad_norm": 4.42782735824585, "learning_rate": 9.100000000000001e-08, "loss": 10.0656, "step": 182 }, { "epoch": 0.05736677115987461, "grad_norm": 3.8123972415924072, "learning_rate": 9.15e-08, "loss": 10.2998, "step": 183 }, { "epoch": 0.05768025078369906, "grad_norm": 4.3586745262146, "learning_rate": 9.2e-08, "loss": 8.8101, "step": 184 }, { "epoch": 0.05799373040752351, "grad_norm": 3.785413980484009, "learning_rate": 9.25e-08, "loss": 9.7424, "step": 185 }, { "epoch": 0.058307210031347964, "grad_norm": 3.5556790828704834, "learning_rate": 9.3e-08, "loss": 10.6945, "step": 186 }, { "epoch": 0.05862068965517241, "grad_norm": 4.004053592681885, "learning_rate": 9.350000000000001e-08, "loss": 9.8506, "step": 187 }, { "epoch": 0.05893416927899686, "grad_norm": 5.22344970703125, "learning_rate": 9.400000000000001e-08, "loss": 12.796, "step": 188 }, { "epoch": 0.05924764890282132, "grad_norm": 3.7727713584899902, "learning_rate": 9.45e-08, "loss": 7.7093, "step": 189 }, { "epoch": 0.05956112852664577, "grad_norm": 3.474813461303711, "learning_rate": 9.5e-08, "loss": 8.4177, "step": 190 }, { "epoch": 0.05987460815047022, "grad_norm": 3.4020941257476807, "learning_rate": 9.55e-08, "loss": 10.3753, "step": 191 }, { "epoch": 0.06018808777429467, "grad_norm": 4.846778392791748, "learning_rate": 9.6e-08, "loss": 9.5903, "step": 192 }, { "epoch": 0.06050156739811912, "grad_norm": 4.567539691925049, "learning_rate": 9.650000000000001e-08, "loss": 9.7159, "step": 193 }, { "epoch": 0.060815047021943576, "grad_norm": 3.0759778022766113, "learning_rate": 9.700000000000001e-08, "loss": 7.0748, "step": 194 }, { "epoch": 0.061128526645768025, "grad_norm": 3.4420666694641113, "learning_rate": 9.75e-08, "loss": 7.8556, "step": 195 }, { "epoch": 0.061442006269592474, "grad_norm": 3.7455813884735107, "learning_rate": 9.8e-08, "loss": 7.4662, "step": 196 }, { "epoch": 0.06175548589341693, "grad_norm": 3.7451171875, "learning_rate": 9.85e-08, "loss": 8.2137, "step": 197 }, { "epoch": 0.06206896551724138, "grad_norm": 3.513235330581665, "learning_rate": 9.900000000000001e-08, "loss": 8.2557, "step": 198 }, { "epoch": 0.06238244514106583, "grad_norm": 4.297177314758301, "learning_rate": 9.950000000000001e-08, "loss": 10.0288, "step": 199 }, { "epoch": 0.06269592476489028, "grad_norm": 4.298759460449219, "learning_rate": 1.0000000000000001e-07, "loss": 10.5449, "step": 200 }, { "epoch": 0.06300940438871473, "grad_norm": 3.4075138568878174, "learning_rate": 1.005e-07, "loss": 7.7592, "step": 201 }, { "epoch": 0.06332288401253919, "grad_norm": 3.19218373298645, "learning_rate": 1.01e-07, "loss": 7.7692, "step": 202 }, { "epoch": 0.06363636363636363, "grad_norm": 4.200982093811035, "learning_rate": 1.015e-07, "loss": 8.7216, "step": 203 }, { "epoch": 0.06394984326018809, "grad_norm": 3.4024147987365723, "learning_rate": 1.0200000000000001e-07, "loss": 7.9235, "step": 204 }, { "epoch": 0.06426332288401254, "grad_norm": 3.751147747039795, "learning_rate": 1.0250000000000001e-07, "loss": 10.3086, "step": 205 }, { "epoch": 0.064576802507837, "grad_norm": 3.3769607543945312, "learning_rate": 1.0300000000000001e-07, "loss": 7.629, "step": 206 }, { "epoch": 0.06489028213166144, "grad_norm": 3.3052303791046143, "learning_rate": 1.0350000000000001e-07, "loss": 9.5893, "step": 207 }, { "epoch": 0.0652037617554859, "grad_norm": 4.127976894378662, "learning_rate": 1.04e-07, "loss": 12.7904, "step": 208 }, { "epoch": 0.06551724137931035, "grad_norm": 3.8568077087402344, "learning_rate": 1.045e-07, "loss": 9.6994, "step": 209 }, { "epoch": 0.06583072100313479, "grad_norm": 3.236760377883911, "learning_rate": 1.0500000000000001e-07, "loss": 8.0482, "step": 210 }, { "epoch": 0.06614420062695925, "grad_norm": 4.277583122253418, "learning_rate": 1.0550000000000001e-07, "loss": 8.1819, "step": 211 }, { "epoch": 0.0664576802507837, "grad_norm": 3.3786604404449463, "learning_rate": 1.0600000000000001e-07, "loss": 7.0545, "step": 212 }, { "epoch": 0.06677115987460815, "grad_norm": 4.007237911224365, "learning_rate": 1.0650000000000001e-07, "loss": 9.9794, "step": 213 }, { "epoch": 0.0670846394984326, "grad_norm": 4.616070747375488, "learning_rate": 1.07e-07, "loss": 11.1093, "step": 214 }, { "epoch": 0.06739811912225706, "grad_norm": 3.0655412673950195, "learning_rate": 1.075e-07, "loss": 9.5085, "step": 215 }, { "epoch": 0.0677115987460815, "grad_norm": 3.05678391456604, "learning_rate": 1.0800000000000001e-07, "loss": 6.3335, "step": 216 }, { "epoch": 0.06802507836990596, "grad_norm": 2.941502094268799, "learning_rate": 1.0850000000000001e-07, "loss": 7.2431, "step": 217 }, { "epoch": 0.06833855799373041, "grad_norm": 3.810181140899658, "learning_rate": 1.0900000000000001e-07, "loss": 9.8534, "step": 218 }, { "epoch": 0.06865203761755485, "grad_norm": 3.007319450378418, "learning_rate": 1.0950000000000001e-07, "loss": 9.2206, "step": 219 }, { "epoch": 0.06896551724137931, "grad_norm": 3.6502251625061035, "learning_rate": 1.1e-07, "loss": 7.9423, "step": 220 }, { "epoch": 0.06927899686520377, "grad_norm": 3.4484660625457764, "learning_rate": 1.1050000000000002e-07, "loss": 8.6399, "step": 221 }, { "epoch": 0.06959247648902821, "grad_norm": 3.3999218940734863, "learning_rate": 1.1100000000000001e-07, "loss": 8.1017, "step": 222 }, { "epoch": 0.06990595611285266, "grad_norm": 3.0518319606781006, "learning_rate": 1.1150000000000001e-07, "loss": 7.0595, "step": 223 }, { "epoch": 0.07021943573667712, "grad_norm": 2.940213918685913, "learning_rate": 1.1200000000000001e-07, "loss": 7.2927, "step": 224 }, { "epoch": 0.07053291536050156, "grad_norm": 4.038904190063477, "learning_rate": 1.1250000000000001e-07, "loss": 9.1408, "step": 225 }, { "epoch": 0.07084639498432602, "grad_norm": 2.5958571434020996, "learning_rate": 1.13e-07, "loss": 6.325, "step": 226 }, { "epoch": 0.07115987460815047, "grad_norm": 3.0129003524780273, "learning_rate": 1.1350000000000002e-07, "loss": 6.963, "step": 227 }, { "epoch": 0.07147335423197493, "grad_norm": 2.9724435806274414, "learning_rate": 1.1400000000000001e-07, "loss": 8.6995, "step": 228 }, { "epoch": 0.07178683385579937, "grad_norm": 3.7590036392211914, "learning_rate": 1.1450000000000001e-07, "loss": 7.9243, "step": 229 }, { "epoch": 0.07210031347962383, "grad_norm": 3.5400218963623047, "learning_rate": 1.1500000000000001e-07, "loss": 6.981, "step": 230 }, { "epoch": 0.07241379310344828, "grad_norm": 2.9733407497406006, "learning_rate": 1.1550000000000001e-07, "loss": 7.1957, "step": 231 }, { "epoch": 0.07272727272727272, "grad_norm": 3.4962241649627686, "learning_rate": 1.16e-07, "loss": 10.1103, "step": 232 }, { "epoch": 0.07304075235109718, "grad_norm": 3.234700918197632, "learning_rate": 1.1650000000000002e-07, "loss": 7.5018, "step": 233 }, { "epoch": 0.07335423197492164, "grad_norm": 3.2388176918029785, "learning_rate": 1.1700000000000002e-07, "loss": 9.0859, "step": 234 }, { "epoch": 0.07366771159874608, "grad_norm": 3.4157941341400146, "learning_rate": 1.1750000000000001e-07, "loss": 7.8399, "step": 235 }, { "epoch": 0.07398119122257053, "grad_norm": 3.5833077430725098, "learning_rate": 1.1800000000000001e-07, "loss": 7.5087, "step": 236 }, { "epoch": 0.07429467084639499, "grad_norm": 4.769215106964111, "learning_rate": 1.1850000000000001e-07, "loss": 8.7621, "step": 237 }, { "epoch": 0.07460815047021943, "grad_norm": 3.941755771636963, "learning_rate": 1.1900000000000002e-07, "loss": 13.078, "step": 238 }, { "epoch": 0.07492163009404389, "grad_norm": 3.798787832260132, "learning_rate": 1.195e-07, "loss": 9.7572, "step": 239 }, { "epoch": 0.07523510971786834, "grad_norm": 4.713004112243652, "learning_rate": 1.2000000000000002e-07, "loss": 13.3091, "step": 240 }, { "epoch": 0.07554858934169278, "grad_norm": 3.408093214035034, "learning_rate": 1.205e-07, "loss": 7.7499, "step": 241 }, { "epoch": 0.07586206896551724, "grad_norm": 4.062272071838379, "learning_rate": 1.21e-07, "loss": 9.7988, "step": 242 }, { "epoch": 0.0761755485893417, "grad_norm": 4.664130210876465, "learning_rate": 1.215e-07, "loss": 9.4779, "step": 243 }, { "epoch": 0.07648902821316614, "grad_norm": 3.5159950256347656, "learning_rate": 1.22e-07, "loss": 8.8292, "step": 244 }, { "epoch": 0.0768025078369906, "grad_norm": 3.664834499359131, "learning_rate": 1.2250000000000002e-07, "loss": 8.0994, "step": 245 }, { "epoch": 0.07711598746081505, "grad_norm": 4.765824794769287, "learning_rate": 1.23e-07, "loss": 8.5952, "step": 246 }, { "epoch": 0.07742946708463949, "grad_norm": 3.9765498638153076, "learning_rate": 1.2350000000000001e-07, "loss": 9.6514, "step": 247 }, { "epoch": 0.07774294670846395, "grad_norm": 3.6185436248779297, "learning_rate": 1.24e-07, "loss": 8.74, "step": 248 }, { "epoch": 0.0780564263322884, "grad_norm": 3.6925275325775146, "learning_rate": 1.245e-07, "loss": 7.0158, "step": 249 }, { "epoch": 0.07836990595611286, "grad_norm": 3.878559112548828, "learning_rate": 1.2500000000000002e-07, "loss": 8.7004, "step": 250 }, { "epoch": 0.0786833855799373, "grad_norm": 4.704588413238525, "learning_rate": 1.255e-07, "loss": 9.9536, "step": 251 }, { "epoch": 0.07899686520376176, "grad_norm": 4.152799129486084, "learning_rate": 1.2600000000000002e-07, "loss": 9.6934, "step": 252 }, { "epoch": 0.07931034482758621, "grad_norm": 3.1007423400878906, "learning_rate": 1.265e-07, "loss": 7.1812, "step": 253 }, { "epoch": 0.07962382445141065, "grad_norm": 3.872610092163086, "learning_rate": 1.27e-07, "loss": 8.3354, "step": 254 }, { "epoch": 0.07993730407523511, "grad_norm": 3.334554433822632, "learning_rate": 1.275e-07, "loss": 9.6324, "step": 255 }, { "epoch": 0.08025078369905957, "grad_norm": 3.413045883178711, "learning_rate": 1.28e-07, "loss": 8.7391, "step": 256 }, { "epoch": 0.08056426332288401, "grad_norm": 3.656658887863159, "learning_rate": 1.2850000000000002e-07, "loss": 10.2712, "step": 257 }, { "epoch": 0.08087774294670846, "grad_norm": 3.1734461784362793, "learning_rate": 1.29e-07, "loss": 7.0908, "step": 258 }, { "epoch": 0.08119122257053292, "grad_norm": 5.781704425811768, "learning_rate": 1.2950000000000001e-07, "loss": 8.8973, "step": 259 }, { "epoch": 0.08150470219435736, "grad_norm": 3.3216211795806885, "learning_rate": 1.3e-07, "loss": 7.9809, "step": 260 }, { "epoch": 0.08181818181818182, "grad_norm": 3.4130654335021973, "learning_rate": 1.305e-07, "loss": 10.6107, "step": 261 }, { "epoch": 0.08213166144200627, "grad_norm": 3.198608160018921, "learning_rate": 1.3100000000000002e-07, "loss": 7.9311, "step": 262 }, { "epoch": 0.08244514106583072, "grad_norm": 3.2649126052856445, "learning_rate": 1.315e-07, "loss": 6.9358, "step": 263 }, { "epoch": 0.08275862068965517, "grad_norm": 3.805680513381958, "learning_rate": 1.3200000000000002e-07, "loss": 8.7701, "step": 264 }, { "epoch": 0.08307210031347963, "grad_norm": 3.8234152793884277, "learning_rate": 1.325e-07, "loss": 7.1507, "step": 265 }, { "epoch": 0.08338557993730407, "grad_norm": 4.14029598236084, "learning_rate": 1.33e-07, "loss": 10.002, "step": 266 }, { "epoch": 0.08369905956112852, "grad_norm": 3.70548677444458, "learning_rate": 1.3350000000000002e-07, "loss": 7.0396, "step": 267 }, { "epoch": 0.08401253918495298, "grad_norm": 4.227056503295898, "learning_rate": 1.34e-07, "loss": 10.8817, "step": 268 }, { "epoch": 0.08432601880877742, "grad_norm": 3.6237289905548096, "learning_rate": 1.3450000000000002e-07, "loss": 8.0917, "step": 269 }, { "epoch": 0.08463949843260188, "grad_norm": 2.858159303665161, "learning_rate": 1.35e-07, "loss": 6.4492, "step": 270 }, { "epoch": 0.08495297805642633, "grad_norm": 3.7545058727264404, "learning_rate": 1.3550000000000002e-07, "loss": 7.2091, "step": 271 }, { "epoch": 0.08526645768025079, "grad_norm": 3.4545750617980957, "learning_rate": 1.36e-07, "loss": 6.7025, "step": 272 }, { "epoch": 0.08557993730407523, "grad_norm": 2.947838068008423, "learning_rate": 1.365e-07, "loss": 6.9652, "step": 273 }, { "epoch": 0.08589341692789969, "grad_norm": 4.7777605056762695, "learning_rate": 1.3700000000000002e-07, "loss": 10.056, "step": 274 }, { "epoch": 0.08620689655172414, "grad_norm": 2.519544839859009, "learning_rate": 1.375e-07, "loss": 6.0347, "step": 275 }, { "epoch": 0.08652037617554859, "grad_norm": 4.664981842041016, "learning_rate": 1.3800000000000002e-07, "loss": 10.88, "step": 276 }, { "epoch": 0.08683385579937304, "grad_norm": 4.797813892364502, "learning_rate": 1.385e-07, "loss": 10.9775, "step": 277 }, { "epoch": 0.0871473354231975, "grad_norm": 3.1882078647613525, "learning_rate": 1.3900000000000001e-07, "loss": 7.7702, "step": 278 }, { "epoch": 0.08746081504702194, "grad_norm": 4.286925792694092, "learning_rate": 1.3950000000000002e-07, "loss": 10.9283, "step": 279 }, { "epoch": 0.0877742946708464, "grad_norm": 3.3588335514068604, "learning_rate": 1.4e-07, "loss": 7.7342, "step": 280 }, { "epoch": 0.08808777429467085, "grad_norm": 2.775568962097168, "learning_rate": 1.4050000000000002e-07, "loss": 6.9301, "step": 281 }, { "epoch": 0.08840125391849529, "grad_norm": 4.123598575592041, "learning_rate": 1.41e-07, "loss": 10.0182, "step": 282 }, { "epoch": 0.08871473354231975, "grad_norm": 4.854518413543701, "learning_rate": 1.4150000000000002e-07, "loss": 12.1932, "step": 283 }, { "epoch": 0.0890282131661442, "grad_norm": 2.8263025283813477, "learning_rate": 1.4200000000000003e-07, "loss": 7.8395, "step": 284 }, { "epoch": 0.08934169278996865, "grad_norm": 3.2582967281341553, "learning_rate": 1.425e-07, "loss": 7.4587, "step": 285 }, { "epoch": 0.0896551724137931, "grad_norm": 3.1975438594818115, "learning_rate": 1.4300000000000002e-07, "loss": 8.037, "step": 286 }, { "epoch": 0.08996865203761756, "grad_norm": 3.0459468364715576, "learning_rate": 1.435e-07, "loss": 7.1786, "step": 287 }, { "epoch": 0.090282131661442, "grad_norm": 4.034944534301758, "learning_rate": 1.4400000000000002e-07, "loss": 8.2916, "step": 288 }, { "epoch": 0.09059561128526646, "grad_norm": 5.0129499435424805, "learning_rate": 1.445e-07, "loss": 11.0094, "step": 289 }, { "epoch": 0.09090909090909091, "grad_norm": 4.1220903396606445, "learning_rate": 1.4500000000000001e-07, "loss": 8.5799, "step": 290 }, { "epoch": 0.09122257053291537, "grad_norm": 3.6785085201263428, "learning_rate": 1.4550000000000003e-07, "loss": 9.5831, "step": 291 }, { "epoch": 0.09153605015673981, "grad_norm": 3.44356369972229, "learning_rate": 1.46e-07, "loss": 7.9535, "step": 292 }, { "epoch": 0.09184952978056427, "grad_norm": 4.070002555847168, "learning_rate": 1.4650000000000002e-07, "loss": 11.4279, "step": 293 }, { "epoch": 0.09216300940438872, "grad_norm": 3.0748281478881836, "learning_rate": 1.47e-07, "loss": 7.3921, "step": 294 }, { "epoch": 0.09247648902821316, "grad_norm": 2.973388671875, "learning_rate": 1.4750000000000002e-07, "loss": 7.6754, "step": 295 }, { "epoch": 0.09278996865203762, "grad_norm": 3.4642834663391113, "learning_rate": 1.4800000000000003e-07, "loss": 8.1933, "step": 296 }, { "epoch": 0.09310344827586207, "grad_norm": 4.090353012084961, "learning_rate": 1.485e-07, "loss": 7.926, "step": 297 }, { "epoch": 0.09341692789968652, "grad_norm": 3.8677382469177246, "learning_rate": 1.4900000000000002e-07, "loss": 7.5799, "step": 298 }, { "epoch": 0.09373040752351097, "grad_norm": 3.7805559635162354, "learning_rate": 1.495e-07, "loss": 9.6573, "step": 299 }, { "epoch": 0.09404388714733543, "grad_norm": 3.8729629516601562, "learning_rate": 1.5000000000000002e-07, "loss": 10.3939, "step": 300 }, { "epoch": 0.09435736677115987, "grad_norm": 4.191401481628418, "learning_rate": 1.505e-07, "loss": 11.4678, "step": 301 }, { "epoch": 0.09467084639498433, "grad_norm": 3.4422829151153564, "learning_rate": 1.5100000000000002e-07, "loss": 7.9499, "step": 302 }, { "epoch": 0.09498432601880878, "grad_norm": 3.8708252906799316, "learning_rate": 1.5150000000000003e-07, "loss": 9.9911, "step": 303 }, { "epoch": 0.09529780564263322, "grad_norm": 3.080570936203003, "learning_rate": 1.52e-07, "loss": 7.4803, "step": 304 }, { "epoch": 0.09561128526645768, "grad_norm": 3.9624032974243164, "learning_rate": 1.5250000000000002e-07, "loss": 10.5125, "step": 305 }, { "epoch": 0.09592476489028214, "grad_norm": 4.132358551025391, "learning_rate": 1.53e-07, "loss": 10.1554, "step": 306 }, { "epoch": 0.09623824451410658, "grad_norm": 3.146574020385742, "learning_rate": 1.5350000000000002e-07, "loss": 6.7796, "step": 307 }, { "epoch": 0.09655172413793103, "grad_norm": 3.1497974395751953, "learning_rate": 1.5400000000000003e-07, "loss": 8.9689, "step": 308 }, { "epoch": 0.09686520376175549, "grad_norm": 4.384578227996826, "learning_rate": 1.5450000000000001e-07, "loss": 12.2934, "step": 309 }, { "epoch": 0.09717868338557993, "grad_norm": 3.5674219131469727, "learning_rate": 1.5500000000000002e-07, "loss": 10.6616, "step": 310 }, { "epoch": 0.09749216300940439, "grad_norm": 4.463435649871826, "learning_rate": 1.555e-07, "loss": 7.4585, "step": 311 }, { "epoch": 0.09780564263322884, "grad_norm": 4.1657609939575195, "learning_rate": 1.56e-07, "loss": 8.5832, "step": 312 }, { "epoch": 0.0981191222570533, "grad_norm": 5.202960968017578, "learning_rate": 1.5650000000000003e-07, "loss": 12.9104, "step": 313 }, { "epoch": 0.09843260188087774, "grad_norm": 4.485203742980957, "learning_rate": 1.57e-07, "loss": 7.8665, "step": 314 }, { "epoch": 0.0987460815047022, "grad_norm": 3.092864751815796, "learning_rate": 1.575e-07, "loss": 8.4677, "step": 315 }, { "epoch": 0.09905956112852665, "grad_norm": 3.2465710639953613, "learning_rate": 1.5800000000000004e-07, "loss": 7.498, "step": 316 }, { "epoch": 0.0993730407523511, "grad_norm": 5.711244106292725, "learning_rate": 1.585e-07, "loss": 12.3313, "step": 317 }, { "epoch": 0.09968652037617555, "grad_norm": 3.7532050609588623, "learning_rate": 1.5900000000000003e-07, "loss": 10.8737, "step": 318 }, { "epoch": 0.1, "grad_norm": 4.172240257263184, "learning_rate": 1.595e-07, "loss": 8.8672, "step": 319 }, { "epoch": 0.10031347962382445, "grad_norm": 3.577533721923828, "learning_rate": 1.6e-07, "loss": 11.197, "step": 320 }, { "epoch": 0.1006269592476489, "grad_norm": 3.8616108894348145, "learning_rate": 1.605e-07, "loss": 9.6396, "step": 321 }, { "epoch": 0.10094043887147336, "grad_norm": 5.926366329193115, "learning_rate": 1.61e-07, "loss": 12.75, "step": 322 }, { "epoch": 0.1012539184952978, "grad_norm": 3.613895893096924, "learning_rate": 1.6150000000000004e-07, "loss": 7.4106, "step": 323 }, { "epoch": 0.10156739811912226, "grad_norm": 3.902071952819824, "learning_rate": 1.62e-07, "loss": 11.4143, "step": 324 }, { "epoch": 0.10188087774294671, "grad_norm": 2.8985440731048584, "learning_rate": 1.625e-07, "loss": 7.3166, "step": 325 }, { "epoch": 0.10219435736677115, "grad_norm": 3.545724391937256, "learning_rate": 1.63e-07, "loss": 7.8454, "step": 326 }, { "epoch": 0.10250783699059561, "grad_norm": 3.259948253631592, "learning_rate": 1.635e-07, "loss": 8.4287, "step": 327 }, { "epoch": 0.10282131661442007, "grad_norm": 3.207242012023926, "learning_rate": 1.6400000000000004e-07, "loss": 7.234, "step": 328 }, { "epoch": 0.10313479623824451, "grad_norm": 3.2320971488952637, "learning_rate": 1.645e-07, "loss": 7.4103, "step": 329 }, { "epoch": 0.10344827586206896, "grad_norm": 4.266717910766602, "learning_rate": 1.65e-07, "loss": 10.7557, "step": 330 }, { "epoch": 0.10376175548589342, "grad_norm": 4.703216075897217, "learning_rate": 1.655e-07, "loss": 13.6921, "step": 331 }, { "epoch": 0.10407523510971786, "grad_norm": 2.98171329498291, "learning_rate": 1.66e-07, "loss": 6.4217, "step": 332 }, { "epoch": 0.10438871473354232, "grad_norm": 4.266834735870361, "learning_rate": 1.6650000000000004e-07, "loss": 8.7814, "step": 333 }, { "epoch": 0.10470219435736677, "grad_norm": 3.13301420211792, "learning_rate": 1.67e-07, "loss": 7.5465, "step": 334 }, { "epoch": 0.10501567398119123, "grad_norm": 3.93803071975708, "learning_rate": 1.675e-07, "loss": 9.7471, "step": 335 }, { "epoch": 0.10532915360501567, "grad_norm": 3.124204397201538, "learning_rate": 1.68e-07, "loss": 8.2321, "step": 336 }, { "epoch": 0.10564263322884013, "grad_norm": 3.1418778896331787, "learning_rate": 1.685e-07, "loss": 8.8142, "step": 337 }, { "epoch": 0.10595611285266458, "grad_norm": 3.865973472595215, "learning_rate": 1.69e-07, "loss": 10.8355, "step": 338 }, { "epoch": 0.10626959247648902, "grad_norm": 3.5527279376983643, "learning_rate": 1.695e-07, "loss": 7.6143, "step": 339 }, { "epoch": 0.10658307210031348, "grad_norm": 3.353538751602173, "learning_rate": 1.7000000000000001e-07, "loss": 9.4709, "step": 340 }, { "epoch": 0.10689655172413794, "grad_norm": 3.7409756183624268, "learning_rate": 1.705e-07, "loss": 11.7136, "step": 341 }, { "epoch": 0.10721003134796238, "grad_norm": 4.518954277038574, "learning_rate": 1.71e-07, "loss": 8.8255, "step": 342 }, { "epoch": 0.10752351097178683, "grad_norm": 4.619668960571289, "learning_rate": 1.715e-07, "loss": 12.3065, "step": 343 }, { "epoch": 0.10783699059561129, "grad_norm": 3.260843276977539, "learning_rate": 1.72e-07, "loss": 9.3232, "step": 344 }, { "epoch": 0.10815047021943573, "grad_norm": 3.9291486740112305, "learning_rate": 1.7250000000000002e-07, "loss": 9.9018, "step": 345 }, { "epoch": 0.10846394984326019, "grad_norm": 4.316411972045898, "learning_rate": 1.73e-07, "loss": 8.0697, "step": 346 }, { "epoch": 0.10877742946708464, "grad_norm": 3.2413835525512695, "learning_rate": 1.735e-07, "loss": 8.5543, "step": 347 }, { "epoch": 0.10909090909090909, "grad_norm": 2.881052017211914, "learning_rate": 1.74e-07, "loss": 6.481, "step": 348 }, { "epoch": 0.10940438871473354, "grad_norm": 3.0879931449890137, "learning_rate": 1.745e-07, "loss": 9.835, "step": 349 }, { "epoch": 0.109717868338558, "grad_norm": 3.5517656803131104, "learning_rate": 1.7500000000000002e-07, "loss": 8.4705, "step": 350 }, { "epoch": 0.11003134796238244, "grad_norm": 3.764478921890259, "learning_rate": 1.755e-07, "loss": 9.458, "step": 351 }, { "epoch": 0.1103448275862069, "grad_norm": 5.203456878662109, "learning_rate": 1.7600000000000001e-07, "loss": 11.9955, "step": 352 }, { "epoch": 0.11065830721003135, "grad_norm": 4.887335777282715, "learning_rate": 1.765e-07, "loss": 11.2742, "step": 353 }, { "epoch": 0.11097178683385579, "grad_norm": 3.6964869499206543, "learning_rate": 1.77e-07, "loss": 8.1842, "step": 354 }, { "epoch": 0.11128526645768025, "grad_norm": 3.0974836349487305, "learning_rate": 1.775e-07, "loss": 7.7249, "step": 355 }, { "epoch": 0.1115987460815047, "grad_norm": 3.3595480918884277, "learning_rate": 1.78e-07, "loss": 7.27, "step": 356 }, { "epoch": 0.11191222570532916, "grad_norm": 3.602126359939575, "learning_rate": 1.7850000000000002e-07, "loss": 10.0521, "step": 357 }, { "epoch": 0.1122257053291536, "grad_norm": 3.0182366371154785, "learning_rate": 1.79e-07, "loss": 8.1576, "step": 358 }, { "epoch": 0.11253918495297806, "grad_norm": 7.666667461395264, "learning_rate": 1.795e-07, "loss": 17.8031, "step": 359 }, { "epoch": 0.11285266457680251, "grad_norm": 3.1000211238861084, "learning_rate": 1.8e-07, "loss": 8.0788, "step": 360 }, { "epoch": 0.11316614420062696, "grad_norm": 3.830988883972168, "learning_rate": 1.805e-07, "loss": 10.3451, "step": 361 }, { "epoch": 0.11347962382445141, "grad_norm": 3.502091407775879, "learning_rate": 1.8100000000000002e-07, "loss": 7.3287, "step": 362 }, { "epoch": 0.11379310344827587, "grad_norm": 3.8298943042755127, "learning_rate": 1.815e-07, "loss": 7.3115, "step": 363 }, { "epoch": 0.11410658307210031, "grad_norm": 3.5254364013671875, "learning_rate": 1.8200000000000002e-07, "loss": 10.1726, "step": 364 }, { "epoch": 0.11442006269592477, "grad_norm": 2.8689956665039062, "learning_rate": 1.825e-07, "loss": 5.9041, "step": 365 }, { "epoch": 0.11473354231974922, "grad_norm": 3.9380292892456055, "learning_rate": 1.83e-07, "loss": 8.7651, "step": 366 }, { "epoch": 0.11504702194357366, "grad_norm": 4.940433502197266, "learning_rate": 1.8350000000000002e-07, "loss": 13.3309, "step": 367 }, { "epoch": 0.11536050156739812, "grad_norm": 3.1874752044677734, "learning_rate": 1.84e-07, "loss": 7.9231, "step": 368 }, { "epoch": 0.11567398119122257, "grad_norm": 3.361844778060913, "learning_rate": 1.8450000000000002e-07, "loss": 8.2631, "step": 369 }, { "epoch": 0.11598746081504702, "grad_norm": 4.501951217651367, "learning_rate": 1.85e-07, "loss": 11.7963, "step": 370 }, { "epoch": 0.11630094043887147, "grad_norm": 3.1025617122650146, "learning_rate": 1.8550000000000001e-07, "loss": 7.3833, "step": 371 }, { "epoch": 0.11661442006269593, "grad_norm": 3.30342960357666, "learning_rate": 1.86e-07, "loss": 8.7881, "step": 372 }, { "epoch": 0.11692789968652037, "grad_norm": 3.470710515975952, "learning_rate": 1.865e-07, "loss": 7.7015, "step": 373 }, { "epoch": 0.11724137931034483, "grad_norm": 3.2457809448242188, "learning_rate": 1.8700000000000002e-07, "loss": 8.8045, "step": 374 }, { "epoch": 0.11755485893416928, "grad_norm": 3.0047662258148193, "learning_rate": 1.875e-07, "loss": 8.3349, "step": 375 }, { "epoch": 0.11786833855799372, "grad_norm": 3.3185179233551025, "learning_rate": 1.8800000000000002e-07, "loss": 6.7955, "step": 376 }, { "epoch": 0.11818181818181818, "grad_norm": 3.7260663509368896, "learning_rate": 1.885e-07, "loss": 8.8171, "step": 377 }, { "epoch": 0.11849529780564264, "grad_norm": 3.9109082221984863, "learning_rate": 1.89e-07, "loss": 9.8559, "step": 378 }, { "epoch": 0.11880877742946709, "grad_norm": 3.2869834899902344, "learning_rate": 1.8950000000000002e-07, "loss": 11.0736, "step": 379 }, { "epoch": 0.11912225705329153, "grad_norm": 4.131401538848877, "learning_rate": 1.9e-07, "loss": 11.9681, "step": 380 }, { "epoch": 0.11943573667711599, "grad_norm": 3.241637945175171, "learning_rate": 1.9050000000000002e-07, "loss": 9.0305, "step": 381 }, { "epoch": 0.11974921630094044, "grad_norm": 3.635667324066162, "learning_rate": 1.91e-07, "loss": 9.6321, "step": 382 }, { "epoch": 0.12006269592476489, "grad_norm": 2.901669979095459, "learning_rate": 1.9150000000000001e-07, "loss": 7.8995, "step": 383 }, { "epoch": 0.12037617554858934, "grad_norm": 3.7463796138763428, "learning_rate": 1.92e-07, "loss": 7.9248, "step": 384 }, { "epoch": 0.1206896551724138, "grad_norm": 3.811286211013794, "learning_rate": 1.925e-07, "loss": 7.6093, "step": 385 }, { "epoch": 0.12100313479623824, "grad_norm": 3.3971803188323975, "learning_rate": 1.9300000000000002e-07, "loss": 7.9361, "step": 386 }, { "epoch": 0.1213166144200627, "grad_norm": 3.552212953567505, "learning_rate": 1.935e-07, "loss": 9.3616, "step": 387 }, { "epoch": 0.12163009404388715, "grad_norm": 3.006312608718872, "learning_rate": 1.9400000000000002e-07, "loss": 8.1449, "step": 388 }, { "epoch": 0.1219435736677116, "grad_norm": 3.3143630027770996, "learning_rate": 1.945e-07, "loss": 8.2126, "step": 389 }, { "epoch": 0.12225705329153605, "grad_norm": 3.8813610076904297, "learning_rate": 1.95e-07, "loss": 10.1397, "step": 390 }, { "epoch": 0.1225705329153605, "grad_norm": 2.9321842193603516, "learning_rate": 1.9550000000000002e-07, "loss": 6.3588, "step": 391 }, { "epoch": 0.12288401253918495, "grad_norm": 3.752800941467285, "learning_rate": 1.96e-07, "loss": 8.3612, "step": 392 }, { "epoch": 0.1231974921630094, "grad_norm": 3.759476661682129, "learning_rate": 1.9650000000000002e-07, "loss": 8.3461, "step": 393 }, { "epoch": 0.12351097178683386, "grad_norm": 2.937373161315918, "learning_rate": 1.97e-07, "loss": 6.9189, "step": 394 }, { "epoch": 0.1238244514106583, "grad_norm": 3.327111005783081, "learning_rate": 1.9750000000000001e-07, "loss": 9.0745, "step": 395 }, { "epoch": 0.12413793103448276, "grad_norm": 2.807039976119995, "learning_rate": 1.9800000000000003e-07, "loss": 6.5768, "step": 396 }, { "epoch": 0.12445141065830721, "grad_norm": 4.174190044403076, "learning_rate": 1.985e-07, "loss": 14.798, "step": 397 }, { "epoch": 0.12476489028213165, "grad_norm": 3.8052163124084473, "learning_rate": 1.9900000000000002e-07, "loss": 7.4751, "step": 398 }, { "epoch": 0.1250783699059561, "grad_norm": 3.2749545574188232, "learning_rate": 1.995e-07, "loss": 6.8837, "step": 399 }, { "epoch": 0.12539184952978055, "grad_norm": 5.2682695388793945, "learning_rate": 2.0000000000000002e-07, "loss": 17.3949, "step": 400 }, { "epoch": 0.12570532915360502, "grad_norm": 3.6819639205932617, "learning_rate": 2.005e-07, "loss": 8.8539, "step": 401 }, { "epoch": 0.12601880877742946, "grad_norm": 3.480807304382324, "learning_rate": 2.01e-07, "loss": 7.9894, "step": 402 }, { "epoch": 0.1263322884012539, "grad_norm": 4.27021598815918, "learning_rate": 2.0150000000000002e-07, "loss": 10.9579, "step": 403 }, { "epoch": 0.12664576802507838, "grad_norm": 3.2545857429504395, "learning_rate": 2.02e-07, "loss": 7.1311, "step": 404 }, { "epoch": 0.12695924764890282, "grad_norm": 3.5351126194000244, "learning_rate": 2.0250000000000002e-07, "loss": 6.8761, "step": 405 }, { "epoch": 0.12727272727272726, "grad_norm": 3.6219284534454346, "learning_rate": 2.03e-07, "loss": 8.3192, "step": 406 }, { "epoch": 0.12758620689655173, "grad_norm": 3.1589441299438477, "learning_rate": 2.0350000000000002e-07, "loss": 7.9498, "step": 407 }, { "epoch": 0.12789968652037617, "grad_norm": 3.862013101577759, "learning_rate": 2.0400000000000003e-07, "loss": 12.2729, "step": 408 }, { "epoch": 0.1282131661442006, "grad_norm": 3.1884796619415283, "learning_rate": 2.045e-07, "loss": 6.5746, "step": 409 }, { "epoch": 0.12852664576802508, "grad_norm": 2.733370065689087, "learning_rate": 2.0500000000000002e-07, "loss": 6.8392, "step": 410 }, { "epoch": 0.12884012539184952, "grad_norm": 4.183924198150635, "learning_rate": 2.055e-07, "loss": 9.7881, "step": 411 }, { "epoch": 0.129153605015674, "grad_norm": 4.186039924621582, "learning_rate": 2.0600000000000002e-07, "loss": 7.9172, "step": 412 }, { "epoch": 0.12946708463949844, "grad_norm": 4.129965305328369, "learning_rate": 2.0650000000000003e-07, "loss": 8.3458, "step": 413 }, { "epoch": 0.12978056426332288, "grad_norm": 3.3639910221099854, "learning_rate": 2.0700000000000001e-07, "loss": 7.8052, "step": 414 }, { "epoch": 0.13009404388714735, "grad_norm": 3.8352103233337402, "learning_rate": 2.0750000000000003e-07, "loss": 10.0176, "step": 415 }, { "epoch": 0.1304075235109718, "grad_norm": 3.672574281692505, "learning_rate": 2.08e-07, "loss": 10.4668, "step": 416 }, { "epoch": 0.13072100313479623, "grad_norm": 2.9237465858459473, "learning_rate": 2.0850000000000002e-07, "loss": 5.7591, "step": 417 }, { "epoch": 0.1310344827586207, "grad_norm": 4.392944812774658, "learning_rate": 2.09e-07, "loss": 10.9199, "step": 418 }, { "epoch": 0.13134796238244514, "grad_norm": 4.115564823150635, "learning_rate": 2.0950000000000002e-07, "loss": 7.3922, "step": 419 }, { "epoch": 0.13166144200626959, "grad_norm": 3.255119562149048, "learning_rate": 2.1000000000000003e-07, "loss": 7.2558, "step": 420 }, { "epoch": 0.13197492163009406, "grad_norm": 2.9578206539154053, "learning_rate": 2.105e-07, "loss": 8.0258, "step": 421 }, { "epoch": 0.1322884012539185, "grad_norm": 3.8864405155181885, "learning_rate": 2.1100000000000002e-07, "loss": 9.2595, "step": 422 }, { "epoch": 0.13260188087774294, "grad_norm": 3.2227253913879395, "learning_rate": 2.115e-07, "loss": 7.2088, "step": 423 }, { "epoch": 0.1329153605015674, "grad_norm": 3.5899605751037598, "learning_rate": 2.1200000000000002e-07, "loss": 8.3448, "step": 424 }, { "epoch": 0.13322884012539185, "grad_norm": 3.725222110748291, "learning_rate": 2.1250000000000003e-07, "loss": 8.4943, "step": 425 }, { "epoch": 0.1335423197492163, "grad_norm": 4.701845645904541, "learning_rate": 2.1300000000000001e-07, "loss": 12.9665, "step": 426 }, { "epoch": 0.13385579937304076, "grad_norm": 3.239241123199463, "learning_rate": 2.1350000000000003e-07, "loss": 7.3724, "step": 427 }, { "epoch": 0.1341692789968652, "grad_norm": 2.888547420501709, "learning_rate": 2.14e-07, "loss": 7.2136, "step": 428 }, { "epoch": 0.13448275862068965, "grad_norm": 3.1289238929748535, "learning_rate": 2.1450000000000002e-07, "loss": 8.2064, "step": 429 }, { "epoch": 0.13479623824451412, "grad_norm": 3.992865800857544, "learning_rate": 2.15e-07, "loss": 8.4046, "step": 430 }, { "epoch": 0.13510971786833856, "grad_norm": 3.2021284103393555, "learning_rate": 2.1550000000000002e-07, "loss": 12.1033, "step": 431 }, { "epoch": 0.135423197492163, "grad_norm": 3.114281177520752, "learning_rate": 2.1600000000000003e-07, "loss": 7.0616, "step": 432 }, { "epoch": 0.13573667711598747, "grad_norm": 3.175734043121338, "learning_rate": 2.165e-07, "loss": 7.1838, "step": 433 }, { "epoch": 0.1360501567398119, "grad_norm": 3.4493837356567383, "learning_rate": 2.1700000000000002e-07, "loss": 7.9487, "step": 434 }, { "epoch": 0.13636363636363635, "grad_norm": 3.04085373878479, "learning_rate": 2.175e-07, "loss": 7.6261, "step": 435 }, { "epoch": 0.13667711598746082, "grad_norm": 4.71550989151001, "learning_rate": 2.1800000000000002e-07, "loss": 13.7869, "step": 436 }, { "epoch": 0.13699059561128527, "grad_norm": 3.4197592735290527, "learning_rate": 2.1850000000000003e-07, "loss": 7.7423, "step": 437 }, { "epoch": 0.1373040752351097, "grad_norm": 3.3364362716674805, "learning_rate": 2.1900000000000002e-07, "loss": 8.9017, "step": 438 }, { "epoch": 0.13761755485893418, "grad_norm": 3.6165454387664795, "learning_rate": 2.1950000000000003e-07, "loss": 7.4735, "step": 439 }, { "epoch": 0.13793103448275862, "grad_norm": 5.154653072357178, "learning_rate": 2.2e-07, "loss": 14.2502, "step": 440 }, { "epoch": 0.13824451410658306, "grad_norm": 3.3679981231689453, "learning_rate": 2.2050000000000002e-07, "loss": 7.7449, "step": 441 }, { "epoch": 0.13855799373040753, "grad_norm": 3.918351888656616, "learning_rate": 2.2100000000000003e-07, "loss": 10.1961, "step": 442 }, { "epoch": 0.13887147335423197, "grad_norm": 3.199449300765991, "learning_rate": 2.2150000000000002e-07, "loss": 7.5792, "step": 443 }, { "epoch": 0.13918495297805641, "grad_norm": 3.322082757949829, "learning_rate": 2.2200000000000003e-07, "loss": 7.9956, "step": 444 }, { "epoch": 0.13949843260188088, "grad_norm": 3.1004014015197754, "learning_rate": 2.2250000000000001e-07, "loss": 7.0897, "step": 445 }, { "epoch": 0.13981191222570533, "grad_norm": 3.9262382984161377, "learning_rate": 2.2300000000000002e-07, "loss": 9.4931, "step": 446 }, { "epoch": 0.14012539184952977, "grad_norm": 3.103273868560791, "learning_rate": 2.235e-07, "loss": 6.7228, "step": 447 }, { "epoch": 0.14043887147335424, "grad_norm": 3.1082265377044678, "learning_rate": 2.2400000000000002e-07, "loss": 8.3279, "step": 448 }, { "epoch": 0.14075235109717868, "grad_norm": 3.3036298751831055, "learning_rate": 2.2450000000000003e-07, "loss": 7.875, "step": 449 }, { "epoch": 0.14106583072100312, "grad_norm": 3.8810064792633057, "learning_rate": 2.2500000000000002e-07, "loss": 8.3544, "step": 450 }, { "epoch": 0.1413793103448276, "grad_norm": 4.209482192993164, "learning_rate": 2.2550000000000003e-07, "loss": 9.1295, "step": 451 }, { "epoch": 0.14169278996865203, "grad_norm": 3.419205665588379, "learning_rate": 2.26e-07, "loss": 6.9125, "step": 452 }, { "epoch": 0.1420062695924765, "grad_norm": 3.985367774963379, "learning_rate": 2.2650000000000002e-07, "loss": 11.5769, "step": 453 }, { "epoch": 0.14231974921630094, "grad_norm": 3.542846202850342, "learning_rate": 2.2700000000000003e-07, "loss": 8.6733, "step": 454 }, { "epoch": 0.1426332288401254, "grad_norm": 3.4622037410736084, "learning_rate": 2.2750000000000002e-07, "loss": 9.0828, "step": 455 }, { "epoch": 0.14294670846394986, "grad_norm": 3.1978418827056885, "learning_rate": 2.2800000000000003e-07, "loss": 6.9524, "step": 456 }, { "epoch": 0.1432601880877743, "grad_norm": 3.3328914642333984, "learning_rate": 2.2850000000000001e-07, "loss": 7.4014, "step": 457 }, { "epoch": 0.14357366771159874, "grad_norm": 3.5065202713012695, "learning_rate": 2.2900000000000003e-07, "loss": 9.0877, "step": 458 }, { "epoch": 0.1438871473354232, "grad_norm": 6.033443927764893, "learning_rate": 2.2950000000000004e-07, "loss": 15.9271, "step": 459 }, { "epoch": 0.14420062695924765, "grad_norm": 4.368375301361084, "learning_rate": 2.3000000000000002e-07, "loss": 11.6588, "step": 460 }, { "epoch": 0.1445141065830721, "grad_norm": 3.0983519554138184, "learning_rate": 2.3050000000000003e-07, "loss": 6.6645, "step": 461 }, { "epoch": 0.14482758620689656, "grad_norm": 3.8745956420898438, "learning_rate": 2.3100000000000002e-07, "loss": 10.457, "step": 462 }, { "epoch": 0.145141065830721, "grad_norm": 3.9520504474639893, "learning_rate": 2.3150000000000003e-07, "loss": 10.4561, "step": 463 }, { "epoch": 0.14545454545454545, "grad_norm": 8.04711627960205, "learning_rate": 2.32e-07, "loss": 19.9331, "step": 464 }, { "epoch": 0.14576802507836992, "grad_norm": 4.129817485809326, "learning_rate": 2.3250000000000002e-07, "loss": 8.8334, "step": 465 }, { "epoch": 0.14608150470219436, "grad_norm": 4.049325466156006, "learning_rate": 2.3300000000000003e-07, "loss": 8.772, "step": 466 }, { "epoch": 0.1463949843260188, "grad_norm": 4.55631160736084, "learning_rate": 2.3350000000000002e-07, "loss": 13.48, "step": 467 }, { "epoch": 0.14670846394984327, "grad_norm": 3.488584518432617, "learning_rate": 2.3400000000000003e-07, "loss": 10.4798, "step": 468 }, { "epoch": 0.1470219435736677, "grad_norm": 5.771771430969238, "learning_rate": 2.3450000000000002e-07, "loss": 11.7797, "step": 469 }, { "epoch": 0.14733542319749215, "grad_norm": 3.116539716720581, "learning_rate": 2.3500000000000003e-07, "loss": 6.873, "step": 470 }, { "epoch": 0.14764890282131662, "grad_norm": 2.576658248901367, "learning_rate": 2.3550000000000004e-07, "loss": 7.21, "step": 471 }, { "epoch": 0.14796238244514107, "grad_norm": 3.1541895866394043, "learning_rate": 2.3600000000000002e-07, "loss": 7.4172, "step": 472 }, { "epoch": 0.1482758620689655, "grad_norm": 4.149017810821533, "learning_rate": 2.3650000000000003e-07, "loss": 13.571, "step": 473 }, { "epoch": 0.14858934169278998, "grad_norm": 4.210467338562012, "learning_rate": 2.3700000000000002e-07, "loss": 11.6499, "step": 474 }, { "epoch": 0.14890282131661442, "grad_norm": 5.193135738372803, "learning_rate": 2.3750000000000003e-07, "loss": 17.2001, "step": 475 }, { "epoch": 0.14921630094043886, "grad_norm": 3.6134843826293945, "learning_rate": 2.3800000000000004e-07, "loss": 6.4639, "step": 476 }, { "epoch": 0.14952978056426333, "grad_norm": 5.032776355743408, "learning_rate": 2.385e-07, "loss": 13.8185, "step": 477 }, { "epoch": 0.14984326018808777, "grad_norm": 4.616049766540527, "learning_rate": 2.39e-07, "loss": 10.5861, "step": 478 }, { "epoch": 0.15015673981191222, "grad_norm": 3.1708860397338867, "learning_rate": 2.395e-07, "loss": 7.8613, "step": 479 }, { "epoch": 0.15047021943573669, "grad_norm": 4.432284355163574, "learning_rate": 2.4000000000000003e-07, "loss": 10.7543, "step": 480 }, { "epoch": 0.15078369905956113, "grad_norm": 3.439668655395508, "learning_rate": 2.405e-07, "loss": 7.4266, "step": 481 }, { "epoch": 0.15109717868338557, "grad_norm": 4.03348445892334, "learning_rate": 2.41e-07, "loss": 11.5268, "step": 482 }, { "epoch": 0.15141065830721004, "grad_norm": 3.4068636894226074, "learning_rate": 2.4150000000000004e-07, "loss": 7.1183, "step": 483 }, { "epoch": 0.15172413793103448, "grad_norm": 4.846498966217041, "learning_rate": 2.42e-07, "loss": 11.0191, "step": 484 }, { "epoch": 0.15203761755485892, "grad_norm": 5.300605773925781, "learning_rate": 2.425e-07, "loss": 9.6552, "step": 485 }, { "epoch": 0.1523510971786834, "grad_norm": 4.646756649017334, "learning_rate": 2.43e-07, "loss": 13.7772, "step": 486 }, { "epoch": 0.15266457680250783, "grad_norm": 3.7247793674468994, "learning_rate": 2.4350000000000003e-07, "loss": 8.6148, "step": 487 }, { "epoch": 0.15297805642633228, "grad_norm": 3.396324634552002, "learning_rate": 2.44e-07, "loss": 11.5777, "step": 488 }, { "epoch": 0.15329153605015675, "grad_norm": 3.7958171367645264, "learning_rate": 2.445e-07, "loss": 8.7151, "step": 489 }, { "epoch": 0.1536050156739812, "grad_norm": 3.312392473220825, "learning_rate": 2.4500000000000004e-07, "loss": 8.2649, "step": 490 }, { "epoch": 0.15391849529780563, "grad_norm": 2.4669861793518066, "learning_rate": 2.455e-07, "loss": 6.4529, "step": 491 }, { "epoch": 0.1542319749216301, "grad_norm": 5.108162879943848, "learning_rate": 2.46e-07, "loss": 14.7079, "step": 492 }, { "epoch": 0.15454545454545454, "grad_norm": 3.430584192276001, "learning_rate": 2.465e-07, "loss": 8.4653, "step": 493 }, { "epoch": 0.15485893416927898, "grad_norm": 3.0151519775390625, "learning_rate": 2.4700000000000003e-07, "loss": 6.9225, "step": 494 }, { "epoch": 0.15517241379310345, "grad_norm": 4.027904510498047, "learning_rate": 2.475e-07, "loss": 8.5251, "step": 495 }, { "epoch": 0.1554858934169279, "grad_norm": 3.500556707382202, "learning_rate": 2.48e-07, "loss": 9.9441, "step": 496 }, { "epoch": 0.15579937304075236, "grad_norm": 3.032302141189575, "learning_rate": 2.4850000000000003e-07, "loss": 6.9624, "step": 497 }, { "epoch": 0.1561128526645768, "grad_norm": 4.127997875213623, "learning_rate": 2.49e-07, "loss": 9.5575, "step": 498 }, { "epoch": 0.15642633228840125, "grad_norm": 3.3242340087890625, "learning_rate": 2.495e-07, "loss": 7.8737, "step": 499 }, { "epoch": 0.15673981191222572, "grad_norm": 3.514852285385132, "learning_rate": 2.5000000000000004e-07, "loss": 7.5601, "step": 500 }, { "epoch": 0.15705329153605016, "grad_norm": 3.9590742588043213, "learning_rate": 2.505e-07, "loss": 8.2909, "step": 501 }, { "epoch": 0.1573667711598746, "grad_norm": 3.0089352130889893, "learning_rate": 2.51e-07, "loss": 8.9848, "step": 502 }, { "epoch": 0.15768025078369907, "grad_norm": 4.294312477111816, "learning_rate": 2.515e-07, "loss": 10.3314, "step": 503 }, { "epoch": 0.1579937304075235, "grad_norm": 4.103808879852295, "learning_rate": 2.5200000000000003e-07, "loss": 9.262, "step": 504 }, { "epoch": 0.15830721003134796, "grad_norm": 3.4118666648864746, "learning_rate": 2.525e-07, "loss": 7.6306, "step": 505 }, { "epoch": 0.15862068965517243, "grad_norm": 3.144343137741089, "learning_rate": 2.53e-07, "loss": 9.7107, "step": 506 }, { "epoch": 0.15893416927899687, "grad_norm": 3.416949987411499, "learning_rate": 2.5350000000000004e-07, "loss": 7.1229, "step": 507 }, { "epoch": 0.1592476489028213, "grad_norm": 3.30198335647583, "learning_rate": 2.54e-07, "loss": 8.9435, "step": 508 }, { "epoch": 0.15956112852664578, "grad_norm": 3.7316057682037354, "learning_rate": 2.545e-07, "loss": 8.5075, "step": 509 }, { "epoch": 0.15987460815047022, "grad_norm": 3.468623638153076, "learning_rate": 2.55e-07, "loss": 8.5576, "step": 510 }, { "epoch": 0.16018808777429466, "grad_norm": 3.579921245574951, "learning_rate": 2.5550000000000003e-07, "loss": 8.5891, "step": 511 }, { "epoch": 0.16050156739811913, "grad_norm": 2.719144105911255, "learning_rate": 2.56e-07, "loss": 7.0097, "step": 512 }, { "epoch": 0.16081504702194357, "grad_norm": 3.503176689147949, "learning_rate": 2.565e-07, "loss": 9.2646, "step": 513 }, { "epoch": 0.16112852664576802, "grad_norm": 3.4942426681518555, "learning_rate": 2.5700000000000004e-07, "loss": 8.3605, "step": 514 }, { "epoch": 0.1614420062695925, "grad_norm": 3.6798880100250244, "learning_rate": 2.575e-07, "loss": 7.8945, "step": 515 }, { "epoch": 0.16175548589341693, "grad_norm": 2.953244924545288, "learning_rate": 2.58e-07, "loss": 6.319, "step": 516 }, { "epoch": 0.16206896551724137, "grad_norm": 4.936113357543945, "learning_rate": 2.5850000000000004e-07, "loss": 13.3042, "step": 517 }, { "epoch": 0.16238244514106584, "grad_norm": 4.359250545501709, "learning_rate": 2.5900000000000003e-07, "loss": 8.4494, "step": 518 }, { "epoch": 0.16269592476489028, "grad_norm": 3.1021082401275635, "learning_rate": 2.595e-07, "loss": 8.4413, "step": 519 }, { "epoch": 0.16300940438871472, "grad_norm": 3.71295166015625, "learning_rate": 2.6e-07, "loss": 9.4076, "step": 520 }, { "epoch": 0.1633228840125392, "grad_norm": 3.4278814792633057, "learning_rate": 2.6050000000000004e-07, "loss": 8.9707, "step": 521 }, { "epoch": 0.16363636363636364, "grad_norm": 4.640440464019775, "learning_rate": 2.61e-07, "loss": 12.3595, "step": 522 }, { "epoch": 0.16394984326018808, "grad_norm": 4.346306800842285, "learning_rate": 2.615e-07, "loss": 11.8391, "step": 523 }, { "epoch": 0.16426332288401255, "grad_norm": 3.444598436355591, "learning_rate": 2.6200000000000004e-07, "loss": 8.5561, "step": 524 }, { "epoch": 0.164576802507837, "grad_norm": 3.9495432376861572, "learning_rate": 2.6250000000000003e-07, "loss": 11.1607, "step": 525 }, { "epoch": 0.16489028213166143, "grad_norm": 3.858132839202881, "learning_rate": 2.63e-07, "loss": 8.4288, "step": 526 }, { "epoch": 0.1652037617554859, "grad_norm": 4.880698204040527, "learning_rate": 2.635e-07, "loss": 17.5724, "step": 527 }, { "epoch": 0.16551724137931034, "grad_norm": 5.0397162437438965, "learning_rate": 2.6400000000000003e-07, "loss": 10.3173, "step": 528 }, { "epoch": 0.16583072100313478, "grad_norm": 3.1536805629730225, "learning_rate": 2.645e-07, "loss": 7.8392, "step": 529 }, { "epoch": 0.16614420062695925, "grad_norm": 2.8486595153808594, "learning_rate": 2.65e-07, "loss": 7.3892, "step": 530 }, { "epoch": 0.1664576802507837, "grad_norm": 3.6035428047180176, "learning_rate": 2.6550000000000004e-07, "loss": 8.3855, "step": 531 }, { "epoch": 0.16677115987460814, "grad_norm": 3.0228993892669678, "learning_rate": 2.66e-07, "loss": 7.0978, "step": 532 }, { "epoch": 0.1670846394984326, "grad_norm": 3.3057663440704346, "learning_rate": 2.665e-07, "loss": 8.4415, "step": 533 }, { "epoch": 0.16739811912225705, "grad_norm": 3.630830764770508, "learning_rate": 2.6700000000000005e-07, "loss": 7.5833, "step": 534 }, { "epoch": 0.1677115987460815, "grad_norm": 4.190485954284668, "learning_rate": 2.6750000000000003e-07, "loss": 6.7527, "step": 535 }, { "epoch": 0.16802507836990596, "grad_norm": 4.3880486488342285, "learning_rate": 2.68e-07, "loss": 10.2591, "step": 536 }, { "epoch": 0.1683385579937304, "grad_norm": 3.767117738723755, "learning_rate": 2.685e-07, "loss": 6.5155, "step": 537 }, { "epoch": 0.16865203761755485, "grad_norm": 4.589762210845947, "learning_rate": 2.6900000000000004e-07, "loss": 11.3011, "step": 538 }, { "epoch": 0.16896551724137931, "grad_norm": 2.933931350708008, "learning_rate": 2.695e-07, "loss": 7.7405, "step": 539 }, { "epoch": 0.16927899686520376, "grad_norm": 4.077798843383789, "learning_rate": 2.7e-07, "loss": 7.7766, "step": 540 }, { "epoch": 0.16959247648902823, "grad_norm": 4.123392581939697, "learning_rate": 2.7050000000000005e-07, "loss": 8.437, "step": 541 }, { "epoch": 0.16990595611285267, "grad_norm": 3.4657938480377197, "learning_rate": 2.7100000000000003e-07, "loss": 8.0518, "step": 542 }, { "epoch": 0.1702194357366771, "grad_norm": 3.4541122913360596, "learning_rate": 2.715e-07, "loss": 8.7644, "step": 543 }, { "epoch": 0.17053291536050158, "grad_norm": 4.604305267333984, "learning_rate": 2.72e-07, "loss": 10.5831, "step": 544 }, { "epoch": 0.17084639498432602, "grad_norm": 3.2504711151123047, "learning_rate": 2.7250000000000004e-07, "loss": 8.0843, "step": 545 }, { "epoch": 0.17115987460815046, "grad_norm": 3.421630382537842, "learning_rate": 2.73e-07, "loss": 7.3776, "step": 546 }, { "epoch": 0.17147335423197493, "grad_norm": 3.5182442665100098, "learning_rate": 2.735e-07, "loss": 7.4899, "step": 547 }, { "epoch": 0.17178683385579938, "grad_norm": 3.437476396560669, "learning_rate": 2.7400000000000004e-07, "loss": 8.6969, "step": 548 }, { "epoch": 0.17210031347962382, "grad_norm": 3.5565860271453857, "learning_rate": 2.7450000000000003e-07, "loss": 7.6131, "step": 549 }, { "epoch": 0.1724137931034483, "grad_norm": 4.052206516265869, "learning_rate": 2.75e-07, "loss": 10.5389, "step": 550 }, { "epoch": 0.17272727272727273, "grad_norm": 4.916564464569092, "learning_rate": 2.7550000000000005e-07, "loss": 13.2927, "step": 551 }, { "epoch": 0.17304075235109717, "grad_norm": 2.8071060180664062, "learning_rate": 2.7600000000000004e-07, "loss": 7.181, "step": 552 }, { "epoch": 0.17335423197492164, "grad_norm": 3.762843370437622, "learning_rate": 2.765e-07, "loss": 9.2195, "step": 553 }, { "epoch": 0.17366771159874608, "grad_norm": 4.453097820281982, "learning_rate": 2.77e-07, "loss": 11.1995, "step": 554 }, { "epoch": 0.17398119122257052, "grad_norm": 3.8368940353393555, "learning_rate": 2.7750000000000004e-07, "loss": 10.7022, "step": 555 }, { "epoch": 0.174294670846395, "grad_norm": 3.128065586090088, "learning_rate": 2.7800000000000003e-07, "loss": 7.396, "step": 556 }, { "epoch": 0.17460815047021944, "grad_norm": 3.999136447906494, "learning_rate": 2.785e-07, "loss": 9.3729, "step": 557 }, { "epoch": 0.17492163009404388, "grad_norm": 3.374950408935547, "learning_rate": 2.7900000000000005e-07, "loss": 6.9749, "step": 558 }, { "epoch": 0.17523510971786835, "grad_norm": 3.5031139850616455, "learning_rate": 2.7950000000000003e-07, "loss": 8.5784, "step": 559 }, { "epoch": 0.1755485893416928, "grad_norm": 3.912729263305664, "learning_rate": 2.8e-07, "loss": 8.9176, "step": 560 }, { "epoch": 0.17586206896551723, "grad_norm": 4.987295627593994, "learning_rate": 2.805e-07, "loss": 10.5289, "step": 561 }, { "epoch": 0.1761755485893417, "grad_norm": 5.127312183380127, "learning_rate": 2.8100000000000004e-07, "loss": 12.005, "step": 562 }, { "epoch": 0.17648902821316614, "grad_norm": 2.887765884399414, "learning_rate": 2.815e-07, "loss": 8.8117, "step": 563 }, { "epoch": 0.17680250783699059, "grad_norm": 3.11470890045166, "learning_rate": 2.82e-07, "loss": 9.1879, "step": 564 }, { "epoch": 0.17711598746081506, "grad_norm": 3.491626739501953, "learning_rate": 2.8250000000000005e-07, "loss": 8.4029, "step": 565 }, { "epoch": 0.1774294670846395, "grad_norm": 3.3320202827453613, "learning_rate": 2.8300000000000003e-07, "loss": 7.258, "step": 566 }, { "epoch": 0.17774294670846394, "grad_norm": 3.7208235263824463, "learning_rate": 2.835e-07, "loss": 9.4842, "step": 567 }, { "epoch": 0.1780564263322884, "grad_norm": 3.597604274749756, "learning_rate": 2.8400000000000005e-07, "loss": 8.5819, "step": 568 }, { "epoch": 0.17836990595611285, "grad_norm": 3.1107091903686523, "learning_rate": 2.8450000000000004e-07, "loss": 7.8507, "step": 569 }, { "epoch": 0.1786833855799373, "grad_norm": 2.587881565093994, "learning_rate": 2.85e-07, "loss": 7.2284, "step": 570 }, { "epoch": 0.17899686520376176, "grad_norm": 2.9738922119140625, "learning_rate": 2.855e-07, "loss": 8.5117, "step": 571 }, { "epoch": 0.1793103448275862, "grad_norm": 3.6646111011505127, "learning_rate": 2.8600000000000005e-07, "loss": 8.4424, "step": 572 }, { "epoch": 0.17962382445141065, "grad_norm": 3.749894380569458, "learning_rate": 2.8650000000000003e-07, "loss": 8.9965, "step": 573 }, { "epoch": 0.17993730407523512, "grad_norm": 2.9281930923461914, "learning_rate": 2.87e-07, "loss": 6.6291, "step": 574 }, { "epoch": 0.18025078369905956, "grad_norm": 5.141971111297607, "learning_rate": 2.8750000000000005e-07, "loss": 11.6456, "step": 575 }, { "epoch": 0.180564263322884, "grad_norm": 3.058166980743408, "learning_rate": 2.8800000000000004e-07, "loss": 7.7736, "step": 576 }, { "epoch": 0.18087774294670847, "grad_norm": 3.9422664642333984, "learning_rate": 2.885e-07, "loss": 11.6602, "step": 577 }, { "epoch": 0.1811912225705329, "grad_norm": 4.253895282745361, "learning_rate": 2.89e-07, "loss": 9.2913, "step": 578 }, { "epoch": 0.18150470219435735, "grad_norm": 3.428746461868286, "learning_rate": 2.8950000000000004e-07, "loss": 7.2803, "step": 579 }, { "epoch": 0.18181818181818182, "grad_norm": 2.983686685562134, "learning_rate": 2.9000000000000003e-07, "loss": 7.1757, "step": 580 }, { "epoch": 0.18213166144200627, "grad_norm": 3.508849859237671, "learning_rate": 2.905e-07, "loss": 8.3138, "step": 581 }, { "epoch": 0.18244514106583073, "grad_norm": 3.7815003395080566, "learning_rate": 2.9100000000000005e-07, "loss": 6.8592, "step": 582 }, { "epoch": 0.18275862068965518, "grad_norm": 3.492091417312622, "learning_rate": 2.9150000000000004e-07, "loss": 9.769, "step": 583 }, { "epoch": 0.18307210031347962, "grad_norm": 3.9170730113983154, "learning_rate": 2.92e-07, "loss": 6.7468, "step": 584 }, { "epoch": 0.1833855799373041, "grad_norm": 3.778157949447632, "learning_rate": 2.9250000000000006e-07, "loss": 7.6754, "step": 585 }, { "epoch": 0.18369905956112853, "grad_norm": 3.811721086502075, "learning_rate": 2.9300000000000004e-07, "loss": 9.9387, "step": 586 }, { "epoch": 0.18401253918495297, "grad_norm": 4.163631916046143, "learning_rate": 2.9350000000000003e-07, "loss": 8.2871, "step": 587 }, { "epoch": 0.18432601880877744, "grad_norm": 3.4269614219665527, "learning_rate": 2.94e-07, "loss": 7.114, "step": 588 }, { "epoch": 0.18463949843260188, "grad_norm": 3.990867853164673, "learning_rate": 2.9450000000000005e-07, "loss": 9.8128, "step": 589 }, { "epoch": 0.18495297805642633, "grad_norm": 3.4142584800720215, "learning_rate": 2.9500000000000003e-07, "loss": 7.7801, "step": 590 }, { "epoch": 0.1852664576802508, "grad_norm": 3.636627674102783, "learning_rate": 2.955e-07, "loss": 10.9509, "step": 591 }, { "epoch": 0.18557993730407524, "grad_norm": 4.59920072555542, "learning_rate": 2.9600000000000006e-07, "loss": 10.6065, "step": 592 }, { "epoch": 0.18589341692789968, "grad_norm": 3.2188518047332764, "learning_rate": 2.9650000000000004e-07, "loss": 7.1557, "step": 593 }, { "epoch": 0.18620689655172415, "grad_norm": 3.7860870361328125, "learning_rate": 2.97e-07, "loss": 7.7989, "step": 594 }, { "epoch": 0.1865203761755486, "grad_norm": 4.25967264175415, "learning_rate": 2.975e-07, "loss": 12.088, "step": 595 }, { "epoch": 0.18683385579937303, "grad_norm": 2.7574448585510254, "learning_rate": 2.9800000000000005e-07, "loss": 5.9382, "step": 596 }, { "epoch": 0.1871473354231975, "grad_norm": 3.17261004447937, "learning_rate": 2.9850000000000003e-07, "loss": 7.0805, "step": 597 }, { "epoch": 0.18746081504702194, "grad_norm": 4.3467793464660645, "learning_rate": 2.99e-07, "loss": 11.9778, "step": 598 }, { "epoch": 0.1877742946708464, "grad_norm": 4.424931526184082, "learning_rate": 2.9950000000000005e-07, "loss": 8.564, "step": 599 }, { "epoch": 0.18808777429467086, "grad_norm": 3.811779737472534, "learning_rate": 3.0000000000000004e-07, "loss": 7.9092, "step": 600 }, { "epoch": 0.1884012539184953, "grad_norm": 4.103071689605713, "learning_rate": 3.005e-07, "loss": 9.164, "step": 601 }, { "epoch": 0.18871473354231974, "grad_norm": 3.3931884765625, "learning_rate": 3.01e-07, "loss": 7.6855, "step": 602 }, { "epoch": 0.1890282131661442, "grad_norm": 3.917874574661255, "learning_rate": 3.0150000000000005e-07, "loss": 9.6698, "step": 603 }, { "epoch": 0.18934169278996865, "grad_norm": 4.343447685241699, "learning_rate": 3.0200000000000003e-07, "loss": 8.4255, "step": 604 }, { "epoch": 0.1896551724137931, "grad_norm": 4.0894927978515625, "learning_rate": 3.025e-07, "loss": 11.4714, "step": 605 }, { "epoch": 0.18996865203761756, "grad_norm": 4.6894073486328125, "learning_rate": 3.0300000000000005e-07, "loss": 10.4595, "step": 606 }, { "epoch": 0.190282131661442, "grad_norm": 2.9865052700042725, "learning_rate": 3.035e-07, "loss": 6.7574, "step": 607 }, { "epoch": 0.19059561128526645, "grad_norm": 3.2500112056732178, "learning_rate": 3.04e-07, "loss": 7.7867, "step": 608 }, { "epoch": 0.19090909090909092, "grad_norm": 3.837984323501587, "learning_rate": 3.0450000000000006e-07, "loss": 8.9823, "step": 609 }, { "epoch": 0.19122257053291536, "grad_norm": 5.894450664520264, "learning_rate": 3.0500000000000004e-07, "loss": 15.5915, "step": 610 }, { "epoch": 0.1915360501567398, "grad_norm": 3.60793137550354, "learning_rate": 3.0550000000000003e-07, "loss": 7.3988, "step": 611 }, { "epoch": 0.19184952978056427, "grad_norm": 3.7032573223114014, "learning_rate": 3.06e-07, "loss": 7.1858, "step": 612 }, { "epoch": 0.1921630094043887, "grad_norm": 3.515413522720337, "learning_rate": 3.0650000000000005e-07, "loss": 8.9439, "step": 613 }, { "epoch": 0.19247648902821315, "grad_norm": 4.349193096160889, "learning_rate": 3.0700000000000004e-07, "loss": 10.2601, "step": 614 }, { "epoch": 0.19278996865203762, "grad_norm": 3.940768241882324, "learning_rate": 3.075e-07, "loss": 10.1987, "step": 615 }, { "epoch": 0.19310344827586207, "grad_norm": 2.720353841781616, "learning_rate": 3.0800000000000006e-07, "loss": 6.7377, "step": 616 }, { "epoch": 0.1934169278996865, "grad_norm": 3.8752973079681396, "learning_rate": 3.085e-07, "loss": 8.5383, "step": 617 }, { "epoch": 0.19373040752351098, "grad_norm": 3.49699330329895, "learning_rate": 3.0900000000000003e-07, "loss": 8.1884, "step": 618 }, { "epoch": 0.19404388714733542, "grad_norm": 3.857430934906006, "learning_rate": 3.095e-07, "loss": 8.9309, "step": 619 }, { "epoch": 0.19435736677115986, "grad_norm": 3.196040153503418, "learning_rate": 3.1000000000000005e-07, "loss": 7.5568, "step": 620 }, { "epoch": 0.19467084639498433, "grad_norm": 3.7832367420196533, "learning_rate": 3.1050000000000003e-07, "loss": 9.6145, "step": 621 }, { "epoch": 0.19498432601880877, "grad_norm": 3.3154568672180176, "learning_rate": 3.11e-07, "loss": 7.3691, "step": 622 }, { "epoch": 0.19529780564263322, "grad_norm": 5.462655067443848, "learning_rate": 3.1150000000000006e-07, "loss": 12.7679, "step": 623 }, { "epoch": 0.19561128526645769, "grad_norm": 4.170969009399414, "learning_rate": 3.12e-07, "loss": 11.6538, "step": 624 }, { "epoch": 0.19592476489028213, "grad_norm": 5.751269817352295, "learning_rate": 3.125e-07, "loss": 13.9954, "step": 625 }, { "epoch": 0.1962382445141066, "grad_norm": 4.067015171051025, "learning_rate": 3.1300000000000006e-07, "loss": 6.5656, "step": 626 }, { "epoch": 0.19655172413793104, "grad_norm": 3.084688186645508, "learning_rate": 3.1350000000000005e-07, "loss": 8.2132, "step": 627 }, { "epoch": 0.19686520376175548, "grad_norm": 2.9886057376861572, "learning_rate": 3.14e-07, "loss": 6.7756, "step": 628 }, { "epoch": 0.19717868338557995, "grad_norm": 3.593235969543457, "learning_rate": 3.145e-07, "loss": 8.3709, "step": 629 }, { "epoch": 0.1974921630094044, "grad_norm": 3.363783597946167, "learning_rate": 3.15e-07, "loss": 10.0533, "step": 630 }, { "epoch": 0.19780564263322883, "grad_norm": 4.221756935119629, "learning_rate": 3.1550000000000004e-07, "loss": 11.3121, "step": 631 }, { "epoch": 0.1981191222570533, "grad_norm": 4.736835956573486, "learning_rate": 3.160000000000001e-07, "loss": 12.493, "step": 632 }, { "epoch": 0.19843260188087775, "grad_norm": 3.1203339099884033, "learning_rate": 3.165e-07, "loss": 7.1578, "step": 633 }, { "epoch": 0.1987460815047022, "grad_norm": 3.5580482482910156, "learning_rate": 3.17e-07, "loss": 7.742, "step": 634 }, { "epoch": 0.19905956112852666, "grad_norm": 3.4418790340423584, "learning_rate": 3.1750000000000003e-07, "loss": 9.445, "step": 635 }, { "epoch": 0.1993730407523511, "grad_norm": 3.2233057022094727, "learning_rate": 3.1800000000000007e-07, "loss": 8.102, "step": 636 }, { "epoch": 0.19968652037617554, "grad_norm": 3.335604190826416, "learning_rate": 3.1850000000000005e-07, "loss": 6.6202, "step": 637 }, { "epoch": 0.2, "grad_norm": 2.8534433841705322, "learning_rate": 3.19e-07, "loss": 7.2855, "step": 638 }, { "epoch": 0.20031347962382445, "grad_norm": 3.358614683151245, "learning_rate": 3.195e-07, "loss": 9.9256, "step": 639 }, { "epoch": 0.2006269592476489, "grad_norm": 3.0653884410858154, "learning_rate": 3.2e-07, "loss": 6.773, "step": 640 }, { "epoch": 0.20094043887147336, "grad_norm": 3.5670981407165527, "learning_rate": 3.2050000000000004e-07, "loss": 10.0116, "step": 641 }, { "epoch": 0.2012539184952978, "grad_norm": 3.6908740997314453, "learning_rate": 3.21e-07, "loss": 9.244, "step": 642 }, { "epoch": 0.20156739811912225, "grad_norm": 4.45837926864624, "learning_rate": 3.215e-07, "loss": 7.975, "step": 643 }, { "epoch": 0.20188087774294672, "grad_norm": 3.217262029647827, "learning_rate": 3.22e-07, "loss": 7.2521, "step": 644 }, { "epoch": 0.20219435736677116, "grad_norm": 3.4965760707855225, "learning_rate": 3.2250000000000004e-07, "loss": 7.2706, "step": 645 }, { "epoch": 0.2025078369905956, "grad_norm": 3.6670267581939697, "learning_rate": 3.2300000000000007e-07, "loss": 7.8659, "step": 646 }, { "epoch": 0.20282131661442007, "grad_norm": 3.4865596294403076, "learning_rate": 3.235e-07, "loss": 8.019, "step": 647 }, { "epoch": 0.2031347962382445, "grad_norm": 3.0378615856170654, "learning_rate": 3.24e-07, "loss": 8.2733, "step": 648 }, { "epoch": 0.20344827586206896, "grad_norm": 4.101233005523682, "learning_rate": 3.2450000000000003e-07, "loss": 8.759, "step": 649 }, { "epoch": 0.20376175548589343, "grad_norm": 3.027646064758301, "learning_rate": 3.25e-07, "loss": 7.4447, "step": 650 }, { "epoch": 0.20407523510971787, "grad_norm": 4.034641742706299, "learning_rate": 3.2550000000000005e-07, "loss": 10.331, "step": 651 }, { "epoch": 0.2043887147335423, "grad_norm": 3.4955785274505615, "learning_rate": 3.26e-07, "loss": 8.214, "step": 652 }, { "epoch": 0.20470219435736678, "grad_norm": 3.91501784324646, "learning_rate": 3.265e-07, "loss": 8.5537, "step": 653 }, { "epoch": 0.20501567398119122, "grad_norm": 3.4765992164611816, "learning_rate": 3.27e-07, "loss": 8.4838, "step": 654 }, { "epoch": 0.20532915360501566, "grad_norm": 2.9636855125427246, "learning_rate": 3.2750000000000004e-07, "loss": 7.3184, "step": 655 }, { "epoch": 0.20564263322884013, "grad_norm": 3.7056450843811035, "learning_rate": 3.280000000000001e-07, "loss": 8.578, "step": 656 }, { "epoch": 0.20595611285266457, "grad_norm": 5.734414100646973, "learning_rate": 3.285e-07, "loss": 19.8354, "step": 657 }, { "epoch": 0.20626959247648902, "grad_norm": 4.2713303565979, "learning_rate": 3.29e-07, "loss": 12.1838, "step": 658 }, { "epoch": 0.2065830721003135, "grad_norm": 3.7173306941986084, "learning_rate": 3.2950000000000003e-07, "loss": 8.0242, "step": 659 }, { "epoch": 0.20689655172413793, "grad_norm": 3.0381829738616943, "learning_rate": 3.3e-07, "loss": 7.6178, "step": 660 }, { "epoch": 0.20721003134796237, "grad_norm": 3.788667678833008, "learning_rate": 3.3050000000000005e-07, "loss": 9.3861, "step": 661 }, { "epoch": 0.20752351097178684, "grad_norm": 4.688927173614502, "learning_rate": 3.31e-07, "loss": 9.3514, "step": 662 }, { "epoch": 0.20783699059561128, "grad_norm": 4.714510440826416, "learning_rate": 3.315e-07, "loss": 11.6471, "step": 663 }, { "epoch": 0.20815047021943572, "grad_norm": 3.319627046585083, "learning_rate": 3.32e-07, "loss": 6.9164, "step": 664 }, { "epoch": 0.2084639498432602, "grad_norm": 4.469924449920654, "learning_rate": 3.3250000000000005e-07, "loss": 10.5332, "step": 665 }, { "epoch": 0.20877742946708464, "grad_norm": 3.59578537940979, "learning_rate": 3.330000000000001e-07, "loss": 7.8451, "step": 666 }, { "epoch": 0.20909090909090908, "grad_norm": 3.3554725646972656, "learning_rate": 3.335e-07, "loss": 8.2171, "step": 667 }, { "epoch": 0.20940438871473355, "grad_norm": 3.555884599685669, "learning_rate": 3.34e-07, "loss": 9.0251, "step": 668 }, { "epoch": 0.209717868338558, "grad_norm": 3.4159555435180664, "learning_rate": 3.3450000000000004e-07, "loss": 8.1913, "step": 669 }, { "epoch": 0.21003134796238246, "grad_norm": 3.6287314891815186, "learning_rate": 3.35e-07, "loss": 7.8455, "step": 670 }, { "epoch": 0.2103448275862069, "grad_norm": 5.647161960601807, "learning_rate": 3.3550000000000006e-07, "loss": 14.64, "step": 671 }, { "epoch": 0.21065830721003134, "grad_norm": 3.4719080924987793, "learning_rate": 3.36e-07, "loss": 8.0084, "step": 672 }, { "epoch": 0.2109717868338558, "grad_norm": 3.0406198501586914, "learning_rate": 3.3650000000000003e-07, "loss": 8.0089, "step": 673 }, { "epoch": 0.21128526645768025, "grad_norm": 3.4766266345977783, "learning_rate": 3.37e-07, "loss": 8.2851, "step": 674 }, { "epoch": 0.2115987460815047, "grad_norm": 4.269450664520264, "learning_rate": 3.3750000000000005e-07, "loss": 8.4858, "step": 675 }, { "epoch": 0.21191222570532917, "grad_norm": 4.416221618652344, "learning_rate": 3.38e-07, "loss": 8.1825, "step": 676 }, { "epoch": 0.2122257053291536, "grad_norm": 3.905118465423584, "learning_rate": 3.385e-07, "loss": 8.2382, "step": 677 }, { "epoch": 0.21253918495297805, "grad_norm": 3.801635980606079, "learning_rate": 3.39e-07, "loss": 8.5147, "step": 678 }, { "epoch": 0.21285266457680252, "grad_norm": 3.1189794540405273, "learning_rate": 3.3950000000000004e-07, "loss": 8.2367, "step": 679 }, { "epoch": 0.21316614420062696, "grad_norm": 3.4691970348358154, "learning_rate": 3.4000000000000003e-07, "loss": 8.0238, "step": 680 }, { "epoch": 0.2134796238244514, "grad_norm": 2.9724643230438232, "learning_rate": 3.405e-07, "loss": 7.9417, "step": 681 }, { "epoch": 0.21379310344827587, "grad_norm": 3.1940643787384033, "learning_rate": 3.41e-07, "loss": 8.5166, "step": 682 }, { "epoch": 0.21410658307210031, "grad_norm": 3.5346059799194336, "learning_rate": 3.4150000000000003e-07, "loss": 7.2445, "step": 683 }, { "epoch": 0.21442006269592476, "grad_norm": 3.558433771133423, "learning_rate": 3.42e-07, "loss": 6.4688, "step": 684 }, { "epoch": 0.21473354231974923, "grad_norm": 3.95771861076355, "learning_rate": 3.4250000000000006e-07, "loss": 9.9655, "step": 685 }, { "epoch": 0.21504702194357367, "grad_norm": 3.4017562866210938, "learning_rate": 3.43e-07, "loss": 8.1479, "step": 686 }, { "epoch": 0.2153605015673981, "grad_norm": 3.312255859375, "learning_rate": 3.435e-07, "loss": 7.3946, "step": 687 }, { "epoch": 0.21567398119122258, "grad_norm": 3.0924856662750244, "learning_rate": 3.44e-07, "loss": 7.1558, "step": 688 }, { "epoch": 0.21598746081504702, "grad_norm": 4.0401482582092285, "learning_rate": 3.4450000000000005e-07, "loss": 8.9632, "step": 689 }, { "epoch": 0.21630094043887146, "grad_norm": 3.2518508434295654, "learning_rate": 3.4500000000000003e-07, "loss": 7.0506, "step": 690 }, { "epoch": 0.21661442006269593, "grad_norm": 3.050063133239746, "learning_rate": 3.455e-07, "loss": 7.7591, "step": 691 }, { "epoch": 0.21692789968652038, "grad_norm": 4.7669997215271, "learning_rate": 3.46e-07, "loss": 9.7782, "step": 692 }, { "epoch": 0.21724137931034482, "grad_norm": 3.25658917427063, "learning_rate": 3.4650000000000004e-07, "loss": 7.9233, "step": 693 }, { "epoch": 0.2175548589341693, "grad_norm": 2.796995162963867, "learning_rate": 3.47e-07, "loss": 6.1858, "step": 694 }, { "epoch": 0.21786833855799373, "grad_norm": 3.696262836456299, "learning_rate": 3.4750000000000006e-07, "loss": 8.2826, "step": 695 }, { "epoch": 0.21818181818181817, "grad_norm": 4.308631896972656, "learning_rate": 3.48e-07, "loss": 12.5166, "step": 696 }, { "epoch": 0.21849529780564264, "grad_norm": 2.7071919441223145, "learning_rate": 3.4850000000000003e-07, "loss": 6.3807, "step": 697 }, { "epoch": 0.21880877742946708, "grad_norm": 4.019406795501709, "learning_rate": 3.49e-07, "loss": 11.1741, "step": 698 }, { "epoch": 0.21912225705329152, "grad_norm": 4.2864580154418945, "learning_rate": 3.4950000000000005e-07, "loss": 9.779, "step": 699 }, { "epoch": 0.219435736677116, "grad_norm": 3.9045979976654053, "learning_rate": 3.5000000000000004e-07, "loss": 6.9983, "step": 700 }, { "epoch": 0.21974921630094044, "grad_norm": 3.1046054363250732, "learning_rate": 3.505e-07, "loss": 7.6675, "step": 701 }, { "epoch": 0.22006269592476488, "grad_norm": 3.561466693878174, "learning_rate": 3.51e-07, "loss": 5.8858, "step": 702 }, { "epoch": 0.22037617554858935, "grad_norm": 3.2833735942840576, "learning_rate": 3.5150000000000004e-07, "loss": 6.8125, "step": 703 }, { "epoch": 0.2206896551724138, "grad_norm": 5.263795375823975, "learning_rate": 3.5200000000000003e-07, "loss": 11.6207, "step": 704 }, { "epoch": 0.22100313479623823, "grad_norm": 3.4245877265930176, "learning_rate": 3.525e-07, "loss": 8.0098, "step": 705 }, { "epoch": 0.2213166144200627, "grad_norm": 3.0912983417510986, "learning_rate": 3.53e-07, "loss": 7.7124, "step": 706 }, { "epoch": 0.22163009404388714, "grad_norm": 2.501897096633911, "learning_rate": 3.5350000000000004e-07, "loss": 6.3126, "step": 707 }, { "epoch": 0.22194357366771159, "grad_norm": 2.732363224029541, "learning_rate": 3.54e-07, "loss": 8.4596, "step": 708 }, { "epoch": 0.22225705329153606, "grad_norm": 2.866844892501831, "learning_rate": 3.5450000000000006e-07, "loss": 7.4127, "step": 709 }, { "epoch": 0.2225705329153605, "grad_norm": 3.562100410461426, "learning_rate": 3.55e-07, "loss": 10.6676, "step": 710 }, { "epoch": 0.22288401253918494, "grad_norm": 2.7351932525634766, "learning_rate": 3.5550000000000003e-07, "loss": 6.7216, "step": 711 }, { "epoch": 0.2231974921630094, "grad_norm": 2.929007053375244, "learning_rate": 3.56e-07, "loss": 7.5504, "step": 712 }, { "epoch": 0.22351097178683385, "grad_norm": 2.9847233295440674, "learning_rate": 3.5650000000000005e-07, "loss": 8.5679, "step": 713 }, { "epoch": 0.22382445141065832, "grad_norm": 3.290311336517334, "learning_rate": 3.5700000000000003e-07, "loss": 8.5614, "step": 714 }, { "epoch": 0.22413793103448276, "grad_norm": 3.176608085632324, "learning_rate": 3.575e-07, "loss": 7.1585, "step": 715 }, { "epoch": 0.2244514106583072, "grad_norm": 5.026858806610107, "learning_rate": 3.58e-07, "loss": 15.6788, "step": 716 }, { "epoch": 0.22476489028213167, "grad_norm": 2.588078260421753, "learning_rate": 3.5850000000000004e-07, "loss": 6.1721, "step": 717 }, { "epoch": 0.22507836990595612, "grad_norm": 3.75956654548645, "learning_rate": 3.59e-07, "loss": 9.8582, "step": 718 }, { "epoch": 0.22539184952978056, "grad_norm": 3.7098045349121094, "learning_rate": 3.5950000000000006e-07, "loss": 7.7692, "step": 719 }, { "epoch": 0.22570532915360503, "grad_norm": 3.2847020626068115, "learning_rate": 3.6e-07, "loss": 7.2862, "step": 720 }, { "epoch": 0.22601880877742947, "grad_norm": 4.180566310882568, "learning_rate": 3.6050000000000003e-07, "loss": 8.2584, "step": 721 }, { "epoch": 0.2263322884012539, "grad_norm": 3.886276960372925, "learning_rate": 3.61e-07, "loss": 8.167, "step": 722 }, { "epoch": 0.22664576802507838, "grad_norm": 3.5251870155334473, "learning_rate": 3.6150000000000005e-07, "loss": 8.091, "step": 723 }, { "epoch": 0.22695924764890282, "grad_norm": 3.8147459030151367, "learning_rate": 3.6200000000000004e-07, "loss": 8.1781, "step": 724 }, { "epoch": 0.22727272727272727, "grad_norm": 4.299187660217285, "learning_rate": 3.625e-07, "loss": 10.4068, "step": 725 }, { "epoch": 0.22758620689655173, "grad_norm": 4.2207255363464355, "learning_rate": 3.63e-07, "loss": 12.8631, "step": 726 }, { "epoch": 0.22789968652037618, "grad_norm": 4.004030704498291, "learning_rate": 3.6350000000000005e-07, "loss": 8.9691, "step": 727 }, { "epoch": 0.22821316614420062, "grad_norm": 4.197298526763916, "learning_rate": 3.6400000000000003e-07, "loss": 8.0809, "step": 728 }, { "epoch": 0.2285266457680251, "grad_norm": 4.978548049926758, "learning_rate": 3.6450000000000007e-07, "loss": 11.308, "step": 729 }, { "epoch": 0.22884012539184953, "grad_norm": 3.4799234867095947, "learning_rate": 3.65e-07, "loss": 8.5875, "step": 730 }, { "epoch": 0.22915360501567397, "grad_norm": 3.4249789714813232, "learning_rate": 3.6550000000000004e-07, "loss": 9.2064, "step": 731 }, { "epoch": 0.22946708463949844, "grad_norm": 2.496309995651245, "learning_rate": 3.66e-07, "loss": 6.6208, "step": 732 }, { "epoch": 0.22978056426332288, "grad_norm": 4.72249698638916, "learning_rate": 3.6650000000000006e-07, "loss": 7.5853, "step": 733 }, { "epoch": 0.23009404388714733, "grad_norm": 3.36556339263916, "learning_rate": 3.6700000000000004e-07, "loss": 6.5957, "step": 734 }, { "epoch": 0.2304075235109718, "grad_norm": 5.186474800109863, "learning_rate": 3.6750000000000003e-07, "loss": 19.8754, "step": 735 }, { "epoch": 0.23072100313479624, "grad_norm": 3.4509902000427246, "learning_rate": 3.68e-07, "loss": 8.3037, "step": 736 }, { "epoch": 0.23103448275862068, "grad_norm": 3.9138832092285156, "learning_rate": 3.6850000000000005e-07, "loss": 10.7141, "step": 737 }, { "epoch": 0.23134796238244515, "grad_norm": 4.275444030761719, "learning_rate": 3.6900000000000004e-07, "loss": 12.6663, "step": 738 }, { "epoch": 0.2316614420062696, "grad_norm": 2.733919858932495, "learning_rate": 3.695e-07, "loss": 7.8312, "step": 739 }, { "epoch": 0.23197492163009403, "grad_norm": 3.960728645324707, "learning_rate": 3.7e-07, "loss": 7.1807, "step": 740 }, { "epoch": 0.2322884012539185, "grad_norm": 3.9499926567077637, "learning_rate": 3.7050000000000004e-07, "loss": 8.1952, "step": 741 }, { "epoch": 0.23260188087774294, "grad_norm": 3.199276924133301, "learning_rate": 3.7100000000000003e-07, "loss": 8.8693, "step": 742 }, { "epoch": 0.2329153605015674, "grad_norm": 3.465644121170044, "learning_rate": 3.7150000000000006e-07, "loss": 8.3512, "step": 743 }, { "epoch": 0.23322884012539186, "grad_norm": 3.957305431365967, "learning_rate": 3.72e-07, "loss": 12.9835, "step": 744 }, { "epoch": 0.2335423197492163, "grad_norm": 4.241058349609375, "learning_rate": 3.7250000000000003e-07, "loss": 11.0323, "step": 745 }, { "epoch": 0.23385579937304074, "grad_norm": 3.187009572982788, "learning_rate": 3.73e-07, "loss": 8.9532, "step": 746 }, { "epoch": 0.2341692789968652, "grad_norm": 3.3454670906066895, "learning_rate": 3.7350000000000006e-07, "loss": 8.9809, "step": 747 }, { "epoch": 0.23448275862068965, "grad_norm": 4.55251407623291, "learning_rate": 3.7400000000000004e-07, "loss": 8.4751, "step": 748 }, { "epoch": 0.2347962382445141, "grad_norm": 3.5339887142181396, "learning_rate": 3.745e-07, "loss": 10.128, "step": 749 }, { "epoch": 0.23510971786833856, "grad_norm": 3.775538206100464, "learning_rate": 3.75e-07, "loss": 8.0116, "step": 750 }, { "epoch": 0.235423197492163, "grad_norm": 3.8628170490264893, "learning_rate": 3.7550000000000005e-07, "loss": 8.7356, "step": 751 }, { "epoch": 0.23573667711598745, "grad_norm": 3.2559432983398438, "learning_rate": 3.7600000000000003e-07, "loss": 7.3864, "step": 752 }, { "epoch": 0.23605015673981192, "grad_norm": 3.832037925720215, "learning_rate": 3.7650000000000007e-07, "loss": 10.6071, "step": 753 }, { "epoch": 0.23636363636363636, "grad_norm": 3.030069589614868, "learning_rate": 3.77e-07, "loss": 8.176, "step": 754 }, { "epoch": 0.23667711598746083, "grad_norm": 3.3893072605133057, "learning_rate": 3.7750000000000004e-07, "loss": 7.696, "step": 755 }, { "epoch": 0.23699059561128527, "grad_norm": 3.5850815773010254, "learning_rate": 3.78e-07, "loss": 8.6539, "step": 756 }, { "epoch": 0.2373040752351097, "grad_norm": 3.6070199012756348, "learning_rate": 3.7850000000000006e-07, "loss": 9.2962, "step": 757 }, { "epoch": 0.23761755485893418, "grad_norm": 3.547919273376465, "learning_rate": 3.7900000000000005e-07, "loss": 10.0205, "step": 758 }, { "epoch": 0.23793103448275862, "grad_norm": 3.6676182746887207, "learning_rate": 3.7950000000000003e-07, "loss": 7.0754, "step": 759 }, { "epoch": 0.23824451410658307, "grad_norm": 4.715800762176514, "learning_rate": 3.8e-07, "loss": 13.3314, "step": 760 }, { "epoch": 0.23855799373040754, "grad_norm": 3.9477193355560303, "learning_rate": 3.8050000000000005e-07, "loss": 8.7171, "step": 761 }, { "epoch": 0.23887147335423198, "grad_norm": 4.012876987457275, "learning_rate": 3.8100000000000004e-07, "loss": 8.9844, "step": 762 }, { "epoch": 0.23918495297805642, "grad_norm": 3.585956335067749, "learning_rate": 3.815000000000001e-07, "loss": 8.8991, "step": 763 }, { "epoch": 0.2394984326018809, "grad_norm": 3.68974232673645, "learning_rate": 3.82e-07, "loss": 8.506, "step": 764 }, { "epoch": 0.23981191222570533, "grad_norm": 3.4120147228240967, "learning_rate": 3.8250000000000004e-07, "loss": 7.4377, "step": 765 }, { "epoch": 0.24012539184952977, "grad_norm": 2.9373185634613037, "learning_rate": 3.8300000000000003e-07, "loss": 6.9149, "step": 766 }, { "epoch": 0.24043887147335424, "grad_norm": 6.263192176818848, "learning_rate": 3.8350000000000007e-07, "loss": 15.8049, "step": 767 }, { "epoch": 0.24075235109717869, "grad_norm": 3.48296856880188, "learning_rate": 3.84e-07, "loss": 7.886, "step": 768 }, { "epoch": 0.24106583072100313, "grad_norm": 3.5808095932006836, "learning_rate": 3.8450000000000004e-07, "loss": 8.8138, "step": 769 }, { "epoch": 0.2413793103448276, "grad_norm": 4.42182731628418, "learning_rate": 3.85e-07, "loss": 11.1607, "step": 770 }, { "epoch": 0.24169278996865204, "grad_norm": 3.0944645404815674, "learning_rate": 3.8550000000000006e-07, "loss": 8.216, "step": 771 }, { "epoch": 0.24200626959247648, "grad_norm": 4.562041282653809, "learning_rate": 3.8600000000000004e-07, "loss": 10.2914, "step": 772 }, { "epoch": 0.24231974921630095, "grad_norm": 3.654736042022705, "learning_rate": 3.865e-07, "loss": 7.9714, "step": 773 }, { "epoch": 0.2426332288401254, "grad_norm": 3.496178150177002, "learning_rate": 3.87e-07, "loss": 8.5865, "step": 774 }, { "epoch": 0.24294670846394983, "grad_norm": 3.801335096359253, "learning_rate": 3.8750000000000005e-07, "loss": 8.9361, "step": 775 }, { "epoch": 0.2432601880877743, "grad_norm": 4.17310094833374, "learning_rate": 3.8800000000000003e-07, "loss": 7.906, "step": 776 }, { "epoch": 0.24357366771159875, "grad_norm": 3.4103686809539795, "learning_rate": 3.8850000000000007e-07, "loss": 8.7571, "step": 777 }, { "epoch": 0.2438871473354232, "grad_norm": 3.6416525840759277, "learning_rate": 3.89e-07, "loss": 7.8949, "step": 778 }, { "epoch": 0.24420062695924766, "grad_norm": 4.429583549499512, "learning_rate": 3.8950000000000004e-07, "loss": 7.3726, "step": 779 }, { "epoch": 0.2445141065830721, "grad_norm": 3.5960888862609863, "learning_rate": 3.9e-07, "loss": 8.3664, "step": 780 }, { "epoch": 0.24482758620689654, "grad_norm": 6.068627834320068, "learning_rate": 3.9050000000000006e-07, "loss": 14.7055, "step": 781 }, { "epoch": 0.245141065830721, "grad_norm": 4.344872951507568, "learning_rate": 3.9100000000000005e-07, "loss": 10.0272, "step": 782 }, { "epoch": 0.24545454545454545, "grad_norm": 3.6392626762390137, "learning_rate": 3.915e-07, "loss": 6.46, "step": 783 }, { "epoch": 0.2457680250783699, "grad_norm": 3.415555238723755, "learning_rate": 3.92e-07, "loss": 8.4829, "step": 784 }, { "epoch": 0.24608150470219436, "grad_norm": 4.941524982452393, "learning_rate": 3.9250000000000005e-07, "loss": 11.3501, "step": 785 }, { "epoch": 0.2463949843260188, "grad_norm": 3.191941738128662, "learning_rate": 3.9300000000000004e-07, "loss": 8.494, "step": 786 }, { "epoch": 0.24670846394984325, "grad_norm": 3.4646642208099365, "learning_rate": 3.935000000000001e-07, "loss": 7.9202, "step": 787 }, { "epoch": 0.24702194357366772, "grad_norm": 3.47352933883667, "learning_rate": 3.94e-07, "loss": 10.1235, "step": 788 }, { "epoch": 0.24733542319749216, "grad_norm": 4.042746543884277, "learning_rate": 3.9450000000000005e-07, "loss": 7.8208, "step": 789 }, { "epoch": 0.2476489028213166, "grad_norm": 3.284029006958008, "learning_rate": 3.9500000000000003e-07, "loss": 7.8496, "step": 790 }, { "epoch": 0.24796238244514107, "grad_norm": 4.137977600097656, "learning_rate": 3.9550000000000007e-07, "loss": 11.9702, "step": 791 }, { "epoch": 0.2482758620689655, "grad_norm": 3.6068592071533203, "learning_rate": 3.9600000000000005e-07, "loss": 6.9573, "step": 792 }, { "epoch": 0.24858934169278996, "grad_norm": 3.264119863510132, "learning_rate": 3.965e-07, "loss": 7.5779, "step": 793 }, { "epoch": 0.24890282131661443, "grad_norm": 3.505993366241455, "learning_rate": 3.97e-07, "loss": 7.7447, "step": 794 }, { "epoch": 0.24921630094043887, "grad_norm": 3.9648635387420654, "learning_rate": 3.9750000000000006e-07, "loss": 11.2637, "step": 795 }, { "epoch": 0.2495297805642633, "grad_norm": 3.1940486431121826, "learning_rate": 3.9800000000000004e-07, "loss": 8.9283, "step": 796 }, { "epoch": 0.24984326018808778, "grad_norm": 3.5564422607421875, "learning_rate": 3.985e-07, "loss": 7.2672, "step": 797 }, { "epoch": 0.2501567398119122, "grad_norm": 3.626217842102051, "learning_rate": 3.99e-07, "loss": 8.51, "step": 798 }, { "epoch": 0.2501567398119122, "eval_loss": 28.84930992126465, "eval_runtime": 20.5251, "eval_samples_per_second": 130.913, "eval_steps_per_second": 8.185, "step": 798 }, { "epoch": 0.25047021943573666, "grad_norm": 3.097006320953369, "learning_rate": 3.9950000000000005e-07, "loss": 7.4537, "step": 799 }, { "epoch": 0.2507836990595611, "grad_norm": 3.5833940505981445, "learning_rate": 4.0000000000000003e-07, "loss": 10.0463, "step": 800 }, { "epoch": 0.2510971786833856, "grad_norm": 4.196984767913818, "learning_rate": 4.0050000000000007e-07, "loss": 7.771, "step": 801 }, { "epoch": 0.25141065830721004, "grad_norm": 4.240922927856445, "learning_rate": 4.01e-07, "loss": 10.3631, "step": 802 }, { "epoch": 0.2517241379310345, "grad_norm": 3.425544023513794, "learning_rate": 4.015e-07, "loss": 6.2936, "step": 803 }, { "epoch": 0.25203761755485893, "grad_norm": 3.8338463306427, "learning_rate": 4.02e-07, "loss": 11.4034, "step": 804 }, { "epoch": 0.25235109717868337, "grad_norm": 3.4138238430023193, "learning_rate": 4.0250000000000006e-07, "loss": 6.9629, "step": 805 }, { "epoch": 0.2526645768025078, "grad_norm": 3.1681156158447266, "learning_rate": 4.0300000000000005e-07, "loss": 7.616, "step": 806 }, { "epoch": 0.2529780564263323, "grad_norm": 13.125308990478516, "learning_rate": 4.035e-07, "loss": 7.871, "step": 807 }, { "epoch": 0.25329153605015675, "grad_norm": 5.784317493438721, "learning_rate": 4.04e-07, "loss": 20.8958, "step": 808 }, { "epoch": 0.2536050156739812, "grad_norm": 3.021087646484375, "learning_rate": 4.0450000000000006e-07, "loss": 7.3132, "step": 809 }, { "epoch": 0.25391849529780564, "grad_norm": 2.8129184246063232, "learning_rate": 4.0500000000000004e-07, "loss": 7.4504, "step": 810 }, { "epoch": 0.2542319749216301, "grad_norm": 3.036388874053955, "learning_rate": 4.055000000000001e-07, "loss": 6.5679, "step": 811 }, { "epoch": 0.2545454545454545, "grad_norm": 3.112868547439575, "learning_rate": 4.06e-07, "loss": 8.2039, "step": 812 }, { "epoch": 0.254858934169279, "grad_norm": 3.948190212249756, "learning_rate": 4.065e-07, "loss": 9.2969, "step": 813 }, { "epoch": 0.25517241379310346, "grad_norm": 3.823413133621216, "learning_rate": 4.0700000000000003e-07, "loss": 8.2135, "step": 814 }, { "epoch": 0.2554858934169279, "grad_norm": 3.8010671138763428, "learning_rate": 4.0750000000000007e-07, "loss": 8.7767, "step": 815 }, { "epoch": 0.25579937304075234, "grad_norm": 2.9464848041534424, "learning_rate": 4.0800000000000005e-07, "loss": 6.3003, "step": 816 }, { "epoch": 0.2561128526645768, "grad_norm": 3.2114903926849365, "learning_rate": 4.085e-07, "loss": 8.0724, "step": 817 }, { "epoch": 0.2564263322884012, "grad_norm": 3.254835844039917, "learning_rate": 4.09e-07, "loss": 7.6632, "step": 818 }, { "epoch": 0.2567398119122257, "grad_norm": 4.548694133758545, "learning_rate": 4.0950000000000006e-07, "loss": 12.1637, "step": 819 }, { "epoch": 0.25705329153605017, "grad_norm": 4.109635829925537, "learning_rate": 4.1000000000000004e-07, "loss": 9.4519, "step": 820 }, { "epoch": 0.2573667711598746, "grad_norm": 4.340848922729492, "learning_rate": 4.105000000000001e-07, "loss": 8.5242, "step": 821 }, { "epoch": 0.25768025078369905, "grad_norm": 3.260594367980957, "learning_rate": 4.11e-07, "loss": 7.83, "step": 822 }, { "epoch": 0.2579937304075235, "grad_norm": 5.5108771324157715, "learning_rate": 4.115e-07, "loss": 10.5884, "step": 823 }, { "epoch": 0.258307210031348, "grad_norm": 3.490274429321289, "learning_rate": 4.1200000000000004e-07, "loss": 8.7728, "step": 824 }, { "epoch": 0.25862068965517243, "grad_norm": 6.578031539916992, "learning_rate": 4.125000000000001e-07, "loss": 10.9723, "step": 825 }, { "epoch": 0.2589341692789969, "grad_norm": 2.774808406829834, "learning_rate": 4.1300000000000006e-07, "loss": 6.9211, "step": 826 }, { "epoch": 0.2592476489028213, "grad_norm": 5.915481090545654, "learning_rate": 4.135e-07, "loss": 9.1981, "step": 827 }, { "epoch": 0.25956112852664576, "grad_norm": 2.8038599491119385, "learning_rate": 4.1400000000000003e-07, "loss": 6.1127, "step": 828 }, { "epoch": 0.2598746081504702, "grad_norm": 3.501262903213501, "learning_rate": 4.1450000000000007e-07, "loss": 6.5746, "step": 829 }, { "epoch": 0.2601880877742947, "grad_norm": 3.5788586139678955, "learning_rate": 4.1500000000000005e-07, "loss": 8.6381, "step": 830 }, { "epoch": 0.26050156739811914, "grad_norm": 5.020801544189453, "learning_rate": 4.155e-07, "loss": 16.7279, "step": 831 }, { "epoch": 0.2608150470219436, "grad_norm": 3.9327170848846436, "learning_rate": 4.16e-07, "loss": 12.7287, "step": 832 }, { "epoch": 0.261128526645768, "grad_norm": 3.2840352058410645, "learning_rate": 4.165e-07, "loss": 6.839, "step": 833 }, { "epoch": 0.26144200626959246, "grad_norm": 3.750833511352539, "learning_rate": 4.1700000000000004e-07, "loss": 11.1242, "step": 834 }, { "epoch": 0.2617554858934169, "grad_norm": 3.46420955657959, "learning_rate": 4.175000000000001e-07, "loss": 9.7763, "step": 835 }, { "epoch": 0.2620689655172414, "grad_norm": 3.476485252380371, "learning_rate": 4.18e-07, "loss": 7.1691, "step": 836 }, { "epoch": 0.26238244514106585, "grad_norm": 4.025050163269043, "learning_rate": 4.185e-07, "loss": 9.9421, "step": 837 }, { "epoch": 0.2626959247648903, "grad_norm": 3.0116124153137207, "learning_rate": 4.1900000000000003e-07, "loss": 6.3215, "step": 838 }, { "epoch": 0.26300940438871473, "grad_norm": 4.668792247772217, "learning_rate": 4.1950000000000007e-07, "loss": 11.0969, "step": 839 }, { "epoch": 0.26332288401253917, "grad_norm": 2.963440418243408, "learning_rate": 4.2000000000000006e-07, "loss": 7.2545, "step": 840 }, { "epoch": 0.2636363636363636, "grad_norm": 4.21551513671875, "learning_rate": 4.205e-07, "loss": 8.2629, "step": 841 }, { "epoch": 0.2639498432601881, "grad_norm": 3.958061456680298, "learning_rate": 4.21e-07, "loss": 9.7021, "step": 842 }, { "epoch": 0.26426332288401255, "grad_norm": 2.9558305740356445, "learning_rate": 4.215e-07, "loss": 7.3453, "step": 843 }, { "epoch": 0.264576802507837, "grad_norm": 4.2135233879089355, "learning_rate": 4.2200000000000005e-07, "loss": 8.955, "step": 844 }, { "epoch": 0.26489028213166144, "grad_norm": 3.562573194503784, "learning_rate": 4.225000000000001e-07, "loss": 7.2757, "step": 845 }, { "epoch": 0.2652037617554859, "grad_norm": 3.2057554721832275, "learning_rate": 4.23e-07, "loss": 8.2123, "step": 846 }, { "epoch": 0.2655172413793103, "grad_norm": 3.901542901992798, "learning_rate": 4.235e-07, "loss": 9.1358, "step": 847 }, { "epoch": 0.2658307210031348, "grad_norm": 3.1347365379333496, "learning_rate": 4.2400000000000004e-07, "loss": 6.5317, "step": 848 }, { "epoch": 0.26614420062695926, "grad_norm": 5.042209625244141, "learning_rate": 4.245000000000001e-07, "loss": 14.4307, "step": 849 }, { "epoch": 0.2664576802507837, "grad_norm": 3.5455801486968994, "learning_rate": 4.2500000000000006e-07, "loss": 8.8807, "step": 850 }, { "epoch": 0.26677115987460814, "grad_norm": 3.4751904010772705, "learning_rate": 4.255e-07, "loss": 8.8506, "step": 851 }, { "epoch": 0.2670846394984326, "grad_norm": 3.6751580238342285, "learning_rate": 4.2600000000000003e-07, "loss": 10.4928, "step": 852 }, { "epoch": 0.267398119122257, "grad_norm": 4.02175760269165, "learning_rate": 4.265e-07, "loss": 6.9068, "step": 853 }, { "epoch": 0.2677115987460815, "grad_norm": 7.32761287689209, "learning_rate": 4.2700000000000005e-07, "loss": 17.0699, "step": 854 }, { "epoch": 0.26802507836990597, "grad_norm": 3.607837200164795, "learning_rate": 4.275000000000001e-07, "loss": 8.021, "step": 855 }, { "epoch": 0.2683385579937304, "grad_norm": 2.8955557346343994, "learning_rate": 4.28e-07, "loss": 8.8195, "step": 856 }, { "epoch": 0.26865203761755485, "grad_norm": 3.2017621994018555, "learning_rate": 4.285e-07, "loss": 7.1978, "step": 857 }, { "epoch": 0.2689655172413793, "grad_norm": 3.7214231491088867, "learning_rate": 4.2900000000000004e-07, "loss": 7.2386, "step": 858 }, { "epoch": 0.26927899686520373, "grad_norm": 3.3874855041503906, "learning_rate": 4.295000000000001e-07, "loss": 7.6419, "step": 859 }, { "epoch": 0.26959247648902823, "grad_norm": 4.008450508117676, "learning_rate": 4.3e-07, "loss": 8.0012, "step": 860 }, { "epoch": 0.2699059561128527, "grad_norm": 3.3975911140441895, "learning_rate": 4.305e-07, "loss": 7.3739, "step": 861 }, { "epoch": 0.2702194357366771, "grad_norm": 3.656843423843384, "learning_rate": 4.3100000000000003e-07, "loss": 8.8982, "step": 862 }, { "epoch": 0.27053291536050156, "grad_norm": 3.2853686809539795, "learning_rate": 4.315e-07, "loss": 7.6732, "step": 863 }, { "epoch": 0.270846394984326, "grad_norm": 4.232872009277344, "learning_rate": 4.3200000000000006e-07, "loss": 10.4912, "step": 864 }, { "epoch": 0.2711598746081505, "grad_norm": 3.6398162841796875, "learning_rate": 4.325e-07, "loss": 9.274, "step": 865 }, { "epoch": 0.27147335423197494, "grad_norm": 2.7679269313812256, "learning_rate": 4.33e-07, "loss": 7.0839, "step": 866 }, { "epoch": 0.2717868338557994, "grad_norm": 3.5400402545928955, "learning_rate": 4.335e-07, "loss": 8.8406, "step": 867 }, { "epoch": 0.2721003134796238, "grad_norm": 3.0687062740325928, "learning_rate": 4.3400000000000005e-07, "loss": 7.36, "step": 868 }, { "epoch": 0.27241379310344827, "grad_norm": 3.573641061782837, "learning_rate": 4.345000000000001e-07, "loss": 8.0488, "step": 869 }, { "epoch": 0.2727272727272727, "grad_norm": 4.390239238739014, "learning_rate": 4.35e-07, "loss": 12.9096, "step": 870 }, { "epoch": 0.2730407523510972, "grad_norm": 3.314115524291992, "learning_rate": 4.355e-07, "loss": 7.1066, "step": 871 }, { "epoch": 0.27335423197492165, "grad_norm": 5.46345853805542, "learning_rate": 4.3600000000000004e-07, "loss": 11.487, "step": 872 }, { "epoch": 0.2736677115987461, "grad_norm": 5.023271083831787, "learning_rate": 4.365e-07, "loss": 10.0573, "step": 873 }, { "epoch": 0.27398119122257053, "grad_norm": 2.962111234664917, "learning_rate": 4.3700000000000006e-07, "loss": 8.829, "step": 874 }, { "epoch": 0.274294670846395, "grad_norm": 3.1559054851531982, "learning_rate": 4.375e-07, "loss": 7.1875, "step": 875 }, { "epoch": 0.2746081504702194, "grad_norm": 3.6950178146362305, "learning_rate": 4.3800000000000003e-07, "loss": 8.4211, "step": 876 }, { "epoch": 0.2749216300940439, "grad_norm": 4.661724090576172, "learning_rate": 4.385e-07, "loss": 11.424, "step": 877 }, { "epoch": 0.27523510971786835, "grad_norm": 3.7680742740631104, "learning_rate": 4.3900000000000005e-07, "loss": 9.1259, "step": 878 }, { "epoch": 0.2755485893416928, "grad_norm": 3.409186601638794, "learning_rate": 4.395000000000001e-07, "loss": 6.5314, "step": 879 }, { "epoch": 0.27586206896551724, "grad_norm": 3.2190394401550293, "learning_rate": 4.4e-07, "loss": 6.4504, "step": 880 }, { "epoch": 0.2761755485893417, "grad_norm": 4.012719631195068, "learning_rate": 4.405e-07, "loss": 9.4085, "step": 881 }, { "epoch": 0.2764890282131661, "grad_norm": 4.595925807952881, "learning_rate": 4.4100000000000004e-07, "loss": 13.1328, "step": 882 }, { "epoch": 0.2768025078369906, "grad_norm": 4.065830230712891, "learning_rate": 4.4150000000000003e-07, "loss": 8.3007, "step": 883 }, { "epoch": 0.27711598746081506, "grad_norm": 4.229946613311768, "learning_rate": 4.4200000000000007e-07, "loss": 11.1843, "step": 884 }, { "epoch": 0.2774294670846395, "grad_norm": 3.8973495960235596, "learning_rate": 4.425e-07, "loss": 7.978, "step": 885 }, { "epoch": 0.27774294670846394, "grad_norm": 2.952505111694336, "learning_rate": 4.4300000000000004e-07, "loss": 8.5853, "step": 886 }, { "epoch": 0.2780564263322884, "grad_norm": 2.839816093444824, "learning_rate": 4.435e-07, "loss": 6.6168, "step": 887 }, { "epoch": 0.27836990595611283, "grad_norm": 4.309621334075928, "learning_rate": 4.4400000000000006e-07, "loss": 10.9452, "step": 888 }, { "epoch": 0.2786833855799373, "grad_norm": 5.065432071685791, "learning_rate": 4.445000000000001e-07, "loss": 9.2203, "step": 889 }, { "epoch": 0.27899686520376177, "grad_norm": 4.246471881866455, "learning_rate": 4.4500000000000003e-07, "loss": 7.9399, "step": 890 }, { "epoch": 0.2793103448275862, "grad_norm": 5.130437850952148, "learning_rate": 4.455e-07, "loss": 11.7158, "step": 891 }, { "epoch": 0.27962382445141065, "grad_norm": 3.858509063720703, "learning_rate": 4.4600000000000005e-07, "loss": 9.7248, "step": 892 }, { "epoch": 0.2799373040752351, "grad_norm": 3.6608774662017822, "learning_rate": 4.4650000000000003e-07, "loss": 12.7871, "step": 893 }, { "epoch": 0.28025078369905954, "grad_norm": 4.863644123077393, "learning_rate": 4.47e-07, "loss": 9.9666, "step": 894 }, { "epoch": 0.28056426332288403, "grad_norm": 3.754490613937378, "learning_rate": 4.475e-07, "loss": 7.9063, "step": 895 }, { "epoch": 0.2808777429467085, "grad_norm": 3.4950146675109863, "learning_rate": 4.4800000000000004e-07, "loss": 8.867, "step": 896 }, { "epoch": 0.2811912225705329, "grad_norm": 3.564337968826294, "learning_rate": 4.485e-07, "loss": 7.4725, "step": 897 }, { "epoch": 0.28150470219435736, "grad_norm": 2.9661481380462646, "learning_rate": 4.4900000000000006e-07, "loss": 6.9609, "step": 898 }, { "epoch": 0.2818181818181818, "grad_norm": 3.3152127265930176, "learning_rate": 4.495e-07, "loss": 7.2334, "step": 899 }, { "epoch": 0.28213166144200624, "grad_norm": 3.4572389125823975, "learning_rate": 4.5000000000000003e-07, "loss": 7.8584, "step": 900 }, { "epoch": 0.28244514106583074, "grad_norm": 3.333373546600342, "learning_rate": 4.505e-07, "loss": 6.6077, "step": 901 }, { "epoch": 0.2827586206896552, "grad_norm": 4.405636310577393, "learning_rate": 4.5100000000000005e-07, "loss": 11.1708, "step": 902 }, { "epoch": 0.2830721003134796, "grad_norm": 3.0989999771118164, "learning_rate": 4.5150000000000004e-07, "loss": 7.3427, "step": 903 }, { "epoch": 0.28338557993730407, "grad_norm": 3.1780450344085693, "learning_rate": 4.52e-07, "loss": 7.8806, "step": 904 }, { "epoch": 0.2836990595611285, "grad_norm": 3.923069953918457, "learning_rate": 4.525e-07, "loss": 9.7537, "step": 905 }, { "epoch": 0.284012539184953, "grad_norm": 4.220728874206543, "learning_rate": 4.5300000000000005e-07, "loss": 9.528, "step": 906 }, { "epoch": 0.28432601880877745, "grad_norm": 3.3002758026123047, "learning_rate": 4.5350000000000003e-07, "loss": 6.0225, "step": 907 }, { "epoch": 0.2846394984326019, "grad_norm": 4.327773571014404, "learning_rate": 4.5400000000000007e-07, "loss": 8.0762, "step": 908 }, { "epoch": 0.28495297805642633, "grad_norm": 3.128321409225464, "learning_rate": 4.545e-07, "loss": 8.3285, "step": 909 }, { "epoch": 0.2852664576802508, "grad_norm": 3.2514123916625977, "learning_rate": 4.5500000000000004e-07, "loss": 8.2311, "step": 910 }, { "epoch": 0.2855799373040752, "grad_norm": 5.544764518737793, "learning_rate": 4.555e-07, "loss": 11.3571, "step": 911 }, { "epoch": 0.2858934169278997, "grad_norm": 5.904582977294922, "learning_rate": 4.5600000000000006e-07, "loss": 18.3895, "step": 912 }, { "epoch": 0.28620689655172415, "grad_norm": 3.38208270072937, "learning_rate": 4.5650000000000004e-07, "loss": 9.4236, "step": 913 }, { "epoch": 0.2865203761755486, "grad_norm": 3.433018445968628, "learning_rate": 4.5700000000000003e-07, "loss": 11.2537, "step": 914 }, { "epoch": 0.28683385579937304, "grad_norm": 4.306755542755127, "learning_rate": 4.575e-07, "loss": 11.701, "step": 915 }, { "epoch": 0.2871473354231975, "grad_norm": 3.296084403991699, "learning_rate": 4.5800000000000005e-07, "loss": 6.5184, "step": 916 }, { "epoch": 0.2874608150470219, "grad_norm": 4.437682628631592, "learning_rate": 4.5850000000000004e-07, "loss": 8.7946, "step": 917 }, { "epoch": 0.2877742946708464, "grad_norm": 4.028353214263916, "learning_rate": 4.5900000000000007e-07, "loss": 9.0222, "step": 918 }, { "epoch": 0.28808777429467086, "grad_norm": 2.981740951538086, "learning_rate": 4.595e-07, "loss": 6.3495, "step": 919 }, { "epoch": 0.2884012539184953, "grad_norm": 3.5462889671325684, "learning_rate": 4.6000000000000004e-07, "loss": 9.6387, "step": 920 }, { "epoch": 0.28871473354231975, "grad_norm": 3.0416839122772217, "learning_rate": 4.6050000000000003e-07, "loss": 5.743, "step": 921 }, { "epoch": 0.2890282131661442, "grad_norm": 2.538724660873413, "learning_rate": 4.6100000000000006e-07, "loss": 6.5413, "step": 922 }, { "epoch": 0.28934169278996863, "grad_norm": 3.508538007736206, "learning_rate": 4.615e-07, "loss": 9.6875, "step": 923 }, { "epoch": 0.2896551724137931, "grad_norm": 3.4180221557617188, "learning_rate": 4.6200000000000003e-07, "loss": 7.9855, "step": 924 }, { "epoch": 0.28996865203761757, "grad_norm": 3.5554192066192627, "learning_rate": 4.625e-07, "loss": 7.1513, "step": 925 }, { "epoch": 0.290282131661442, "grad_norm": 3.860269784927368, "learning_rate": 4.6300000000000006e-07, "loss": 9.0043, "step": 926 }, { "epoch": 0.29059561128526645, "grad_norm": 3.593224287033081, "learning_rate": 4.6350000000000004e-07, "loss": 8.9827, "step": 927 }, { "epoch": 0.2909090909090909, "grad_norm": 4.743045330047607, "learning_rate": 4.64e-07, "loss": 12.2761, "step": 928 }, { "epoch": 0.29122257053291534, "grad_norm": 3.506476640701294, "learning_rate": 4.645e-07, "loss": 9.6425, "step": 929 }, { "epoch": 0.29153605015673983, "grad_norm": 3.420703887939453, "learning_rate": 4.6500000000000005e-07, "loss": 7.238, "step": 930 }, { "epoch": 0.2918495297805643, "grad_norm": 8.075421333312988, "learning_rate": 4.6550000000000003e-07, "loss": 19.2958, "step": 931 }, { "epoch": 0.2921630094043887, "grad_norm": 4.11147928237915, "learning_rate": 4.6600000000000007e-07, "loss": 7.9719, "step": 932 }, { "epoch": 0.29247648902821316, "grad_norm": 4.024860382080078, "learning_rate": 4.665e-07, "loss": 8.5926, "step": 933 }, { "epoch": 0.2927899686520376, "grad_norm": 3.3219480514526367, "learning_rate": 4.6700000000000004e-07, "loss": 8.4054, "step": 934 }, { "epoch": 0.29310344827586204, "grad_norm": 3.3816304206848145, "learning_rate": 4.675e-07, "loss": 6.9335, "step": 935 }, { "epoch": 0.29341692789968654, "grad_norm": 3.370476484298706, "learning_rate": 4.6800000000000006e-07, "loss": 8.5852, "step": 936 }, { "epoch": 0.293730407523511, "grad_norm": 3.3092455863952637, "learning_rate": 4.6850000000000005e-07, "loss": 8.164, "step": 937 }, { "epoch": 0.2940438871473354, "grad_norm": 3.3833985328674316, "learning_rate": 4.6900000000000003e-07, "loss": 7.7537, "step": 938 }, { "epoch": 0.29435736677115987, "grad_norm": 4.879644870758057, "learning_rate": 4.695e-07, "loss": 9.2243, "step": 939 }, { "epoch": 0.2946708463949843, "grad_norm": 3.931847333908081, "learning_rate": 4.7000000000000005e-07, "loss": 9.3823, "step": 940 }, { "epoch": 0.29498432601880875, "grad_norm": 3.1454243659973145, "learning_rate": 4.7050000000000004e-07, "loss": 6.3691, "step": 941 }, { "epoch": 0.29529780564263325, "grad_norm": 3.438483953475952, "learning_rate": 4.710000000000001e-07, "loss": 7.0824, "step": 942 }, { "epoch": 0.2956112852664577, "grad_norm": 3.97963809967041, "learning_rate": 4.715e-07, "loss": 9.5606, "step": 943 }, { "epoch": 0.29592476489028213, "grad_norm": 5.063857078552246, "learning_rate": 4.7200000000000004e-07, "loss": 12.8541, "step": 944 }, { "epoch": 0.2962382445141066, "grad_norm": 4.647433280944824, "learning_rate": 4.7250000000000003e-07, "loss": 11.1892, "step": 945 }, { "epoch": 0.296551724137931, "grad_norm": 4.020174026489258, "learning_rate": 4.7300000000000007e-07, "loss": 9.2035, "step": 946 }, { "epoch": 0.29686520376175546, "grad_norm": 3.719456434249878, "learning_rate": 4.7350000000000005e-07, "loss": 7.5178, "step": 947 }, { "epoch": 0.29717868338557996, "grad_norm": 3.1977577209472656, "learning_rate": 4.7400000000000004e-07, "loss": 6.5933, "step": 948 }, { "epoch": 0.2974921630094044, "grad_norm": 2.9651682376861572, "learning_rate": 4.745e-07, "loss": 6.2385, "step": 949 }, { "epoch": 0.29780564263322884, "grad_norm": 4.075620174407959, "learning_rate": 4.7500000000000006e-07, "loss": 13.3818, "step": 950 }, { "epoch": 0.2981191222570533, "grad_norm": 4.676611423492432, "learning_rate": 4.7550000000000004e-07, "loss": 13.5246, "step": 951 }, { "epoch": 0.2984326018808777, "grad_norm": 4.3568291664123535, "learning_rate": 4.760000000000001e-07, "loss": 10.122, "step": 952 }, { "epoch": 0.2987460815047022, "grad_norm": 3.4638936519622803, "learning_rate": 4.765e-07, "loss": 9.9878, "step": 953 }, { "epoch": 0.29905956112852666, "grad_norm": 3.4983904361724854, "learning_rate": 4.77e-07, "loss": 7.968, "step": 954 }, { "epoch": 0.2993730407523511, "grad_norm": 2.9241857528686523, "learning_rate": 4.775000000000001e-07, "loss": 6.1837, "step": 955 }, { "epoch": 0.29968652037617555, "grad_norm": 5.027895927429199, "learning_rate": 4.78e-07, "loss": 10.7868, "step": 956 }, { "epoch": 0.3, "grad_norm": 3.0963079929351807, "learning_rate": 4.785000000000001e-07, "loss": 9.2105, "step": 957 }, { "epoch": 0.30031347962382443, "grad_norm": 3.367241144180298, "learning_rate": 4.79e-07, "loss": 8.6549, "step": 958 }, { "epoch": 0.30062695924764893, "grad_norm": 3.1908528804779053, "learning_rate": 4.795e-07, "loss": 9.3335, "step": 959 }, { "epoch": 0.30094043887147337, "grad_norm": 3.797089099884033, "learning_rate": 4.800000000000001e-07, "loss": 10.5798, "step": 960 }, { "epoch": 0.3012539184952978, "grad_norm": 3.816810131072998, "learning_rate": 4.805000000000001e-07, "loss": 8.424, "step": 961 }, { "epoch": 0.30156739811912225, "grad_norm": 2.98754620552063, "learning_rate": 4.81e-07, "loss": 6.592, "step": 962 }, { "epoch": 0.3018808777429467, "grad_norm": 7.078648090362549, "learning_rate": 4.815000000000001e-07, "loss": 20.1471, "step": 963 }, { "epoch": 0.30219435736677114, "grad_norm": 3.7833642959594727, "learning_rate": 4.82e-07, "loss": 8.8326, "step": 964 }, { "epoch": 0.30250783699059564, "grad_norm": 2.790283203125, "learning_rate": 4.825e-07, "loss": 6.6206, "step": 965 }, { "epoch": 0.3028213166144201, "grad_norm": 4.206464767456055, "learning_rate": 4.830000000000001e-07, "loss": 10.1851, "step": 966 }, { "epoch": 0.3031347962382445, "grad_norm": 4.659297943115234, "learning_rate": 4.835e-07, "loss": 11.0335, "step": 967 }, { "epoch": 0.30344827586206896, "grad_norm": 4.047898769378662, "learning_rate": 4.84e-07, "loss": 10.0668, "step": 968 }, { "epoch": 0.3037617554858934, "grad_norm": 4.826545715332031, "learning_rate": 4.845000000000001e-07, "loss": 9.4059, "step": 969 }, { "epoch": 0.30407523510971785, "grad_norm": 3.3105831146240234, "learning_rate": 4.85e-07, "loss": 9.7796, "step": 970 }, { "epoch": 0.30438871473354234, "grad_norm": 3.3956053256988525, "learning_rate": 4.855e-07, "loss": 8.4835, "step": 971 }, { "epoch": 0.3047021943573668, "grad_norm": 3.9917337894439697, "learning_rate": 4.86e-07, "loss": 12.5467, "step": 972 }, { "epoch": 0.3050156739811912, "grad_norm": 2.869931221008301, "learning_rate": 4.865e-07, "loss": 7.641, "step": 973 }, { "epoch": 0.30532915360501567, "grad_norm": 4.201666355133057, "learning_rate": 4.870000000000001e-07, "loss": 8.2918, "step": 974 }, { "epoch": 0.3056426332288401, "grad_norm": 3.3518385887145996, "learning_rate": 4.875000000000001e-07, "loss": 10.0857, "step": 975 }, { "epoch": 0.30595611285266455, "grad_norm": 3.7927677631378174, "learning_rate": 4.88e-07, "loss": 8.8694, "step": 976 }, { "epoch": 0.30626959247648905, "grad_norm": 2.8095505237579346, "learning_rate": 4.885000000000001e-07, "loss": 6.6404, "step": 977 }, { "epoch": 0.3065830721003135, "grad_norm": 3.649327516555786, "learning_rate": 4.89e-07, "loss": 8.3927, "step": 978 }, { "epoch": 0.30689655172413793, "grad_norm": 3.178901195526123, "learning_rate": 4.895e-07, "loss": 7.3435, "step": 979 }, { "epoch": 0.3072100313479624, "grad_norm": 3.161583185195923, "learning_rate": 4.900000000000001e-07, "loss": 8.2137, "step": 980 }, { "epoch": 0.3075235109717868, "grad_norm": 3.098173141479492, "learning_rate": 4.905000000000001e-07, "loss": 7.279, "step": 981 }, { "epoch": 0.30783699059561126, "grad_norm": 2.761258363723755, "learning_rate": 4.91e-07, "loss": 6.147, "step": 982 }, { "epoch": 0.30815047021943576, "grad_norm": 2.6040921211242676, "learning_rate": 4.915000000000001e-07, "loss": 6.242, "step": 983 }, { "epoch": 0.3084639498432602, "grad_norm": 3.172119617462158, "learning_rate": 4.92e-07, "loss": 6.6438, "step": 984 }, { "epoch": 0.30877742946708464, "grad_norm": 3.180616617202759, "learning_rate": 4.925e-07, "loss": 7.9243, "step": 985 }, { "epoch": 0.3090909090909091, "grad_norm": 5.080252170562744, "learning_rate": 4.93e-07, "loss": 13.2643, "step": 986 }, { "epoch": 0.3094043887147335, "grad_norm": 3.6815905570983887, "learning_rate": 4.935e-07, "loss": 10.5387, "step": 987 }, { "epoch": 0.30971786833855797, "grad_norm": 5.263372898101807, "learning_rate": 4.940000000000001e-07, "loss": 16.638, "step": 988 }, { "epoch": 0.31003134796238246, "grad_norm": 2.8743481636047363, "learning_rate": 4.945000000000001e-07, "loss": 9.2944, "step": 989 }, { "epoch": 0.3103448275862069, "grad_norm": 5.4028096199035645, "learning_rate": 4.95e-07, "loss": 13.0038, "step": 990 }, { "epoch": 0.31065830721003135, "grad_norm": 3.379136323928833, "learning_rate": 4.955e-07, "loss": 7.5712, "step": 991 }, { "epoch": 0.3109717868338558, "grad_norm": 3.4020957946777344, "learning_rate": 4.96e-07, "loss": 8.871, "step": 992 }, { "epoch": 0.31128526645768023, "grad_norm": 4.882453918457031, "learning_rate": 4.965e-07, "loss": 12.0715, "step": 993 }, { "epoch": 0.31159874608150473, "grad_norm": 5.079671859741211, "learning_rate": 4.970000000000001e-07, "loss": 14.1268, "step": 994 }, { "epoch": 0.31191222570532917, "grad_norm": 4.953401565551758, "learning_rate": 4.975000000000001e-07, "loss": 9.5667, "step": 995 }, { "epoch": 0.3122257053291536, "grad_norm": 3.9187986850738525, "learning_rate": 4.98e-07, "loss": 8.5076, "step": 996 }, { "epoch": 0.31253918495297806, "grad_norm": 3.059863805770874, "learning_rate": 4.985000000000001e-07, "loss": 6.8895, "step": 997 }, { "epoch": 0.3128526645768025, "grad_norm": 3.27205228805542, "learning_rate": 4.99e-07, "loss": 7.4944, "step": 998 }, { "epoch": 0.31316614420062694, "grad_norm": 4.175319194793701, "learning_rate": 4.995e-07, "loss": 10.0083, "step": 999 }, { "epoch": 0.31347962382445144, "grad_norm": 4.775247097015381, "learning_rate": 5.000000000000001e-07, "loss": 10.5517, "step": 1000 }, { "epoch": 0.3137931034482759, "grad_norm": 4.07278299331665, "learning_rate": 5.005e-07, "loss": 10.0079, "step": 1001 }, { "epoch": 0.3141065830721003, "grad_norm": 3.5604565143585205, "learning_rate": 5.01e-07, "loss": 9.4462, "step": 1002 }, { "epoch": 0.31442006269592476, "grad_norm": 4.0364990234375, "learning_rate": 5.015000000000001e-07, "loss": 10.3931, "step": 1003 }, { "epoch": 0.3147335423197492, "grad_norm": 4.032431125640869, "learning_rate": 5.02e-07, "loss": 8.9061, "step": 1004 }, { "epoch": 0.31504702194357365, "grad_norm": 3.2184982299804688, "learning_rate": 5.025000000000001e-07, "loss": 7.8915, "step": 1005 }, { "epoch": 0.31536050156739814, "grad_norm": 3.669436454772949, "learning_rate": 5.03e-07, "loss": 8.4085, "step": 1006 }, { "epoch": 0.3156739811912226, "grad_norm": 3.7609059810638428, "learning_rate": 5.035e-07, "loss": 10.8993, "step": 1007 }, { "epoch": 0.315987460815047, "grad_norm": 3.2515149116516113, "learning_rate": 5.040000000000001e-07, "loss": 10.0338, "step": 1008 }, { "epoch": 0.31630094043887147, "grad_norm": 4.648274898529053, "learning_rate": 5.045000000000001e-07, "loss": 11.952, "step": 1009 }, { "epoch": 0.3166144200626959, "grad_norm": 3.8987960815429688, "learning_rate": 5.05e-07, "loss": 8.06, "step": 1010 }, { "epoch": 0.31692789968652035, "grad_norm": 3.405416965484619, "learning_rate": 5.055e-07, "loss": 6.5823, "step": 1011 }, { "epoch": 0.31724137931034485, "grad_norm": 4.502613067626953, "learning_rate": 5.06e-07, "loss": 13.0741, "step": 1012 }, { "epoch": 0.3175548589341693, "grad_norm": 3.1743547916412354, "learning_rate": 5.065e-07, "loss": 6.6114, "step": 1013 }, { "epoch": 0.31786833855799373, "grad_norm": 3.0800437927246094, "learning_rate": 5.070000000000001e-07, "loss": 7.7446, "step": 1014 }, { "epoch": 0.3181818181818182, "grad_norm": 4.586092472076416, "learning_rate": 5.075000000000001e-07, "loss": 12.149, "step": 1015 }, { "epoch": 0.3184952978056426, "grad_norm": 4.496054649353027, "learning_rate": 5.08e-07, "loss": 9.6337, "step": 1016 }, { "epoch": 0.31880877742946706, "grad_norm": 2.7630536556243896, "learning_rate": 5.085000000000001e-07, "loss": 6.5648, "step": 1017 }, { "epoch": 0.31912225705329156, "grad_norm": 4.153172016143799, "learning_rate": 5.09e-07, "loss": 8.3805, "step": 1018 }, { "epoch": 0.319435736677116, "grad_norm": 3.8220574855804443, "learning_rate": 5.095000000000001e-07, "loss": 8.8453, "step": 1019 }, { "epoch": 0.31974921630094044, "grad_norm": 3.7293951511383057, "learning_rate": 5.1e-07, "loss": 9.2166, "step": 1020 }, { "epoch": 0.3200626959247649, "grad_norm": 3.5755627155303955, "learning_rate": 5.105e-07, "loss": 8.9008, "step": 1021 }, { "epoch": 0.3203761755485893, "grad_norm": 3.9862473011016846, "learning_rate": 5.110000000000001e-07, "loss": 11.2926, "step": 1022 }, { "epoch": 0.32068965517241377, "grad_norm": 3.1247386932373047, "learning_rate": 5.115000000000001e-07, "loss": 7.0044, "step": 1023 }, { "epoch": 0.32100313479623827, "grad_norm": 3.4025156497955322, "learning_rate": 5.12e-07, "loss": 8.6599, "step": 1024 }, { "epoch": 0.3213166144200627, "grad_norm": 3.8938844203948975, "learning_rate": 5.125e-07, "loss": 7.4781, "step": 1025 }, { "epoch": 0.32163009404388715, "grad_norm": 3.694965124130249, "learning_rate": 5.13e-07, "loss": 8.2075, "step": 1026 }, { "epoch": 0.3219435736677116, "grad_norm": 3.9770612716674805, "learning_rate": 5.135e-07, "loss": 6.7056, "step": 1027 }, { "epoch": 0.32225705329153603, "grad_norm": 3.3346900939941406, "learning_rate": 5.140000000000001e-07, "loss": 6.8695, "step": 1028 }, { "epoch": 0.3225705329153605, "grad_norm": 4.177748680114746, "learning_rate": 5.145000000000001e-07, "loss": 11.471, "step": 1029 }, { "epoch": 0.322884012539185, "grad_norm": 2.610774517059326, "learning_rate": 5.15e-07, "loss": 7.1106, "step": 1030 }, { "epoch": 0.3231974921630094, "grad_norm": 4.45445442199707, "learning_rate": 5.155e-07, "loss": 9.852, "step": 1031 }, { "epoch": 0.32351097178683386, "grad_norm": 3.7908337116241455, "learning_rate": 5.16e-07, "loss": 7.8356, "step": 1032 }, { "epoch": 0.3238244514106583, "grad_norm": 4.410386562347412, "learning_rate": 5.165e-07, "loss": 9.0971, "step": 1033 }, { "epoch": 0.32413793103448274, "grad_norm": 3.542137384414673, "learning_rate": 5.170000000000001e-07, "loss": 11.263, "step": 1034 }, { "epoch": 0.32445141065830724, "grad_norm": 4.1727681159973145, "learning_rate": 5.175e-07, "loss": 13.1233, "step": 1035 }, { "epoch": 0.3247648902821317, "grad_norm": 3.6058053970336914, "learning_rate": 5.180000000000001e-07, "loss": 9.9525, "step": 1036 }, { "epoch": 0.3250783699059561, "grad_norm": 4.465059757232666, "learning_rate": 5.185000000000001e-07, "loss": 9.9752, "step": 1037 }, { "epoch": 0.32539184952978056, "grad_norm": 4.481200695037842, "learning_rate": 5.19e-07, "loss": 9.631, "step": 1038 }, { "epoch": 0.325705329153605, "grad_norm": 4.167693138122559, "learning_rate": 5.195000000000001e-07, "loss": 10.6723, "step": 1039 }, { "epoch": 0.32601880877742945, "grad_norm": 4.743021011352539, "learning_rate": 5.2e-07, "loss": 10.3528, "step": 1040 }, { "epoch": 0.32633228840125394, "grad_norm": 3.419243812561035, "learning_rate": 5.205e-07, "loss": 6.2905, "step": 1041 }, { "epoch": 0.3266457680250784, "grad_norm": 3.0460777282714844, "learning_rate": 5.210000000000001e-07, "loss": 6.7207, "step": 1042 }, { "epoch": 0.32695924764890283, "grad_norm": 3.9700310230255127, "learning_rate": 5.215000000000001e-07, "loss": 8.2999, "step": 1043 }, { "epoch": 0.32727272727272727, "grad_norm": 3.008124351501465, "learning_rate": 5.22e-07, "loss": 7.1742, "step": 1044 }, { "epoch": 0.3275862068965517, "grad_norm": 3.2137064933776855, "learning_rate": 5.225e-07, "loss": 6.8929, "step": 1045 }, { "epoch": 0.32789968652037615, "grad_norm": 4.896831035614014, "learning_rate": 5.23e-07, "loss": 11.7279, "step": 1046 }, { "epoch": 0.32821316614420065, "grad_norm": 3.5546038150787354, "learning_rate": 5.235e-07, "loss": 7.8732, "step": 1047 }, { "epoch": 0.3285266457680251, "grad_norm": 3.5171022415161133, "learning_rate": 5.240000000000001e-07, "loss": 8.6413, "step": 1048 }, { "epoch": 0.32884012539184954, "grad_norm": 3.2662153244018555, "learning_rate": 5.245e-07, "loss": 8.4025, "step": 1049 }, { "epoch": 0.329153605015674, "grad_norm": 4.3712005615234375, "learning_rate": 5.250000000000001e-07, "loss": 8.6959, "step": 1050 }, { "epoch": 0.3294670846394984, "grad_norm": 3.437635660171509, "learning_rate": 5.255e-07, "loss": 9.4805, "step": 1051 }, { "epoch": 0.32978056426332286, "grad_norm": 4.1790876388549805, "learning_rate": 5.26e-07, "loss": 8.4143, "step": 1052 }, { "epoch": 0.33009404388714736, "grad_norm": 3.861347198486328, "learning_rate": 5.265000000000001e-07, "loss": 7.995, "step": 1053 }, { "epoch": 0.3304075235109718, "grad_norm": 4.775915622711182, "learning_rate": 5.27e-07, "loss": 13.2288, "step": 1054 }, { "epoch": 0.33072100313479624, "grad_norm": 3.08349347114563, "learning_rate": 5.275e-07, "loss": 8.4152, "step": 1055 }, { "epoch": 0.3310344827586207, "grad_norm": 4.555473804473877, "learning_rate": 5.280000000000001e-07, "loss": 9.4847, "step": 1056 }, { "epoch": 0.3313479623824451, "grad_norm": 4.14658260345459, "learning_rate": 5.285000000000001e-07, "loss": 9.6713, "step": 1057 }, { "epoch": 0.33166144200626957, "grad_norm": 3.729527711868286, "learning_rate": 5.29e-07, "loss": 9.5404, "step": 1058 }, { "epoch": 0.33197492163009407, "grad_norm": 4.110138893127441, "learning_rate": 5.295e-07, "loss": 7.4485, "step": 1059 }, { "epoch": 0.3322884012539185, "grad_norm": 3.4189460277557373, "learning_rate": 5.3e-07, "loss": 9.1945, "step": 1060 }, { "epoch": 0.33260188087774295, "grad_norm": 4.435842990875244, "learning_rate": 5.305e-07, "loss": 9.3644, "step": 1061 }, { "epoch": 0.3329153605015674, "grad_norm": 3.913818836212158, "learning_rate": 5.310000000000001e-07, "loss": 10.439, "step": 1062 }, { "epoch": 0.33322884012539183, "grad_norm": 5.079927921295166, "learning_rate": 5.315000000000001e-07, "loss": 11.0371, "step": 1063 }, { "epoch": 0.3335423197492163, "grad_norm": 3.028818368911743, "learning_rate": 5.32e-07, "loss": 6.7478, "step": 1064 }, { "epoch": 0.3338557993730408, "grad_norm": 2.9255435466766357, "learning_rate": 5.325e-07, "loss": 6.9333, "step": 1065 }, { "epoch": 0.3341692789968652, "grad_norm": 3.4844298362731934, "learning_rate": 5.33e-07, "loss": 8.3449, "step": 1066 }, { "epoch": 0.33448275862068966, "grad_norm": 4.424320697784424, "learning_rate": 5.335000000000001e-07, "loss": 11.034, "step": 1067 }, { "epoch": 0.3347962382445141, "grad_norm": 4.413206577301025, "learning_rate": 5.340000000000001e-07, "loss": 9.8029, "step": 1068 }, { "epoch": 0.33510971786833854, "grad_norm": 3.4747884273529053, "learning_rate": 5.345e-07, "loss": 8.9154, "step": 1069 }, { "epoch": 0.335423197492163, "grad_norm": 3.6665916442871094, "learning_rate": 5.350000000000001e-07, "loss": 7.1011, "step": 1070 }, { "epoch": 0.3357366771159875, "grad_norm": 2.8998830318450928, "learning_rate": 5.355e-07, "loss": 7.7268, "step": 1071 }, { "epoch": 0.3360501567398119, "grad_norm": 3.9192614555358887, "learning_rate": 5.36e-07, "loss": 10.9592, "step": 1072 }, { "epoch": 0.33636363636363636, "grad_norm": 2.7648117542266846, "learning_rate": 5.365000000000001e-07, "loss": 7.8875, "step": 1073 }, { "epoch": 0.3366771159874608, "grad_norm": 4.6224751472473145, "learning_rate": 5.37e-07, "loss": 10.7205, "step": 1074 }, { "epoch": 0.33699059561128525, "grad_norm": 3.7434327602386475, "learning_rate": 5.375e-07, "loss": 9.4757, "step": 1075 }, { "epoch": 0.3373040752351097, "grad_norm": 2.9543545246124268, "learning_rate": 5.380000000000001e-07, "loss": 6.5248, "step": 1076 }, { "epoch": 0.3376175548589342, "grad_norm": 4.7394185066223145, "learning_rate": 5.385000000000001e-07, "loss": 16.7804, "step": 1077 }, { "epoch": 0.33793103448275863, "grad_norm": 3.574186325073242, "learning_rate": 5.39e-07, "loss": 7.1065, "step": 1078 }, { "epoch": 0.33824451410658307, "grad_norm": 2.6284937858581543, "learning_rate": 5.395e-07, "loss": 6.8801, "step": 1079 }, { "epoch": 0.3385579937304075, "grad_norm": 3.36057710647583, "learning_rate": 5.4e-07, "loss": 10.2646, "step": 1080 }, { "epoch": 0.33887147335423196, "grad_norm": 4.6945624351501465, "learning_rate": 5.405000000000001e-07, "loss": 12.4643, "step": 1081 }, { "epoch": 0.33918495297805645, "grad_norm": 3.1167168617248535, "learning_rate": 5.410000000000001e-07, "loss": 6.7228, "step": 1082 }, { "epoch": 0.3394984326018809, "grad_norm": 3.513232469558716, "learning_rate": 5.415e-07, "loss": 9.1163, "step": 1083 }, { "epoch": 0.33981191222570534, "grad_norm": 5.232317924499512, "learning_rate": 5.420000000000001e-07, "loss": 11.4712, "step": 1084 }, { "epoch": 0.3401253918495298, "grad_norm": 3.7332653999328613, "learning_rate": 5.425e-07, "loss": 7.6378, "step": 1085 }, { "epoch": 0.3404388714733542, "grad_norm": 4.457612037658691, "learning_rate": 5.43e-07, "loss": 11.9674, "step": 1086 }, { "epoch": 0.34075235109717866, "grad_norm": 3.196615695953369, "learning_rate": 5.435000000000001e-07, "loss": 8.66, "step": 1087 }, { "epoch": 0.34106583072100316, "grad_norm": 4.207455635070801, "learning_rate": 5.44e-07, "loss": 8.5547, "step": 1088 }, { "epoch": 0.3413793103448276, "grad_norm": 3.803297519683838, "learning_rate": 5.445e-07, "loss": 8.1143, "step": 1089 }, { "epoch": 0.34169278996865204, "grad_norm": 3.9146108627319336, "learning_rate": 5.450000000000001e-07, "loss": 8.9048, "step": 1090 }, { "epoch": 0.3420062695924765, "grad_norm": 3.0338478088378906, "learning_rate": 5.455e-07, "loss": 7.1311, "step": 1091 }, { "epoch": 0.34231974921630093, "grad_norm": 3.301464557647705, "learning_rate": 5.46e-07, "loss": 7.2672, "step": 1092 }, { "epoch": 0.34263322884012537, "grad_norm": 3.3662900924682617, "learning_rate": 5.465e-07, "loss": 9.0924, "step": 1093 }, { "epoch": 0.34294670846394987, "grad_norm": 3.946822166442871, "learning_rate": 5.47e-07, "loss": 10.3786, "step": 1094 }, { "epoch": 0.3432601880877743, "grad_norm": 3.5544939041137695, "learning_rate": 5.475e-07, "loss": 7.2466, "step": 1095 }, { "epoch": 0.34357366771159875, "grad_norm": 3.7304489612579346, "learning_rate": 5.480000000000001e-07, "loss": 9.4879, "step": 1096 }, { "epoch": 0.3438871473354232, "grad_norm": 4.137592315673828, "learning_rate": 5.485e-07, "loss": 8.2506, "step": 1097 }, { "epoch": 0.34420062695924764, "grad_norm": 3.8458902835845947, "learning_rate": 5.490000000000001e-07, "loss": 10.3391, "step": 1098 }, { "epoch": 0.3445141065830721, "grad_norm": 3.944636106491089, "learning_rate": 5.495e-07, "loss": 10.2381, "step": 1099 }, { "epoch": 0.3448275862068966, "grad_norm": 2.935556411743164, "learning_rate": 5.5e-07, "loss": 6.7504, "step": 1100 }, { "epoch": 0.345141065830721, "grad_norm": 2.839891195297241, "learning_rate": 5.505000000000001e-07, "loss": 6.2424, "step": 1101 }, { "epoch": 0.34545454545454546, "grad_norm": 3.0521023273468018, "learning_rate": 5.510000000000001e-07, "loss": 8.1019, "step": 1102 }, { "epoch": 0.3457680250783699, "grad_norm": 2.7549774646759033, "learning_rate": 5.515e-07, "loss": 6.9908, "step": 1103 }, { "epoch": 0.34608150470219434, "grad_norm": 2.460770845413208, "learning_rate": 5.520000000000001e-07, "loss": 6.4588, "step": 1104 }, { "epoch": 0.3463949843260188, "grad_norm": 3.0610549449920654, "learning_rate": 5.525e-07, "loss": 7.1453, "step": 1105 }, { "epoch": 0.3467084639498433, "grad_norm": 3.7821202278137207, "learning_rate": 5.53e-07, "loss": 8.4413, "step": 1106 }, { "epoch": 0.3470219435736677, "grad_norm": 3.272524356842041, "learning_rate": 5.535000000000001e-07, "loss": 7.5479, "step": 1107 }, { "epoch": 0.34733542319749217, "grad_norm": 4.5854082107543945, "learning_rate": 5.54e-07, "loss": 12.7431, "step": 1108 }, { "epoch": 0.3476489028213166, "grad_norm": 3.382383346557617, "learning_rate": 5.545e-07, "loss": 8.0938, "step": 1109 }, { "epoch": 0.34796238244514105, "grad_norm": 4.590867042541504, "learning_rate": 5.550000000000001e-07, "loss": 9.5862, "step": 1110 }, { "epoch": 0.3482758620689655, "grad_norm": 3.3720743656158447, "learning_rate": 5.555e-07, "loss": 7.8215, "step": 1111 }, { "epoch": 0.34858934169279, "grad_norm": 3.0238683223724365, "learning_rate": 5.560000000000001e-07, "loss": 8.9804, "step": 1112 }, { "epoch": 0.34890282131661443, "grad_norm": 3.9639229774475098, "learning_rate": 5.565e-07, "loss": 9.2357, "step": 1113 }, { "epoch": 0.3492163009404389, "grad_norm": 2.6964738368988037, "learning_rate": 5.57e-07, "loss": 7.0609, "step": 1114 }, { "epoch": 0.3495297805642633, "grad_norm": 4.804142475128174, "learning_rate": 5.575000000000001e-07, "loss": 13.9845, "step": 1115 }, { "epoch": 0.34984326018808776, "grad_norm": 4.389916896820068, "learning_rate": 5.580000000000001e-07, "loss": 7.7403, "step": 1116 }, { "epoch": 0.3501567398119122, "grad_norm": 4.200212478637695, "learning_rate": 5.585e-07, "loss": 9.193, "step": 1117 }, { "epoch": 0.3504702194357367, "grad_norm": 4.732844829559326, "learning_rate": 5.590000000000001e-07, "loss": 13.0463, "step": 1118 }, { "epoch": 0.35078369905956114, "grad_norm": 2.878286123275757, "learning_rate": 5.595e-07, "loss": 6.3315, "step": 1119 }, { "epoch": 0.3510971786833856, "grad_norm": 4.683932781219482, "learning_rate": 5.6e-07, "loss": 10.1464, "step": 1120 }, { "epoch": 0.35141065830721, "grad_norm": 3.83774995803833, "learning_rate": 5.605000000000001e-07, "loss": 8.5081, "step": 1121 }, { "epoch": 0.35172413793103446, "grad_norm": 3.030034303665161, "learning_rate": 5.61e-07, "loss": 6.6782, "step": 1122 }, { "epoch": 0.35203761755485896, "grad_norm": 3.7640645503997803, "learning_rate": 5.615e-07, "loss": 7.6323, "step": 1123 }, { "epoch": 0.3523510971786834, "grad_norm": 3.636522054672241, "learning_rate": 5.620000000000001e-07, "loss": 8.9132, "step": 1124 }, { "epoch": 0.35266457680250785, "grad_norm": 5.256652355194092, "learning_rate": 5.625e-07, "loss": 13.0476, "step": 1125 }, { "epoch": 0.3529780564263323, "grad_norm": 3.7114644050598145, "learning_rate": 5.63e-07, "loss": 7.7855, "step": 1126 }, { "epoch": 0.35329153605015673, "grad_norm": 4.469037055969238, "learning_rate": 5.635e-07, "loss": 10.8488, "step": 1127 }, { "epoch": 0.35360501567398117, "grad_norm": 4.428511619567871, "learning_rate": 5.64e-07, "loss": 10.8417, "step": 1128 }, { "epoch": 0.35391849529780567, "grad_norm": 3.272402048110962, "learning_rate": 5.645000000000001e-07, "loss": 7.0908, "step": 1129 }, { "epoch": 0.3542319749216301, "grad_norm": 4.514960765838623, "learning_rate": 5.650000000000001e-07, "loss": 11.1969, "step": 1130 }, { "epoch": 0.35454545454545455, "grad_norm": 3.0566494464874268, "learning_rate": 5.655e-07, "loss": 6.9586, "step": 1131 }, { "epoch": 0.354858934169279, "grad_norm": 4.796217441558838, "learning_rate": 5.660000000000001e-07, "loss": 12.4509, "step": 1132 }, { "epoch": 0.35517241379310344, "grad_norm": 3.8200011253356934, "learning_rate": 5.665e-07, "loss": 7.6372, "step": 1133 }, { "epoch": 0.3554858934169279, "grad_norm": 4.99943733215332, "learning_rate": 5.67e-07, "loss": 16.3218, "step": 1134 }, { "epoch": 0.3557993730407524, "grad_norm": 4.074048042297363, "learning_rate": 5.675000000000001e-07, "loss": 11.7632, "step": 1135 }, { "epoch": 0.3561128526645768, "grad_norm": 3.519336700439453, "learning_rate": 5.680000000000001e-07, "loss": 12.5035, "step": 1136 }, { "epoch": 0.35642633228840126, "grad_norm": 3.5221669673919678, "learning_rate": 5.685e-07, "loss": 8.2566, "step": 1137 }, { "epoch": 0.3567398119122257, "grad_norm": 3.0210797786712646, "learning_rate": 5.690000000000001e-07, "loss": 7.6131, "step": 1138 }, { "epoch": 0.35705329153605014, "grad_norm": 3.848649740219116, "learning_rate": 5.695e-07, "loss": 9.2893, "step": 1139 }, { "epoch": 0.3573667711598746, "grad_norm": 3.4947142601013184, "learning_rate": 5.7e-07, "loss": 10.0174, "step": 1140 }, { "epoch": 0.3576802507836991, "grad_norm": 4.418102741241455, "learning_rate": 5.705e-07, "loss": 10.2442, "step": 1141 }, { "epoch": 0.3579937304075235, "grad_norm": 3.0057661533355713, "learning_rate": 5.71e-07, "loss": 6.8693, "step": 1142 }, { "epoch": 0.35830721003134797, "grad_norm": 5.070820331573486, "learning_rate": 5.715000000000001e-07, "loss": 12.646, "step": 1143 }, { "epoch": 0.3586206896551724, "grad_norm": 3.685112714767456, "learning_rate": 5.720000000000001e-07, "loss": 8.9385, "step": 1144 }, { "epoch": 0.35893416927899685, "grad_norm": 3.591017007827759, "learning_rate": 5.725e-07, "loss": 7.2102, "step": 1145 }, { "epoch": 0.3592476489028213, "grad_norm": 3.66312575340271, "learning_rate": 5.730000000000001e-07, "loss": 8.0567, "step": 1146 }, { "epoch": 0.3595611285266458, "grad_norm": 3.2532527446746826, "learning_rate": 5.735e-07, "loss": 7.5575, "step": 1147 }, { "epoch": 0.35987460815047023, "grad_norm": 2.452608108520508, "learning_rate": 5.74e-07, "loss": 6.1812, "step": 1148 }, { "epoch": 0.3601880877742947, "grad_norm": 4.427051067352295, "learning_rate": 5.745000000000001e-07, "loss": 9.7562, "step": 1149 }, { "epoch": 0.3605015673981191, "grad_norm": 4.048511981964111, "learning_rate": 5.750000000000001e-07, "loss": 7.2065, "step": 1150 }, { "epoch": 0.36081504702194356, "grad_norm": 4.313746929168701, "learning_rate": 5.755e-07, "loss": 12.0608, "step": 1151 }, { "epoch": 0.361128526645768, "grad_norm": 3.703238010406494, "learning_rate": 5.760000000000001e-07, "loss": 8.8626, "step": 1152 }, { "epoch": 0.3614420062695925, "grad_norm": 3.321089744567871, "learning_rate": 5.765e-07, "loss": 7.7751, "step": 1153 }, { "epoch": 0.36175548589341694, "grad_norm": 3.596169948577881, "learning_rate": 5.77e-07, "loss": 7.2633, "step": 1154 }, { "epoch": 0.3620689655172414, "grad_norm": 3.9182851314544678, "learning_rate": 5.775000000000001e-07, "loss": 8.9107, "step": 1155 }, { "epoch": 0.3623824451410658, "grad_norm": 2.809251070022583, "learning_rate": 5.78e-07, "loss": 7.5999, "step": 1156 }, { "epoch": 0.36269592476489027, "grad_norm": 3.9834253787994385, "learning_rate": 5.785e-07, "loss": 10.8149, "step": 1157 }, { "epoch": 0.3630094043887147, "grad_norm": 3.9162814617156982, "learning_rate": 5.790000000000001e-07, "loss": 9.0022, "step": 1158 }, { "epoch": 0.3633228840125392, "grad_norm": 3.7181437015533447, "learning_rate": 5.795e-07, "loss": 12.1042, "step": 1159 }, { "epoch": 0.36363636363636365, "grad_norm": 3.6745083332061768, "learning_rate": 5.800000000000001e-07, "loss": 10.9387, "step": 1160 }, { "epoch": 0.3639498432601881, "grad_norm": 3.6251866817474365, "learning_rate": 5.805e-07, "loss": 9.3794, "step": 1161 }, { "epoch": 0.36426332288401253, "grad_norm": 3.9675116539001465, "learning_rate": 5.81e-07, "loss": 7.4843, "step": 1162 }, { "epoch": 0.364576802507837, "grad_norm": 4.0218915939331055, "learning_rate": 5.815000000000001e-07, "loss": 11.3793, "step": 1163 }, { "epoch": 0.36489028213166147, "grad_norm": 4.011270999908447, "learning_rate": 5.820000000000001e-07, "loss": 8.5104, "step": 1164 }, { "epoch": 0.3652037617554859, "grad_norm": 3.468784809112549, "learning_rate": 5.825e-07, "loss": 7.195, "step": 1165 }, { "epoch": 0.36551724137931035, "grad_norm": 5.364189624786377, "learning_rate": 5.830000000000001e-07, "loss": 19.2203, "step": 1166 }, { "epoch": 0.3658307210031348, "grad_norm": 4.526871681213379, "learning_rate": 5.835e-07, "loss": 11.444, "step": 1167 }, { "epoch": 0.36614420062695924, "grad_norm": 3.9016737937927246, "learning_rate": 5.84e-07, "loss": 11.455, "step": 1168 }, { "epoch": 0.3664576802507837, "grad_norm": 3.691763401031494, "learning_rate": 5.845000000000001e-07, "loss": 8.6588, "step": 1169 }, { "epoch": 0.3667711598746082, "grad_norm": 3.842839241027832, "learning_rate": 5.850000000000001e-07, "loss": 8.317, "step": 1170 }, { "epoch": 0.3670846394984326, "grad_norm": 3.8796255588531494, "learning_rate": 5.855e-07, "loss": 9.903, "step": 1171 }, { "epoch": 0.36739811912225706, "grad_norm": 4.093709468841553, "learning_rate": 5.860000000000001e-07, "loss": 9.6562, "step": 1172 }, { "epoch": 0.3677115987460815, "grad_norm": 4.154341220855713, "learning_rate": 5.865e-07, "loss": 9.7709, "step": 1173 }, { "epoch": 0.36802507836990594, "grad_norm": 5.882456302642822, "learning_rate": 5.870000000000001e-07, "loss": 16.238, "step": 1174 }, { "epoch": 0.3683385579937304, "grad_norm": 4.124493598937988, "learning_rate": 5.875e-07, "loss": 9.4565, "step": 1175 }, { "epoch": 0.3686520376175549, "grad_norm": 3.4049675464630127, "learning_rate": 5.88e-07, "loss": 6.7959, "step": 1176 }, { "epoch": 0.3689655172413793, "grad_norm": 3.196589708328247, "learning_rate": 5.885000000000001e-07, "loss": 6.6816, "step": 1177 }, { "epoch": 0.36927899686520377, "grad_norm": 4.004443645477295, "learning_rate": 5.890000000000001e-07, "loss": 9.0686, "step": 1178 }, { "epoch": 0.3695924764890282, "grad_norm": 3.7830114364624023, "learning_rate": 5.895e-07, "loss": 8.4215, "step": 1179 }, { "epoch": 0.36990595611285265, "grad_norm": 16.382652282714844, "learning_rate": 5.900000000000001e-07, "loss": 8.5894, "step": 1180 }, { "epoch": 0.3702194357366771, "grad_norm": 4.150466442108154, "learning_rate": 5.905e-07, "loss": 9.9268, "step": 1181 }, { "epoch": 0.3705329153605016, "grad_norm": 4.039095401763916, "learning_rate": 5.91e-07, "loss": 7.6783, "step": 1182 }, { "epoch": 0.37084639498432603, "grad_norm": 4.113837242126465, "learning_rate": 5.915000000000001e-07, "loss": 10.9473, "step": 1183 }, { "epoch": 0.3711598746081505, "grad_norm": 4.3714447021484375, "learning_rate": 5.920000000000001e-07, "loss": 8.6596, "step": 1184 }, { "epoch": 0.3714733542319749, "grad_norm": 4.696350574493408, "learning_rate": 5.925e-07, "loss": 11.0915, "step": 1185 }, { "epoch": 0.37178683385579936, "grad_norm": 3.4268805980682373, "learning_rate": 5.930000000000001e-07, "loss": 8.7898, "step": 1186 }, { "epoch": 0.3721003134796238, "grad_norm": 3.7892448902130127, "learning_rate": 5.935e-07, "loss": 7.9897, "step": 1187 }, { "epoch": 0.3724137931034483, "grad_norm": 3.5174129009246826, "learning_rate": 5.94e-07, "loss": 8.9607, "step": 1188 }, { "epoch": 0.37272727272727274, "grad_norm": 3.5492613315582275, "learning_rate": 5.945000000000001e-07, "loss": 8.2248, "step": 1189 }, { "epoch": 0.3730407523510972, "grad_norm": 3.0557780265808105, "learning_rate": 5.95e-07, "loss": 7.4525, "step": 1190 }, { "epoch": 0.3733542319749216, "grad_norm": 3.610521078109741, "learning_rate": 5.955000000000001e-07, "loss": 8.6173, "step": 1191 }, { "epoch": 0.37366771159874607, "grad_norm": 5.024316310882568, "learning_rate": 5.960000000000001e-07, "loss": 9.0102, "step": 1192 }, { "epoch": 0.3739811912225705, "grad_norm": 3.3854024410247803, "learning_rate": 5.965e-07, "loss": 9.3159, "step": 1193 }, { "epoch": 0.374294670846395, "grad_norm": 3.692943572998047, "learning_rate": 5.970000000000001e-07, "loss": 6.6683, "step": 1194 }, { "epoch": 0.37460815047021945, "grad_norm": 4.656247138977051, "learning_rate": 5.975e-07, "loss": 9.6465, "step": 1195 }, { "epoch": 0.3749216300940439, "grad_norm": 2.7945048809051514, "learning_rate": 5.98e-07, "loss": 7.6828, "step": 1196 }, { "epoch": 0.37523510971786833, "grad_norm": 3.6382293701171875, "learning_rate": 5.985000000000001e-07, "loss": 8.2837, "step": 1197 }, { "epoch": 0.3755485893416928, "grad_norm": 4.916844844818115, "learning_rate": 5.990000000000001e-07, "loss": 10.6478, "step": 1198 }, { "epoch": 0.3758620689655172, "grad_norm": 2.6404612064361572, "learning_rate": 5.995e-07, "loss": 5.8349, "step": 1199 }, { "epoch": 0.3761755485893417, "grad_norm": 3.963083505630493, "learning_rate": 6.000000000000001e-07, "loss": 7.5912, "step": 1200 }, { "epoch": 0.37648902821316615, "grad_norm": 3.9304614067077637, "learning_rate": 6.005e-07, "loss": 9.4547, "step": 1201 }, { "epoch": 0.3768025078369906, "grad_norm": 3.4377453327178955, "learning_rate": 6.01e-07, "loss": 9.7902, "step": 1202 }, { "epoch": 0.37711598746081504, "grad_norm": 3.773444414138794, "learning_rate": 6.015000000000001e-07, "loss": 9.5372, "step": 1203 }, { "epoch": 0.3774294670846395, "grad_norm": 3.075242280960083, "learning_rate": 6.02e-07, "loss": 8.3831, "step": 1204 }, { "epoch": 0.3777429467084639, "grad_norm": 3.2014951705932617, "learning_rate": 6.025000000000001e-07, "loss": 7.9814, "step": 1205 }, { "epoch": 0.3780564263322884, "grad_norm": 3.752372980117798, "learning_rate": 6.030000000000001e-07, "loss": 8.109, "step": 1206 }, { "epoch": 0.37836990595611286, "grad_norm": 4.027518272399902, "learning_rate": 6.035e-07, "loss": 7.6091, "step": 1207 }, { "epoch": 0.3786833855799373, "grad_norm": 3.438729763031006, "learning_rate": 6.040000000000001e-07, "loss": 6.8037, "step": 1208 }, { "epoch": 0.37899686520376175, "grad_norm": 4.082209587097168, "learning_rate": 6.045e-07, "loss": 9.9197, "step": 1209 }, { "epoch": 0.3793103448275862, "grad_norm": 5.536364555358887, "learning_rate": 6.05e-07, "loss": 13.404, "step": 1210 }, { "epoch": 0.3796238244514107, "grad_norm": 3.8817527294158936, "learning_rate": 6.055000000000001e-07, "loss": 11.2098, "step": 1211 }, { "epoch": 0.3799373040752351, "grad_norm": 3.6680006980895996, "learning_rate": 6.060000000000001e-07, "loss": 8.0359, "step": 1212 }, { "epoch": 0.38025078369905957, "grad_norm": 4.336513519287109, "learning_rate": 6.065e-07, "loss": 10.526, "step": 1213 }, { "epoch": 0.380564263322884, "grad_norm": 3.7376954555511475, "learning_rate": 6.07e-07, "loss": 9.7812, "step": 1214 }, { "epoch": 0.38087774294670845, "grad_norm": 4.394288539886475, "learning_rate": 6.075e-07, "loss": 10.5658, "step": 1215 }, { "epoch": 0.3811912225705329, "grad_norm": 3.9178755283355713, "learning_rate": 6.08e-07, "loss": 9.1735, "step": 1216 }, { "epoch": 0.3815047021943574, "grad_norm": 3.385538339614868, "learning_rate": 6.085000000000001e-07, "loss": 8.2474, "step": 1217 }, { "epoch": 0.38181818181818183, "grad_norm": 3.3162765502929688, "learning_rate": 6.090000000000001e-07, "loss": 7.6934, "step": 1218 }, { "epoch": 0.3821316614420063, "grad_norm": 4.181975364685059, "learning_rate": 6.095e-07, "loss": 9.205, "step": 1219 }, { "epoch": 0.3824451410658307, "grad_norm": 3.707218647003174, "learning_rate": 6.100000000000001e-07, "loss": 7.6344, "step": 1220 }, { "epoch": 0.38275862068965516, "grad_norm": 3.0025179386138916, "learning_rate": 6.105e-07, "loss": 8.5384, "step": 1221 }, { "epoch": 0.3830721003134796, "grad_norm": 3.2136359214782715, "learning_rate": 6.110000000000001e-07, "loss": 7.3391, "step": 1222 }, { "epoch": 0.3833855799373041, "grad_norm": 3.87955904006958, "learning_rate": 6.115000000000001e-07, "loss": 9.051, "step": 1223 }, { "epoch": 0.38369905956112854, "grad_norm": 2.9862728118896484, "learning_rate": 6.12e-07, "loss": 6.7873, "step": 1224 }, { "epoch": 0.384012539184953, "grad_norm": 4.336016654968262, "learning_rate": 6.125000000000001e-07, "loss": 8.1016, "step": 1225 }, { "epoch": 0.3843260188087774, "grad_norm": 3.270927906036377, "learning_rate": 6.130000000000001e-07, "loss": 9.3506, "step": 1226 }, { "epoch": 0.38463949843260187, "grad_norm": 4.06575870513916, "learning_rate": 6.135e-07, "loss": 8.5931, "step": 1227 }, { "epoch": 0.3849529780564263, "grad_norm": 3.6191561222076416, "learning_rate": 6.140000000000001e-07, "loss": 8.4115, "step": 1228 }, { "epoch": 0.3852664576802508, "grad_norm": 2.8012547492980957, "learning_rate": 6.145e-07, "loss": 7.0199, "step": 1229 }, { "epoch": 0.38557993730407525, "grad_norm": 4.054346084594727, "learning_rate": 6.15e-07, "loss": 9.9699, "step": 1230 }, { "epoch": 0.3858934169278997, "grad_norm": 4.556372165679932, "learning_rate": 6.155000000000001e-07, "loss": 14.4437, "step": 1231 }, { "epoch": 0.38620689655172413, "grad_norm": 4.824500560760498, "learning_rate": 6.160000000000001e-07, "loss": 10.5535, "step": 1232 }, { "epoch": 0.3865203761755486, "grad_norm": 4.935982704162598, "learning_rate": 6.165e-07, "loss": 8.3582, "step": 1233 }, { "epoch": 0.386833855799373, "grad_norm": 8.828557014465332, "learning_rate": 6.17e-07, "loss": 19.3321, "step": 1234 }, { "epoch": 0.3871473354231975, "grad_norm": 2.967073678970337, "learning_rate": 6.175e-07, "loss": 6.7905, "step": 1235 }, { "epoch": 0.38746081504702196, "grad_norm": 4.4314799308776855, "learning_rate": 6.180000000000001e-07, "loss": 10.2915, "step": 1236 }, { "epoch": 0.3877742946708464, "grad_norm": 3.622016191482544, "learning_rate": 6.185000000000001e-07, "loss": 9.6284, "step": 1237 }, { "epoch": 0.38808777429467084, "grad_norm": 3.8630599975585938, "learning_rate": 6.19e-07, "loss": 7.4754, "step": 1238 }, { "epoch": 0.3884012539184953, "grad_norm": 3.7421834468841553, "learning_rate": 6.195000000000001e-07, "loss": 8.5903, "step": 1239 }, { "epoch": 0.3887147335423197, "grad_norm": 3.313210964202881, "learning_rate": 6.200000000000001e-07, "loss": 9.3886, "step": 1240 }, { "epoch": 0.3890282131661442, "grad_norm": 3.029578924179077, "learning_rate": 6.205e-07, "loss": 7.3684, "step": 1241 }, { "epoch": 0.38934169278996866, "grad_norm": 4.0817670822143555, "learning_rate": 6.210000000000001e-07, "loss": 8.6238, "step": 1242 }, { "epoch": 0.3896551724137931, "grad_norm": 3.1163382530212402, "learning_rate": 6.215e-07, "loss": 7.8326, "step": 1243 }, { "epoch": 0.38996865203761755, "grad_norm": 3.418872833251953, "learning_rate": 6.22e-07, "loss": 9.6686, "step": 1244 }, { "epoch": 0.390282131661442, "grad_norm": 3.265856981277466, "learning_rate": 6.225000000000001e-07, "loss": 6.8915, "step": 1245 }, { "epoch": 0.39059561128526643, "grad_norm": 2.5802524089813232, "learning_rate": 6.230000000000001e-07, "loss": 5.6296, "step": 1246 }, { "epoch": 0.39090909090909093, "grad_norm": 3.2665584087371826, "learning_rate": 6.235e-07, "loss": 7.1075, "step": 1247 }, { "epoch": 0.39122257053291537, "grad_norm": 4.159616470336914, "learning_rate": 6.24e-07, "loss": 11.453, "step": 1248 }, { "epoch": 0.3915360501567398, "grad_norm": 4.0924482345581055, "learning_rate": 6.245e-07, "loss": 9.2337, "step": 1249 }, { "epoch": 0.39184952978056425, "grad_norm": 3.432142972946167, "learning_rate": 6.25e-07, "loss": 8.3582, "step": 1250 }, { "epoch": 0.3921630094043887, "grad_norm": 3.7760117053985596, "learning_rate": 6.255e-07, "loss": 10.0285, "step": 1251 }, { "epoch": 0.3924764890282132, "grad_norm": 3.153862237930298, "learning_rate": 6.260000000000001e-07, "loss": 7.4688, "step": 1252 }, { "epoch": 0.39278996865203764, "grad_norm": 3.4726884365081787, "learning_rate": 6.265000000000001e-07, "loss": 9.7888, "step": 1253 }, { "epoch": 0.3931034482758621, "grad_norm": 2.944378614425659, "learning_rate": 6.270000000000001e-07, "loss": 7.6955, "step": 1254 }, { "epoch": 0.3934169278996865, "grad_norm": 3.5531530380249023, "learning_rate": 6.275e-07, "loss": 6.9592, "step": 1255 }, { "epoch": 0.39373040752351096, "grad_norm": 4.495529651641846, "learning_rate": 6.28e-07, "loss": 13.8164, "step": 1256 }, { "epoch": 0.3940438871473354, "grad_norm": 3.5309019088745117, "learning_rate": 6.285000000000001e-07, "loss": 8.0689, "step": 1257 }, { "epoch": 0.3943573667711599, "grad_norm": 5.7128682136535645, "learning_rate": 6.29e-07, "loss": 14.1, "step": 1258 }, { "epoch": 0.39467084639498434, "grad_norm": 2.9314987659454346, "learning_rate": 6.295000000000001e-07, "loss": 6.5813, "step": 1259 }, { "epoch": 0.3949843260188088, "grad_norm": 4.196994781494141, "learning_rate": 6.3e-07, "loss": 11.5749, "step": 1260 }, { "epoch": 0.3952978056426332, "grad_norm": 3.2020435333251953, "learning_rate": 6.305e-07, "loss": 7.9098, "step": 1261 }, { "epoch": 0.39561128526645767, "grad_norm": 4.530463218688965, "learning_rate": 6.310000000000001e-07, "loss": 13.4221, "step": 1262 }, { "epoch": 0.3959247648902821, "grad_norm": 3.823436975479126, "learning_rate": 6.315e-07, "loss": 10.8347, "step": 1263 }, { "epoch": 0.3962382445141066, "grad_norm": 4.340320110321045, "learning_rate": 6.320000000000002e-07, "loss": 10.6129, "step": 1264 }, { "epoch": 0.39655172413793105, "grad_norm": 4.320498466491699, "learning_rate": 6.325000000000001e-07, "loss": 9.6761, "step": 1265 }, { "epoch": 0.3968652037617555, "grad_norm": 3.8908495903015137, "learning_rate": 6.33e-07, "loss": 9.3212, "step": 1266 }, { "epoch": 0.39717868338557993, "grad_norm": 4.79443359375, "learning_rate": 6.335000000000001e-07, "loss": 8.4957, "step": 1267 }, { "epoch": 0.3974921630094044, "grad_norm": 3.5879569053649902, "learning_rate": 6.34e-07, "loss": 7.9517, "step": 1268 }, { "epoch": 0.3978056426332288, "grad_norm": 3.9859533309936523, "learning_rate": 6.345000000000001e-07, "loss": 10.1546, "step": 1269 }, { "epoch": 0.3981191222570533, "grad_norm": 4.457554817199707, "learning_rate": 6.350000000000001e-07, "loss": 11.0071, "step": 1270 }, { "epoch": 0.39843260188087776, "grad_norm": 3.4207284450531006, "learning_rate": 6.355e-07, "loss": 7.5192, "step": 1271 }, { "epoch": 0.3987460815047022, "grad_norm": 2.9599740505218506, "learning_rate": 6.360000000000001e-07, "loss": 8.1417, "step": 1272 }, { "epoch": 0.39905956112852664, "grad_norm": 5.587130069732666, "learning_rate": 6.365000000000001e-07, "loss": 13.4308, "step": 1273 }, { "epoch": 0.3993730407523511, "grad_norm": 4.257179260253906, "learning_rate": 6.370000000000001e-07, "loss": 9.8754, "step": 1274 }, { "epoch": 0.3996865203761755, "grad_norm": 3.8205981254577637, "learning_rate": 6.375e-07, "loss": 7.3584, "step": 1275 }, { "epoch": 0.4, "grad_norm": 4.358394145965576, "learning_rate": 6.38e-07, "loss": 8.0118, "step": 1276 }, { "epoch": 0.40031347962382446, "grad_norm": 3.71220326423645, "learning_rate": 6.385000000000001e-07, "loss": 10.123, "step": 1277 }, { "epoch": 0.4006269592476489, "grad_norm": 3.896798610687256, "learning_rate": 6.39e-07, "loss": 7.8256, "step": 1278 }, { "epoch": 0.40094043887147335, "grad_norm": 3.1705808639526367, "learning_rate": 6.395000000000001e-07, "loss": 7.985, "step": 1279 }, { "epoch": 0.4012539184952978, "grad_norm": 5.2914838790893555, "learning_rate": 6.4e-07, "loss": 10.3964, "step": 1280 }, { "epoch": 0.40156739811912223, "grad_norm": 3.0464425086975098, "learning_rate": 6.405e-07, "loss": 6.6271, "step": 1281 }, { "epoch": 0.40188087774294673, "grad_norm": 4.177611827850342, "learning_rate": 6.410000000000001e-07, "loss": 12.3688, "step": 1282 }, { "epoch": 0.40219435736677117, "grad_norm": 3.6782820224761963, "learning_rate": 6.415e-07, "loss": 7.3459, "step": 1283 }, { "epoch": 0.4025078369905956, "grad_norm": 3.16139817237854, "learning_rate": 6.42e-07, "loss": 7.4459, "step": 1284 }, { "epoch": 0.40282131661442006, "grad_norm": 3.282700538635254, "learning_rate": 6.425000000000001e-07, "loss": 6.6329, "step": 1285 }, { "epoch": 0.4031347962382445, "grad_norm": 3.4450936317443848, "learning_rate": 6.43e-07, "loss": 9.4665, "step": 1286 }, { "epoch": 0.40344827586206894, "grad_norm": 2.994030475616455, "learning_rate": 6.435000000000001e-07, "loss": 6.7493, "step": 1287 }, { "epoch": 0.40376175548589344, "grad_norm": 4.3425822257995605, "learning_rate": 6.44e-07, "loss": 9.9379, "step": 1288 }, { "epoch": 0.4040752351097179, "grad_norm": 4.892250061035156, "learning_rate": 6.445e-07, "loss": 11.3928, "step": 1289 }, { "epoch": 0.4043887147335423, "grad_norm": 3.246832847595215, "learning_rate": 6.450000000000001e-07, "loss": 7.554, "step": 1290 }, { "epoch": 0.40470219435736676, "grad_norm": 4.307924270629883, "learning_rate": 6.455e-07, "loss": 11.633, "step": 1291 }, { "epoch": 0.4050156739811912, "grad_norm": 3.2229695320129395, "learning_rate": 6.460000000000001e-07, "loss": 6.5894, "step": 1292 }, { "epoch": 0.40532915360501565, "grad_norm": 4.491090774536133, "learning_rate": 6.465000000000001e-07, "loss": 12.8065, "step": 1293 }, { "epoch": 0.40564263322884014, "grad_norm": 3.604726552963257, "learning_rate": 6.47e-07, "loss": 7.5955, "step": 1294 }, { "epoch": 0.4059561128526646, "grad_norm": 2.8530771732330322, "learning_rate": 6.475e-07, "loss": 6.6698, "step": 1295 }, { "epoch": 0.406269592476489, "grad_norm": 3.601217746734619, "learning_rate": 6.48e-07, "loss": 7.8026, "step": 1296 }, { "epoch": 0.40658307210031347, "grad_norm": 3.475271701812744, "learning_rate": 6.485000000000001e-07, "loss": 7.1792, "step": 1297 }, { "epoch": 0.4068965517241379, "grad_norm": 3.568582057952881, "learning_rate": 6.490000000000001e-07, "loss": 8.6444, "step": 1298 }, { "epoch": 0.4072100313479624, "grad_norm": 3.6762585639953613, "learning_rate": 6.495e-07, "loss": 7.4771, "step": 1299 }, { "epoch": 0.40752351097178685, "grad_norm": 4.190446376800537, "learning_rate": 6.5e-07, "loss": 12.6181, "step": 1300 }, { "epoch": 0.4078369905956113, "grad_norm": 3.1675896644592285, "learning_rate": 6.505000000000001e-07, "loss": 6.2409, "step": 1301 }, { "epoch": 0.40815047021943573, "grad_norm": 4.8222856521606445, "learning_rate": 6.510000000000001e-07, "loss": 11.7033, "step": 1302 }, { "epoch": 0.4084639498432602, "grad_norm": 4.360724925994873, "learning_rate": 6.515e-07, "loss": 13.757, "step": 1303 }, { "epoch": 0.4087774294670846, "grad_norm": 3.622544288635254, "learning_rate": 6.52e-07, "loss": 9.4418, "step": 1304 }, { "epoch": 0.4090909090909091, "grad_norm": 3.5663022994995117, "learning_rate": 6.525000000000001e-07, "loss": 7.6926, "step": 1305 }, { "epoch": 0.40940438871473356, "grad_norm": 4.033201694488525, "learning_rate": 6.53e-07, "loss": 8.5123, "step": 1306 }, { "epoch": 0.409717868338558, "grad_norm": 3.480999708175659, "learning_rate": 6.535000000000001e-07, "loss": 7.2778, "step": 1307 }, { "epoch": 0.41003134796238244, "grad_norm": 3.1945245265960693, "learning_rate": 6.54e-07, "loss": 7.371, "step": 1308 }, { "epoch": 0.4103448275862069, "grad_norm": 3.4284608364105225, "learning_rate": 6.545e-07, "loss": 7.7181, "step": 1309 }, { "epoch": 0.4106583072100313, "grad_norm": 6.255939483642578, "learning_rate": 6.550000000000001e-07, "loss": 13.9958, "step": 1310 }, { "epoch": 0.4109717868338558, "grad_norm": 3.8257150650024414, "learning_rate": 6.555e-07, "loss": 9.3615, "step": 1311 }, { "epoch": 0.41128526645768027, "grad_norm": 5.354909420013428, "learning_rate": 6.560000000000002e-07, "loss": 10.8434, "step": 1312 }, { "epoch": 0.4115987460815047, "grad_norm": 4.636758804321289, "learning_rate": 6.565000000000001e-07, "loss": 10.2746, "step": 1313 }, { "epoch": 0.41191222570532915, "grad_norm": 4.180166721343994, "learning_rate": 6.57e-07, "loss": 10.871, "step": 1314 }, { "epoch": 0.4122257053291536, "grad_norm": 4.635682106018066, "learning_rate": 6.575000000000001e-07, "loss": 9.849, "step": 1315 }, { "epoch": 0.41253918495297803, "grad_norm": 3.4844233989715576, "learning_rate": 6.58e-07, "loss": 10.2331, "step": 1316 }, { "epoch": 0.41285266457680253, "grad_norm": 2.8132760524749756, "learning_rate": 6.585000000000001e-07, "loss": 6.6116, "step": 1317 }, { "epoch": 0.413166144200627, "grad_norm": 3.6411001682281494, "learning_rate": 6.590000000000001e-07, "loss": 9.5894, "step": 1318 }, { "epoch": 0.4134796238244514, "grad_norm": 3.4417355060577393, "learning_rate": 6.595e-07, "loss": 8.5706, "step": 1319 }, { "epoch": 0.41379310344827586, "grad_norm": 4.843432903289795, "learning_rate": 6.6e-07, "loss": 8.9815, "step": 1320 }, { "epoch": 0.4141065830721003, "grad_norm": 4.353475093841553, "learning_rate": 6.605000000000001e-07, "loss": 7.6671, "step": 1321 }, { "epoch": 0.41442006269592474, "grad_norm": 3.1126198768615723, "learning_rate": 6.610000000000001e-07, "loss": 7.3491, "step": 1322 }, { "epoch": 0.41473354231974924, "grad_norm": 2.952798366546631, "learning_rate": 6.615e-07, "loss": 6.6309, "step": 1323 }, { "epoch": 0.4150470219435737, "grad_norm": 3.8614158630371094, "learning_rate": 6.62e-07, "loss": 7.1688, "step": 1324 }, { "epoch": 0.4153605015673981, "grad_norm": 3.264158248901367, "learning_rate": 6.625000000000001e-07, "loss": 7.4095, "step": 1325 }, { "epoch": 0.41567398119122256, "grad_norm": 4.060593605041504, "learning_rate": 6.63e-07, "loss": 11.2259, "step": 1326 }, { "epoch": 0.415987460815047, "grad_norm": 3.954174757003784, "learning_rate": 6.635000000000001e-07, "loss": 9.4012, "step": 1327 }, { "epoch": 0.41630094043887145, "grad_norm": 3.335627555847168, "learning_rate": 6.64e-07, "loss": 7.6803, "step": 1328 }, { "epoch": 0.41661442006269594, "grad_norm": 3.3831787109375, "learning_rate": 6.645000000000001e-07, "loss": 8.848, "step": 1329 }, { "epoch": 0.4169278996865204, "grad_norm": 4.064548492431641, "learning_rate": 6.650000000000001e-07, "loss": 7.7122, "step": 1330 }, { "epoch": 0.41724137931034483, "grad_norm": 3.576258659362793, "learning_rate": 6.655e-07, "loss": 9.0345, "step": 1331 }, { "epoch": 0.41755485893416927, "grad_norm": 4.037924766540527, "learning_rate": 6.660000000000002e-07, "loss": 10.0141, "step": 1332 }, { "epoch": 0.4178683385579937, "grad_norm": 4.519690990447998, "learning_rate": 6.665000000000001e-07, "loss": 7.9251, "step": 1333 }, { "epoch": 0.41818181818181815, "grad_norm": 3.73274827003479, "learning_rate": 6.67e-07, "loss": 9.2519, "step": 1334 }, { "epoch": 0.41849529780564265, "grad_norm": 3.267875909805298, "learning_rate": 6.675000000000001e-07, "loss": 6.2191, "step": 1335 }, { "epoch": 0.4188087774294671, "grad_norm": 3.341024160385132, "learning_rate": 6.68e-07, "loss": 7.4184, "step": 1336 }, { "epoch": 0.41912225705329154, "grad_norm": 2.8991432189941406, "learning_rate": 6.685000000000001e-07, "loss": 7.4803, "step": 1337 }, { "epoch": 0.419435736677116, "grad_norm": 4.8145928382873535, "learning_rate": 6.690000000000001e-07, "loss": 10.2224, "step": 1338 }, { "epoch": 0.4197492163009404, "grad_norm": 3.761445999145508, "learning_rate": 6.695e-07, "loss": 8.4562, "step": 1339 }, { "epoch": 0.4200626959247649, "grad_norm": 3.892420768737793, "learning_rate": 6.7e-07, "loss": 9.1571, "step": 1340 }, { "epoch": 0.42037617554858936, "grad_norm": 5.118911266326904, "learning_rate": 6.705000000000001e-07, "loss": 12.1094, "step": 1341 }, { "epoch": 0.4206896551724138, "grad_norm": 3.2898852825164795, "learning_rate": 6.710000000000001e-07, "loss": 7.4492, "step": 1342 }, { "epoch": 0.42100313479623824, "grad_norm": 2.7074368000030518, "learning_rate": 6.715e-07, "loss": 6.0427, "step": 1343 }, { "epoch": 0.4213166144200627, "grad_norm": 4.675681114196777, "learning_rate": 6.72e-07, "loss": 13.983, "step": 1344 }, { "epoch": 0.4216300940438871, "grad_norm": 3.761418104171753, "learning_rate": 6.725000000000001e-07, "loss": 7.1159, "step": 1345 }, { "epoch": 0.4219435736677116, "grad_norm": 3.7398788928985596, "learning_rate": 6.730000000000001e-07, "loss": 8.9778, "step": 1346 }, { "epoch": 0.42225705329153607, "grad_norm": 4.614981651306152, "learning_rate": 6.735e-07, "loss": 11.8006, "step": 1347 }, { "epoch": 0.4225705329153605, "grad_norm": 4.014955043792725, "learning_rate": 6.74e-07, "loss": 7.9856, "step": 1348 }, { "epoch": 0.42288401253918495, "grad_norm": 4.2952470779418945, "learning_rate": 6.745000000000001e-07, "loss": 8.6141, "step": 1349 }, { "epoch": 0.4231974921630094, "grad_norm": 4.254025459289551, "learning_rate": 6.750000000000001e-07, "loss": 13.2271, "step": 1350 }, { "epoch": 0.42351097178683383, "grad_norm": 3.196575403213501, "learning_rate": 6.755e-07, "loss": 6.6306, "step": 1351 }, { "epoch": 0.42382445141065833, "grad_norm": 4.765541076660156, "learning_rate": 6.76e-07, "loss": 12.8829, "step": 1352 }, { "epoch": 0.4241379310344828, "grad_norm": 4.769258499145508, "learning_rate": 6.765000000000001e-07, "loss": 10.9679, "step": 1353 }, { "epoch": 0.4244514106583072, "grad_norm": 3.533536195755005, "learning_rate": 6.77e-07, "loss": 6.7036, "step": 1354 }, { "epoch": 0.42476489028213166, "grad_norm": 4.482390403747559, "learning_rate": 6.775000000000001e-07, "loss": 9.1064, "step": 1355 }, { "epoch": 0.4250783699059561, "grad_norm": 3.877317428588867, "learning_rate": 6.78e-07, "loss": 9.4161, "step": 1356 }, { "epoch": 0.42539184952978054, "grad_norm": 4.770163059234619, "learning_rate": 6.784999999999999e-07, "loss": 13.1058, "step": 1357 }, { "epoch": 0.42570532915360504, "grad_norm": 3.1010990142822266, "learning_rate": 6.790000000000001e-07, "loss": 7.6001, "step": 1358 }, { "epoch": 0.4260188087774295, "grad_norm": 4.240891456604004, "learning_rate": 6.795e-07, "loss": 10.6859, "step": 1359 }, { "epoch": 0.4263322884012539, "grad_norm": 3.357872486114502, "learning_rate": 6.800000000000001e-07, "loss": 9.1908, "step": 1360 }, { "epoch": 0.42664576802507836, "grad_norm": 4.015371322631836, "learning_rate": 6.805000000000001e-07, "loss": 7.1454, "step": 1361 }, { "epoch": 0.4269592476489028, "grad_norm": 3.248347520828247, "learning_rate": 6.81e-07, "loss": 7.6464, "step": 1362 }, { "epoch": 0.42727272727272725, "grad_norm": 3.975553512573242, "learning_rate": 6.815000000000001e-07, "loss": 9.1993, "step": 1363 }, { "epoch": 0.42758620689655175, "grad_norm": 5.927193641662598, "learning_rate": 6.82e-07, "loss": 9.7591, "step": 1364 }, { "epoch": 0.4278996865203762, "grad_norm": 3.059760808944702, "learning_rate": 6.825000000000001e-07, "loss": 6.5011, "step": 1365 }, { "epoch": 0.42821316614420063, "grad_norm": 3.896256923675537, "learning_rate": 6.830000000000001e-07, "loss": 7.2387, "step": 1366 }, { "epoch": 0.42852664576802507, "grad_norm": 3.3756370544433594, "learning_rate": 6.835e-07, "loss": 7.5067, "step": 1367 }, { "epoch": 0.4288401253918495, "grad_norm": 4.085315704345703, "learning_rate": 6.84e-07, "loss": 7.0529, "step": 1368 }, { "epoch": 0.42915360501567396, "grad_norm": 2.654603958129883, "learning_rate": 6.845000000000001e-07, "loss": 7.7151, "step": 1369 }, { "epoch": 0.42946708463949845, "grad_norm": 4.527529716491699, "learning_rate": 6.850000000000001e-07, "loss": 10.333, "step": 1370 }, { "epoch": 0.4297805642633229, "grad_norm": 3.930635690689087, "learning_rate": 6.855e-07, "loss": 9.8118, "step": 1371 }, { "epoch": 0.43009404388714734, "grad_norm": 4.306950092315674, "learning_rate": 6.86e-07, "loss": 9.8894, "step": 1372 }, { "epoch": 0.4304075235109718, "grad_norm": 3.4206173419952393, "learning_rate": 6.865000000000001e-07, "loss": 8.8873, "step": 1373 }, { "epoch": 0.4307210031347962, "grad_norm": 4.430827617645264, "learning_rate": 6.87e-07, "loss": 9.3459, "step": 1374 }, { "epoch": 0.43103448275862066, "grad_norm": 3.1537222862243652, "learning_rate": 6.875000000000001e-07, "loss": 7.5838, "step": 1375 }, { "epoch": 0.43134796238244516, "grad_norm": 4.20504093170166, "learning_rate": 6.88e-07, "loss": 12.7391, "step": 1376 }, { "epoch": 0.4316614420062696, "grad_norm": 3.653794050216675, "learning_rate": 6.885e-07, "loss": 8.7142, "step": 1377 }, { "epoch": 0.43197492163009404, "grad_norm": 4.225579261779785, "learning_rate": 6.890000000000001e-07, "loss": 10.9947, "step": 1378 }, { "epoch": 0.4322884012539185, "grad_norm": 3.8407626152038574, "learning_rate": 6.895e-07, "loss": 9.6915, "step": 1379 }, { "epoch": 0.43260188087774293, "grad_norm": 3.124218463897705, "learning_rate": 6.900000000000001e-07, "loss": 6.9618, "step": 1380 }, { "epoch": 0.4329153605015674, "grad_norm": 3.4793970584869385, "learning_rate": 6.905000000000001e-07, "loss": 10.846, "step": 1381 }, { "epoch": 0.43322884012539187, "grad_norm": 3.0507242679595947, "learning_rate": 6.91e-07, "loss": 8.2012, "step": 1382 }, { "epoch": 0.4335423197492163, "grad_norm": 3.7710156440734863, "learning_rate": 6.915000000000001e-07, "loss": 9.654, "step": 1383 }, { "epoch": 0.43385579937304075, "grad_norm": 3.3001532554626465, "learning_rate": 6.92e-07, "loss": 6.7221, "step": 1384 }, { "epoch": 0.4341692789968652, "grad_norm": 4.092366695404053, "learning_rate": 6.925000000000001e-07, "loss": 10.6304, "step": 1385 }, { "epoch": 0.43448275862068964, "grad_norm": 3.2531421184539795, "learning_rate": 6.930000000000001e-07, "loss": 7.9767, "step": 1386 }, { "epoch": 0.43479623824451413, "grad_norm": 3.08742094039917, "learning_rate": 6.935e-07, "loss": 7.9412, "step": 1387 }, { "epoch": 0.4351097178683386, "grad_norm": 4.16629695892334, "learning_rate": 6.94e-07, "loss": 7.9362, "step": 1388 }, { "epoch": 0.435423197492163, "grad_norm": 3.6127755641937256, "learning_rate": 6.945000000000001e-07, "loss": 8.0281, "step": 1389 }, { "epoch": 0.43573667711598746, "grad_norm": 3.736285924911499, "learning_rate": 6.950000000000001e-07, "loss": 9.2018, "step": 1390 }, { "epoch": 0.4360501567398119, "grad_norm": 5.327370643615723, "learning_rate": 6.955000000000001e-07, "loss": 10.193, "step": 1391 }, { "epoch": 0.43636363636363634, "grad_norm": 3.426323413848877, "learning_rate": 6.96e-07, "loss": 8.9392, "step": 1392 }, { "epoch": 0.43667711598746084, "grad_norm": 4.5006937980651855, "learning_rate": 6.965000000000001e-07, "loss": 11.3513, "step": 1393 }, { "epoch": 0.4369905956112853, "grad_norm": 3.4478158950805664, "learning_rate": 6.970000000000001e-07, "loss": 8.5876, "step": 1394 }, { "epoch": 0.4373040752351097, "grad_norm": 3.8687758445739746, "learning_rate": 6.975000000000001e-07, "loss": 8.731, "step": 1395 }, { "epoch": 0.43761755485893417, "grad_norm": 3.383852958679199, "learning_rate": 6.98e-07, "loss": 7.7361, "step": 1396 }, { "epoch": 0.4379310344827586, "grad_norm": 3.9315686225891113, "learning_rate": 6.985e-07, "loss": 9.827, "step": 1397 }, { "epoch": 0.43824451410658305, "grad_norm": 3.924696683883667, "learning_rate": 6.990000000000001e-07, "loss": 7.6312, "step": 1398 }, { "epoch": 0.43855799373040755, "grad_norm": 3.3615076541900635, "learning_rate": 6.995e-07, "loss": 7.7659, "step": 1399 }, { "epoch": 0.438871473354232, "grad_norm": 3.5409042835235596, "learning_rate": 7.000000000000001e-07, "loss": 6.6861, "step": 1400 }, { "epoch": 0.43918495297805643, "grad_norm": 3.4962658882141113, "learning_rate": 7.005000000000001e-07, "loss": 8.9106, "step": 1401 }, { "epoch": 0.4394984326018809, "grad_norm": 4.335007667541504, "learning_rate": 7.01e-07, "loss": 7.0246, "step": 1402 }, { "epoch": 0.4398119122257053, "grad_norm": 3.592299222946167, "learning_rate": 7.015000000000001e-07, "loss": 9.6732, "step": 1403 }, { "epoch": 0.44012539184952976, "grad_norm": 5.375201225280762, "learning_rate": 7.02e-07, "loss": 8.865, "step": 1404 }, { "epoch": 0.44043887147335425, "grad_norm": 5.488976955413818, "learning_rate": 7.025000000000002e-07, "loss": 14.8031, "step": 1405 }, { "epoch": 0.4407523510971787, "grad_norm": 3.3288614749908447, "learning_rate": 7.030000000000001e-07, "loss": 8.8429, "step": 1406 }, { "epoch": 0.44106583072100314, "grad_norm": 3.75602388381958, "learning_rate": 7.035e-07, "loss": 7.5947, "step": 1407 }, { "epoch": 0.4413793103448276, "grad_norm": 2.7701900005340576, "learning_rate": 7.040000000000001e-07, "loss": 7.4473, "step": 1408 }, { "epoch": 0.441692789968652, "grad_norm": 4.249903678894043, "learning_rate": 7.045000000000001e-07, "loss": 8.4978, "step": 1409 }, { "epoch": 0.44200626959247646, "grad_norm": 3.3370988368988037, "learning_rate": 7.05e-07, "loss": 6.707, "step": 1410 }, { "epoch": 0.44231974921630096, "grad_norm": 5.536358833312988, "learning_rate": 7.055000000000001e-07, "loss": 11.3998, "step": 1411 }, { "epoch": 0.4426332288401254, "grad_norm": 4.11740779876709, "learning_rate": 7.06e-07, "loss": 8.3269, "step": 1412 }, { "epoch": 0.44294670846394985, "grad_norm": 4.711095809936523, "learning_rate": 7.065000000000001e-07, "loss": 12.2608, "step": 1413 }, { "epoch": 0.4432601880877743, "grad_norm": 3.0188283920288086, "learning_rate": 7.070000000000001e-07, "loss": 7.8679, "step": 1414 }, { "epoch": 0.44357366771159873, "grad_norm": 3.6578001976013184, "learning_rate": 7.075e-07, "loss": 7.9547, "step": 1415 }, { "epoch": 0.44388714733542317, "grad_norm": 3.710144519805908, "learning_rate": 7.08e-07, "loss": 9.1333, "step": 1416 }, { "epoch": 0.44420062695924767, "grad_norm": 3.580639600753784, "learning_rate": 7.085e-07, "loss": 8.6172, "step": 1417 }, { "epoch": 0.4445141065830721, "grad_norm": 3.4201769828796387, "learning_rate": 7.090000000000001e-07, "loss": 7.38, "step": 1418 }, { "epoch": 0.44482758620689655, "grad_norm": 3.8317136764526367, "learning_rate": 7.095e-07, "loss": 7.1807, "step": 1419 }, { "epoch": 0.445141065830721, "grad_norm": 6.48163366317749, "learning_rate": 7.1e-07, "loss": 15.1615, "step": 1420 }, { "epoch": 0.44545454545454544, "grad_norm": 3.3979268074035645, "learning_rate": 7.105000000000001e-07, "loss": 6.6834, "step": 1421 }, { "epoch": 0.4457680250783699, "grad_norm": 3.5724081993103027, "learning_rate": 7.110000000000001e-07, "loss": 8.6715, "step": 1422 }, { "epoch": 0.4460815047021944, "grad_norm": 3.738381862640381, "learning_rate": 7.115000000000001e-07, "loss": 7.0441, "step": 1423 }, { "epoch": 0.4463949843260188, "grad_norm": 4.122878551483154, "learning_rate": 7.12e-07, "loss": 7.2443, "step": 1424 }, { "epoch": 0.44670846394984326, "grad_norm": 3.6147820949554443, "learning_rate": 7.125e-07, "loss": 7.8883, "step": 1425 }, { "epoch": 0.4470219435736677, "grad_norm": 3.86806058883667, "learning_rate": 7.130000000000001e-07, "loss": 8.0316, "step": 1426 }, { "epoch": 0.44733542319749214, "grad_norm": 4.159463882446289, "learning_rate": 7.135e-07, "loss": 7.2277, "step": 1427 }, { "epoch": 0.44764890282131664, "grad_norm": 2.5668580532073975, "learning_rate": 7.140000000000001e-07, "loss": 6.9613, "step": 1428 }, { "epoch": 0.4479623824451411, "grad_norm": 4.1336469650268555, "learning_rate": 7.145000000000001e-07, "loss": 9.4702, "step": 1429 }, { "epoch": 0.4482758620689655, "grad_norm": 6.645508289337158, "learning_rate": 7.15e-07, "loss": 15.8057, "step": 1430 }, { "epoch": 0.44858934169278997, "grad_norm": 4.979160785675049, "learning_rate": 7.155000000000001e-07, "loss": 15.3406, "step": 1431 }, { "epoch": 0.4489028213166144, "grad_norm": 3.6304969787597656, "learning_rate": 7.16e-07, "loss": 8.6424, "step": 1432 }, { "epoch": 0.44921630094043885, "grad_norm": 4.989882469177246, "learning_rate": 7.165000000000001e-07, "loss": 11.6095, "step": 1433 }, { "epoch": 0.44952978056426335, "grad_norm": 3.8582489490509033, "learning_rate": 7.170000000000001e-07, "loss": 8.5874, "step": 1434 }, { "epoch": 0.4498432601880878, "grad_norm": 4.933996200561523, "learning_rate": 7.175e-07, "loss": 11.0086, "step": 1435 }, { "epoch": 0.45015673981191223, "grad_norm": 4.105413913726807, "learning_rate": 7.18e-07, "loss": 10.1414, "step": 1436 }, { "epoch": 0.4504702194357367, "grad_norm": 4.856784820556641, "learning_rate": 7.185e-07, "loss": 10.7219, "step": 1437 }, { "epoch": 0.4507836990595611, "grad_norm": 3.4442543983459473, "learning_rate": 7.190000000000001e-07, "loss": 6.6924, "step": 1438 }, { "epoch": 0.45109717868338556, "grad_norm": 3.018460988998413, "learning_rate": 7.195000000000001e-07, "loss": 8.6876, "step": 1439 }, { "epoch": 0.45141065830721006, "grad_norm": 3.129737377166748, "learning_rate": 7.2e-07, "loss": 7.0111, "step": 1440 }, { "epoch": 0.4517241379310345, "grad_norm": 4.270967483520508, "learning_rate": 7.205000000000001e-07, "loss": 9.3915, "step": 1441 }, { "epoch": 0.45203761755485894, "grad_norm": 3.9212779998779297, "learning_rate": 7.210000000000001e-07, "loss": 10.3279, "step": 1442 }, { "epoch": 0.4523510971786834, "grad_norm": 3.4876608848571777, "learning_rate": 7.215000000000001e-07, "loss": 6.606, "step": 1443 }, { "epoch": 0.4526645768025078, "grad_norm": 3.4680469036102295, "learning_rate": 7.22e-07, "loss": 7.3311, "step": 1444 }, { "epoch": 0.45297805642633227, "grad_norm": 3.6926703453063965, "learning_rate": 7.225e-07, "loss": 7.5257, "step": 1445 }, { "epoch": 0.45329153605015676, "grad_norm": 6.416301727294922, "learning_rate": 7.230000000000001e-07, "loss": 14.6459, "step": 1446 }, { "epoch": 0.4536050156739812, "grad_norm": 4.126363277435303, "learning_rate": 7.235e-07, "loss": 8.369, "step": 1447 }, { "epoch": 0.45391849529780565, "grad_norm": 3.9787609577178955, "learning_rate": 7.240000000000001e-07, "loss": 8.115, "step": 1448 }, { "epoch": 0.4542319749216301, "grad_norm": 3.5446219444274902, "learning_rate": 7.245000000000001e-07, "loss": 7.3203, "step": 1449 }, { "epoch": 0.45454545454545453, "grad_norm": 3.7280256748199463, "learning_rate": 7.25e-07, "loss": 9.8608, "step": 1450 }, { "epoch": 0.454858934169279, "grad_norm": 3.1473164558410645, "learning_rate": 7.255000000000001e-07, "loss": 7.5801, "step": 1451 }, { "epoch": 0.45517241379310347, "grad_norm": 3.243011951446533, "learning_rate": 7.26e-07, "loss": 6.5251, "step": 1452 }, { "epoch": 0.4554858934169279, "grad_norm": 3.912731170654297, "learning_rate": 7.265000000000002e-07, "loss": 9.8108, "step": 1453 }, { "epoch": 0.45579937304075235, "grad_norm": 3.3148555755615234, "learning_rate": 7.270000000000001e-07, "loss": 9.7591, "step": 1454 }, { "epoch": 0.4561128526645768, "grad_norm": 3.5249342918395996, "learning_rate": 7.275e-07, "loss": 7.1523, "step": 1455 }, { "epoch": 0.45642633228840124, "grad_norm": 3.985668420791626, "learning_rate": 7.280000000000001e-07, "loss": 7.9672, "step": 1456 }, { "epoch": 0.4567398119122257, "grad_norm": 3.4252846240997314, "learning_rate": 7.285e-07, "loss": 8.2703, "step": 1457 }, { "epoch": 0.4570532915360502, "grad_norm": 5.43139123916626, "learning_rate": 7.290000000000001e-07, "loss": 9.2351, "step": 1458 }, { "epoch": 0.4573667711598746, "grad_norm": 5.333076000213623, "learning_rate": 7.295000000000001e-07, "loss": 13.8034, "step": 1459 }, { "epoch": 0.45768025078369906, "grad_norm": 3.3597097396850586, "learning_rate": 7.3e-07, "loss": 11.3477, "step": 1460 }, { "epoch": 0.4579937304075235, "grad_norm": 2.962592601776123, "learning_rate": 7.305000000000001e-07, "loss": 8.5858, "step": 1461 }, { "epoch": 0.45830721003134794, "grad_norm": 2.7471611499786377, "learning_rate": 7.310000000000001e-07, "loss": 6.0348, "step": 1462 }, { "epoch": 0.4586206896551724, "grad_norm": 4.322995662689209, "learning_rate": 7.315000000000001e-07, "loss": 9.9356, "step": 1463 }, { "epoch": 0.4589341692789969, "grad_norm": 3.186292886734009, "learning_rate": 7.32e-07, "loss": 8.0499, "step": 1464 }, { "epoch": 0.4592476489028213, "grad_norm": 2.747281789779663, "learning_rate": 7.325e-07, "loss": 6.545, "step": 1465 }, { "epoch": 0.45956112852664577, "grad_norm": 4.15464448928833, "learning_rate": 7.330000000000001e-07, "loss": 8.2943, "step": 1466 }, { "epoch": 0.4598746081504702, "grad_norm": 3.4695398807525635, "learning_rate": 7.335e-07, "loss": 7.7409, "step": 1467 }, { "epoch": 0.46018808777429465, "grad_norm": 3.8150601387023926, "learning_rate": 7.340000000000001e-07, "loss": 9.3436, "step": 1468 }, { "epoch": 0.46050156739811915, "grad_norm": 3.744868278503418, "learning_rate": 7.345000000000001e-07, "loss": 9.7803, "step": 1469 }, { "epoch": 0.4608150470219436, "grad_norm": 3.8790037631988525, "learning_rate": 7.350000000000001e-07, "loss": 8.0119, "step": 1470 }, { "epoch": 0.46112852664576803, "grad_norm": 3.8017265796661377, "learning_rate": 7.355000000000001e-07, "loss": 7.3153, "step": 1471 }, { "epoch": 0.4614420062695925, "grad_norm": 4.845677375793457, "learning_rate": 7.36e-07, "loss": 12.615, "step": 1472 }, { "epoch": 0.4617554858934169, "grad_norm": 4.192208766937256, "learning_rate": 7.365e-07, "loss": 7.3787, "step": 1473 }, { "epoch": 0.46206896551724136, "grad_norm": 4.240219593048096, "learning_rate": 7.370000000000001e-07, "loss": 8.5848, "step": 1474 }, { "epoch": 0.46238244514106586, "grad_norm": 4.029890537261963, "learning_rate": 7.375e-07, "loss": 9.3911, "step": 1475 }, { "epoch": 0.4626959247648903, "grad_norm": 3.331756591796875, "learning_rate": 7.380000000000001e-07, "loss": 6.3178, "step": 1476 }, { "epoch": 0.46300940438871474, "grad_norm": 3.7580671310424805, "learning_rate": 7.385e-07, "loss": 9.2023, "step": 1477 }, { "epoch": 0.4633228840125392, "grad_norm": 4.716964244842529, "learning_rate": 7.39e-07, "loss": 11.6383, "step": 1478 }, { "epoch": 0.4636363636363636, "grad_norm": 3.466538906097412, "learning_rate": 7.395000000000001e-07, "loss": 8.556, "step": 1479 }, { "epoch": 0.46394984326018807, "grad_norm": 3.618819236755371, "learning_rate": 7.4e-07, "loss": 8.6228, "step": 1480 }, { "epoch": 0.46426332288401256, "grad_norm": 3.9370975494384766, "learning_rate": 7.405000000000002e-07, "loss": 8.9446, "step": 1481 }, { "epoch": 0.464576802507837, "grad_norm": 3.3497865200042725, "learning_rate": 7.410000000000001e-07, "loss": 8.4896, "step": 1482 }, { "epoch": 0.46489028213166145, "grad_norm": 3.9014110565185547, "learning_rate": 7.415e-07, "loss": 9.6106, "step": 1483 }, { "epoch": 0.4652037617554859, "grad_norm": 3.2826168537139893, "learning_rate": 7.420000000000001e-07, "loss": 7.3699, "step": 1484 }, { "epoch": 0.46551724137931033, "grad_norm": 4.081614971160889, "learning_rate": 7.425e-07, "loss": 8.7689, "step": 1485 }, { "epoch": 0.4658307210031348, "grad_norm": 3.3145084381103516, "learning_rate": 7.430000000000001e-07, "loss": 7.7831, "step": 1486 }, { "epoch": 0.46614420062695927, "grad_norm": 5.257850170135498, "learning_rate": 7.435000000000001e-07, "loss": 14.4335, "step": 1487 }, { "epoch": 0.4664576802507837, "grad_norm": 3.631011486053467, "learning_rate": 7.44e-07, "loss": 8.0316, "step": 1488 }, { "epoch": 0.46677115987460815, "grad_norm": 3.9170570373535156, "learning_rate": 7.445000000000001e-07, "loss": 7.9331, "step": 1489 }, { "epoch": 0.4670846394984326, "grad_norm": 3.710536003112793, "learning_rate": 7.450000000000001e-07, "loss": 10.0156, "step": 1490 }, { "epoch": 0.46739811912225704, "grad_norm": 3.666468858718872, "learning_rate": 7.455000000000001e-07, "loss": 7.2938, "step": 1491 }, { "epoch": 0.4677115987460815, "grad_norm": 4.270216941833496, "learning_rate": 7.46e-07, "loss": 10.1365, "step": 1492 }, { "epoch": 0.468025078369906, "grad_norm": 4.941109657287598, "learning_rate": 7.465e-07, "loss": 10.8911, "step": 1493 }, { "epoch": 0.4683385579937304, "grad_norm": 3.449632167816162, "learning_rate": 7.470000000000001e-07, "loss": 8.6447, "step": 1494 }, { "epoch": 0.46865203761755486, "grad_norm": 2.9346587657928467, "learning_rate": 7.475e-07, "loss": 6.7992, "step": 1495 }, { "epoch": 0.4689655172413793, "grad_norm": 2.9172539710998535, "learning_rate": 7.480000000000001e-07, "loss": 7.0059, "step": 1496 }, { "epoch": 0.46927899686520375, "grad_norm": 3.648991346359253, "learning_rate": 7.485e-07, "loss": 8.9688, "step": 1497 }, { "epoch": 0.4695924764890282, "grad_norm": 3.0724036693573, "learning_rate": 7.49e-07, "loss": 7.4737, "step": 1498 }, { "epoch": 0.4699059561128527, "grad_norm": 3.7592673301696777, "learning_rate": 7.495000000000001e-07, "loss": 9.0526, "step": 1499 }, { "epoch": 0.4702194357366771, "grad_norm": 3.5673487186431885, "learning_rate": 7.5e-07, "loss": 6.967, "step": 1500 }, { "epoch": 0.47053291536050157, "grad_norm": 4.531203269958496, "learning_rate": 7.505000000000002e-07, "loss": 8.6275, "step": 1501 }, { "epoch": 0.470846394984326, "grad_norm": 4.227149486541748, "learning_rate": 7.510000000000001e-07, "loss": 10.3291, "step": 1502 }, { "epoch": 0.47115987460815045, "grad_norm": 3.532550096511841, "learning_rate": 7.515e-07, "loss": 9.2704, "step": 1503 }, { "epoch": 0.4714733542319749, "grad_norm": 3.5547232627868652, "learning_rate": 7.520000000000001e-07, "loss": 8.9358, "step": 1504 }, { "epoch": 0.4717868338557994, "grad_norm": 2.87225604057312, "learning_rate": 7.525e-07, "loss": 7.2077, "step": 1505 }, { "epoch": 0.47210031347962383, "grad_norm": 3.5084965229034424, "learning_rate": 7.530000000000001e-07, "loss": 7.6485, "step": 1506 }, { "epoch": 0.4724137931034483, "grad_norm": 4.405672550201416, "learning_rate": 7.535000000000001e-07, "loss": 9.7914, "step": 1507 }, { "epoch": 0.4727272727272727, "grad_norm": 3.6262218952178955, "learning_rate": 7.54e-07, "loss": 5.8048, "step": 1508 }, { "epoch": 0.47304075235109716, "grad_norm": 4.285321235656738, "learning_rate": 7.545000000000001e-07, "loss": 8.9892, "step": 1509 }, { "epoch": 0.47335423197492166, "grad_norm": 3.221437692642212, "learning_rate": 7.550000000000001e-07, "loss": 8.5896, "step": 1510 }, { "epoch": 0.4736677115987461, "grad_norm": 3.349771022796631, "learning_rate": 7.555000000000001e-07, "loss": 7.2775, "step": 1511 }, { "epoch": 0.47398119122257054, "grad_norm": 4.374917984008789, "learning_rate": 7.56e-07, "loss": 10.1504, "step": 1512 }, { "epoch": 0.474294670846395, "grad_norm": 4.312491416931152, "learning_rate": 7.565e-07, "loss": 9.6675, "step": 1513 }, { "epoch": 0.4746081504702194, "grad_norm": 4.160287380218506, "learning_rate": 7.570000000000001e-07, "loss": 7.7446, "step": 1514 }, { "epoch": 0.47492163009404387, "grad_norm": 4.5663909912109375, "learning_rate": 7.575000000000001e-07, "loss": 10.4192, "step": 1515 }, { "epoch": 0.47523510971786836, "grad_norm": 3.4202966690063477, "learning_rate": 7.580000000000001e-07, "loss": 7.1886, "step": 1516 }, { "epoch": 0.4755485893416928, "grad_norm": 4.065593719482422, "learning_rate": 7.585e-07, "loss": 8.5182, "step": 1517 }, { "epoch": 0.47586206896551725, "grad_norm": 4.027133464813232, "learning_rate": 7.590000000000001e-07, "loss": 8.4011, "step": 1518 }, { "epoch": 0.4761755485893417, "grad_norm": 3.671494722366333, "learning_rate": 7.595000000000001e-07, "loss": 7.6953, "step": 1519 }, { "epoch": 0.47648902821316613, "grad_norm": 4.113628387451172, "learning_rate": 7.6e-07, "loss": 9.0673, "step": 1520 }, { "epoch": 0.4768025078369906, "grad_norm": 4.1456756591796875, "learning_rate": 7.605000000000002e-07, "loss": 8.4153, "step": 1521 }, { "epoch": 0.47711598746081507, "grad_norm": 4.083249568939209, "learning_rate": 7.610000000000001e-07, "loss": 9.7206, "step": 1522 }, { "epoch": 0.4774294670846395, "grad_norm": 4.154261589050293, "learning_rate": 7.615e-07, "loss": 8.6472, "step": 1523 }, { "epoch": 0.47774294670846396, "grad_norm": 4.028384208679199, "learning_rate": 7.620000000000001e-07, "loss": 7.9002, "step": 1524 }, { "epoch": 0.4780564263322884, "grad_norm": 4.162931442260742, "learning_rate": 7.625e-07, "loss": 9.9488, "step": 1525 }, { "epoch": 0.47836990595611284, "grad_norm": 3.495849132537842, "learning_rate": 7.630000000000001e-07, "loss": 9.5747, "step": 1526 }, { "epoch": 0.4786833855799373, "grad_norm": 3.8134756088256836, "learning_rate": 7.635000000000001e-07, "loss": 8.6538, "step": 1527 }, { "epoch": 0.4789968652037618, "grad_norm": 3.9398064613342285, "learning_rate": 7.64e-07, "loss": 8.3008, "step": 1528 }, { "epoch": 0.4793103448275862, "grad_norm": 3.440537691116333, "learning_rate": 7.645000000000002e-07, "loss": 6.1765, "step": 1529 }, { "epoch": 0.47962382445141066, "grad_norm": 4.416642665863037, "learning_rate": 7.650000000000001e-07, "loss": 9.6123, "step": 1530 }, { "epoch": 0.4799373040752351, "grad_norm": 3.9015026092529297, "learning_rate": 7.655000000000001e-07, "loss": 8.8604, "step": 1531 }, { "epoch": 0.48025078369905955, "grad_norm": 3.634904623031616, "learning_rate": 7.660000000000001e-07, "loss": 8.6111, "step": 1532 }, { "epoch": 0.480564263322884, "grad_norm": 3.1789402961730957, "learning_rate": 7.665e-07, "loss": 8.7108, "step": 1533 }, { "epoch": 0.4808777429467085, "grad_norm": 3.556788206100464, "learning_rate": 7.670000000000001e-07, "loss": 7.9619, "step": 1534 }, { "epoch": 0.48119122257053293, "grad_norm": 3.4481894969940186, "learning_rate": 7.675000000000001e-07, "loss": 8.4755, "step": 1535 }, { "epoch": 0.48150470219435737, "grad_norm": 3.0415053367614746, "learning_rate": 7.68e-07, "loss": 6.9378, "step": 1536 }, { "epoch": 0.4818181818181818, "grad_norm": 3.7473058700561523, "learning_rate": 7.685e-07, "loss": 10.5121, "step": 1537 }, { "epoch": 0.48213166144200625, "grad_norm": 4.390431880950928, "learning_rate": 7.690000000000001e-07, "loss": 8.8402, "step": 1538 }, { "epoch": 0.4824451410658307, "grad_norm": 3.8549649715423584, "learning_rate": 7.695000000000001e-07, "loss": 8.73, "step": 1539 }, { "epoch": 0.4827586206896552, "grad_norm": 5.701721668243408, "learning_rate": 7.7e-07, "loss": 9.7011, "step": 1540 }, { "epoch": 0.48307210031347964, "grad_norm": 3.738424777984619, "learning_rate": 7.705e-07, "loss": 8.1286, "step": 1541 }, { "epoch": 0.4833855799373041, "grad_norm": 3.1514458656311035, "learning_rate": 7.710000000000001e-07, "loss": 7.7808, "step": 1542 }, { "epoch": 0.4836990595611285, "grad_norm": 2.9332237243652344, "learning_rate": 7.715e-07, "loss": 7.3622, "step": 1543 }, { "epoch": 0.48401253918495296, "grad_norm": 3.4852969646453857, "learning_rate": 7.720000000000001e-07, "loss": 7.6669, "step": 1544 }, { "epoch": 0.4843260188087774, "grad_norm": 7.102890968322754, "learning_rate": 7.725e-07, "loss": 7.0981, "step": 1545 }, { "epoch": 0.4846394984326019, "grad_norm": 3.0128979682922363, "learning_rate": 7.73e-07, "loss": 6.5679, "step": 1546 }, { "epoch": 0.48495297805642634, "grad_norm": 3.7915170192718506, "learning_rate": 7.735000000000001e-07, "loss": 7.2064, "step": 1547 }, { "epoch": 0.4852664576802508, "grad_norm": 2.735111713409424, "learning_rate": 7.74e-07, "loss": 6.365, "step": 1548 }, { "epoch": 0.4855799373040752, "grad_norm": 3.264190435409546, "learning_rate": 7.745000000000002e-07, "loss": 8.0042, "step": 1549 }, { "epoch": 0.48589341692789967, "grad_norm": 4.327232837677002, "learning_rate": 7.750000000000001e-07, "loss": 9.7193, "step": 1550 }, { "epoch": 0.4862068965517241, "grad_norm": 3.012519359588623, "learning_rate": 7.755e-07, "loss": 6.3156, "step": 1551 }, { "epoch": 0.4865203761755486, "grad_norm": 4.424871921539307, "learning_rate": 7.760000000000001e-07, "loss": 8.8488, "step": 1552 }, { "epoch": 0.48683385579937305, "grad_norm": 3.7431857585906982, "learning_rate": 7.765e-07, "loss": 8.0868, "step": 1553 }, { "epoch": 0.4871473354231975, "grad_norm": 3.6024186611175537, "learning_rate": 7.770000000000001e-07, "loss": 10.3315, "step": 1554 }, { "epoch": 0.48746081504702193, "grad_norm": 3.3676459789276123, "learning_rate": 7.775000000000001e-07, "loss": 6.5772, "step": 1555 }, { "epoch": 0.4877742946708464, "grad_norm": 4.312803745269775, "learning_rate": 7.78e-07, "loss": 8.3249, "step": 1556 }, { "epoch": 0.4880877742946709, "grad_norm": 4.562367916107178, "learning_rate": 7.785e-07, "loss": 7.975, "step": 1557 }, { "epoch": 0.4884012539184953, "grad_norm": 3.697195053100586, "learning_rate": 7.790000000000001e-07, "loss": 7.3428, "step": 1558 }, { "epoch": 0.48871473354231976, "grad_norm": 4.236298561096191, "learning_rate": 7.795000000000001e-07, "loss": 10.2044, "step": 1559 }, { "epoch": 0.4890282131661442, "grad_norm": 3.0572011470794678, "learning_rate": 7.8e-07, "loss": 6.726, "step": 1560 }, { "epoch": 0.48934169278996864, "grad_norm": 3.513315439224243, "learning_rate": 7.805e-07, "loss": 7.963, "step": 1561 }, { "epoch": 0.4896551724137931, "grad_norm": 4.3329548835754395, "learning_rate": 7.810000000000001e-07, "loss": 7.9515, "step": 1562 }, { "epoch": 0.4899686520376176, "grad_norm": 4.270005226135254, "learning_rate": 7.815000000000001e-07, "loss": 10.5315, "step": 1563 }, { "epoch": 0.490282131661442, "grad_norm": 4.627399921417236, "learning_rate": 7.820000000000001e-07, "loss": 7.8852, "step": 1564 }, { "epoch": 0.49059561128526646, "grad_norm": 3.8905036449432373, "learning_rate": 7.825e-07, "loss": 8.9447, "step": 1565 }, { "epoch": 0.4909090909090909, "grad_norm": 4.1380181312561035, "learning_rate": 7.83e-07, "loss": 9.1815, "step": 1566 }, { "epoch": 0.49122257053291535, "grad_norm": 3.6653265953063965, "learning_rate": 7.835000000000001e-07, "loss": 8.7808, "step": 1567 }, { "epoch": 0.4915360501567398, "grad_norm": 3.8347866535186768, "learning_rate": 7.84e-07, "loss": 8.8488, "step": 1568 }, { "epoch": 0.4918495297805643, "grad_norm": 3.9519078731536865, "learning_rate": 7.845000000000001e-07, "loss": 8.6361, "step": 1569 }, { "epoch": 0.49216300940438873, "grad_norm": 4.5167036056518555, "learning_rate": 7.850000000000001e-07, "loss": 12.1671, "step": 1570 }, { "epoch": 0.49247648902821317, "grad_norm": 4.719188690185547, "learning_rate": 7.855e-07, "loss": 8.6138, "step": 1571 }, { "epoch": 0.4927899686520376, "grad_norm": 4.4409284591674805, "learning_rate": 7.860000000000001e-07, "loss": 9.6423, "step": 1572 }, { "epoch": 0.49310344827586206, "grad_norm": 2.9862163066864014, "learning_rate": 7.865e-07, "loss": 6.1838, "step": 1573 }, { "epoch": 0.4934169278996865, "grad_norm": 4.9992146492004395, "learning_rate": 7.870000000000002e-07, "loss": 11.4322, "step": 1574 }, { "epoch": 0.493730407523511, "grad_norm": 4.074015140533447, "learning_rate": 7.875000000000001e-07, "loss": 9.6542, "step": 1575 }, { "epoch": 0.49404388714733544, "grad_norm": 3.1027674674987793, "learning_rate": 7.88e-07, "loss": 6.8902, "step": 1576 }, { "epoch": 0.4943573667711599, "grad_norm": 4.539718151092529, "learning_rate": 7.885e-07, "loss": 11.3049, "step": 1577 }, { "epoch": 0.4946708463949843, "grad_norm": 4.0925140380859375, "learning_rate": 7.890000000000001e-07, "loss": 8.0069, "step": 1578 }, { "epoch": 0.49498432601880876, "grad_norm": 4.172740936279297, "learning_rate": 7.895000000000001e-07, "loss": 7.8851, "step": 1579 }, { "epoch": 0.4952978056426332, "grad_norm": 3.7345142364501953, "learning_rate": 7.900000000000001e-07, "loss": 9.2792, "step": 1580 }, { "epoch": 0.4956112852664577, "grad_norm": 3.628817081451416, "learning_rate": 7.905e-07, "loss": 6.9987, "step": 1581 }, { "epoch": 0.49592476489028214, "grad_norm": 4.242038249969482, "learning_rate": 7.910000000000001e-07, "loss": 9.6288, "step": 1582 }, { "epoch": 0.4962382445141066, "grad_norm": 5.484152793884277, "learning_rate": 7.915000000000001e-07, "loss": 15.2138, "step": 1583 }, { "epoch": 0.496551724137931, "grad_norm": 4.404213905334473, "learning_rate": 7.920000000000001e-07, "loss": 8.7437, "step": 1584 }, { "epoch": 0.49686520376175547, "grad_norm": 3.855804920196533, "learning_rate": 7.925e-07, "loss": 8.6907, "step": 1585 }, { "epoch": 0.4971786833855799, "grad_norm": 6.219444274902344, "learning_rate": 7.93e-07, "loss": 10.0004, "step": 1586 }, { "epoch": 0.4974921630094044, "grad_norm": 5.004199981689453, "learning_rate": 7.935000000000001e-07, "loss": 12.7213, "step": 1587 }, { "epoch": 0.49780564263322885, "grad_norm": 4.671052932739258, "learning_rate": 7.94e-07, "loss": 11.9351, "step": 1588 }, { "epoch": 0.4981191222570533, "grad_norm": 4.2331438064575195, "learning_rate": 7.945000000000001e-07, "loss": 9.8539, "step": 1589 }, { "epoch": 0.49843260188087773, "grad_norm": 3.8226304054260254, "learning_rate": 7.950000000000001e-07, "loss": 8.1552, "step": 1590 }, { "epoch": 0.4987460815047022, "grad_norm": 4.827603340148926, "learning_rate": 7.955e-07, "loss": 11.9278, "step": 1591 }, { "epoch": 0.4990595611285266, "grad_norm": 4.008607864379883, "learning_rate": 7.960000000000001e-07, "loss": 9.1185, "step": 1592 }, { "epoch": 0.4993730407523511, "grad_norm": 3.332667112350464, "learning_rate": 7.965e-07, "loss": 7.4391, "step": 1593 }, { "epoch": 0.49968652037617556, "grad_norm": 4.7476677894592285, "learning_rate": 7.97e-07, "loss": 7.911, "step": 1594 }, { "epoch": 0.5, "grad_norm": 3.6339681148529053, "learning_rate": 7.975000000000001e-07, "loss": 8.406, "step": 1595 }, { "epoch": 0.5003134796238244, "grad_norm": 3.5920093059539795, "learning_rate": 7.98e-07, "loss": 7.1252, "step": 1596 }, { "epoch": 0.5003134796238244, "eval_loss": 28.69378662109375, "eval_runtime": 20.7573, "eval_samples_per_second": 129.449, "eval_steps_per_second": 8.094, "step": 1596 }, { "epoch": 0.5006269592476489, "grad_norm": 3.7933077812194824, "learning_rate": 7.985000000000001e-07, "loss": 7.7692, "step": 1597 }, { "epoch": 0.5009404388714733, "grad_norm": 3.132864475250244, "learning_rate": 7.990000000000001e-07, "loss": 7.6955, "step": 1598 }, { "epoch": 0.5012539184952978, "grad_norm": 3.2161154747009277, "learning_rate": 7.995e-07, "loss": 7.817, "step": 1599 }, { "epoch": 0.5015673981191222, "grad_norm": 3.270275115966797, "learning_rate": 8.000000000000001e-07, "loss": 8.0105, "step": 1600 }, { "epoch": 0.5018808777429468, "grad_norm": 3.4156415462493896, "learning_rate": 8.005e-07, "loss": 7.2844, "step": 1601 }, { "epoch": 0.5021943573667712, "grad_norm": 3.0608019828796387, "learning_rate": 8.010000000000001e-07, "loss": 7.7339, "step": 1602 }, { "epoch": 0.5025078369905956, "grad_norm": 3.434061050415039, "learning_rate": 8.015000000000001e-07, "loss": 8.1699, "step": 1603 }, { "epoch": 0.5028213166144201, "grad_norm": 3.004152536392212, "learning_rate": 8.02e-07, "loss": 6.8241, "step": 1604 }, { "epoch": 0.5031347962382445, "grad_norm": 3.3219809532165527, "learning_rate": 8.025e-07, "loss": 7.6745, "step": 1605 }, { "epoch": 0.503448275862069, "grad_norm": 4.926275730133057, "learning_rate": 8.03e-07, "loss": 11.5191, "step": 1606 }, { "epoch": 0.5037617554858934, "grad_norm": 3.7799763679504395, "learning_rate": 8.035000000000001e-07, "loss": 8.9354, "step": 1607 }, { "epoch": 0.5040752351097179, "grad_norm": 3.3203251361846924, "learning_rate": 8.04e-07, "loss": 7.2962, "step": 1608 }, { "epoch": 0.5043887147335423, "grad_norm": 4.6373419761657715, "learning_rate": 8.045e-07, "loss": 11.3006, "step": 1609 }, { "epoch": 0.5047021943573667, "grad_norm": 4.2450103759765625, "learning_rate": 8.050000000000001e-07, "loss": 7.3383, "step": 1610 }, { "epoch": 0.5050156739811912, "grad_norm": 3.985623359680176, "learning_rate": 8.055000000000001e-07, "loss": 8.2701, "step": 1611 }, { "epoch": 0.5053291536050156, "grad_norm": 3.601292848587036, "learning_rate": 8.060000000000001e-07, "loss": 8.3857, "step": 1612 }, { "epoch": 0.5056426332288402, "grad_norm": 5.563617706298828, "learning_rate": 8.065e-07, "loss": 14.2597, "step": 1613 }, { "epoch": 0.5059561128526646, "grad_norm": 4.677281856536865, "learning_rate": 8.07e-07, "loss": 9.532, "step": 1614 }, { "epoch": 0.5062695924764891, "grad_norm": 3.749307632446289, "learning_rate": 8.075000000000001e-07, "loss": 6.9486, "step": 1615 }, { "epoch": 0.5065830721003135, "grad_norm": 4.544729232788086, "learning_rate": 8.08e-07, "loss": 8.8941, "step": 1616 }, { "epoch": 0.506896551724138, "grad_norm": 3.591256856918335, "learning_rate": 8.085000000000001e-07, "loss": 8.471, "step": 1617 }, { "epoch": 0.5072100313479624, "grad_norm": 5.5906548500061035, "learning_rate": 8.090000000000001e-07, "loss": 10.5195, "step": 1618 }, { "epoch": 0.5075235109717868, "grad_norm": 4.689754009246826, "learning_rate": 8.095e-07, "loss": 8.9176, "step": 1619 }, { "epoch": 0.5078369905956113, "grad_norm": 4.00483512878418, "learning_rate": 8.100000000000001e-07, "loss": 8.361, "step": 1620 }, { "epoch": 0.5081504702194357, "grad_norm": 3.282796621322632, "learning_rate": 8.105e-07, "loss": 7.4275, "step": 1621 }, { "epoch": 0.5084639498432602, "grad_norm": 4.235138416290283, "learning_rate": 8.110000000000002e-07, "loss": 9.8603, "step": 1622 }, { "epoch": 0.5087774294670846, "grad_norm": 4.118296146392822, "learning_rate": 8.115000000000001e-07, "loss": 8.8349, "step": 1623 }, { "epoch": 0.509090909090909, "grad_norm": 3.592233180999756, "learning_rate": 8.12e-07, "loss": 9.0909, "step": 1624 }, { "epoch": 0.5094043887147336, "grad_norm": 3.8250372409820557, "learning_rate": 8.125000000000001e-07, "loss": 6.3841, "step": 1625 }, { "epoch": 0.509717868338558, "grad_norm": 3.962688446044922, "learning_rate": 8.13e-07, "loss": 7.9938, "step": 1626 }, { "epoch": 0.5100313479623825, "grad_norm": 4.621908664703369, "learning_rate": 8.135000000000001e-07, "loss": 9.8957, "step": 1627 }, { "epoch": 0.5103448275862069, "grad_norm": 3.104255199432373, "learning_rate": 8.140000000000001e-07, "loss": 6.7121, "step": 1628 }, { "epoch": 0.5106583072100314, "grad_norm": 4.97641134262085, "learning_rate": 8.145e-07, "loss": 6.1975, "step": 1629 }, { "epoch": 0.5109717868338558, "grad_norm": 4.67917013168335, "learning_rate": 8.150000000000001e-07, "loss": 13.774, "step": 1630 }, { "epoch": 0.5112852664576802, "grad_norm": 3.4128177165985107, "learning_rate": 8.155000000000001e-07, "loss": 8.6059, "step": 1631 }, { "epoch": 0.5115987460815047, "grad_norm": 4.750978469848633, "learning_rate": 8.160000000000001e-07, "loss": 11.319, "step": 1632 }, { "epoch": 0.5119122257053291, "grad_norm": 3.3610668182373047, "learning_rate": 8.165e-07, "loss": 6.3177, "step": 1633 }, { "epoch": 0.5122257053291536, "grad_norm": 4.7112956047058105, "learning_rate": 8.17e-07, "loss": 10.9001, "step": 1634 }, { "epoch": 0.512539184952978, "grad_norm": 4.46808385848999, "learning_rate": 8.175000000000001e-07, "loss": 12.3748, "step": 1635 }, { "epoch": 0.5128526645768025, "grad_norm": 3.6378445625305176, "learning_rate": 8.18e-07, "loss": 9.1623, "step": 1636 }, { "epoch": 0.513166144200627, "grad_norm": 4.917815685272217, "learning_rate": 8.185000000000001e-07, "loss": 11.5192, "step": 1637 }, { "epoch": 0.5134796238244514, "grad_norm": 3.1884193420410156, "learning_rate": 8.190000000000001e-07, "loss": 6.7961, "step": 1638 }, { "epoch": 0.5137931034482759, "grad_norm": 2.6258697509765625, "learning_rate": 8.195e-07, "loss": 6.3646, "step": 1639 }, { "epoch": 0.5141065830721003, "grad_norm": 4.169460296630859, "learning_rate": 8.200000000000001e-07, "loss": 9.467, "step": 1640 }, { "epoch": 0.5144200626959248, "grad_norm": 5.105269432067871, "learning_rate": 8.205e-07, "loss": 13.4161, "step": 1641 }, { "epoch": 0.5147335423197492, "grad_norm": 3.549649238586426, "learning_rate": 8.210000000000002e-07, "loss": 7.6424, "step": 1642 }, { "epoch": 0.5150470219435737, "grad_norm": 5.451201915740967, "learning_rate": 8.215000000000001e-07, "loss": 10.404, "step": 1643 }, { "epoch": 0.5153605015673981, "grad_norm": 4.793301582336426, "learning_rate": 8.22e-07, "loss": 10.6138, "step": 1644 }, { "epoch": 0.5156739811912225, "grad_norm": 4.120952129364014, "learning_rate": 8.225000000000001e-07, "loss": 9.8365, "step": 1645 }, { "epoch": 0.515987460815047, "grad_norm": 5.643743991851807, "learning_rate": 8.23e-07, "loss": 13.1447, "step": 1646 }, { "epoch": 0.5163009404388714, "grad_norm": 3.094728469848633, "learning_rate": 8.235000000000001e-07, "loss": 6.3482, "step": 1647 }, { "epoch": 0.516614420062696, "grad_norm": 5.247632026672363, "learning_rate": 8.240000000000001e-07, "loss": 12.6728, "step": 1648 }, { "epoch": 0.5169278996865204, "grad_norm": 4.007214069366455, "learning_rate": 8.245e-07, "loss": 9.3752, "step": 1649 }, { "epoch": 0.5172413793103449, "grad_norm": 3.778515577316284, "learning_rate": 8.250000000000001e-07, "loss": 8.7075, "step": 1650 }, { "epoch": 0.5175548589341693, "grad_norm": 2.9957754611968994, "learning_rate": 8.255000000000001e-07, "loss": 6.3369, "step": 1651 }, { "epoch": 0.5178683385579937, "grad_norm": 3.4570438861846924, "learning_rate": 8.260000000000001e-07, "loss": 7.8577, "step": 1652 }, { "epoch": 0.5181818181818182, "grad_norm": 3.390144109725952, "learning_rate": 8.265e-07, "loss": 8.0684, "step": 1653 }, { "epoch": 0.5184952978056426, "grad_norm": 3.808957815170288, "learning_rate": 8.27e-07, "loss": 8.0672, "step": 1654 }, { "epoch": 0.5188087774294671, "grad_norm": 3.636155843734741, "learning_rate": 8.275000000000001e-07, "loss": 9.3789, "step": 1655 }, { "epoch": 0.5191222570532915, "grad_norm": 4.033257007598877, "learning_rate": 8.280000000000001e-07, "loss": 8.0115, "step": 1656 }, { "epoch": 0.519435736677116, "grad_norm": 4.070474624633789, "learning_rate": 8.285e-07, "loss": 8.4374, "step": 1657 }, { "epoch": 0.5197492163009404, "grad_norm": 3.276840925216675, "learning_rate": 8.290000000000001e-07, "loss": 6.5463, "step": 1658 }, { "epoch": 0.5200626959247648, "grad_norm": 3.784796714782715, "learning_rate": 8.295000000000001e-07, "loss": 9.6518, "step": 1659 }, { "epoch": 0.5203761755485894, "grad_norm": 3.181030035018921, "learning_rate": 8.300000000000001e-07, "loss": 7.1291, "step": 1660 }, { "epoch": 0.5206896551724138, "grad_norm": 6.969316005706787, "learning_rate": 8.305e-07, "loss": 13.7243, "step": 1661 }, { "epoch": 0.5210031347962383, "grad_norm": 3.7494585514068604, "learning_rate": 8.31e-07, "loss": 7.9837, "step": 1662 }, { "epoch": 0.5213166144200627, "grad_norm": 4.4347991943359375, "learning_rate": 8.315000000000001e-07, "loss": 9.8557, "step": 1663 }, { "epoch": 0.5216300940438872, "grad_norm": 3.5605807304382324, "learning_rate": 8.32e-07, "loss": 8.4987, "step": 1664 }, { "epoch": 0.5219435736677116, "grad_norm": 3.945876359939575, "learning_rate": 8.325000000000001e-07, "loss": 9.7023, "step": 1665 }, { "epoch": 0.522257053291536, "grad_norm": 3.840888500213623, "learning_rate": 8.33e-07, "loss": 10.5396, "step": 1666 }, { "epoch": 0.5225705329153605, "grad_norm": 5.052422523498535, "learning_rate": 8.335e-07, "loss": 10.9094, "step": 1667 }, { "epoch": 0.5228840125391849, "grad_norm": 3.726701259613037, "learning_rate": 8.340000000000001e-07, "loss": 8.6335, "step": 1668 }, { "epoch": 0.5231974921630094, "grad_norm": 3.632563591003418, "learning_rate": 8.345e-07, "loss": 7.3737, "step": 1669 }, { "epoch": 0.5235109717868338, "grad_norm": 5.3117499351501465, "learning_rate": 8.350000000000002e-07, "loss": 9.7193, "step": 1670 }, { "epoch": 0.5238244514106583, "grad_norm": 4.223608493804932, "learning_rate": 8.355000000000001e-07, "loss": 8.8684, "step": 1671 }, { "epoch": 0.5241379310344828, "grad_norm": 4.2938151359558105, "learning_rate": 8.36e-07, "loss": 7.9856, "step": 1672 }, { "epoch": 0.5244514106583072, "grad_norm": 3.227950096130371, "learning_rate": 8.365000000000001e-07, "loss": 8.0511, "step": 1673 }, { "epoch": 0.5247648902821317, "grad_norm": 3.6287143230438232, "learning_rate": 8.37e-07, "loss": 6.8761, "step": 1674 }, { "epoch": 0.5250783699059561, "grad_norm": 3.5546576976776123, "learning_rate": 8.375000000000001e-07, "loss": 8.3474, "step": 1675 }, { "epoch": 0.5253918495297806, "grad_norm": 4.312641143798828, "learning_rate": 8.380000000000001e-07, "loss": 8.7992, "step": 1676 }, { "epoch": 0.525705329153605, "grad_norm": 4.029053688049316, "learning_rate": 8.385e-07, "loss": 8.269, "step": 1677 }, { "epoch": 0.5260188087774295, "grad_norm": 4.90485143661499, "learning_rate": 8.390000000000001e-07, "loss": 10.3717, "step": 1678 }, { "epoch": 0.5263322884012539, "grad_norm": 3.8778631687164307, "learning_rate": 8.395000000000001e-07, "loss": 11.3539, "step": 1679 }, { "epoch": 0.5266457680250783, "grad_norm": 4.429601669311523, "learning_rate": 8.400000000000001e-07, "loss": 10.0634, "step": 1680 }, { "epoch": 0.5269592476489028, "grad_norm": 5.119757175445557, "learning_rate": 8.405e-07, "loss": 10.9194, "step": 1681 }, { "epoch": 0.5272727272727272, "grad_norm": 8.258536338806152, "learning_rate": 8.41e-07, "loss": 22.845, "step": 1682 }, { "epoch": 0.5275862068965518, "grad_norm": 4.698957920074463, "learning_rate": 8.415000000000001e-07, "loss": 8.8281, "step": 1683 }, { "epoch": 0.5278996865203762, "grad_norm": 3.764852285385132, "learning_rate": 8.42e-07, "loss": 7.234, "step": 1684 }, { "epoch": 0.5282131661442007, "grad_norm": 6.866782188415527, "learning_rate": 8.425000000000001e-07, "loss": 9.7779, "step": 1685 }, { "epoch": 0.5285266457680251, "grad_norm": 4.264431476593018, "learning_rate": 8.43e-07, "loss": 10.7163, "step": 1686 }, { "epoch": 0.5288401253918495, "grad_norm": 5.4701995849609375, "learning_rate": 8.435000000000001e-07, "loss": 11.002, "step": 1687 }, { "epoch": 0.529153605015674, "grad_norm": 4.967133522033691, "learning_rate": 8.440000000000001e-07, "loss": 9.6692, "step": 1688 }, { "epoch": 0.5294670846394984, "grad_norm": 3.6275758743286133, "learning_rate": 8.445e-07, "loss": 6.5804, "step": 1689 }, { "epoch": 0.5297805642633229, "grad_norm": 4.6618170738220215, "learning_rate": 8.450000000000002e-07, "loss": 8.9379, "step": 1690 }, { "epoch": 0.5300940438871473, "grad_norm": 3.9030559062957764, "learning_rate": 8.455000000000001e-07, "loss": 7.8233, "step": 1691 }, { "epoch": 0.5304075235109718, "grad_norm": 3.4137344360351562, "learning_rate": 8.46e-07, "loss": 7.6667, "step": 1692 }, { "epoch": 0.5307210031347962, "grad_norm": 3.2711963653564453, "learning_rate": 8.465000000000001e-07, "loss": 7.6183, "step": 1693 }, { "epoch": 0.5310344827586206, "grad_norm": 5.124555587768555, "learning_rate": 8.47e-07, "loss": 16.8092, "step": 1694 }, { "epoch": 0.5313479623824452, "grad_norm": 3.800910711288452, "learning_rate": 8.475000000000001e-07, "loss": 8.8713, "step": 1695 }, { "epoch": 0.5316614420062696, "grad_norm": 4.457700252532959, "learning_rate": 8.480000000000001e-07, "loss": 8.7624, "step": 1696 }, { "epoch": 0.5319749216300941, "grad_norm": 3.4324142932891846, "learning_rate": 8.485e-07, "loss": 7.641, "step": 1697 }, { "epoch": 0.5322884012539185, "grad_norm": 3.523561477661133, "learning_rate": 8.490000000000002e-07, "loss": 7.0024, "step": 1698 }, { "epoch": 0.532601880877743, "grad_norm": 3.4999146461486816, "learning_rate": 8.495000000000001e-07, "loss": 8.2928, "step": 1699 }, { "epoch": 0.5329153605015674, "grad_norm": 4.3573222160339355, "learning_rate": 8.500000000000001e-07, "loss": 8.0377, "step": 1700 }, { "epoch": 0.5332288401253918, "grad_norm": 3.5811939239501953, "learning_rate": 8.505e-07, "loss": 7.2941, "step": 1701 }, { "epoch": 0.5335423197492163, "grad_norm": 3.876803398132324, "learning_rate": 8.51e-07, "loss": 9.4216, "step": 1702 }, { "epoch": 0.5338557993730407, "grad_norm": 2.872718572616577, "learning_rate": 8.515000000000001e-07, "loss": 6.6356, "step": 1703 }, { "epoch": 0.5341692789968652, "grad_norm": 4.38058614730835, "learning_rate": 8.520000000000001e-07, "loss": 9.6664, "step": 1704 }, { "epoch": 0.5344827586206896, "grad_norm": 4.397758960723877, "learning_rate": 8.525000000000001e-07, "loss": 10.8439, "step": 1705 }, { "epoch": 0.534796238244514, "grad_norm": 3.576199769973755, "learning_rate": 8.53e-07, "loss": 7.8423, "step": 1706 }, { "epoch": 0.5351097178683386, "grad_norm": 3.0761783123016357, "learning_rate": 8.535000000000001e-07, "loss": 7.3114, "step": 1707 }, { "epoch": 0.535423197492163, "grad_norm": 4.053654670715332, "learning_rate": 8.540000000000001e-07, "loss": 8.3691, "step": 1708 }, { "epoch": 0.5357366771159875, "grad_norm": 4.436123847961426, "learning_rate": 8.545e-07, "loss": 12.1063, "step": 1709 }, { "epoch": 0.5360501567398119, "grad_norm": 5.484961986541748, "learning_rate": 8.550000000000002e-07, "loss": 10.3859, "step": 1710 }, { "epoch": 0.5363636363636364, "grad_norm": 4.26839542388916, "learning_rate": 8.555000000000001e-07, "loss": 8.8806, "step": 1711 }, { "epoch": 0.5366771159874608, "grad_norm": 5.425708770751953, "learning_rate": 8.56e-07, "loss": 11.4141, "step": 1712 }, { "epoch": 0.5369905956112853, "grad_norm": 4.29015588760376, "learning_rate": 8.565000000000001e-07, "loss": 9.4032, "step": 1713 }, { "epoch": 0.5373040752351097, "grad_norm": 3.599637508392334, "learning_rate": 8.57e-07, "loss": 8.3692, "step": 1714 }, { "epoch": 0.5376175548589341, "grad_norm": 4.9885358810424805, "learning_rate": 8.575000000000002e-07, "loss": 9.0797, "step": 1715 }, { "epoch": 0.5379310344827586, "grad_norm": 3.783560037612915, "learning_rate": 8.580000000000001e-07, "loss": 7.1796, "step": 1716 }, { "epoch": 0.538244514106583, "grad_norm": 8.088088035583496, "learning_rate": 8.585e-07, "loss": 19.7025, "step": 1717 }, { "epoch": 0.5385579937304075, "grad_norm": 2.911607503890991, "learning_rate": 8.590000000000002e-07, "loss": 6.2524, "step": 1718 }, { "epoch": 0.538871473354232, "grad_norm": 3.7300970554351807, "learning_rate": 8.595000000000001e-07, "loss": 8.4442, "step": 1719 }, { "epoch": 0.5391849529780565, "grad_norm": 3.8973944187164307, "learning_rate": 8.6e-07, "loss": 9.406, "step": 1720 }, { "epoch": 0.5394984326018809, "grad_norm": 4.2937397956848145, "learning_rate": 8.605000000000001e-07, "loss": 6.9903, "step": 1721 }, { "epoch": 0.5398119122257053, "grad_norm": 4.311274528503418, "learning_rate": 8.61e-07, "loss": 7.9766, "step": 1722 }, { "epoch": 0.5401253918495298, "grad_norm": 3.7893688678741455, "learning_rate": 8.615000000000001e-07, "loss": 8.0692, "step": 1723 }, { "epoch": 0.5404388714733542, "grad_norm": 3.4368412494659424, "learning_rate": 8.620000000000001e-07, "loss": 7.6627, "step": 1724 }, { "epoch": 0.5407523510971787, "grad_norm": 4.653461933135986, "learning_rate": 8.625e-07, "loss": 8.2418, "step": 1725 }, { "epoch": 0.5410658307210031, "grad_norm": 3.554748058319092, "learning_rate": 8.63e-07, "loss": 7.7693, "step": 1726 }, { "epoch": 0.5413793103448276, "grad_norm": 3.596076726913452, "learning_rate": 8.635000000000001e-07, "loss": 11.7324, "step": 1727 }, { "epoch": 0.541692789968652, "grad_norm": 5.29031229019165, "learning_rate": 8.640000000000001e-07, "loss": 10.1403, "step": 1728 }, { "epoch": 0.5420062695924764, "grad_norm": 4.6957807540893555, "learning_rate": 8.645e-07, "loss": 10.0583, "step": 1729 }, { "epoch": 0.542319749216301, "grad_norm": 3.0998942852020264, "learning_rate": 8.65e-07, "loss": 7.5078, "step": 1730 }, { "epoch": 0.5426332288401254, "grad_norm": 6.108078479766846, "learning_rate": 8.655000000000001e-07, "loss": 15.5995, "step": 1731 }, { "epoch": 0.5429467084639499, "grad_norm": 4.639706611633301, "learning_rate": 8.66e-07, "loss": 8.0792, "step": 1732 }, { "epoch": 0.5432601880877743, "grad_norm": 3.6627628803253174, "learning_rate": 8.665000000000001e-07, "loss": 8.4311, "step": 1733 }, { "epoch": 0.5435736677115988, "grad_norm": 3.9771621227264404, "learning_rate": 8.67e-07, "loss": 8.8372, "step": 1734 }, { "epoch": 0.5438871473354232, "grad_norm": 3.9450392723083496, "learning_rate": 8.675000000000001e-07, "loss": 10.2992, "step": 1735 }, { "epoch": 0.5442006269592476, "grad_norm": 3.019969940185547, "learning_rate": 8.680000000000001e-07, "loss": 7.7473, "step": 1736 }, { "epoch": 0.5445141065830721, "grad_norm": 5.00993013381958, "learning_rate": 8.685e-07, "loss": 10.7672, "step": 1737 }, { "epoch": 0.5448275862068965, "grad_norm": 4.663172245025635, "learning_rate": 8.690000000000002e-07, "loss": 8.9194, "step": 1738 }, { "epoch": 0.545141065830721, "grad_norm": 5.089305400848389, "learning_rate": 8.695000000000001e-07, "loss": 10.1471, "step": 1739 }, { "epoch": 0.5454545454545454, "grad_norm": 8.746878623962402, "learning_rate": 8.7e-07, "loss": 11.3008, "step": 1740 }, { "epoch": 0.5457680250783699, "grad_norm": 3.6730334758758545, "learning_rate": 8.705000000000001e-07, "loss": 9.8861, "step": 1741 }, { "epoch": 0.5460815047021944, "grad_norm": 3.33270263671875, "learning_rate": 8.71e-07, "loss": 8.7378, "step": 1742 }, { "epoch": 0.5463949843260189, "grad_norm": 3.6083526611328125, "learning_rate": 8.715000000000001e-07, "loss": 7.0979, "step": 1743 }, { "epoch": 0.5467084639498433, "grad_norm": 4.226643085479736, "learning_rate": 8.720000000000001e-07, "loss": 10.3873, "step": 1744 }, { "epoch": 0.5470219435736677, "grad_norm": 5.096066951751709, "learning_rate": 8.725e-07, "loss": 10.6816, "step": 1745 }, { "epoch": 0.5473354231974922, "grad_norm": 5.229349613189697, "learning_rate": 8.73e-07, "loss": 11.0695, "step": 1746 }, { "epoch": 0.5476489028213166, "grad_norm": 3.8301353454589844, "learning_rate": 8.735000000000001e-07, "loss": 8.6425, "step": 1747 }, { "epoch": 0.5479623824451411, "grad_norm": 3.517941951751709, "learning_rate": 8.740000000000001e-07, "loss": 6.2092, "step": 1748 }, { "epoch": 0.5482758620689655, "grad_norm": 5.241222858428955, "learning_rate": 8.745000000000001e-07, "loss": 11.7764, "step": 1749 }, { "epoch": 0.54858934169279, "grad_norm": 5.1557793617248535, "learning_rate": 8.75e-07, "loss": 13.8192, "step": 1750 }, { "epoch": 0.5489028213166144, "grad_norm": 4.522706508636475, "learning_rate": 8.755000000000001e-07, "loss": 9.2798, "step": 1751 }, { "epoch": 0.5492163009404388, "grad_norm": 5.382571697235107, "learning_rate": 8.760000000000001e-07, "loss": 11.1607, "step": 1752 }, { "epoch": 0.5495297805642633, "grad_norm": 3.164684295654297, "learning_rate": 8.765000000000001e-07, "loss": 6.6911, "step": 1753 }, { "epoch": 0.5498432601880878, "grad_norm": 3.7033884525299072, "learning_rate": 8.77e-07, "loss": 7.5505, "step": 1754 }, { "epoch": 0.5501567398119123, "grad_norm": 4.4705424308776855, "learning_rate": 8.775000000000001e-07, "loss": 10.6266, "step": 1755 }, { "epoch": 0.5504702194357367, "grad_norm": 3.576754331588745, "learning_rate": 8.780000000000001e-07, "loss": 7.6148, "step": 1756 }, { "epoch": 0.5507836990595611, "grad_norm": 6.662974834442139, "learning_rate": 8.785e-07, "loss": 15.6583, "step": 1757 }, { "epoch": 0.5510971786833856, "grad_norm": 4.8018364906311035, "learning_rate": 8.790000000000002e-07, "loss": 7.7867, "step": 1758 }, { "epoch": 0.55141065830721, "grad_norm": 3.5706562995910645, "learning_rate": 8.795000000000001e-07, "loss": 7.1267, "step": 1759 }, { "epoch": 0.5517241379310345, "grad_norm": 3.8161635398864746, "learning_rate": 8.8e-07, "loss": 8.0858, "step": 1760 }, { "epoch": 0.5520376175548589, "grad_norm": 4.2101898193359375, "learning_rate": 8.805000000000001e-07, "loss": 8.4719, "step": 1761 }, { "epoch": 0.5523510971786834, "grad_norm": 3.5980849266052246, "learning_rate": 8.81e-07, "loss": 8.5086, "step": 1762 }, { "epoch": 0.5526645768025078, "grad_norm": 4.349874973297119, "learning_rate": 8.815000000000002e-07, "loss": 9.6007, "step": 1763 }, { "epoch": 0.5529780564263322, "grad_norm": 4.534544467926025, "learning_rate": 8.820000000000001e-07, "loss": 9.1542, "step": 1764 }, { "epoch": 0.5532915360501567, "grad_norm": 5.753028392791748, "learning_rate": 8.825e-07, "loss": 13.3121, "step": 1765 }, { "epoch": 0.5536050156739812, "grad_norm": 4.646644592285156, "learning_rate": 8.830000000000001e-07, "loss": 10.1693, "step": 1766 }, { "epoch": 0.5539184952978057, "grad_norm": 4.152806758880615, "learning_rate": 8.835000000000001e-07, "loss": 9.3472, "step": 1767 }, { "epoch": 0.5542319749216301, "grad_norm": 4.156120300292969, "learning_rate": 8.840000000000001e-07, "loss": 8.9546, "step": 1768 }, { "epoch": 0.5545454545454546, "grad_norm": 4.695617198944092, "learning_rate": 8.845000000000001e-07, "loss": 9.1984, "step": 1769 }, { "epoch": 0.554858934169279, "grad_norm": 3.514662742614746, "learning_rate": 8.85e-07, "loss": 6.9892, "step": 1770 }, { "epoch": 0.5551724137931034, "grad_norm": 6.180057048797607, "learning_rate": 8.855000000000001e-07, "loss": 15.8813, "step": 1771 }, { "epoch": 0.5554858934169279, "grad_norm": 3.279160261154175, "learning_rate": 8.860000000000001e-07, "loss": 6.86, "step": 1772 }, { "epoch": 0.5557993730407523, "grad_norm": 4.369133472442627, "learning_rate": 8.865000000000001e-07, "loss": 9.133, "step": 1773 }, { "epoch": 0.5561128526645768, "grad_norm": 3.11153507232666, "learning_rate": 8.87e-07, "loss": 7.517, "step": 1774 }, { "epoch": 0.5564263322884012, "grad_norm": 4.086194038391113, "learning_rate": 8.875000000000001e-07, "loss": 8.7146, "step": 1775 }, { "epoch": 0.5567398119122257, "grad_norm": 4.669196605682373, "learning_rate": 8.880000000000001e-07, "loss": 9.4316, "step": 1776 }, { "epoch": 0.5570532915360502, "grad_norm": 4.278200626373291, "learning_rate": 8.885e-07, "loss": 9.8683, "step": 1777 }, { "epoch": 0.5573667711598747, "grad_norm": 6.216943740844727, "learning_rate": 8.890000000000002e-07, "loss": 12.5523, "step": 1778 }, { "epoch": 0.5576802507836991, "grad_norm": 4.141219139099121, "learning_rate": 8.895000000000001e-07, "loss": 9.5758, "step": 1779 }, { "epoch": 0.5579937304075235, "grad_norm": 5.539915561676025, "learning_rate": 8.900000000000001e-07, "loss": 13.5515, "step": 1780 }, { "epoch": 0.558307210031348, "grad_norm": 4.460906505584717, "learning_rate": 8.905000000000001e-07, "loss": 9.7846, "step": 1781 }, { "epoch": 0.5586206896551724, "grad_norm": 3.381852388381958, "learning_rate": 8.91e-07, "loss": 7.3451, "step": 1782 }, { "epoch": 0.5589341692789969, "grad_norm": 4.935987949371338, "learning_rate": 8.915e-07, "loss": 9.0696, "step": 1783 }, { "epoch": 0.5592476489028213, "grad_norm": 4.209127426147461, "learning_rate": 8.920000000000001e-07, "loss": 8.1537, "step": 1784 }, { "epoch": 0.5595611285266457, "grad_norm": 3.22871994972229, "learning_rate": 8.925e-07, "loss": 7.8978, "step": 1785 }, { "epoch": 0.5598746081504702, "grad_norm": 4.8778510093688965, "learning_rate": 8.930000000000001e-07, "loss": 9.9476, "step": 1786 }, { "epoch": 0.5601880877742946, "grad_norm": 4.3461384773254395, "learning_rate": 8.935000000000001e-07, "loss": 8.4605, "step": 1787 }, { "epoch": 0.5605015673981191, "grad_norm": 4.09140157699585, "learning_rate": 8.94e-07, "loss": 8.6371, "step": 1788 }, { "epoch": 0.5608150470219436, "grad_norm": 3.154021739959717, "learning_rate": 8.945000000000001e-07, "loss": 6.6153, "step": 1789 }, { "epoch": 0.5611285266457681, "grad_norm": 5.04860782623291, "learning_rate": 8.95e-07, "loss": 10.1237, "step": 1790 }, { "epoch": 0.5614420062695925, "grad_norm": 3.98972487449646, "learning_rate": 8.955000000000002e-07, "loss": 7.8599, "step": 1791 }, { "epoch": 0.561755485893417, "grad_norm": 3.527597665786743, "learning_rate": 8.960000000000001e-07, "loss": 8.2753, "step": 1792 }, { "epoch": 0.5620689655172414, "grad_norm": 6.0412139892578125, "learning_rate": 8.965e-07, "loss": 13.1168, "step": 1793 }, { "epoch": 0.5623824451410658, "grad_norm": 4.94082498550415, "learning_rate": 8.97e-07, "loss": 10.4559, "step": 1794 }, { "epoch": 0.5626959247648903, "grad_norm": 4.965950012207031, "learning_rate": 8.975000000000001e-07, "loss": 10.237, "step": 1795 }, { "epoch": 0.5630094043887147, "grad_norm": 4.55895471572876, "learning_rate": 8.980000000000001e-07, "loss": 10.7508, "step": 1796 }, { "epoch": 0.5633228840125392, "grad_norm": 4.183038234710693, "learning_rate": 8.985000000000001e-07, "loss": 7.4745, "step": 1797 }, { "epoch": 0.5636363636363636, "grad_norm": 3.6337876319885254, "learning_rate": 8.99e-07, "loss": 6.8318, "step": 1798 }, { "epoch": 0.563949843260188, "grad_norm": 5.689560890197754, "learning_rate": 8.995000000000001e-07, "loss": 12.2413, "step": 1799 }, { "epoch": 0.5642633228840125, "grad_norm": 3.8097832202911377, "learning_rate": 9.000000000000001e-07, "loss": 7.0029, "step": 1800 }, { "epoch": 0.564576802507837, "grad_norm": 4.039278030395508, "learning_rate": 9.005000000000001e-07, "loss": 9.2906, "step": 1801 }, { "epoch": 0.5648902821316615, "grad_norm": 3.7648391723632812, "learning_rate": 9.01e-07, "loss": 7.9313, "step": 1802 }, { "epoch": 0.5652037617554859, "grad_norm": 4.256327152252197, "learning_rate": 9.015e-07, "loss": 6.9056, "step": 1803 }, { "epoch": 0.5655172413793104, "grad_norm": 4.904327869415283, "learning_rate": 9.020000000000001e-07, "loss": 9.3173, "step": 1804 }, { "epoch": 0.5658307210031348, "grad_norm": 3.761366367340088, "learning_rate": 9.025e-07, "loss": 10.9158, "step": 1805 }, { "epoch": 0.5661442006269592, "grad_norm": 4.512961387634277, "learning_rate": 9.030000000000001e-07, "loss": 9.3172, "step": 1806 }, { "epoch": 0.5664576802507837, "grad_norm": 4.454610824584961, "learning_rate": 9.035000000000001e-07, "loss": 9.806, "step": 1807 }, { "epoch": 0.5667711598746081, "grad_norm": 3.878605842590332, "learning_rate": 9.04e-07, "loss": 7.8319, "step": 1808 }, { "epoch": 0.5670846394984326, "grad_norm": 3.3651275634765625, "learning_rate": 9.045000000000001e-07, "loss": 7.7877, "step": 1809 }, { "epoch": 0.567398119122257, "grad_norm": 4.225704669952393, "learning_rate": 9.05e-07, "loss": 8.8007, "step": 1810 }, { "epoch": 0.5677115987460815, "grad_norm": 4.0525617599487305, "learning_rate": 9.055000000000002e-07, "loss": 8.9639, "step": 1811 }, { "epoch": 0.568025078369906, "grad_norm": 3.8242380619049072, "learning_rate": 9.060000000000001e-07, "loss": 8.4361, "step": 1812 }, { "epoch": 0.5683385579937305, "grad_norm": 6.60628080368042, "learning_rate": 9.065e-07, "loss": 9.6387, "step": 1813 }, { "epoch": 0.5686520376175549, "grad_norm": 4.6100616455078125, "learning_rate": 9.070000000000001e-07, "loss": 8.7753, "step": 1814 }, { "epoch": 0.5689655172413793, "grad_norm": 3.9851086139678955, "learning_rate": 9.075000000000001e-07, "loss": 7.7566, "step": 1815 }, { "epoch": 0.5692789968652038, "grad_norm": 5.08043098449707, "learning_rate": 9.080000000000001e-07, "loss": 12.0445, "step": 1816 }, { "epoch": 0.5695924764890282, "grad_norm": 8.151008605957031, "learning_rate": 9.085000000000001e-07, "loss": 12.3116, "step": 1817 }, { "epoch": 0.5699059561128527, "grad_norm": 5.475290775299072, "learning_rate": 9.09e-07, "loss": 12.5451, "step": 1818 }, { "epoch": 0.5702194357366771, "grad_norm": 3.203784227371216, "learning_rate": 9.095000000000001e-07, "loss": 6.5738, "step": 1819 }, { "epoch": 0.5705329153605015, "grad_norm": 3.322894334793091, "learning_rate": 9.100000000000001e-07, "loss": 7.036, "step": 1820 }, { "epoch": 0.570846394984326, "grad_norm": 5.2033772468566895, "learning_rate": 9.105000000000001e-07, "loss": 11.3802, "step": 1821 }, { "epoch": 0.5711598746081504, "grad_norm": 3.5129075050354004, "learning_rate": 9.11e-07, "loss": 5.7609, "step": 1822 }, { "epoch": 0.5714733542319749, "grad_norm": 3.822200059890747, "learning_rate": 9.115e-07, "loss": 8.4694, "step": 1823 }, { "epoch": 0.5717868338557994, "grad_norm": 5.213927268981934, "learning_rate": 9.120000000000001e-07, "loss": 10.8033, "step": 1824 }, { "epoch": 0.5721003134796239, "grad_norm": 3.970292806625366, "learning_rate": 9.125e-07, "loss": 8.1297, "step": 1825 }, { "epoch": 0.5724137931034483, "grad_norm": 3.255862236022949, "learning_rate": 9.130000000000001e-07, "loss": 6.6128, "step": 1826 }, { "epoch": 0.5727272727272728, "grad_norm": 5.0875139236450195, "learning_rate": 9.135000000000001e-07, "loss": 10.7206, "step": 1827 }, { "epoch": 0.5730407523510972, "grad_norm": 4.447558403015137, "learning_rate": 9.140000000000001e-07, "loss": 8.3421, "step": 1828 }, { "epoch": 0.5733542319749216, "grad_norm": 4.044737815856934, "learning_rate": 9.145000000000001e-07, "loss": 7.2425, "step": 1829 }, { "epoch": 0.5736677115987461, "grad_norm": 2.927945852279663, "learning_rate": 9.15e-07, "loss": 6.6691, "step": 1830 }, { "epoch": 0.5739811912225705, "grad_norm": 3.3857815265655518, "learning_rate": 9.155000000000002e-07, "loss": 6.6235, "step": 1831 }, { "epoch": 0.574294670846395, "grad_norm": 4.590656757354736, "learning_rate": 9.160000000000001e-07, "loss": 11.8911, "step": 1832 }, { "epoch": 0.5746081504702194, "grad_norm": 4.849060535430908, "learning_rate": 9.165e-07, "loss": 9.1651, "step": 1833 }, { "epoch": 0.5749216300940438, "grad_norm": 3.661118984222412, "learning_rate": 9.170000000000001e-07, "loss": 8.3145, "step": 1834 }, { "epoch": 0.5752351097178683, "grad_norm": 4.550508975982666, "learning_rate": 9.175000000000001e-07, "loss": 10.9553, "step": 1835 }, { "epoch": 0.5755485893416928, "grad_norm": 4.856189250946045, "learning_rate": 9.180000000000001e-07, "loss": 10.9995, "step": 1836 }, { "epoch": 0.5758620689655173, "grad_norm": 3.962833881378174, "learning_rate": 9.185000000000001e-07, "loss": 7.6791, "step": 1837 }, { "epoch": 0.5761755485893417, "grad_norm": 3.818290948867798, "learning_rate": 9.19e-07, "loss": 9.4411, "step": 1838 }, { "epoch": 0.5764890282131662, "grad_norm": 4.265783309936523, "learning_rate": 9.195000000000002e-07, "loss": 8.6175, "step": 1839 }, { "epoch": 0.5768025078369906, "grad_norm": 4.399896621704102, "learning_rate": 9.200000000000001e-07, "loss": 8.0593, "step": 1840 }, { "epoch": 0.577115987460815, "grad_norm": 3.487797975540161, "learning_rate": 9.205000000000001e-07, "loss": 7.8994, "step": 1841 }, { "epoch": 0.5774294670846395, "grad_norm": 4.428309440612793, "learning_rate": 9.210000000000001e-07, "loss": 7.2103, "step": 1842 }, { "epoch": 0.5777429467084639, "grad_norm": 4.569345951080322, "learning_rate": 9.215e-07, "loss": 9.6218, "step": 1843 }, { "epoch": 0.5780564263322884, "grad_norm": 3.990476369857788, "learning_rate": 9.220000000000001e-07, "loss": 9.4368, "step": 1844 }, { "epoch": 0.5783699059561128, "grad_norm": 3.2364110946655273, "learning_rate": 9.225000000000001e-07, "loss": 6.377, "step": 1845 }, { "epoch": 0.5786833855799373, "grad_norm": 4.474930763244629, "learning_rate": 9.23e-07, "loss": 8.6538, "step": 1846 }, { "epoch": 0.5789968652037617, "grad_norm": 4.316197872161865, "learning_rate": 9.235000000000001e-07, "loss": 9.8697, "step": 1847 }, { "epoch": 0.5793103448275863, "grad_norm": 3.6721043586730957, "learning_rate": 9.240000000000001e-07, "loss": 7.6369, "step": 1848 }, { "epoch": 0.5796238244514107, "grad_norm": 4.066787242889404, "learning_rate": 9.245000000000001e-07, "loss": 8.1153, "step": 1849 }, { "epoch": 0.5799373040752351, "grad_norm": 4.483874320983887, "learning_rate": 9.25e-07, "loss": 7.9151, "step": 1850 }, { "epoch": 0.5802507836990596, "grad_norm": 6.05754280090332, "learning_rate": 9.255e-07, "loss": 11.5638, "step": 1851 }, { "epoch": 0.580564263322884, "grad_norm": 4.692145824432373, "learning_rate": 9.260000000000001e-07, "loss": 9.0891, "step": 1852 }, { "epoch": 0.5808777429467085, "grad_norm": 4.905162334442139, "learning_rate": 9.265e-07, "loss": 11.009, "step": 1853 }, { "epoch": 0.5811912225705329, "grad_norm": 3.9991772174835205, "learning_rate": 9.270000000000001e-07, "loss": 7.1689, "step": 1854 }, { "epoch": 0.5815047021943573, "grad_norm": 3.545417308807373, "learning_rate": 9.275000000000001e-07, "loss": 8.8178, "step": 1855 }, { "epoch": 0.5818181818181818, "grad_norm": 3.4547829627990723, "learning_rate": 9.28e-07, "loss": 6.4124, "step": 1856 }, { "epoch": 0.5821316614420062, "grad_norm": 5.388890266418457, "learning_rate": 9.285000000000001e-07, "loss": 9.8751, "step": 1857 }, { "epoch": 0.5824451410658307, "grad_norm": 3.773027181625366, "learning_rate": 9.29e-07, "loss": 9.1356, "step": 1858 }, { "epoch": 0.5827586206896552, "grad_norm": 3.524904727935791, "learning_rate": 9.295000000000002e-07, "loss": 7.9023, "step": 1859 }, { "epoch": 0.5830721003134797, "grad_norm": 3.681853771209717, "learning_rate": 9.300000000000001e-07, "loss": 7.6041, "step": 1860 }, { "epoch": 0.5833855799373041, "grad_norm": 4.124235153198242, "learning_rate": 9.305e-07, "loss": 10.3233, "step": 1861 }, { "epoch": 0.5836990595611286, "grad_norm": 4.318082809448242, "learning_rate": 9.310000000000001e-07, "loss": 10.4379, "step": 1862 }, { "epoch": 0.584012539184953, "grad_norm": 3.5927541255950928, "learning_rate": 9.315e-07, "loss": 7.0752, "step": 1863 }, { "epoch": 0.5843260188087774, "grad_norm": 3.7160086631774902, "learning_rate": 9.320000000000001e-07, "loss": 8.0991, "step": 1864 }, { "epoch": 0.5846394984326019, "grad_norm": 4.717742919921875, "learning_rate": 9.325000000000001e-07, "loss": 8.6703, "step": 1865 }, { "epoch": 0.5849529780564263, "grad_norm": 4.089090347290039, "learning_rate": 9.33e-07, "loss": 8.6572, "step": 1866 }, { "epoch": 0.5852664576802508, "grad_norm": 5.522655010223389, "learning_rate": 9.335000000000001e-07, "loss": 9.8966, "step": 1867 }, { "epoch": 0.5855799373040752, "grad_norm": 3.4123260974884033, "learning_rate": 9.340000000000001e-07, "loss": 7.0431, "step": 1868 }, { "epoch": 0.5858934169278996, "grad_norm": 4.160542011260986, "learning_rate": 9.345000000000001e-07, "loss": 8.1701, "step": 1869 }, { "epoch": 0.5862068965517241, "grad_norm": 3.6532208919525146, "learning_rate": 9.35e-07, "loss": 10.5839, "step": 1870 }, { "epoch": 0.5865203761755486, "grad_norm": 4.788308143615723, "learning_rate": 9.355e-07, "loss": 9.6103, "step": 1871 }, { "epoch": 0.5868338557993731, "grad_norm": 3.656114339828491, "learning_rate": 9.360000000000001e-07, "loss": 9.0917, "step": 1872 }, { "epoch": 0.5871473354231975, "grad_norm": 3.7740187644958496, "learning_rate": 9.365000000000001e-07, "loss": 10.3359, "step": 1873 }, { "epoch": 0.587460815047022, "grad_norm": 3.203486680984497, "learning_rate": 9.370000000000001e-07, "loss": 6.0882, "step": 1874 }, { "epoch": 0.5877742946708464, "grad_norm": 4.1461920738220215, "learning_rate": 9.375000000000001e-07, "loss": 10.5749, "step": 1875 }, { "epoch": 0.5880877742946709, "grad_norm": 4.353470802307129, "learning_rate": 9.380000000000001e-07, "loss": 8.5529, "step": 1876 }, { "epoch": 0.5884012539184953, "grad_norm": 4.317278861999512, "learning_rate": 9.385000000000001e-07, "loss": 9.1286, "step": 1877 }, { "epoch": 0.5887147335423197, "grad_norm": 4.176400661468506, "learning_rate": 9.39e-07, "loss": 9.2643, "step": 1878 }, { "epoch": 0.5890282131661442, "grad_norm": 3.544487953186035, "learning_rate": 9.395000000000002e-07, "loss": 7.9035, "step": 1879 }, { "epoch": 0.5893416927899686, "grad_norm": 4.3520731925964355, "learning_rate": 9.400000000000001e-07, "loss": 8.9988, "step": 1880 }, { "epoch": 0.5896551724137931, "grad_norm": 4.328791618347168, "learning_rate": 9.405e-07, "loss": 8.2385, "step": 1881 }, { "epoch": 0.5899686520376175, "grad_norm": 4.532482147216797, "learning_rate": 9.410000000000001e-07, "loss": 10.0327, "step": 1882 }, { "epoch": 0.590282131661442, "grad_norm": 4.309502124786377, "learning_rate": 9.415e-07, "loss": 9.5955, "step": 1883 }, { "epoch": 0.5905956112852665, "grad_norm": 9.216413497924805, "learning_rate": 9.420000000000002e-07, "loss": 17.2207, "step": 1884 }, { "epoch": 0.5909090909090909, "grad_norm": 4.302111625671387, "learning_rate": 9.425000000000001e-07, "loss": 8.134, "step": 1885 }, { "epoch": 0.5912225705329154, "grad_norm": 3.478489637374878, "learning_rate": 9.43e-07, "loss": 7.1565, "step": 1886 }, { "epoch": 0.5915360501567398, "grad_norm": 5.942442893981934, "learning_rate": 9.435000000000002e-07, "loss": 16.1045, "step": 1887 }, { "epoch": 0.5918495297805643, "grad_norm": 4.080247402191162, "learning_rate": 9.440000000000001e-07, "loss": 10.6923, "step": 1888 }, { "epoch": 0.5921630094043887, "grad_norm": 4.32432222366333, "learning_rate": 9.445000000000001e-07, "loss": 8.9229, "step": 1889 }, { "epoch": 0.5924764890282131, "grad_norm": 3.346331834793091, "learning_rate": 9.450000000000001e-07, "loss": 6.9437, "step": 1890 }, { "epoch": 0.5927899686520376, "grad_norm": 8.710538864135742, "learning_rate": 9.455e-07, "loss": 18.3767, "step": 1891 }, { "epoch": 0.593103448275862, "grad_norm": 6.308311462402344, "learning_rate": 9.460000000000001e-07, "loss": 13.8071, "step": 1892 }, { "epoch": 0.5934169278996865, "grad_norm": 4.4475321769714355, "learning_rate": 9.465000000000001e-07, "loss": 8.8487, "step": 1893 }, { "epoch": 0.5937304075235109, "grad_norm": 5.897817611694336, "learning_rate": 9.470000000000001e-07, "loss": 11.9756, "step": 1894 }, { "epoch": 0.5940438871473355, "grad_norm": 5.22391414642334, "learning_rate": 9.475e-07, "loss": 7.3174, "step": 1895 }, { "epoch": 0.5943573667711599, "grad_norm": 4.082650184631348, "learning_rate": 9.480000000000001e-07, "loss": 7.8726, "step": 1896 }, { "epoch": 0.5946708463949844, "grad_norm": 4.346643447875977, "learning_rate": 9.485000000000001e-07, "loss": 8.1732, "step": 1897 }, { "epoch": 0.5949843260188088, "grad_norm": 6.866816997528076, "learning_rate": 9.49e-07, "loss": 12.5577, "step": 1898 }, { "epoch": 0.5952978056426332, "grad_norm": 3.5348422527313232, "learning_rate": 9.495000000000002e-07, "loss": 6.8785, "step": 1899 }, { "epoch": 0.5956112852664577, "grad_norm": 3.0132737159729004, "learning_rate": 9.500000000000001e-07, "loss": 5.57, "step": 1900 }, { "epoch": 0.5959247648902821, "grad_norm": 3.286238193511963, "learning_rate": 9.505e-07, "loss": 8.4462, "step": 1901 }, { "epoch": 0.5962382445141066, "grad_norm": 5.34050989151001, "learning_rate": 9.510000000000001e-07, "loss": 8.19, "step": 1902 }, { "epoch": 0.596551724137931, "grad_norm": 3.889528751373291, "learning_rate": 9.515e-07, "loss": 7.5043, "step": 1903 }, { "epoch": 0.5968652037617554, "grad_norm": 3.9055964946746826, "learning_rate": 9.520000000000002e-07, "loss": 8.6483, "step": 1904 }, { "epoch": 0.5971786833855799, "grad_norm": 3.298340320587158, "learning_rate": 9.525000000000001e-07, "loss": 6.4088, "step": 1905 }, { "epoch": 0.5974921630094044, "grad_norm": 3.5493357181549072, "learning_rate": 9.53e-07, "loss": 7.4659, "step": 1906 }, { "epoch": 0.5978056426332289, "grad_norm": 4.918447494506836, "learning_rate": 9.535000000000002e-07, "loss": 9.6589, "step": 1907 }, { "epoch": 0.5981191222570533, "grad_norm": 3.490692615509033, "learning_rate": 9.54e-07, "loss": 7.0492, "step": 1908 }, { "epoch": 0.5984326018808778, "grad_norm": 4.562289237976074, "learning_rate": 9.545e-07, "loss": 9.4843, "step": 1909 }, { "epoch": 0.5987460815047022, "grad_norm": 5.618401050567627, "learning_rate": 9.550000000000002e-07, "loss": 15.0284, "step": 1910 }, { "epoch": 0.5990595611285267, "grad_norm": 5.000195503234863, "learning_rate": 9.555e-07, "loss": 10.3012, "step": 1911 }, { "epoch": 0.5993730407523511, "grad_norm": 4.634150505065918, "learning_rate": 9.56e-07, "loss": 8.0312, "step": 1912 }, { "epoch": 0.5996865203761755, "grad_norm": 4.144935131072998, "learning_rate": 9.565e-07, "loss": 7.6425, "step": 1913 }, { "epoch": 0.6, "grad_norm": 6.016616344451904, "learning_rate": 9.570000000000001e-07, "loss": 10.0162, "step": 1914 }, { "epoch": 0.6003134796238244, "grad_norm": 4.132763862609863, "learning_rate": 9.575000000000001e-07, "loss": 7.813, "step": 1915 }, { "epoch": 0.6006269592476489, "grad_norm": 5.254676818847656, "learning_rate": 9.58e-07, "loss": 9.9075, "step": 1916 }, { "epoch": 0.6009404388714733, "grad_norm": 3.9892544746398926, "learning_rate": 9.585000000000002e-07, "loss": 8.0493, "step": 1917 }, { "epoch": 0.6012539184952979, "grad_norm": 4.14557409286499, "learning_rate": 9.59e-07, "loss": 8.4252, "step": 1918 }, { "epoch": 0.6015673981191223, "grad_norm": 5.171091079711914, "learning_rate": 9.595e-07, "loss": 9.5018, "step": 1919 }, { "epoch": 0.6018808777429467, "grad_norm": 5.105014801025391, "learning_rate": 9.600000000000001e-07, "loss": 8.7117, "step": 1920 }, { "epoch": 0.6021943573667712, "grad_norm": 4.491677284240723, "learning_rate": 9.605e-07, "loss": 9.022, "step": 1921 }, { "epoch": 0.6025078369905956, "grad_norm": 4.195937156677246, "learning_rate": 9.610000000000002e-07, "loss": 8.2956, "step": 1922 }, { "epoch": 0.6028213166144201, "grad_norm": 2.9639382362365723, "learning_rate": 9.615e-07, "loss": 6.2187, "step": 1923 }, { "epoch": 0.6031347962382445, "grad_norm": 5.277063369750977, "learning_rate": 9.62e-07, "loss": 9.245, "step": 1924 }, { "epoch": 0.603448275862069, "grad_norm": 3.676309585571289, "learning_rate": 9.625e-07, "loss": 7.6557, "step": 1925 }, { "epoch": 0.6037617554858934, "grad_norm": 5.051148891448975, "learning_rate": 9.630000000000001e-07, "loss": 11.8118, "step": 1926 }, { "epoch": 0.6040752351097178, "grad_norm": 4.08391809463501, "learning_rate": 9.635000000000002e-07, "loss": 8.8092, "step": 1927 }, { "epoch": 0.6043887147335423, "grad_norm": 4.592138290405273, "learning_rate": 9.64e-07, "loss": 8.8694, "step": 1928 }, { "epoch": 0.6047021943573667, "grad_norm": 6.512213230133057, "learning_rate": 9.645e-07, "loss": 10.43, "step": 1929 }, { "epoch": 0.6050156739811913, "grad_norm": 5.282738208770752, "learning_rate": 9.65e-07, "loss": 11.6431, "step": 1930 }, { "epoch": 0.6053291536050157, "grad_norm": 3.4257993698120117, "learning_rate": 9.655000000000001e-07, "loss": 7.8864, "step": 1931 }, { "epoch": 0.6056426332288402, "grad_norm": 3.866560459136963, "learning_rate": 9.660000000000002e-07, "loss": 7.6513, "step": 1932 }, { "epoch": 0.6059561128526646, "grad_norm": 4.288784503936768, "learning_rate": 9.665e-07, "loss": 8.4351, "step": 1933 }, { "epoch": 0.606269592476489, "grad_norm": 5.792541980743408, "learning_rate": 9.67e-07, "loss": 9.6173, "step": 1934 }, { "epoch": 0.6065830721003135, "grad_norm": 2.695124626159668, "learning_rate": 9.675e-07, "loss": 6.1397, "step": 1935 }, { "epoch": 0.6068965517241379, "grad_norm": 4.266729354858398, "learning_rate": 9.68e-07, "loss": 7.2699, "step": 1936 }, { "epoch": 0.6072100313479624, "grad_norm": 3.4654881954193115, "learning_rate": 9.685000000000001e-07, "loss": 6.0948, "step": 1937 }, { "epoch": 0.6075235109717868, "grad_norm": 4.1121954917907715, "learning_rate": 9.690000000000002e-07, "loss": 8.0899, "step": 1938 }, { "epoch": 0.6078369905956112, "grad_norm": 4.078955173492432, "learning_rate": 9.695e-07, "loss": 7.1175, "step": 1939 }, { "epoch": 0.6081504702194357, "grad_norm": 7.904345512390137, "learning_rate": 9.7e-07, "loss": 14.9369, "step": 1940 }, { "epoch": 0.6084639498432602, "grad_norm": 3.5593857765197754, "learning_rate": 9.705e-07, "loss": 6.9494, "step": 1941 }, { "epoch": 0.6087774294670847, "grad_norm": 3.992682933807373, "learning_rate": 9.71e-07, "loss": 8.6618, "step": 1942 }, { "epoch": 0.6090909090909091, "grad_norm": 3.1125354766845703, "learning_rate": 9.715000000000001e-07, "loss": 5.9992, "step": 1943 }, { "epoch": 0.6094043887147336, "grad_norm": 3.825096845626831, "learning_rate": 9.72e-07, "loss": 7.6425, "step": 1944 }, { "epoch": 0.609717868338558, "grad_norm": 3.6271681785583496, "learning_rate": 9.725e-07, "loss": 7.7853, "step": 1945 }, { "epoch": 0.6100313479623825, "grad_norm": 6.750822067260742, "learning_rate": 9.73e-07, "loss": 16.4377, "step": 1946 }, { "epoch": 0.6103448275862069, "grad_norm": 3.6327602863311768, "learning_rate": 9.735e-07, "loss": 8.6631, "step": 1947 }, { "epoch": 0.6106583072100313, "grad_norm": 3.878876209259033, "learning_rate": 9.740000000000001e-07, "loss": 6.7651, "step": 1948 }, { "epoch": 0.6109717868338558, "grad_norm": 3.9936046600341797, "learning_rate": 9.745e-07, "loss": 7.4598, "step": 1949 }, { "epoch": 0.6112852664576802, "grad_norm": 3.6358838081359863, "learning_rate": 9.750000000000002e-07, "loss": 6.9454, "step": 1950 }, { "epoch": 0.6115987460815047, "grad_norm": 4.248730182647705, "learning_rate": 9.755e-07, "loss": 8.5024, "step": 1951 }, { "epoch": 0.6119122257053291, "grad_norm": 3.337886095046997, "learning_rate": 9.76e-07, "loss": 6.4199, "step": 1952 }, { "epoch": 0.6122257053291537, "grad_norm": 3.8960623741149902, "learning_rate": 9.765e-07, "loss": 7.2547, "step": 1953 }, { "epoch": 0.6125391849529781, "grad_norm": 3.7697343826293945, "learning_rate": 9.770000000000001e-07, "loss": 8.3044, "step": 1954 }, { "epoch": 0.6128526645768025, "grad_norm": 4.274561882019043, "learning_rate": 9.775000000000002e-07, "loss": 9.8826, "step": 1955 }, { "epoch": 0.613166144200627, "grad_norm": 4.1365461349487305, "learning_rate": 9.78e-07, "loss": 6.5002, "step": 1956 }, { "epoch": 0.6134796238244514, "grad_norm": 4.854091167449951, "learning_rate": 9.785000000000002e-07, "loss": 9.7826, "step": 1957 }, { "epoch": 0.6137931034482759, "grad_norm": 4.134148597717285, "learning_rate": 9.79e-07, "loss": 7.5184, "step": 1958 }, { "epoch": 0.6141065830721003, "grad_norm": 3.301070213317871, "learning_rate": 9.795000000000001e-07, "loss": 7.5854, "step": 1959 }, { "epoch": 0.6144200626959248, "grad_norm": 5.0885820388793945, "learning_rate": 9.800000000000001e-07, "loss": 12.4143, "step": 1960 }, { "epoch": 0.6147335423197492, "grad_norm": 3.778027296066284, "learning_rate": 9.805e-07, "loss": 6.6442, "step": 1961 }, { "epoch": 0.6150470219435736, "grad_norm": 4.023343086242676, "learning_rate": 9.810000000000002e-07, "loss": 9.6106, "step": 1962 }, { "epoch": 0.6153605015673981, "grad_norm": 4.735710144042969, "learning_rate": 9.815e-07, "loss": 8.8034, "step": 1963 }, { "epoch": 0.6156739811912225, "grad_norm": 4.353292942047119, "learning_rate": 9.82e-07, "loss": 9.1107, "step": 1964 }, { "epoch": 0.6159874608150471, "grad_norm": 4.989565849304199, "learning_rate": 9.825000000000001e-07, "loss": 9.7556, "step": 1965 }, { "epoch": 0.6163009404388715, "grad_norm": 3.5673139095306396, "learning_rate": 9.830000000000002e-07, "loss": 6.4914, "step": 1966 }, { "epoch": 0.616614420062696, "grad_norm": 4.158758640289307, "learning_rate": 9.835000000000002e-07, "loss": 7.9705, "step": 1967 }, { "epoch": 0.6169278996865204, "grad_norm": 7.679725170135498, "learning_rate": 9.84e-07, "loss": 9.2285, "step": 1968 }, { "epoch": 0.6172413793103448, "grad_norm": 5.259065628051758, "learning_rate": 9.845e-07, "loss": 12.6733, "step": 1969 }, { "epoch": 0.6175548589341693, "grad_norm": 4.931044578552246, "learning_rate": 9.85e-07, "loss": 9.4881, "step": 1970 }, { "epoch": 0.6178683385579937, "grad_norm": 4.3828125, "learning_rate": 9.855000000000001e-07, "loss": 8.3451, "step": 1971 }, { "epoch": 0.6181818181818182, "grad_norm": 5.446940898895264, "learning_rate": 9.86e-07, "loss": 8.3187, "step": 1972 }, { "epoch": 0.6184952978056426, "grad_norm": 5.320723533630371, "learning_rate": 9.865e-07, "loss": 9.4323, "step": 1973 }, { "epoch": 0.618808777429467, "grad_norm": 4.248854637145996, "learning_rate": 9.87e-07, "loss": 7.6606, "step": 1974 }, { "epoch": 0.6191222570532915, "grad_norm": 4.689657211303711, "learning_rate": 9.875e-07, "loss": 6.7403, "step": 1975 }, { "epoch": 0.6194357366771159, "grad_norm": 4.264988422393799, "learning_rate": 9.880000000000001e-07, "loss": 9.1217, "step": 1976 }, { "epoch": 0.6197492163009405, "grad_norm": 5.480489253997803, "learning_rate": 9.885e-07, "loss": 8.779, "step": 1977 }, { "epoch": 0.6200626959247649, "grad_norm": 4.787638187408447, "learning_rate": 9.890000000000002e-07, "loss": 8.9536, "step": 1978 }, { "epoch": 0.6203761755485894, "grad_norm": 4.072272300720215, "learning_rate": 9.895e-07, "loss": 8.668, "step": 1979 }, { "epoch": 0.6206896551724138, "grad_norm": 3.5888800621032715, "learning_rate": 9.9e-07, "loss": 7.6795, "step": 1980 }, { "epoch": 0.6210031347962383, "grad_norm": 5.570566177368164, "learning_rate": 9.905e-07, "loss": 12.1096, "step": 1981 }, { "epoch": 0.6213166144200627, "grad_norm": 4.423348426818848, "learning_rate": 9.91e-07, "loss": 9.1405, "step": 1982 }, { "epoch": 0.6216300940438871, "grad_norm": 4.6284356117248535, "learning_rate": 9.915000000000002e-07, "loss": 7.2074, "step": 1983 }, { "epoch": 0.6219435736677116, "grad_norm": 4.048377990722656, "learning_rate": 9.92e-07, "loss": 7.6424, "step": 1984 }, { "epoch": 0.622257053291536, "grad_norm": 3.3591959476470947, "learning_rate": 9.925e-07, "loss": 7.673, "step": 1985 }, { "epoch": 0.6225705329153605, "grad_norm": 3.9147262573242188, "learning_rate": 9.93e-07, "loss": 7.3318, "step": 1986 }, { "epoch": 0.6228840125391849, "grad_norm": 3.4501211643218994, "learning_rate": 9.935e-07, "loss": 7.3048, "step": 1987 }, { "epoch": 0.6231974921630095, "grad_norm": 3.9171688556671143, "learning_rate": 9.940000000000001e-07, "loss": 7.2895, "step": 1988 }, { "epoch": 0.6235109717868339, "grad_norm": 4.911311626434326, "learning_rate": 9.945e-07, "loss": 7.0245, "step": 1989 }, { "epoch": 0.6238244514106583, "grad_norm": 3.89208722114563, "learning_rate": 9.950000000000002e-07, "loss": 9.1831, "step": 1990 }, { "epoch": 0.6241379310344828, "grad_norm": 5.180747032165527, "learning_rate": 9.955e-07, "loss": 9.1415, "step": 1991 }, { "epoch": 0.6244514106583072, "grad_norm": 4.259772777557373, "learning_rate": 9.96e-07, "loss": 9.0887, "step": 1992 }, { "epoch": 0.6247648902821317, "grad_norm": 3.5109479427337646, "learning_rate": 9.965000000000001e-07, "loss": 7.4708, "step": 1993 }, { "epoch": 0.6250783699059561, "grad_norm": 3.7264599800109863, "learning_rate": 9.970000000000002e-07, "loss": 8.0124, "step": 1994 }, { "epoch": 0.6253918495297806, "grad_norm": 6.5949249267578125, "learning_rate": 9.975000000000002e-07, "loss": 12.249, "step": 1995 }, { "epoch": 0.625705329153605, "grad_norm": 5.1157450675964355, "learning_rate": 9.98e-07, "loss": 10.3741, "step": 1996 }, { "epoch": 0.6260188087774294, "grad_norm": 4.158992767333984, "learning_rate": 9.985e-07, "loss": 7.6302, "step": 1997 }, { "epoch": 0.6263322884012539, "grad_norm": 5.8107428550720215, "learning_rate": 9.99e-07, "loss": 14.3595, "step": 1998 }, { "epoch": 0.6266457680250783, "grad_norm": 5.154208660125732, "learning_rate": 9.995000000000001e-07, "loss": 8.8518, "step": 1999 }, { "epoch": 0.6269592476489029, "grad_norm": 4.65903902053833, "learning_rate": 1.0000000000000002e-06, "loss": 7.9164, "step": 2000 }, { "epoch": 0.6272727272727273, "grad_norm": 4.164371013641357, "learning_rate": 1.0005e-06, "loss": 8.2315, "step": 2001 }, { "epoch": 0.6275862068965518, "grad_norm": 4.161435604095459, "learning_rate": 1.001e-06, "loss": 9.4845, "step": 2002 }, { "epoch": 0.6278996865203762, "grad_norm": 6.261343002319336, "learning_rate": 1.0015e-06, "loss": 14.0749, "step": 2003 }, { "epoch": 0.6282131661442006, "grad_norm": 6.03980827331543, "learning_rate": 1.002e-06, "loss": 12.252, "step": 2004 }, { "epoch": 0.6285266457680251, "grad_norm": 4.8788862228393555, "learning_rate": 1.0025000000000001e-06, "loss": 8.6274, "step": 2005 }, { "epoch": 0.6288401253918495, "grad_norm": 4.340915679931641, "learning_rate": 1.0030000000000002e-06, "loss": 8.0098, "step": 2006 }, { "epoch": 0.629153605015674, "grad_norm": 4.664811611175537, "learning_rate": 1.0035e-06, "loss": 8.8534, "step": 2007 }, { "epoch": 0.6294670846394984, "grad_norm": 3.558058977127075, "learning_rate": 1.004e-06, "loss": 7.5891, "step": 2008 }, { "epoch": 0.6297805642633229, "grad_norm": 5.759692668914795, "learning_rate": 1.0045e-06, "loss": 10.9539, "step": 2009 }, { "epoch": 0.6300940438871473, "grad_norm": 3.947463035583496, "learning_rate": 1.0050000000000001e-06, "loss": 6.9227, "step": 2010 }, { "epoch": 0.6304075235109717, "grad_norm": 4.328242778778076, "learning_rate": 1.0055000000000002e-06, "loss": 8.2226, "step": 2011 }, { "epoch": 0.6307210031347963, "grad_norm": 5.028584957122803, "learning_rate": 1.006e-06, "loss": 11.2504, "step": 2012 }, { "epoch": 0.6310344827586207, "grad_norm": 5.828540325164795, "learning_rate": 1.0065e-06, "loss": 12.0494, "step": 2013 }, { "epoch": 0.6313479623824452, "grad_norm": 5.526819705963135, "learning_rate": 1.007e-06, "loss": 13.3581, "step": 2014 }, { "epoch": 0.6316614420062696, "grad_norm": 4.654907703399658, "learning_rate": 1.0075e-06, "loss": 9.1679, "step": 2015 }, { "epoch": 0.631974921630094, "grad_norm": 4.357663154602051, "learning_rate": 1.0080000000000001e-06, "loss": 8.1994, "step": 2016 }, { "epoch": 0.6322884012539185, "grad_norm": 5.075762748718262, "learning_rate": 1.0085e-06, "loss": 10.5631, "step": 2017 }, { "epoch": 0.6326018808777429, "grad_norm": 3.278012275695801, "learning_rate": 1.0090000000000002e-06, "loss": 7.5705, "step": 2018 }, { "epoch": 0.6329153605015674, "grad_norm": 3.894144058227539, "learning_rate": 1.0095e-06, "loss": 7.3649, "step": 2019 }, { "epoch": 0.6332288401253918, "grad_norm": 3.737834930419922, "learning_rate": 1.01e-06, "loss": 8.4081, "step": 2020 }, { "epoch": 0.6335423197492163, "grad_norm": 3.789785623550415, "learning_rate": 1.0105000000000001e-06, "loss": 6.3057, "step": 2021 }, { "epoch": 0.6338557993730407, "grad_norm": 3.5118370056152344, "learning_rate": 1.011e-06, "loss": 5.7457, "step": 2022 }, { "epoch": 0.6341692789968651, "grad_norm": 3.056246519088745, "learning_rate": 1.0115000000000002e-06, "loss": 5.266, "step": 2023 }, { "epoch": 0.6344827586206897, "grad_norm": 4.797995567321777, "learning_rate": 1.012e-06, "loss": 7.2787, "step": 2024 }, { "epoch": 0.6347962382445141, "grad_norm": 3.8254222869873047, "learning_rate": 1.0125e-06, "loss": 7.5975, "step": 2025 }, { "epoch": 0.6351097178683386, "grad_norm": 3.7210936546325684, "learning_rate": 1.013e-06, "loss": 7.1766, "step": 2026 }, { "epoch": 0.635423197492163, "grad_norm": 4.653789520263672, "learning_rate": 1.0135000000000001e-06, "loss": 8.7087, "step": 2027 }, { "epoch": 0.6357366771159875, "grad_norm": 5.211480617523193, "learning_rate": 1.0140000000000002e-06, "loss": 11.3491, "step": 2028 }, { "epoch": 0.6360501567398119, "grad_norm": 4.071037292480469, "learning_rate": 1.0145e-06, "loss": 9.2332, "step": 2029 }, { "epoch": 0.6363636363636364, "grad_norm": 3.9774158000946045, "learning_rate": 1.0150000000000002e-06, "loss": 7.939, "step": 2030 }, { "epoch": 0.6366771159874608, "grad_norm": 4.108770370483398, "learning_rate": 1.0155e-06, "loss": 8.5832, "step": 2031 }, { "epoch": 0.6369905956112852, "grad_norm": 3.7771849632263184, "learning_rate": 1.016e-06, "loss": 7.9219, "step": 2032 }, { "epoch": 0.6373040752351097, "grad_norm": 4.883484363555908, "learning_rate": 1.0165000000000001e-06, "loss": 7.4839, "step": 2033 }, { "epoch": 0.6376175548589341, "grad_norm": 5.540393829345703, "learning_rate": 1.0170000000000002e-06, "loss": 11.3517, "step": 2034 }, { "epoch": 0.6379310344827587, "grad_norm": 3.476276397705078, "learning_rate": 1.0175e-06, "loss": 5.4489, "step": 2035 }, { "epoch": 0.6382445141065831, "grad_norm": 4.134171485900879, "learning_rate": 1.018e-06, "loss": 9.0043, "step": 2036 }, { "epoch": 0.6385579937304076, "grad_norm": 3.8782660961151123, "learning_rate": 1.0185e-06, "loss": 7.7151, "step": 2037 }, { "epoch": 0.638871473354232, "grad_norm": 5.3437347412109375, "learning_rate": 1.0190000000000001e-06, "loss": 8.5708, "step": 2038 }, { "epoch": 0.6391849529780564, "grad_norm": 4.901749610900879, "learning_rate": 1.0195000000000001e-06, "loss": 9.5016, "step": 2039 }, { "epoch": 0.6394984326018809, "grad_norm": 9.612685203552246, "learning_rate": 1.02e-06, "loss": 25.8075, "step": 2040 }, { "epoch": 0.6398119122257053, "grad_norm": 4.421562671661377, "learning_rate": 1.0205e-06, "loss": 8.9058, "step": 2041 }, { "epoch": 0.6401253918495298, "grad_norm": 4.578634738922119, "learning_rate": 1.021e-06, "loss": 8.4515, "step": 2042 }, { "epoch": 0.6404388714733542, "grad_norm": 5.676279067993164, "learning_rate": 1.0215e-06, "loss": 12.9783, "step": 2043 }, { "epoch": 0.6407523510971787, "grad_norm": 4.57780647277832, "learning_rate": 1.0220000000000001e-06, "loss": 7.5248, "step": 2044 }, { "epoch": 0.6410658307210031, "grad_norm": 4.466324806213379, "learning_rate": 1.0225e-06, "loss": 9.4472, "step": 2045 }, { "epoch": 0.6413793103448275, "grad_norm": 4.786015510559082, "learning_rate": 1.0230000000000002e-06, "loss": 7.8918, "step": 2046 }, { "epoch": 0.6416927899686521, "grad_norm": 4.978856086730957, "learning_rate": 1.0235e-06, "loss": 8.9204, "step": 2047 }, { "epoch": 0.6420062695924765, "grad_norm": 3.4786014556884766, "learning_rate": 1.024e-06, "loss": 7.3538, "step": 2048 }, { "epoch": 0.642319749216301, "grad_norm": 6.084647178649902, "learning_rate": 1.0245e-06, "loss": 8.8253, "step": 2049 }, { "epoch": 0.6426332288401254, "grad_norm": 5.3138556480407715, "learning_rate": 1.025e-06, "loss": 11.3613, "step": 2050 }, { "epoch": 0.6429467084639499, "grad_norm": 3.573272943496704, "learning_rate": 1.0255000000000002e-06, "loss": 7.1191, "step": 2051 }, { "epoch": 0.6432601880877743, "grad_norm": 4.325623035430908, "learning_rate": 1.026e-06, "loss": 6.4057, "step": 2052 }, { "epoch": 0.6435736677115987, "grad_norm": 6.643167972564697, "learning_rate": 1.0265e-06, "loss": 16.8502, "step": 2053 }, { "epoch": 0.6438871473354232, "grad_norm": 4.033902645111084, "learning_rate": 1.027e-06, "loss": 7.7638, "step": 2054 }, { "epoch": 0.6442006269592476, "grad_norm": 3.4600517749786377, "learning_rate": 1.0275000000000001e-06, "loss": 6.4728, "step": 2055 }, { "epoch": 0.6445141065830721, "grad_norm": 5.4869818687438965, "learning_rate": 1.0280000000000002e-06, "loss": 11.0305, "step": 2056 }, { "epoch": 0.6448275862068965, "grad_norm": 2.9913864135742188, "learning_rate": 1.0285e-06, "loss": 6.2851, "step": 2057 }, { "epoch": 0.645141065830721, "grad_norm": 4.9674296379089355, "learning_rate": 1.0290000000000002e-06, "loss": 10.5091, "step": 2058 }, { "epoch": 0.6454545454545455, "grad_norm": 4.117265224456787, "learning_rate": 1.0295e-06, "loss": 7.9589, "step": 2059 }, { "epoch": 0.64576802507837, "grad_norm": 4.783974647521973, "learning_rate": 1.03e-06, "loss": 10.0756, "step": 2060 }, { "epoch": 0.6460815047021944, "grad_norm": 3.9700472354888916, "learning_rate": 1.0305000000000001e-06, "loss": 8.5293, "step": 2061 }, { "epoch": 0.6463949843260188, "grad_norm": 3.448155641555786, "learning_rate": 1.031e-06, "loss": 6.8082, "step": 2062 }, { "epoch": 0.6467084639498433, "grad_norm": 5.751345157623291, "learning_rate": 1.0315000000000002e-06, "loss": 10.7494, "step": 2063 }, { "epoch": 0.6470219435736677, "grad_norm": 4.36277961730957, "learning_rate": 1.032e-06, "loss": 7.5388, "step": 2064 }, { "epoch": 0.6473354231974922, "grad_norm": 4.9406633377075195, "learning_rate": 1.0325e-06, "loss": 8.7483, "step": 2065 }, { "epoch": 0.6476489028213166, "grad_norm": 3.610996961593628, "learning_rate": 1.033e-06, "loss": 5.993, "step": 2066 }, { "epoch": 0.647962382445141, "grad_norm": 8.09903621673584, "learning_rate": 1.0335000000000001e-06, "loss": 18.3466, "step": 2067 }, { "epoch": 0.6482758620689655, "grad_norm": 4.765588760375977, "learning_rate": 1.0340000000000002e-06, "loss": 9.1752, "step": 2068 }, { "epoch": 0.6485893416927899, "grad_norm": 5.2749199867248535, "learning_rate": 1.0345e-06, "loss": 11.2824, "step": 2069 }, { "epoch": 0.6489028213166145, "grad_norm": 5.744815826416016, "learning_rate": 1.035e-06, "loss": 12.4292, "step": 2070 }, { "epoch": 0.6492163009404389, "grad_norm": 4.47613525390625, "learning_rate": 1.0355e-06, "loss": 7.8614, "step": 2071 }, { "epoch": 0.6495297805642634, "grad_norm": 6.598586082458496, "learning_rate": 1.0360000000000001e-06, "loss": 11.9815, "step": 2072 }, { "epoch": 0.6498432601880878, "grad_norm": 5.2339606285095215, "learning_rate": 1.0365000000000002e-06, "loss": 11.3129, "step": 2073 }, { "epoch": 0.6501567398119122, "grad_norm": 4.304049015045166, "learning_rate": 1.0370000000000002e-06, "loss": 7.8178, "step": 2074 }, { "epoch": 0.6504702194357367, "grad_norm": 5.00603723526001, "learning_rate": 1.0375e-06, "loss": 10.3162, "step": 2075 }, { "epoch": 0.6507836990595611, "grad_norm": 3.639469623565674, "learning_rate": 1.038e-06, "loss": 6.6105, "step": 2076 }, { "epoch": 0.6510971786833856, "grad_norm": 4.230920314788818, "learning_rate": 1.0385e-06, "loss": 7.4984, "step": 2077 }, { "epoch": 0.65141065830721, "grad_norm": 4.557835578918457, "learning_rate": 1.0390000000000001e-06, "loss": 8.9535, "step": 2078 }, { "epoch": 0.6517241379310345, "grad_norm": 4.792398452758789, "learning_rate": 1.0395000000000002e-06, "loss": 8.1143, "step": 2079 }, { "epoch": 0.6520376175548589, "grad_norm": 4.397486209869385, "learning_rate": 1.04e-06, "loss": 9.0494, "step": 2080 }, { "epoch": 0.6523510971786833, "grad_norm": 6.484209060668945, "learning_rate": 1.0405e-06, "loss": 11.6424, "step": 2081 }, { "epoch": 0.6526645768025079, "grad_norm": 5.75565767288208, "learning_rate": 1.041e-06, "loss": 10.6868, "step": 2082 }, { "epoch": 0.6529780564263323, "grad_norm": 4.253330707550049, "learning_rate": 1.0415000000000001e-06, "loss": 8.2279, "step": 2083 }, { "epoch": 0.6532915360501568, "grad_norm": 5.125704288482666, "learning_rate": 1.0420000000000001e-06, "loss": 8.9277, "step": 2084 }, { "epoch": 0.6536050156739812, "grad_norm": 4.079887390136719, "learning_rate": 1.0425e-06, "loss": 7.9315, "step": 2085 }, { "epoch": 0.6539184952978057, "grad_norm": 4.4778361320495605, "learning_rate": 1.0430000000000002e-06, "loss": 8.606, "step": 2086 }, { "epoch": 0.6542319749216301, "grad_norm": 4.556799411773682, "learning_rate": 1.0435e-06, "loss": 6.0047, "step": 2087 }, { "epoch": 0.6545454545454545, "grad_norm": 6.23499870300293, "learning_rate": 1.044e-06, "loss": 11.1014, "step": 2088 }, { "epoch": 0.654858934169279, "grad_norm": 5.879521369934082, "learning_rate": 1.0445000000000001e-06, "loss": 11.7058, "step": 2089 }, { "epoch": 0.6551724137931034, "grad_norm": 6.175170421600342, "learning_rate": 1.045e-06, "loss": 8.7771, "step": 2090 }, { "epoch": 0.6554858934169279, "grad_norm": 4.931567192077637, "learning_rate": 1.0455000000000002e-06, "loss": 10.7524, "step": 2091 }, { "epoch": 0.6557993730407523, "grad_norm": 5.440428733825684, "learning_rate": 1.046e-06, "loss": 7.6057, "step": 2092 }, { "epoch": 0.6561128526645768, "grad_norm": 4.918017387390137, "learning_rate": 1.0465e-06, "loss": 10.069, "step": 2093 }, { "epoch": 0.6564263322884013, "grad_norm": 4.340334892272949, "learning_rate": 1.047e-06, "loss": 8.5315, "step": 2094 }, { "epoch": 0.6567398119122257, "grad_norm": 4.921428680419922, "learning_rate": 1.0475000000000001e-06, "loss": 9.7294, "step": 2095 }, { "epoch": 0.6570532915360502, "grad_norm": 6.337207794189453, "learning_rate": 1.0480000000000002e-06, "loss": 9.9782, "step": 2096 }, { "epoch": 0.6573667711598746, "grad_norm": 4.316909313201904, "learning_rate": 1.0485e-06, "loss": 6.4938, "step": 2097 }, { "epoch": 0.6576802507836991, "grad_norm": 4.156019687652588, "learning_rate": 1.049e-06, "loss": 7.8225, "step": 2098 }, { "epoch": 0.6579937304075235, "grad_norm": 3.3352489471435547, "learning_rate": 1.0495e-06, "loss": 6.4576, "step": 2099 }, { "epoch": 0.658307210031348, "grad_norm": 4.093966007232666, "learning_rate": 1.0500000000000001e-06, "loss": 7.3799, "step": 2100 }, { "epoch": 0.6586206896551724, "grad_norm": 3.7547051906585693, "learning_rate": 1.0505000000000001e-06, "loss": 8.6161, "step": 2101 }, { "epoch": 0.6589341692789968, "grad_norm": 4.8714447021484375, "learning_rate": 1.051e-06, "loss": 8.6568, "step": 2102 }, { "epoch": 0.6592476489028213, "grad_norm": 4.202094078063965, "learning_rate": 1.0515e-06, "loss": 7.711, "step": 2103 }, { "epoch": 0.6595611285266457, "grad_norm": 5.062414169311523, "learning_rate": 1.052e-06, "loss": 8.4271, "step": 2104 }, { "epoch": 0.6598746081504702, "grad_norm": 5.461309909820557, "learning_rate": 1.0525e-06, "loss": 8.2217, "step": 2105 }, { "epoch": 0.6601880877742947, "grad_norm": 4.929152965545654, "learning_rate": 1.0530000000000001e-06, "loss": 7.7316, "step": 2106 }, { "epoch": 0.6605015673981192, "grad_norm": 5.310903072357178, "learning_rate": 1.0535000000000002e-06, "loss": 9.4138, "step": 2107 }, { "epoch": 0.6608150470219436, "grad_norm": 5.781913757324219, "learning_rate": 1.054e-06, "loss": 8.4, "step": 2108 }, { "epoch": 0.661128526645768, "grad_norm": 4.272099494934082, "learning_rate": 1.0545e-06, "loss": 7.6243, "step": 2109 }, { "epoch": 0.6614420062695925, "grad_norm": 3.9875686168670654, "learning_rate": 1.055e-06, "loss": 7.8158, "step": 2110 }, { "epoch": 0.6617554858934169, "grad_norm": 7.379361629486084, "learning_rate": 1.0555e-06, "loss": 13.4704, "step": 2111 }, { "epoch": 0.6620689655172414, "grad_norm": 8.117541313171387, "learning_rate": 1.0560000000000001e-06, "loss": 16.404, "step": 2112 }, { "epoch": 0.6623824451410658, "grad_norm": 3.9060027599334717, "learning_rate": 1.0565e-06, "loss": 5.9239, "step": 2113 }, { "epoch": 0.6626959247648903, "grad_norm": 4.950360298156738, "learning_rate": 1.0570000000000002e-06, "loss": 7.5477, "step": 2114 }, { "epoch": 0.6630094043887147, "grad_norm": 6.759565353393555, "learning_rate": 1.0575e-06, "loss": 13.3141, "step": 2115 }, { "epoch": 0.6633228840125391, "grad_norm": 4.705567836761475, "learning_rate": 1.058e-06, "loss": 10.2262, "step": 2116 }, { "epoch": 0.6636363636363637, "grad_norm": 6.787057876586914, "learning_rate": 1.0585000000000001e-06, "loss": 10.6113, "step": 2117 }, { "epoch": 0.6639498432601881, "grad_norm": 4.983616828918457, "learning_rate": 1.059e-06, "loss": 10.8316, "step": 2118 }, { "epoch": 0.6642633228840126, "grad_norm": 3.8625426292419434, "learning_rate": 1.0595000000000002e-06, "loss": 7.7674, "step": 2119 }, { "epoch": 0.664576802507837, "grad_norm": 4.234352111816406, "learning_rate": 1.06e-06, "loss": 7.263, "step": 2120 }, { "epoch": 0.6648902821316615, "grad_norm": 4.677432537078857, "learning_rate": 1.0605e-06, "loss": 7.1481, "step": 2121 }, { "epoch": 0.6652037617554859, "grad_norm": 4.3010029792785645, "learning_rate": 1.061e-06, "loss": 6.9077, "step": 2122 }, { "epoch": 0.6655172413793103, "grad_norm": 3.675344228744507, "learning_rate": 1.0615000000000001e-06, "loss": 6.9315, "step": 2123 }, { "epoch": 0.6658307210031348, "grad_norm": 5.514710903167725, "learning_rate": 1.0620000000000002e-06, "loss": 11.0089, "step": 2124 }, { "epoch": 0.6661442006269592, "grad_norm": 4.249388694763184, "learning_rate": 1.0625e-06, "loss": 8.537, "step": 2125 }, { "epoch": 0.6664576802507837, "grad_norm": 4.913691997528076, "learning_rate": 1.0630000000000002e-06, "loss": 8.3072, "step": 2126 }, { "epoch": 0.6667711598746081, "grad_norm": 7.471773624420166, "learning_rate": 1.0635e-06, "loss": 9.3104, "step": 2127 }, { "epoch": 0.6670846394984326, "grad_norm": 5.314687728881836, "learning_rate": 1.064e-06, "loss": 9.9356, "step": 2128 }, { "epoch": 0.6673981191222571, "grad_norm": 5.965895175933838, "learning_rate": 1.0645000000000001e-06, "loss": 12.0117, "step": 2129 }, { "epoch": 0.6677115987460815, "grad_norm": 4.372379779815674, "learning_rate": 1.065e-06, "loss": 7.607, "step": 2130 }, { "epoch": 0.668025078369906, "grad_norm": 4.424569606781006, "learning_rate": 1.0655000000000002e-06, "loss": 10.169, "step": 2131 }, { "epoch": 0.6683385579937304, "grad_norm": 3.1224844455718994, "learning_rate": 1.066e-06, "loss": 6.0792, "step": 2132 }, { "epoch": 0.6686520376175549, "grad_norm": 5.132580280303955, "learning_rate": 1.0665e-06, "loss": 9.2276, "step": 2133 }, { "epoch": 0.6689655172413793, "grad_norm": 4.87095832824707, "learning_rate": 1.0670000000000001e-06, "loss": 7.84, "step": 2134 }, { "epoch": 0.6692789968652038, "grad_norm": 5.659262657165527, "learning_rate": 1.0675000000000002e-06, "loss": 8.4573, "step": 2135 }, { "epoch": 0.6695924764890282, "grad_norm": 4.755374908447266, "learning_rate": 1.0680000000000002e-06, "loss": 10.6835, "step": 2136 }, { "epoch": 0.6699059561128526, "grad_norm": 5.089763641357422, "learning_rate": 1.0685e-06, "loss": 7.9605, "step": 2137 }, { "epoch": 0.6702194357366771, "grad_norm": 8.315079689025879, "learning_rate": 1.069e-06, "loss": 11.3606, "step": 2138 }, { "epoch": 0.6705329153605015, "grad_norm": 5.036296367645264, "learning_rate": 1.0695e-06, "loss": 9.4513, "step": 2139 }, { "epoch": 0.670846394984326, "grad_norm": 4.769467353820801, "learning_rate": 1.0700000000000001e-06, "loss": 7.5086, "step": 2140 }, { "epoch": 0.6711598746081505, "grad_norm": 5.184993743896484, "learning_rate": 1.0705000000000002e-06, "loss": 10.644, "step": 2141 }, { "epoch": 0.671473354231975, "grad_norm": 3.7264373302459717, "learning_rate": 1.071e-06, "loss": 6.8944, "step": 2142 }, { "epoch": 0.6717868338557994, "grad_norm": 4.51856803894043, "learning_rate": 1.0715e-06, "loss": 7.1033, "step": 2143 }, { "epoch": 0.6721003134796238, "grad_norm": 4.620069980621338, "learning_rate": 1.072e-06, "loss": 9.0823, "step": 2144 }, { "epoch": 0.6724137931034483, "grad_norm": 4.350764274597168, "learning_rate": 1.0725000000000001e-06, "loss": 7.9142, "step": 2145 }, { "epoch": 0.6727272727272727, "grad_norm": 5.366765022277832, "learning_rate": 1.0730000000000001e-06, "loss": 7.2714, "step": 2146 }, { "epoch": 0.6730407523510972, "grad_norm": 4.46659517288208, "learning_rate": 1.0735000000000002e-06, "loss": 8.6368, "step": 2147 }, { "epoch": 0.6733542319749216, "grad_norm": 4.17587947845459, "learning_rate": 1.074e-06, "loss": 6.8412, "step": 2148 }, { "epoch": 0.673667711598746, "grad_norm": 4.389501571655273, "learning_rate": 1.0745e-06, "loss": 5.929, "step": 2149 }, { "epoch": 0.6739811912225705, "grad_norm": 3.766752004623413, "learning_rate": 1.075e-06, "loss": 7.428, "step": 2150 }, { "epoch": 0.6742946708463949, "grad_norm": 4.460997581481934, "learning_rate": 1.0755000000000001e-06, "loss": 8.5694, "step": 2151 }, { "epoch": 0.6746081504702194, "grad_norm": 3.7466864585876465, "learning_rate": 1.0760000000000002e-06, "loss": 6.5698, "step": 2152 }, { "epoch": 0.6749216300940439, "grad_norm": 5.410111904144287, "learning_rate": 1.0765e-06, "loss": 13.9963, "step": 2153 }, { "epoch": 0.6752351097178684, "grad_norm": 6.007702827453613, "learning_rate": 1.0770000000000002e-06, "loss": 11.1922, "step": 2154 }, { "epoch": 0.6755485893416928, "grad_norm": 3.324204206466675, "learning_rate": 1.0775e-06, "loss": 7.0282, "step": 2155 }, { "epoch": 0.6758620689655173, "grad_norm": 6.3619914054870605, "learning_rate": 1.078e-06, "loss": 8.3813, "step": 2156 }, { "epoch": 0.6761755485893417, "grad_norm": 5.453836441040039, "learning_rate": 1.0785000000000001e-06, "loss": 8.7658, "step": 2157 }, { "epoch": 0.6764890282131661, "grad_norm": 4.859471321105957, "learning_rate": 1.079e-06, "loss": 7.7485, "step": 2158 }, { "epoch": 0.6768025078369906, "grad_norm": 5.14132022857666, "learning_rate": 1.0795000000000002e-06, "loss": 10.0406, "step": 2159 }, { "epoch": 0.677115987460815, "grad_norm": 4.963798522949219, "learning_rate": 1.08e-06, "loss": 9.1328, "step": 2160 }, { "epoch": 0.6774294670846395, "grad_norm": 5.451442718505859, "learning_rate": 1.0805e-06, "loss": 7.1831, "step": 2161 }, { "epoch": 0.6777429467084639, "grad_norm": 4.889834403991699, "learning_rate": 1.0810000000000001e-06, "loss": 8.8367, "step": 2162 }, { "epoch": 0.6780564263322884, "grad_norm": 3.9947681427001953, "learning_rate": 1.0815000000000001e-06, "loss": 6.2821, "step": 2163 }, { "epoch": 0.6783699059561129, "grad_norm": 6.472747325897217, "learning_rate": 1.0820000000000002e-06, "loss": 12.1777, "step": 2164 }, { "epoch": 0.6786833855799373, "grad_norm": 3.9725639820098877, "learning_rate": 1.0825e-06, "loss": 7.2472, "step": 2165 }, { "epoch": 0.6789968652037618, "grad_norm": 4.466004371643066, "learning_rate": 1.083e-06, "loss": 7.1208, "step": 2166 }, { "epoch": 0.6793103448275862, "grad_norm": 8.549521446228027, "learning_rate": 1.0835e-06, "loss": 14.6298, "step": 2167 }, { "epoch": 0.6796238244514107, "grad_norm": 3.372896671295166, "learning_rate": 1.0840000000000001e-06, "loss": 6.0872, "step": 2168 }, { "epoch": 0.6799373040752351, "grad_norm": 4.142289161682129, "learning_rate": 1.0845000000000002e-06, "loss": 6.0638, "step": 2169 }, { "epoch": 0.6802507836990596, "grad_norm": 4.548405170440674, "learning_rate": 1.085e-06, "loss": 9.5781, "step": 2170 }, { "epoch": 0.680564263322884, "grad_norm": 4.637272357940674, "learning_rate": 1.0855e-06, "loss": 6.9768, "step": 2171 }, { "epoch": 0.6808777429467084, "grad_norm": 4.340859889984131, "learning_rate": 1.086e-06, "loss": 6.6182, "step": 2172 }, { "epoch": 0.6811912225705329, "grad_norm": 4.831057548522949, "learning_rate": 1.0865e-06, "loss": 8.176, "step": 2173 }, { "epoch": 0.6815047021943573, "grad_norm": 4.174441337585449, "learning_rate": 1.0870000000000001e-06, "loss": 9.2991, "step": 2174 }, { "epoch": 0.6818181818181818, "grad_norm": 4.800502300262451, "learning_rate": 1.0875000000000002e-06, "loss": 8.0076, "step": 2175 }, { "epoch": 0.6821316614420063, "grad_norm": 5.333560466766357, "learning_rate": 1.088e-06, "loss": 8.8977, "step": 2176 }, { "epoch": 0.6824451410658308, "grad_norm": 4.281090259552002, "learning_rate": 1.0885e-06, "loss": 6.8888, "step": 2177 }, { "epoch": 0.6827586206896552, "grad_norm": 7.420187950134277, "learning_rate": 1.089e-06, "loss": 12.2294, "step": 2178 }, { "epoch": 0.6830721003134796, "grad_norm": 5.143314838409424, "learning_rate": 1.0895000000000001e-06, "loss": 8.4805, "step": 2179 }, { "epoch": 0.6833855799373041, "grad_norm": 5.837392807006836, "learning_rate": 1.0900000000000002e-06, "loss": 9.5465, "step": 2180 }, { "epoch": 0.6836990595611285, "grad_norm": 4.490211009979248, "learning_rate": 1.0905e-06, "loss": 7.1872, "step": 2181 }, { "epoch": 0.684012539184953, "grad_norm": 5.512997150421143, "learning_rate": 1.091e-06, "loss": 10.6141, "step": 2182 }, { "epoch": 0.6843260188087774, "grad_norm": 5.654632568359375, "learning_rate": 1.0915e-06, "loss": 10.529, "step": 2183 }, { "epoch": 0.6846394984326019, "grad_norm": 3.9920573234558105, "learning_rate": 1.092e-06, "loss": 7.4162, "step": 2184 }, { "epoch": 0.6849529780564263, "grad_norm": 9.621570587158203, "learning_rate": 1.0925000000000001e-06, "loss": 17.1348, "step": 2185 }, { "epoch": 0.6852664576802507, "grad_norm": 4.3478007316589355, "learning_rate": 1.093e-06, "loss": 7.5085, "step": 2186 }, { "epoch": 0.6855799373040752, "grad_norm": 4.477773666381836, "learning_rate": 1.0935000000000002e-06, "loss": 8.6712, "step": 2187 }, { "epoch": 0.6858934169278997, "grad_norm": 4.364738941192627, "learning_rate": 1.094e-06, "loss": 8.0755, "step": 2188 }, { "epoch": 0.6862068965517242, "grad_norm": 5.307512283325195, "learning_rate": 1.0945e-06, "loss": 9.8482, "step": 2189 }, { "epoch": 0.6865203761755486, "grad_norm": 5.022215366363525, "learning_rate": 1.095e-06, "loss": 9.1433, "step": 2190 }, { "epoch": 0.6868338557993731, "grad_norm": 5.021544933319092, "learning_rate": 1.0955e-06, "loss": 8.4613, "step": 2191 }, { "epoch": 0.6871473354231975, "grad_norm": 3.7242307662963867, "learning_rate": 1.0960000000000002e-06, "loss": 6.4761, "step": 2192 }, { "epoch": 0.687460815047022, "grad_norm": 4.364880561828613, "learning_rate": 1.0965e-06, "loss": 8.0514, "step": 2193 }, { "epoch": 0.6877742946708464, "grad_norm": 6.270079612731934, "learning_rate": 1.097e-06, "loss": 7.7713, "step": 2194 }, { "epoch": 0.6880877742946708, "grad_norm": 4.493875026702881, "learning_rate": 1.0975e-06, "loss": 7.1808, "step": 2195 }, { "epoch": 0.6884012539184953, "grad_norm": 5.605881214141846, "learning_rate": 1.0980000000000001e-06, "loss": 8.8931, "step": 2196 }, { "epoch": 0.6887147335423197, "grad_norm": 5.781085014343262, "learning_rate": 1.0985000000000002e-06, "loss": 11.2424, "step": 2197 }, { "epoch": 0.6890282131661442, "grad_norm": 6.997844219207764, "learning_rate": 1.099e-06, "loss": 14.7351, "step": 2198 }, { "epoch": 0.6893416927899687, "grad_norm": 5.427860260009766, "learning_rate": 1.0995000000000002e-06, "loss": 8.0223, "step": 2199 }, { "epoch": 0.6896551724137931, "grad_norm": 5.856205940246582, "learning_rate": 1.1e-06, "loss": 10.776, "step": 2200 }, { "epoch": 0.6899686520376176, "grad_norm": 5.698307037353516, "learning_rate": 1.1005e-06, "loss": 10.4188, "step": 2201 }, { "epoch": 0.690282131661442, "grad_norm": 3.5812063217163086, "learning_rate": 1.1010000000000001e-06, "loss": 5.7494, "step": 2202 }, { "epoch": 0.6905956112852665, "grad_norm": 6.817094802856445, "learning_rate": 1.1015000000000002e-06, "loss": 15.1615, "step": 2203 }, { "epoch": 0.6909090909090909, "grad_norm": 5.15540075302124, "learning_rate": 1.1020000000000002e-06, "loss": 7.7424, "step": 2204 }, { "epoch": 0.6912225705329154, "grad_norm": 5.062628746032715, "learning_rate": 1.1025e-06, "loss": 6.6624, "step": 2205 }, { "epoch": 0.6915360501567398, "grad_norm": 4.202182769775391, "learning_rate": 1.103e-06, "loss": 7.4259, "step": 2206 }, { "epoch": 0.6918495297805642, "grad_norm": 4.714681148529053, "learning_rate": 1.1035000000000001e-06, "loss": 9.4869, "step": 2207 }, { "epoch": 0.6921630094043887, "grad_norm": 5.397708415985107, "learning_rate": 1.1040000000000001e-06, "loss": 8.4847, "step": 2208 }, { "epoch": 0.6924764890282131, "grad_norm": 6.186098098754883, "learning_rate": 1.1045000000000002e-06, "loss": 9.6202, "step": 2209 }, { "epoch": 0.6927899686520376, "grad_norm": 5.049215793609619, "learning_rate": 1.105e-06, "loss": 8.4358, "step": 2210 }, { "epoch": 0.6931034482758621, "grad_norm": 4.837051868438721, "learning_rate": 1.1055e-06, "loss": 7.3944, "step": 2211 }, { "epoch": 0.6934169278996866, "grad_norm": 5.781193256378174, "learning_rate": 1.106e-06, "loss": 10.1386, "step": 2212 }, { "epoch": 0.693730407523511, "grad_norm": 4.360887050628662, "learning_rate": 1.1065000000000001e-06, "loss": 7.0723, "step": 2213 }, { "epoch": 0.6940438871473354, "grad_norm": 4.668460845947266, "learning_rate": 1.1070000000000002e-06, "loss": 8.563, "step": 2214 }, { "epoch": 0.6943573667711599, "grad_norm": 5.356121063232422, "learning_rate": 1.1075000000000002e-06, "loss": 9.5143, "step": 2215 }, { "epoch": 0.6946708463949843, "grad_norm": 4.979715824127197, "learning_rate": 1.108e-06, "loss": 8.1418, "step": 2216 }, { "epoch": 0.6949843260188088, "grad_norm": 4.595643520355225, "learning_rate": 1.1085e-06, "loss": 9.48, "step": 2217 }, { "epoch": 0.6952978056426332, "grad_norm": 5.082688331604004, "learning_rate": 1.109e-06, "loss": 7.4409, "step": 2218 }, { "epoch": 0.6956112852664577, "grad_norm": 5.961426258087158, "learning_rate": 1.1095e-06, "loss": 9.6929, "step": 2219 }, { "epoch": 0.6959247648902821, "grad_norm": 4.507264137268066, "learning_rate": 1.1100000000000002e-06, "loss": 9.0472, "step": 2220 }, { "epoch": 0.6962382445141065, "grad_norm": 6.450530052185059, "learning_rate": 1.1105e-06, "loss": 11.311, "step": 2221 }, { "epoch": 0.696551724137931, "grad_norm": 6.884239673614502, "learning_rate": 1.111e-06, "loss": 9.9046, "step": 2222 }, { "epoch": 0.6968652037617555, "grad_norm": 4.918006420135498, "learning_rate": 1.1115e-06, "loss": 7.7746, "step": 2223 }, { "epoch": 0.69717868338558, "grad_norm": 3.9736998081207275, "learning_rate": 1.1120000000000001e-06, "loss": 8.2501, "step": 2224 }, { "epoch": 0.6974921630094044, "grad_norm": 5.021994590759277, "learning_rate": 1.1125000000000001e-06, "loss": 9.8031, "step": 2225 }, { "epoch": 0.6978056426332289, "grad_norm": 4.272434711456299, "learning_rate": 1.113e-06, "loss": 6.6057, "step": 2226 }, { "epoch": 0.6981191222570533, "grad_norm": 4.612613677978516, "learning_rate": 1.1135000000000002e-06, "loss": 8.5593, "step": 2227 }, { "epoch": 0.6984326018808777, "grad_norm": 6.331014633178711, "learning_rate": 1.114e-06, "loss": 8.7987, "step": 2228 }, { "epoch": 0.6987460815047022, "grad_norm": 6.129283905029297, "learning_rate": 1.1145e-06, "loss": 8.4672, "step": 2229 }, { "epoch": 0.6990595611285266, "grad_norm": 4.61047887802124, "learning_rate": 1.1150000000000001e-06, "loss": 9.5344, "step": 2230 }, { "epoch": 0.6993730407523511, "grad_norm": 4.625089168548584, "learning_rate": 1.1155e-06, "loss": 8.4914, "step": 2231 }, { "epoch": 0.6996865203761755, "grad_norm": 7.176419258117676, "learning_rate": 1.1160000000000002e-06, "loss": 10.456, "step": 2232 }, { "epoch": 0.7, "grad_norm": 4.757896900177002, "learning_rate": 1.1165e-06, "loss": 8.0939, "step": 2233 }, { "epoch": 0.7003134796238244, "grad_norm": 4.703888893127441, "learning_rate": 1.117e-06, "loss": 8.9324, "step": 2234 }, { "epoch": 0.700626959247649, "grad_norm": 5.082801342010498, "learning_rate": 1.1175e-06, "loss": 7.9512, "step": 2235 }, { "epoch": 0.7009404388714734, "grad_norm": 7.074932098388672, "learning_rate": 1.1180000000000001e-06, "loss": 10.5051, "step": 2236 }, { "epoch": 0.7012539184952978, "grad_norm": 5.920054912567139, "learning_rate": 1.1185000000000002e-06, "loss": 10.0657, "step": 2237 }, { "epoch": 0.7015673981191223, "grad_norm": 4.017765045166016, "learning_rate": 1.119e-06, "loss": 7.4091, "step": 2238 }, { "epoch": 0.7018808777429467, "grad_norm": 4.405153274536133, "learning_rate": 1.1195e-06, "loss": 8.9839, "step": 2239 }, { "epoch": 0.7021943573667712, "grad_norm": 5.551645755767822, "learning_rate": 1.12e-06, "loss": 8.2674, "step": 2240 }, { "epoch": 0.7025078369905956, "grad_norm": 5.920106887817383, "learning_rate": 1.1205000000000001e-06, "loss": 10.3074, "step": 2241 }, { "epoch": 0.70282131661442, "grad_norm": 6.09196138381958, "learning_rate": 1.1210000000000002e-06, "loss": 9.587, "step": 2242 }, { "epoch": 0.7031347962382445, "grad_norm": 6.698037624359131, "learning_rate": 1.1215000000000002e-06, "loss": 10.1409, "step": 2243 }, { "epoch": 0.7034482758620689, "grad_norm": 6.3669962882995605, "learning_rate": 1.122e-06, "loss": 10.6705, "step": 2244 }, { "epoch": 0.7037617554858934, "grad_norm": 4.820910930633545, "learning_rate": 1.1225e-06, "loss": 10.7634, "step": 2245 }, { "epoch": 0.7040752351097179, "grad_norm": 6.152073860168457, "learning_rate": 1.123e-06, "loss": 9.9762, "step": 2246 }, { "epoch": 0.7043887147335424, "grad_norm": 6.534127235412598, "learning_rate": 1.1235000000000001e-06, "loss": 9.1076, "step": 2247 }, { "epoch": 0.7047021943573668, "grad_norm": 4.584049224853516, "learning_rate": 1.1240000000000002e-06, "loss": 6.3946, "step": 2248 }, { "epoch": 0.7050156739811912, "grad_norm": 6.2237653732299805, "learning_rate": 1.1245e-06, "loss": 10.3125, "step": 2249 }, { "epoch": 0.7053291536050157, "grad_norm": 7.915231227874756, "learning_rate": 1.125e-06, "loss": 11.7329, "step": 2250 }, { "epoch": 0.7056426332288401, "grad_norm": 5.413792610168457, "learning_rate": 1.1255e-06, "loss": 7.3437, "step": 2251 }, { "epoch": 0.7059561128526646, "grad_norm": 6.203031063079834, "learning_rate": 1.126e-06, "loss": 8.8131, "step": 2252 }, { "epoch": 0.706269592476489, "grad_norm": 5.079891681671143, "learning_rate": 1.1265000000000001e-06, "loss": 8.0062, "step": 2253 }, { "epoch": 0.7065830721003135, "grad_norm": 5.417928218841553, "learning_rate": 1.127e-06, "loss": 7.8545, "step": 2254 }, { "epoch": 0.7068965517241379, "grad_norm": 8.853837966918945, "learning_rate": 1.1275000000000002e-06, "loss": 12.7805, "step": 2255 }, { "epoch": 0.7072100313479623, "grad_norm": 5.80386209487915, "learning_rate": 1.128e-06, "loss": 10.7414, "step": 2256 }, { "epoch": 0.7075235109717868, "grad_norm": 5.670801639556885, "learning_rate": 1.1285e-06, "loss": 8.2194, "step": 2257 }, { "epoch": 0.7078369905956113, "grad_norm": 3.81965708732605, "learning_rate": 1.1290000000000001e-06, "loss": 6.5429, "step": 2258 }, { "epoch": 0.7081504702194358, "grad_norm": 5.524906635284424, "learning_rate": 1.1295e-06, "loss": 10.1136, "step": 2259 }, { "epoch": 0.7084639498432602, "grad_norm": 7.865563869476318, "learning_rate": 1.1300000000000002e-06, "loss": 11.7016, "step": 2260 }, { "epoch": 0.7087774294670847, "grad_norm": 4.466341495513916, "learning_rate": 1.1305e-06, "loss": 6.4666, "step": 2261 }, { "epoch": 0.7090909090909091, "grad_norm": 4.703956127166748, "learning_rate": 1.131e-06, "loss": 7.191, "step": 2262 }, { "epoch": 0.7094043887147335, "grad_norm": 4.839312553405762, "learning_rate": 1.1315e-06, "loss": 8.4636, "step": 2263 }, { "epoch": 0.709717868338558, "grad_norm": 6.889242172241211, "learning_rate": 1.1320000000000001e-06, "loss": 9.0166, "step": 2264 }, { "epoch": 0.7100313479623824, "grad_norm": 5.956830978393555, "learning_rate": 1.1325000000000002e-06, "loss": 7.6537, "step": 2265 }, { "epoch": 0.7103448275862069, "grad_norm": 3.948118209838867, "learning_rate": 1.133e-06, "loss": 6.8683, "step": 2266 }, { "epoch": 0.7106583072100313, "grad_norm": 5.981534004211426, "learning_rate": 1.1335000000000002e-06, "loss": 9.1839, "step": 2267 }, { "epoch": 0.7109717868338558, "grad_norm": 6.075943470001221, "learning_rate": 1.134e-06, "loss": 8.9451, "step": 2268 }, { "epoch": 0.7112852664576802, "grad_norm": 7.1982879638671875, "learning_rate": 1.1345000000000001e-06, "loss": 10.2837, "step": 2269 }, { "epoch": 0.7115987460815048, "grad_norm": 4.65557861328125, "learning_rate": 1.1350000000000001e-06, "loss": 7.0759, "step": 2270 }, { "epoch": 0.7119122257053292, "grad_norm": 7.2853007316589355, "learning_rate": 1.1355e-06, "loss": 10.6811, "step": 2271 }, { "epoch": 0.7122257053291536, "grad_norm": 4.625248432159424, "learning_rate": 1.1360000000000002e-06, "loss": 6.6552, "step": 2272 }, { "epoch": 0.7125391849529781, "grad_norm": 5.515805244445801, "learning_rate": 1.1365e-06, "loss": 11.0095, "step": 2273 }, { "epoch": 0.7128526645768025, "grad_norm": 8.214932441711426, "learning_rate": 1.137e-06, "loss": 9.5016, "step": 2274 }, { "epoch": 0.713166144200627, "grad_norm": 4.511641979217529, "learning_rate": 1.1375000000000001e-06, "loss": 9.456, "step": 2275 }, { "epoch": 0.7134796238244514, "grad_norm": 4.926652908325195, "learning_rate": 1.1380000000000002e-06, "loss": 6.4554, "step": 2276 }, { "epoch": 0.7137931034482758, "grad_norm": 5.701582908630371, "learning_rate": 1.1385000000000002e-06, "loss": 7.8086, "step": 2277 }, { "epoch": 0.7141065830721003, "grad_norm": 5.361356735229492, "learning_rate": 1.139e-06, "loss": 5.5772, "step": 2278 }, { "epoch": 0.7144200626959247, "grad_norm": 5.268955707550049, "learning_rate": 1.1395e-06, "loss": 6.9328, "step": 2279 }, { "epoch": 0.7147335423197492, "grad_norm": 6.054418087005615, "learning_rate": 1.14e-06, "loss": 11.8742, "step": 2280 }, { "epoch": 0.7150470219435736, "grad_norm": 6.118603229522705, "learning_rate": 1.1405000000000001e-06, "loss": 8.7145, "step": 2281 }, { "epoch": 0.7153605015673982, "grad_norm": 5.574398517608643, "learning_rate": 1.141e-06, "loss": 8.354, "step": 2282 }, { "epoch": 0.7156739811912226, "grad_norm": 8.102057456970215, "learning_rate": 1.1415000000000002e-06, "loss": 9.6707, "step": 2283 }, { "epoch": 0.715987460815047, "grad_norm": 6.044636249542236, "learning_rate": 1.142e-06, "loss": 8.4545, "step": 2284 }, { "epoch": 0.7163009404388715, "grad_norm": 5.610233783721924, "learning_rate": 1.1425e-06, "loss": 6.494, "step": 2285 }, { "epoch": 0.7166144200626959, "grad_norm": 7.252960681915283, "learning_rate": 1.1430000000000001e-06, "loss": 9.9181, "step": 2286 }, { "epoch": 0.7169278996865204, "grad_norm": 7.041637897491455, "learning_rate": 1.1435e-06, "loss": 9.8605, "step": 2287 }, { "epoch": 0.7172413793103448, "grad_norm": 7.529675006866455, "learning_rate": 1.1440000000000002e-06, "loss": 9.2948, "step": 2288 }, { "epoch": 0.7175548589341693, "grad_norm": 6.867233753204346, "learning_rate": 1.1445e-06, "loss": 10.663, "step": 2289 }, { "epoch": 0.7178683385579937, "grad_norm": 4.269197463989258, "learning_rate": 1.145e-06, "loss": 6.5775, "step": 2290 }, { "epoch": 0.7181818181818181, "grad_norm": 5.0431060791015625, "learning_rate": 1.1455e-06, "loss": 8.0019, "step": 2291 }, { "epoch": 0.7184952978056426, "grad_norm": 5.916482925415039, "learning_rate": 1.1460000000000001e-06, "loss": 9.4506, "step": 2292 }, { "epoch": 0.7188087774294671, "grad_norm": 8.643074035644531, "learning_rate": 1.1465000000000002e-06, "loss": 11.3396, "step": 2293 }, { "epoch": 0.7191222570532916, "grad_norm": 4.569228649139404, "learning_rate": 1.147e-06, "loss": 6.5566, "step": 2294 }, { "epoch": 0.719435736677116, "grad_norm": 5.326075077056885, "learning_rate": 1.1475000000000002e-06, "loss": 8.4556, "step": 2295 }, { "epoch": 0.7197492163009405, "grad_norm": 5.192141532897949, "learning_rate": 1.148e-06, "loss": 8.5234, "step": 2296 }, { "epoch": 0.7200626959247649, "grad_norm": 4.826154708862305, "learning_rate": 1.1485e-06, "loss": 6.925, "step": 2297 }, { "epoch": 0.7203761755485893, "grad_norm": 6.822600841522217, "learning_rate": 1.1490000000000001e-06, "loss": 12.9681, "step": 2298 }, { "epoch": 0.7206896551724138, "grad_norm": 7.6909003257751465, "learning_rate": 1.1495e-06, "loss": 10.0613, "step": 2299 }, { "epoch": 0.7210031347962382, "grad_norm": 6.721874713897705, "learning_rate": 1.1500000000000002e-06, "loss": 11.6405, "step": 2300 }, { "epoch": 0.7213166144200627, "grad_norm": 5.8059163093566895, "learning_rate": 1.1505e-06, "loss": 8.9701, "step": 2301 }, { "epoch": 0.7216300940438871, "grad_norm": 8.074931144714355, "learning_rate": 1.151e-06, "loss": 9.3916, "step": 2302 }, { "epoch": 0.7219435736677116, "grad_norm": 7.6194987297058105, "learning_rate": 1.1515000000000001e-06, "loss": 11.8245, "step": 2303 }, { "epoch": 0.722257053291536, "grad_norm": 4.169922351837158, "learning_rate": 1.1520000000000002e-06, "loss": 6.3245, "step": 2304 }, { "epoch": 0.7225705329153606, "grad_norm": 7.672272682189941, "learning_rate": 1.1525000000000002e-06, "loss": 10.7136, "step": 2305 }, { "epoch": 0.722884012539185, "grad_norm": 6.1154656410217285, "learning_rate": 1.153e-06, "loss": 10.13, "step": 2306 }, { "epoch": 0.7231974921630094, "grad_norm": 5.131744861602783, "learning_rate": 1.1535e-06, "loss": 6.4488, "step": 2307 }, { "epoch": 0.7235109717868339, "grad_norm": 4.942484378814697, "learning_rate": 1.154e-06, "loss": 7.9078, "step": 2308 }, { "epoch": 0.7238244514106583, "grad_norm": 6.5893449783325195, "learning_rate": 1.1545000000000001e-06, "loss": 10.2193, "step": 2309 }, { "epoch": 0.7241379310344828, "grad_norm": 5.4219136238098145, "learning_rate": 1.1550000000000002e-06, "loss": 7.0708, "step": 2310 }, { "epoch": 0.7244514106583072, "grad_norm": 8.25130558013916, "learning_rate": 1.1555e-06, "loss": 10.7496, "step": 2311 }, { "epoch": 0.7247648902821316, "grad_norm": 4.836966514587402, "learning_rate": 1.156e-06, "loss": 6.6638, "step": 2312 }, { "epoch": 0.7250783699059561, "grad_norm": 7.104246139526367, "learning_rate": 1.1565e-06, "loss": 8.5781, "step": 2313 }, { "epoch": 0.7253918495297805, "grad_norm": 5.00009822845459, "learning_rate": 1.157e-06, "loss": 6.4871, "step": 2314 }, { "epoch": 0.725705329153605, "grad_norm": 4.757653713226318, "learning_rate": 1.1575000000000001e-06, "loss": 7.6189, "step": 2315 }, { "epoch": 0.7260188087774294, "grad_norm": 6.125999450683594, "learning_rate": 1.1580000000000002e-06, "loss": 7.1317, "step": 2316 }, { "epoch": 0.726332288401254, "grad_norm": 6.397932052612305, "learning_rate": 1.1585e-06, "loss": 7.738, "step": 2317 }, { "epoch": 0.7266457680250784, "grad_norm": 5.191574573516846, "learning_rate": 1.159e-06, "loss": 7.8031, "step": 2318 }, { "epoch": 0.7269592476489029, "grad_norm": 6.020807266235352, "learning_rate": 1.1595e-06, "loss": 9.9713, "step": 2319 }, { "epoch": 0.7272727272727273, "grad_norm": 4.571470260620117, "learning_rate": 1.1600000000000001e-06, "loss": 5.8561, "step": 2320 }, { "epoch": 0.7275862068965517, "grad_norm": 6.151414394378662, "learning_rate": 1.1605000000000002e-06, "loss": 8.3401, "step": 2321 }, { "epoch": 0.7278996865203762, "grad_norm": 5.342029094696045, "learning_rate": 1.161e-06, "loss": 7.4286, "step": 2322 }, { "epoch": 0.7282131661442006, "grad_norm": 8.054902076721191, "learning_rate": 1.1615000000000002e-06, "loss": 11.2857, "step": 2323 }, { "epoch": 0.7285266457680251, "grad_norm": 8.631732940673828, "learning_rate": 1.162e-06, "loss": 11.7461, "step": 2324 }, { "epoch": 0.7288401253918495, "grad_norm": 5.86944580078125, "learning_rate": 1.1625e-06, "loss": 8.2415, "step": 2325 }, { "epoch": 0.729153605015674, "grad_norm": 6.775248050689697, "learning_rate": 1.1630000000000001e-06, "loss": 9.1465, "step": 2326 }, { "epoch": 0.7294670846394984, "grad_norm": 5.014388561248779, "learning_rate": 1.1635e-06, "loss": 9.4963, "step": 2327 }, { "epoch": 0.7297805642633229, "grad_norm": 6.137758731842041, "learning_rate": 1.1640000000000002e-06, "loss": 7.47, "step": 2328 }, { "epoch": 0.7300940438871474, "grad_norm": 7.857049942016602, "learning_rate": 1.1645e-06, "loss": 12.0102, "step": 2329 }, { "epoch": 0.7304075235109718, "grad_norm": 5.5146002769470215, "learning_rate": 1.165e-06, "loss": 7.7632, "step": 2330 }, { "epoch": 0.7307210031347963, "grad_norm": 5.327179431915283, "learning_rate": 1.1655000000000001e-06, "loss": 9.2846, "step": 2331 }, { "epoch": 0.7310344827586207, "grad_norm": 5.694344997406006, "learning_rate": 1.1660000000000001e-06, "loss": 8.7609, "step": 2332 }, { "epoch": 0.7313479623824451, "grad_norm": 6.549810886383057, "learning_rate": 1.1665000000000002e-06, "loss": 8.8996, "step": 2333 }, { "epoch": 0.7316614420062696, "grad_norm": 4.765985488891602, "learning_rate": 1.167e-06, "loss": 7.4456, "step": 2334 }, { "epoch": 0.731974921630094, "grad_norm": 4.749044418334961, "learning_rate": 1.1675000000000003e-06, "loss": 6.8727, "step": 2335 }, { "epoch": 0.7322884012539185, "grad_norm": 6.0622711181640625, "learning_rate": 1.168e-06, "loss": 8.9842, "step": 2336 }, { "epoch": 0.7326018808777429, "grad_norm": 5.587085723876953, "learning_rate": 1.1685000000000001e-06, "loss": 7.9121, "step": 2337 }, { "epoch": 0.7329153605015674, "grad_norm": 6.6463189125061035, "learning_rate": 1.1690000000000002e-06, "loss": 7.1128, "step": 2338 }, { "epoch": 0.7332288401253918, "grad_norm": 6.599662780761719, "learning_rate": 1.1695e-06, "loss": 8.0378, "step": 2339 }, { "epoch": 0.7335423197492164, "grad_norm": 5.061799049377441, "learning_rate": 1.1700000000000002e-06, "loss": 6.4478, "step": 2340 }, { "epoch": 0.7338557993730408, "grad_norm": 5.785806655883789, "learning_rate": 1.1705e-06, "loss": 10.5814, "step": 2341 }, { "epoch": 0.7341692789968652, "grad_norm": 5.782971382141113, "learning_rate": 1.171e-06, "loss": 6.4085, "step": 2342 }, { "epoch": 0.7344827586206897, "grad_norm": 5.601132392883301, "learning_rate": 1.1715000000000001e-06, "loss": 9.9022, "step": 2343 }, { "epoch": 0.7347962382445141, "grad_norm": 5.016490459442139, "learning_rate": 1.1720000000000002e-06, "loss": 7.146, "step": 2344 }, { "epoch": 0.7351097178683386, "grad_norm": 5.3097310066223145, "learning_rate": 1.1725e-06, "loss": 7.8484, "step": 2345 }, { "epoch": 0.735423197492163, "grad_norm": 5.761673927307129, "learning_rate": 1.173e-06, "loss": 6.9101, "step": 2346 }, { "epoch": 0.7357366771159874, "grad_norm": 6.155106544494629, "learning_rate": 1.1735e-06, "loss": 7.6222, "step": 2347 }, { "epoch": 0.7360501567398119, "grad_norm": 5.324407577514648, "learning_rate": 1.1740000000000001e-06, "loss": 7.18, "step": 2348 }, { "epoch": 0.7363636363636363, "grad_norm": 5.507112503051758, "learning_rate": 1.1745000000000001e-06, "loss": 7.4581, "step": 2349 }, { "epoch": 0.7366771159874608, "grad_norm": 6.135045528411865, "learning_rate": 1.175e-06, "loss": 8.3784, "step": 2350 }, { "epoch": 0.7369905956112852, "grad_norm": 5.607395172119141, "learning_rate": 1.1755e-06, "loss": 8.0636, "step": 2351 }, { "epoch": 0.7373040752351098, "grad_norm": 4.155460834503174, "learning_rate": 1.176e-06, "loss": 7.7036, "step": 2352 }, { "epoch": 0.7376175548589342, "grad_norm": 6.254929542541504, "learning_rate": 1.1765e-06, "loss": 7.4921, "step": 2353 }, { "epoch": 0.7379310344827587, "grad_norm": 7.738031387329102, "learning_rate": 1.1770000000000001e-06, "loss": 8.3682, "step": 2354 }, { "epoch": 0.7382445141065831, "grad_norm": 4.532285213470459, "learning_rate": 1.1775e-06, "loss": 6.7809, "step": 2355 }, { "epoch": 0.7385579937304075, "grad_norm": 5.4931640625, "learning_rate": 1.1780000000000002e-06, "loss": 8.2121, "step": 2356 }, { "epoch": 0.738871473354232, "grad_norm": 5.828548908233643, "learning_rate": 1.1785e-06, "loss": 7.4208, "step": 2357 }, { "epoch": 0.7391849529780564, "grad_norm": 8.793325424194336, "learning_rate": 1.179e-06, "loss": 10.5305, "step": 2358 }, { "epoch": 0.7394984326018809, "grad_norm": 7.540804386138916, "learning_rate": 1.1795e-06, "loss": 10.3455, "step": 2359 }, { "epoch": 0.7398119122257053, "grad_norm": 7.395792007446289, "learning_rate": 1.1800000000000001e-06, "loss": 8.6099, "step": 2360 }, { "epoch": 0.7401253918495297, "grad_norm": 7.034842491149902, "learning_rate": 1.1805000000000002e-06, "loss": 12.9946, "step": 2361 }, { "epoch": 0.7404388714733542, "grad_norm": 6.157832145690918, "learning_rate": 1.181e-06, "loss": 9.0294, "step": 2362 }, { "epoch": 0.7407523510971786, "grad_norm": 5.958568096160889, "learning_rate": 1.1815000000000002e-06, "loss": 9.4815, "step": 2363 }, { "epoch": 0.7410658307210032, "grad_norm": 5.166853427886963, "learning_rate": 1.182e-06, "loss": 7.8122, "step": 2364 }, { "epoch": 0.7413793103448276, "grad_norm": 6.027432441711426, "learning_rate": 1.1825000000000001e-06, "loss": 9.6608, "step": 2365 }, { "epoch": 0.7416927899686521, "grad_norm": 8.379853248596191, "learning_rate": 1.1830000000000002e-06, "loss": 10.2204, "step": 2366 }, { "epoch": 0.7420062695924765, "grad_norm": 6.882002353668213, "learning_rate": 1.1835e-06, "loss": 9.1155, "step": 2367 }, { "epoch": 0.742319749216301, "grad_norm": 5.61342191696167, "learning_rate": 1.1840000000000002e-06, "loss": 9.3164, "step": 2368 }, { "epoch": 0.7426332288401254, "grad_norm": 7.321317195892334, "learning_rate": 1.1845e-06, "loss": 7.8616, "step": 2369 }, { "epoch": 0.7429467084639498, "grad_norm": 5.494234561920166, "learning_rate": 1.185e-06, "loss": 6.7372, "step": 2370 }, { "epoch": 0.7432601880877743, "grad_norm": 6.378052234649658, "learning_rate": 1.1855000000000001e-06, "loss": 8.2405, "step": 2371 }, { "epoch": 0.7435736677115987, "grad_norm": 9.557308197021484, "learning_rate": 1.1860000000000002e-06, "loss": 12.0775, "step": 2372 }, { "epoch": 0.7438871473354232, "grad_norm": 7.571987152099609, "learning_rate": 1.1865000000000002e-06, "loss": 9.8201, "step": 2373 }, { "epoch": 0.7442006269592476, "grad_norm": 5.4964213371276855, "learning_rate": 1.187e-06, "loss": 6.8091, "step": 2374 }, { "epoch": 0.7445141065830722, "grad_norm": 5.78679895401001, "learning_rate": 1.1875e-06, "loss": 7.4889, "step": 2375 }, { "epoch": 0.7448275862068966, "grad_norm": 5.633636474609375, "learning_rate": 1.188e-06, "loss": 6.6415, "step": 2376 }, { "epoch": 0.745141065830721, "grad_norm": 6.5983428955078125, "learning_rate": 1.1885000000000001e-06, "loss": 7.7574, "step": 2377 }, { "epoch": 0.7454545454545455, "grad_norm": 9.485814094543457, "learning_rate": 1.1890000000000002e-06, "loss": 12.1369, "step": 2378 }, { "epoch": 0.7457680250783699, "grad_norm": 6.415633201599121, "learning_rate": 1.1895e-06, "loss": 7.9947, "step": 2379 }, { "epoch": 0.7460815047021944, "grad_norm": 6.807877063751221, "learning_rate": 1.19e-06, "loss": 8.5465, "step": 2380 }, { "epoch": 0.7463949843260188, "grad_norm": 6.27053165435791, "learning_rate": 1.1905e-06, "loss": 7.1279, "step": 2381 }, { "epoch": 0.7467084639498432, "grad_norm": 5.453633785247803, "learning_rate": 1.1910000000000001e-06, "loss": 7.1849, "step": 2382 }, { "epoch": 0.7470219435736677, "grad_norm": 6.075678825378418, "learning_rate": 1.1915000000000002e-06, "loss": 8.3007, "step": 2383 }, { "epoch": 0.7473354231974921, "grad_norm": 10.42825984954834, "learning_rate": 1.1920000000000002e-06, "loss": 10.672, "step": 2384 }, { "epoch": 0.7476489028213166, "grad_norm": 5.732131481170654, "learning_rate": 1.1925e-06, "loss": 6.1683, "step": 2385 }, { "epoch": 0.747962382445141, "grad_norm": 6.508482456207275, "learning_rate": 1.193e-06, "loss": 9.2361, "step": 2386 }, { "epoch": 0.7482758620689656, "grad_norm": 5.837035179138184, "learning_rate": 1.1935e-06, "loss": 7.1001, "step": 2387 }, { "epoch": 0.74858934169279, "grad_norm": 9.754945755004883, "learning_rate": 1.1940000000000001e-06, "loss": 10.2582, "step": 2388 }, { "epoch": 0.7489028213166145, "grad_norm": 5.980183124542236, "learning_rate": 1.1945000000000002e-06, "loss": 8.5382, "step": 2389 }, { "epoch": 0.7492163009404389, "grad_norm": 5.945464134216309, "learning_rate": 1.195e-06, "loss": 9.5149, "step": 2390 }, { "epoch": 0.7495297805642633, "grad_norm": 6.858036041259766, "learning_rate": 1.1955e-06, "loss": 9.1617, "step": 2391 }, { "epoch": 0.7498432601880878, "grad_norm": 7.403122901916504, "learning_rate": 1.196e-06, "loss": 9.4436, "step": 2392 }, { "epoch": 0.7501567398119122, "grad_norm": 5.871950626373291, "learning_rate": 1.1965000000000001e-06, "loss": 7.0343, "step": 2393 }, { "epoch": 0.7504702194357367, "grad_norm": 9.108241081237793, "learning_rate": 1.1970000000000001e-06, "loss": 11.0054, "step": 2394 }, { "epoch": 0.7504702194357367, "eval_loss": 27.862751007080078, "eval_runtime": 20.9206, "eval_samples_per_second": 128.438, "eval_steps_per_second": 8.03, "step": 2394 }, { "epoch": 0.7507836990595611, "grad_norm": 6.919027805328369, "learning_rate": 1.1975e-06, "loss": 8.3136, "step": 2395 }, { "epoch": 0.7510971786833855, "grad_norm": 5.883973598480225, "learning_rate": 1.1980000000000002e-06, "loss": 7.4282, "step": 2396 }, { "epoch": 0.75141065830721, "grad_norm": 6.176295280456543, "learning_rate": 1.1985e-06, "loss": 8.8492, "step": 2397 }, { "epoch": 0.7517241379310344, "grad_norm": 6.810075283050537, "learning_rate": 1.199e-06, "loss": 9.706, "step": 2398 }, { "epoch": 0.752037617554859, "grad_norm": 5.519807815551758, "learning_rate": 1.1995000000000001e-06, "loss": 8.259, "step": 2399 }, { "epoch": 0.7523510971786834, "grad_norm": 4.797094821929932, "learning_rate": 1.2000000000000002e-06, "loss": 7.2727, "step": 2400 }, { "epoch": 0.7526645768025079, "grad_norm": 6.261948108673096, "learning_rate": 1.2005000000000002e-06, "loss": 9.45, "step": 2401 }, { "epoch": 0.7529780564263323, "grad_norm": 5.316954135894775, "learning_rate": 1.201e-06, "loss": 6.7442, "step": 2402 }, { "epoch": 0.7532915360501568, "grad_norm": 6.721187591552734, "learning_rate": 1.2015000000000003e-06, "loss": 7.6002, "step": 2403 }, { "epoch": 0.7536050156739812, "grad_norm": 5.482544898986816, "learning_rate": 1.202e-06, "loss": 6.7844, "step": 2404 }, { "epoch": 0.7539184952978056, "grad_norm": 5.34820556640625, "learning_rate": 1.2025000000000001e-06, "loss": 7.4373, "step": 2405 }, { "epoch": 0.7542319749216301, "grad_norm": 7.171077728271484, "learning_rate": 1.2030000000000002e-06, "loss": 7.6944, "step": 2406 }, { "epoch": 0.7545454545454545, "grad_norm": 5.950673580169678, "learning_rate": 1.2035e-06, "loss": 8.4961, "step": 2407 }, { "epoch": 0.754858934169279, "grad_norm": 7.122339725494385, "learning_rate": 1.204e-06, "loss": 8.6441, "step": 2408 }, { "epoch": 0.7551724137931034, "grad_norm": 8.164888381958008, "learning_rate": 1.2045e-06, "loss": 10.9008, "step": 2409 }, { "epoch": 0.7554858934169278, "grad_norm": 8.712209701538086, "learning_rate": 1.2050000000000001e-06, "loss": 11.4492, "step": 2410 }, { "epoch": 0.7557993730407524, "grad_norm": 5.684695720672607, "learning_rate": 1.2055000000000001e-06, "loss": 7.7799, "step": 2411 }, { "epoch": 0.7561128526645768, "grad_norm": 5.714209079742432, "learning_rate": 1.2060000000000002e-06, "loss": 8.4314, "step": 2412 }, { "epoch": 0.7564263322884013, "grad_norm": 5.888979434967041, "learning_rate": 1.2065e-06, "loss": 6.7525, "step": 2413 }, { "epoch": 0.7567398119122257, "grad_norm": 7.8231425285339355, "learning_rate": 1.207e-06, "loss": 12.5721, "step": 2414 }, { "epoch": 0.7570532915360502, "grad_norm": 6.4952898025512695, "learning_rate": 1.2075e-06, "loss": 10.1876, "step": 2415 }, { "epoch": 0.7573667711598746, "grad_norm": 6.374924182891846, "learning_rate": 1.2080000000000001e-06, "loss": 7.2975, "step": 2416 }, { "epoch": 0.757680250783699, "grad_norm": 5.737400531768799, "learning_rate": 1.2085000000000002e-06, "loss": 8.371, "step": 2417 }, { "epoch": 0.7579937304075235, "grad_norm": 5.425909042358398, "learning_rate": 1.209e-06, "loss": 6.2901, "step": 2418 }, { "epoch": 0.7583072100313479, "grad_norm": 7.00061559677124, "learning_rate": 1.2095e-06, "loss": 6.5716, "step": 2419 }, { "epoch": 0.7586206896551724, "grad_norm": 9.082880020141602, "learning_rate": 1.21e-06, "loss": 9.7631, "step": 2420 }, { "epoch": 0.7589341692789968, "grad_norm": 5.837677478790283, "learning_rate": 1.2105e-06, "loss": 7.0652, "step": 2421 }, { "epoch": 0.7592476489028214, "grad_norm": 10.264467239379883, "learning_rate": 1.2110000000000001e-06, "loss": 13.0527, "step": 2422 }, { "epoch": 0.7595611285266458, "grad_norm": 6.860038757324219, "learning_rate": 1.2115e-06, "loss": 8.581, "step": 2423 }, { "epoch": 0.7598746081504703, "grad_norm": 8.064723014831543, "learning_rate": 1.2120000000000002e-06, "loss": 7.2021, "step": 2424 }, { "epoch": 0.7601880877742947, "grad_norm": 5.945399761199951, "learning_rate": 1.2125e-06, "loss": 8.6018, "step": 2425 }, { "epoch": 0.7605015673981191, "grad_norm": 8.428067207336426, "learning_rate": 1.213e-06, "loss": 9.2994, "step": 2426 }, { "epoch": 0.7608150470219436, "grad_norm": 6.145665168762207, "learning_rate": 1.2135000000000001e-06, "loss": 7.1988, "step": 2427 }, { "epoch": 0.761128526645768, "grad_norm": 7.332749843597412, "learning_rate": 1.214e-06, "loss": 8.2611, "step": 2428 }, { "epoch": 0.7614420062695925, "grad_norm": 13.179669380187988, "learning_rate": 1.2145000000000002e-06, "loss": 15.9689, "step": 2429 }, { "epoch": 0.7617554858934169, "grad_norm": 5.4208197593688965, "learning_rate": 1.215e-06, "loss": 6.9613, "step": 2430 }, { "epoch": 0.7620689655172413, "grad_norm": 5.367392063140869, "learning_rate": 1.2155e-06, "loss": 6.4245, "step": 2431 }, { "epoch": 0.7623824451410658, "grad_norm": 5.860224723815918, "learning_rate": 1.216e-06, "loss": 6.6126, "step": 2432 }, { "epoch": 0.7626959247648902, "grad_norm": 10.526330947875977, "learning_rate": 1.2165000000000001e-06, "loss": 13.7313, "step": 2433 }, { "epoch": 0.7630094043887148, "grad_norm": 5.663205623626709, "learning_rate": 1.2170000000000002e-06, "loss": 8.9684, "step": 2434 }, { "epoch": 0.7633228840125392, "grad_norm": 7.089934825897217, "learning_rate": 1.2175e-06, "loss": 7.9986, "step": 2435 }, { "epoch": 0.7636363636363637, "grad_norm": 5.609120845794678, "learning_rate": 1.2180000000000002e-06, "loss": 6.8416, "step": 2436 }, { "epoch": 0.7639498432601881, "grad_norm": 6.426872730255127, "learning_rate": 1.2185e-06, "loss": 7.9864, "step": 2437 }, { "epoch": 0.7642633228840126, "grad_norm": 8.020556449890137, "learning_rate": 1.219e-06, "loss": 8.7561, "step": 2438 }, { "epoch": 0.764576802507837, "grad_norm": 14.095486640930176, "learning_rate": 1.2195000000000001e-06, "loss": 14.5934, "step": 2439 }, { "epoch": 0.7648902821316614, "grad_norm": 6.615420341491699, "learning_rate": 1.2200000000000002e-06, "loss": 6.5653, "step": 2440 }, { "epoch": 0.7652037617554859, "grad_norm": 6.877127647399902, "learning_rate": 1.2205000000000002e-06, "loss": 9.0697, "step": 2441 }, { "epoch": 0.7655172413793103, "grad_norm": 7.938243389129639, "learning_rate": 1.221e-06, "loss": 7.9308, "step": 2442 }, { "epoch": 0.7658307210031348, "grad_norm": 5.383735656738281, "learning_rate": 1.2215e-06, "loss": 6.4287, "step": 2443 }, { "epoch": 0.7661442006269592, "grad_norm": 12.071537971496582, "learning_rate": 1.2220000000000001e-06, "loss": 9.1741, "step": 2444 }, { "epoch": 0.7664576802507836, "grad_norm": 8.097783088684082, "learning_rate": 1.2225000000000002e-06, "loss": 9.6469, "step": 2445 }, { "epoch": 0.7667711598746082, "grad_norm": 10.031937599182129, "learning_rate": 1.2230000000000002e-06, "loss": 11.3807, "step": 2446 }, { "epoch": 0.7670846394984326, "grad_norm": 6.421173572540283, "learning_rate": 1.2235e-06, "loss": 7.5599, "step": 2447 }, { "epoch": 0.7673981191222571, "grad_norm": 7.706267356872559, "learning_rate": 1.224e-06, "loss": 9.3446, "step": 2448 }, { "epoch": 0.7677115987460815, "grad_norm": 8.22382926940918, "learning_rate": 1.2245e-06, "loss": 10.4658, "step": 2449 }, { "epoch": 0.768025078369906, "grad_norm": 9.478100776672363, "learning_rate": 1.2250000000000001e-06, "loss": 9.6859, "step": 2450 }, { "epoch": 0.7683385579937304, "grad_norm": 10.563023567199707, "learning_rate": 1.2255000000000002e-06, "loss": 9.8905, "step": 2451 }, { "epoch": 0.7686520376175549, "grad_norm": 6.427543640136719, "learning_rate": 1.2260000000000002e-06, "loss": 7.7129, "step": 2452 }, { "epoch": 0.7689655172413793, "grad_norm": 7.049678325653076, "learning_rate": 1.2265e-06, "loss": 8.6615, "step": 2453 }, { "epoch": 0.7692789968652037, "grad_norm": 8.58466911315918, "learning_rate": 1.227e-06, "loss": 8.6398, "step": 2454 }, { "epoch": 0.7695924764890282, "grad_norm": 5.85331392288208, "learning_rate": 1.2275000000000001e-06, "loss": 7.8588, "step": 2455 }, { "epoch": 0.7699059561128526, "grad_norm": 7.215764999389648, "learning_rate": 1.2280000000000001e-06, "loss": 8.4039, "step": 2456 }, { "epoch": 0.7702194357366771, "grad_norm": 13.46800422668457, "learning_rate": 1.2285000000000002e-06, "loss": 11.4974, "step": 2457 }, { "epoch": 0.7705329153605016, "grad_norm": 7.22824764251709, "learning_rate": 1.229e-06, "loss": 8.1564, "step": 2458 }, { "epoch": 0.770846394984326, "grad_norm": 6.372166156768799, "learning_rate": 1.2295e-06, "loss": 6.8613, "step": 2459 }, { "epoch": 0.7711598746081505, "grad_norm": 8.008846282958984, "learning_rate": 1.23e-06, "loss": 8.2465, "step": 2460 }, { "epoch": 0.7714733542319749, "grad_norm": 10.947956085205078, "learning_rate": 1.2305000000000001e-06, "loss": 11.4562, "step": 2461 }, { "epoch": 0.7717868338557994, "grad_norm": 7.392462253570557, "learning_rate": 1.2310000000000002e-06, "loss": 9.5973, "step": 2462 }, { "epoch": 0.7721003134796238, "grad_norm": 6.9039082527160645, "learning_rate": 1.2315e-06, "loss": 7.2863, "step": 2463 }, { "epoch": 0.7724137931034483, "grad_norm": 6.517736911773682, "learning_rate": 1.2320000000000002e-06, "loss": 6.8215, "step": 2464 }, { "epoch": 0.7727272727272727, "grad_norm": 6.294345378875732, "learning_rate": 1.2325e-06, "loss": 6.9459, "step": 2465 }, { "epoch": 0.7730407523510971, "grad_norm": 7.055344581604004, "learning_rate": 1.233e-06, "loss": 7.9461, "step": 2466 }, { "epoch": 0.7733542319749216, "grad_norm": 8.949790954589844, "learning_rate": 1.2335000000000001e-06, "loss": 10.2358, "step": 2467 }, { "epoch": 0.773667711598746, "grad_norm": 12.344834327697754, "learning_rate": 1.234e-06, "loss": 11.9022, "step": 2468 }, { "epoch": 0.7739811912225706, "grad_norm": 13.830575942993164, "learning_rate": 1.2345000000000002e-06, "loss": 17.4536, "step": 2469 }, { "epoch": 0.774294670846395, "grad_norm": 10.294656753540039, "learning_rate": 1.235e-06, "loss": 7.7727, "step": 2470 }, { "epoch": 0.7746081504702195, "grad_norm": 6.9053874015808105, "learning_rate": 1.2355e-06, "loss": 8.134, "step": 2471 }, { "epoch": 0.7749216300940439, "grad_norm": 8.573596954345703, "learning_rate": 1.2360000000000001e-06, "loss": 9.5387, "step": 2472 }, { "epoch": 0.7752351097178684, "grad_norm": 8.290820121765137, "learning_rate": 1.2365000000000001e-06, "loss": 11.1582, "step": 2473 }, { "epoch": 0.7755485893416928, "grad_norm": 7.78951358795166, "learning_rate": 1.2370000000000002e-06, "loss": 9.6967, "step": 2474 }, { "epoch": 0.7758620689655172, "grad_norm": 9.113204956054688, "learning_rate": 1.2375e-06, "loss": 8.3096, "step": 2475 }, { "epoch": 0.7761755485893417, "grad_norm": 8.999139785766602, "learning_rate": 1.238e-06, "loss": 8.9482, "step": 2476 }, { "epoch": 0.7764890282131661, "grad_norm": 7.273940563201904, "learning_rate": 1.2385e-06, "loss": 7.3428, "step": 2477 }, { "epoch": 0.7768025078369906, "grad_norm": 9.114079475402832, "learning_rate": 1.2390000000000001e-06, "loss": 9.9958, "step": 2478 }, { "epoch": 0.777115987460815, "grad_norm": 9.299750328063965, "learning_rate": 1.2395000000000002e-06, "loss": 9.2364, "step": 2479 }, { "epoch": 0.7774294670846394, "grad_norm": 7.594107151031494, "learning_rate": 1.2400000000000002e-06, "loss": 8.9441, "step": 2480 }, { "epoch": 0.777742946708464, "grad_norm": 15.933536529541016, "learning_rate": 1.2405e-06, "loss": 15.7072, "step": 2481 }, { "epoch": 0.7780564263322884, "grad_norm": 6.593420028686523, "learning_rate": 1.241e-06, "loss": 7.2379, "step": 2482 }, { "epoch": 0.7783699059561129, "grad_norm": 10.504799842834473, "learning_rate": 1.2415e-06, "loss": 8.9226, "step": 2483 }, { "epoch": 0.7786833855799373, "grad_norm": 7.769631385803223, "learning_rate": 1.2420000000000001e-06, "loss": 7.9376, "step": 2484 }, { "epoch": 0.7789968652037618, "grad_norm": 9.674359321594238, "learning_rate": 1.2425000000000002e-06, "loss": 6.172, "step": 2485 }, { "epoch": 0.7793103448275862, "grad_norm": 10.36740493774414, "learning_rate": 1.243e-06, "loss": 10.3587, "step": 2486 }, { "epoch": 0.7796238244514107, "grad_norm": 10.151678085327148, "learning_rate": 1.2435e-06, "loss": 8.8859, "step": 2487 }, { "epoch": 0.7799373040752351, "grad_norm": 5.526301860809326, "learning_rate": 1.244e-06, "loss": 6.5573, "step": 2488 }, { "epoch": 0.7802507836990595, "grad_norm": 5.857382774353027, "learning_rate": 1.2445000000000001e-06, "loss": 6.264, "step": 2489 }, { "epoch": 0.780564263322884, "grad_norm": 7.313165187835693, "learning_rate": 1.2450000000000002e-06, "loss": 6.587, "step": 2490 }, { "epoch": 0.7808777429467084, "grad_norm": 10.243111610412598, "learning_rate": 1.2455e-06, "loss": 9.8566, "step": 2491 }, { "epoch": 0.7811912225705329, "grad_norm": 8.093012809753418, "learning_rate": 1.2460000000000002e-06, "loss": 7.5616, "step": 2492 }, { "epoch": 0.7815047021943574, "grad_norm": 9.839873313903809, "learning_rate": 1.2465e-06, "loss": 9.569, "step": 2493 }, { "epoch": 0.7818181818181819, "grad_norm": 9.529997825622559, "learning_rate": 1.247e-06, "loss": 8.8561, "step": 2494 }, { "epoch": 0.7821316614420063, "grad_norm": 7.144129276275635, "learning_rate": 1.2475000000000001e-06, "loss": 6.8548, "step": 2495 }, { "epoch": 0.7824451410658307, "grad_norm": 9.005447387695312, "learning_rate": 1.248e-06, "loss": 9.8978, "step": 2496 }, { "epoch": 0.7827586206896552, "grad_norm": 6.3512797355651855, "learning_rate": 1.2485000000000002e-06, "loss": 7.3197, "step": 2497 }, { "epoch": 0.7830721003134796, "grad_norm": 13.88167667388916, "learning_rate": 1.249e-06, "loss": 10.638, "step": 2498 }, { "epoch": 0.7833855799373041, "grad_norm": 8.67345905303955, "learning_rate": 1.2495e-06, "loss": 8.4422, "step": 2499 }, { "epoch": 0.7836990595611285, "grad_norm": 7.372941970825195, "learning_rate": 1.25e-06, "loss": 7.1321, "step": 2500 }, { "epoch": 0.784012539184953, "grad_norm": 7.833611488342285, "learning_rate": 1.2505000000000001e-06, "loss": 7.717, "step": 2501 }, { "epoch": 0.7843260188087774, "grad_norm": 7.675827980041504, "learning_rate": 1.251e-06, "loss": 6.8507, "step": 2502 }, { "epoch": 0.7846394984326018, "grad_norm": 7.610905170440674, "learning_rate": 1.2515000000000002e-06, "loss": 7.2601, "step": 2503 }, { "epoch": 0.7849529780564264, "grad_norm": 8.251374244689941, "learning_rate": 1.2520000000000003e-06, "loss": 7.8607, "step": 2504 }, { "epoch": 0.7852664576802508, "grad_norm": 8.651074409484863, "learning_rate": 1.2525e-06, "loss": 9.5066, "step": 2505 }, { "epoch": 0.7855799373040753, "grad_norm": 8.352408409118652, "learning_rate": 1.2530000000000001e-06, "loss": 6.5008, "step": 2506 }, { "epoch": 0.7858934169278997, "grad_norm": 8.274674415588379, "learning_rate": 1.2535e-06, "loss": 6.7467, "step": 2507 }, { "epoch": 0.7862068965517242, "grad_norm": 14.473176956176758, "learning_rate": 1.2540000000000002e-06, "loss": 14.1415, "step": 2508 }, { "epoch": 0.7865203761755486, "grad_norm": 7.974168300628662, "learning_rate": 1.2545000000000002e-06, "loss": 7.2298, "step": 2509 }, { "epoch": 0.786833855799373, "grad_norm": 7.596717357635498, "learning_rate": 1.255e-06, "loss": 7.7122, "step": 2510 }, { "epoch": 0.7871473354231975, "grad_norm": 11.445423126220703, "learning_rate": 1.2555e-06, "loss": 10.7785, "step": 2511 }, { "epoch": 0.7874608150470219, "grad_norm": 7.318839073181152, "learning_rate": 1.256e-06, "loss": 7.747, "step": 2512 }, { "epoch": 0.7877742946708464, "grad_norm": 8.10612964630127, "learning_rate": 1.2565000000000002e-06, "loss": 7.516, "step": 2513 }, { "epoch": 0.7880877742946708, "grad_norm": 6.949522972106934, "learning_rate": 1.2570000000000002e-06, "loss": 8.044, "step": 2514 }, { "epoch": 0.7884012539184952, "grad_norm": 10.29315185546875, "learning_rate": 1.2575e-06, "loss": 9.012, "step": 2515 }, { "epoch": 0.7887147335423198, "grad_norm": 8.57775592803955, "learning_rate": 1.258e-06, "loss": 6.2847, "step": 2516 }, { "epoch": 0.7890282131661442, "grad_norm": 11.213354110717773, "learning_rate": 1.2584999999999999e-06, "loss": 10.073, "step": 2517 }, { "epoch": 0.7893416927899687, "grad_norm": 12.789294242858887, "learning_rate": 1.2590000000000001e-06, "loss": 12.4063, "step": 2518 }, { "epoch": 0.7896551724137931, "grad_norm": 6.868427753448486, "learning_rate": 1.2595000000000002e-06, "loss": 5.4573, "step": 2519 }, { "epoch": 0.7899686520376176, "grad_norm": 6.975144863128662, "learning_rate": 1.26e-06, "loss": 7.0482, "step": 2520 }, { "epoch": 0.790282131661442, "grad_norm": 9.508519172668457, "learning_rate": 1.2605e-06, "loss": 9.9895, "step": 2521 }, { "epoch": 0.7905956112852665, "grad_norm": 12.231802940368652, "learning_rate": 1.261e-06, "loss": 10.782, "step": 2522 }, { "epoch": 0.7909090909090909, "grad_norm": 7.636473178863525, "learning_rate": 1.2615000000000001e-06, "loss": 7.4447, "step": 2523 }, { "epoch": 0.7912225705329153, "grad_norm": 14.089887619018555, "learning_rate": 1.2620000000000002e-06, "loss": 11.8825, "step": 2524 }, { "epoch": 0.7915360501567398, "grad_norm": 11.20523738861084, "learning_rate": 1.2625000000000002e-06, "loss": 10.0402, "step": 2525 }, { "epoch": 0.7918495297805642, "grad_norm": 10.182788848876953, "learning_rate": 1.263e-06, "loss": 7.4292, "step": 2526 }, { "epoch": 0.7921630094043887, "grad_norm": 8.2036771774292, "learning_rate": 1.2635e-06, "loss": 7.3361, "step": 2527 }, { "epoch": 0.7924764890282132, "grad_norm": 9.289824485778809, "learning_rate": 1.2640000000000003e-06, "loss": 7.6616, "step": 2528 }, { "epoch": 0.7927899686520377, "grad_norm": 10.154131889343262, "learning_rate": 1.2645000000000001e-06, "loss": 9.678, "step": 2529 }, { "epoch": 0.7931034482758621, "grad_norm": 8.044170379638672, "learning_rate": 1.2650000000000002e-06, "loss": 7.7272, "step": 2530 }, { "epoch": 0.7934169278996865, "grad_norm": 14.741924285888672, "learning_rate": 1.2655e-06, "loss": 11.6074, "step": 2531 }, { "epoch": 0.793730407523511, "grad_norm": 12.883913040161133, "learning_rate": 1.266e-06, "loss": 7.45, "step": 2532 }, { "epoch": 0.7940438871473354, "grad_norm": 14.291522026062012, "learning_rate": 1.2665000000000003e-06, "loss": 8.8233, "step": 2533 }, { "epoch": 0.7943573667711599, "grad_norm": 10.920492172241211, "learning_rate": 1.2670000000000001e-06, "loss": 8.2044, "step": 2534 }, { "epoch": 0.7946708463949843, "grad_norm": 9.552704811096191, "learning_rate": 1.2675000000000001e-06, "loss": 7.1354, "step": 2535 }, { "epoch": 0.7949843260188088, "grad_norm": 13.507481575012207, "learning_rate": 1.268e-06, "loss": 11.1275, "step": 2536 }, { "epoch": 0.7952978056426332, "grad_norm": 8.592397689819336, "learning_rate": 1.2685e-06, "loss": 6.2247, "step": 2537 }, { "epoch": 0.7956112852664576, "grad_norm": 8.421808242797852, "learning_rate": 1.2690000000000003e-06, "loss": 6.4027, "step": 2538 }, { "epoch": 0.7959247648902821, "grad_norm": 8.067540168762207, "learning_rate": 1.2695e-06, "loss": 8.3162, "step": 2539 }, { "epoch": 0.7962382445141066, "grad_norm": 8.634953498840332, "learning_rate": 1.2700000000000001e-06, "loss": 6.8064, "step": 2540 }, { "epoch": 0.7965517241379311, "grad_norm": 13.059370994567871, "learning_rate": 1.2705000000000002e-06, "loss": 10.4157, "step": 2541 }, { "epoch": 0.7968652037617555, "grad_norm": 7.6555304527282715, "learning_rate": 1.271e-06, "loss": 6.2527, "step": 2542 }, { "epoch": 0.79717868338558, "grad_norm": 9.67294692993164, "learning_rate": 1.2715000000000002e-06, "loss": 6.2133, "step": 2543 }, { "epoch": 0.7974921630094044, "grad_norm": 8.20207691192627, "learning_rate": 1.2720000000000003e-06, "loss": 7.4998, "step": 2544 }, { "epoch": 0.7978056426332288, "grad_norm": 9.94020938873291, "learning_rate": 1.2725e-06, "loss": 8.4282, "step": 2545 }, { "epoch": 0.7981191222570533, "grad_norm": 7.922036647796631, "learning_rate": 1.2730000000000001e-06, "loss": 6.9887, "step": 2546 }, { "epoch": 0.7984326018808777, "grad_norm": 11.113056182861328, "learning_rate": 1.2735e-06, "loss": 7.7985, "step": 2547 }, { "epoch": 0.7987460815047022, "grad_norm": 12.114452362060547, "learning_rate": 1.2740000000000002e-06, "loss": 8.92, "step": 2548 }, { "epoch": 0.7990595611285266, "grad_norm": 12.305609703063965, "learning_rate": 1.2745000000000002e-06, "loss": 10.3891, "step": 2549 }, { "epoch": 0.799373040752351, "grad_norm": 7.7950215339660645, "learning_rate": 1.275e-06, "loss": 6.9325, "step": 2550 }, { "epoch": 0.7996865203761756, "grad_norm": 12.936772346496582, "learning_rate": 1.2755000000000001e-06, "loss": 9.9608, "step": 2551 }, { "epoch": 0.8, "grad_norm": 9.42872428894043, "learning_rate": 1.276e-06, "loss": 7.2778, "step": 2552 }, { "epoch": 0.8003134796238245, "grad_norm": 12.10912036895752, "learning_rate": 1.2765000000000002e-06, "loss": 8.8379, "step": 2553 }, { "epoch": 0.8006269592476489, "grad_norm": 10.744391441345215, "learning_rate": 1.2770000000000002e-06, "loss": 8.0975, "step": 2554 }, { "epoch": 0.8009404388714734, "grad_norm": 9.00346565246582, "learning_rate": 1.2775e-06, "loss": 6.8766, "step": 2555 }, { "epoch": 0.8012539184952978, "grad_norm": 11.213250160217285, "learning_rate": 1.278e-06, "loss": 8.3033, "step": 2556 }, { "epoch": 0.8015673981191223, "grad_norm": 10.06607723236084, "learning_rate": 1.2785e-06, "loss": 6.8206, "step": 2557 }, { "epoch": 0.8018808777429467, "grad_norm": 16.033727645874023, "learning_rate": 1.2790000000000002e-06, "loss": 10.1014, "step": 2558 }, { "epoch": 0.8021943573667711, "grad_norm": 10.360695838928223, "learning_rate": 1.2795000000000002e-06, "loss": 7.3716, "step": 2559 }, { "epoch": 0.8025078369905956, "grad_norm": 16.054412841796875, "learning_rate": 1.28e-06, "loss": 9.5552, "step": 2560 }, { "epoch": 0.80282131661442, "grad_norm": 10.377695083618164, "learning_rate": 1.2805e-06, "loss": 7.2903, "step": 2561 }, { "epoch": 0.8031347962382445, "grad_norm": 10.614144325256348, "learning_rate": 1.281e-06, "loss": 7.9967, "step": 2562 }, { "epoch": 0.803448275862069, "grad_norm": 10.514567375183105, "learning_rate": 1.2815e-06, "loss": 6.7008, "step": 2563 }, { "epoch": 0.8037617554858935, "grad_norm": 8.607209205627441, "learning_rate": 1.2820000000000002e-06, "loss": 5.8722, "step": 2564 }, { "epoch": 0.8040752351097179, "grad_norm": 10.558499336242676, "learning_rate": 1.2825000000000002e-06, "loss": 7.2417, "step": 2565 }, { "epoch": 0.8043887147335423, "grad_norm": 17.161361694335938, "learning_rate": 1.283e-06, "loss": 13.5513, "step": 2566 }, { "epoch": 0.8047021943573668, "grad_norm": 11.054380416870117, "learning_rate": 1.2835e-06, "loss": 8.4185, "step": 2567 }, { "epoch": 0.8050156739811912, "grad_norm": 15.804526329040527, "learning_rate": 1.284e-06, "loss": 12.1668, "step": 2568 }, { "epoch": 0.8053291536050157, "grad_norm": 15.922082901000977, "learning_rate": 1.2845000000000002e-06, "loss": 11.7197, "step": 2569 }, { "epoch": 0.8056426332288401, "grad_norm": 15.542136192321777, "learning_rate": 1.2850000000000002e-06, "loss": 5.9863, "step": 2570 }, { "epoch": 0.8059561128526646, "grad_norm": 13.482390403747559, "learning_rate": 1.2855e-06, "loss": 8.9265, "step": 2571 }, { "epoch": 0.806269592476489, "grad_norm": 15.561339378356934, "learning_rate": 1.286e-06, "loss": 8.5574, "step": 2572 }, { "epoch": 0.8065830721003134, "grad_norm": 14.538595199584961, "learning_rate": 1.2864999999999999e-06, "loss": 11.4279, "step": 2573 }, { "epoch": 0.8068965517241379, "grad_norm": 14.449934005737305, "learning_rate": 1.2870000000000001e-06, "loss": 12.0358, "step": 2574 }, { "epoch": 0.8072100313479624, "grad_norm": 13.869661331176758, "learning_rate": 1.2875000000000002e-06, "loss": 10.5568, "step": 2575 }, { "epoch": 0.8075235109717869, "grad_norm": 10.084548950195312, "learning_rate": 1.288e-06, "loss": 6.7925, "step": 2576 }, { "epoch": 0.8078369905956113, "grad_norm": 8.329049110412598, "learning_rate": 1.2885e-06, "loss": 7.129, "step": 2577 }, { "epoch": 0.8081504702194358, "grad_norm": 15.03673267364502, "learning_rate": 1.289e-06, "loss": 10.9492, "step": 2578 }, { "epoch": 0.8084639498432602, "grad_norm": 11.203147888183594, "learning_rate": 1.2895e-06, "loss": 6.868, "step": 2579 }, { "epoch": 0.8087774294670846, "grad_norm": 14.008033752441406, "learning_rate": 1.2900000000000001e-06, "loss": 7.7931, "step": 2580 }, { "epoch": 0.8090909090909091, "grad_norm": 11.999140739440918, "learning_rate": 1.2905000000000002e-06, "loss": 5.7815, "step": 2581 }, { "epoch": 0.8094043887147335, "grad_norm": 14.67009449005127, "learning_rate": 1.291e-06, "loss": 10.9111, "step": 2582 }, { "epoch": 0.809717868338558, "grad_norm": 10.367982864379883, "learning_rate": 1.2915e-06, "loss": 7.1978, "step": 2583 }, { "epoch": 0.8100313479623824, "grad_norm": 15.708711624145508, "learning_rate": 1.2920000000000003e-06, "loss": 8.7541, "step": 2584 }, { "epoch": 0.8103448275862069, "grad_norm": 21.055742263793945, "learning_rate": 1.2925000000000001e-06, "loss": 9.4178, "step": 2585 }, { "epoch": 0.8106583072100313, "grad_norm": 11.601629257202148, "learning_rate": 1.2930000000000002e-06, "loss": 7.239, "step": 2586 }, { "epoch": 0.8109717868338558, "grad_norm": 19.32367515563965, "learning_rate": 1.2935e-06, "loss": 12.215, "step": 2587 }, { "epoch": 0.8112852664576803, "grad_norm": 12.446346282958984, "learning_rate": 1.294e-06, "loss": 8.1503, "step": 2588 }, { "epoch": 0.8115987460815047, "grad_norm": 15.01457691192627, "learning_rate": 1.2945000000000003e-06, "loss": 9.3474, "step": 2589 }, { "epoch": 0.8119122257053292, "grad_norm": 8.85611629486084, "learning_rate": 1.295e-06, "loss": 6.0037, "step": 2590 }, { "epoch": 0.8122257053291536, "grad_norm": 12.6907320022583, "learning_rate": 1.2955000000000001e-06, "loss": 6.035, "step": 2591 }, { "epoch": 0.812539184952978, "grad_norm": 29.200088500976562, "learning_rate": 1.296e-06, "loss": 15.6593, "step": 2592 }, { "epoch": 0.8128526645768025, "grad_norm": 13.449995994567871, "learning_rate": 1.2965e-06, "loss": 7.4237, "step": 2593 }, { "epoch": 0.8131661442006269, "grad_norm": 12.185378074645996, "learning_rate": 1.2970000000000002e-06, "loss": 7.6051, "step": 2594 }, { "epoch": 0.8134796238244514, "grad_norm": 13.01950454711914, "learning_rate": 1.2975e-06, "loss": 9.8676, "step": 2595 }, { "epoch": 0.8137931034482758, "grad_norm": 12.636890411376953, "learning_rate": 1.2980000000000001e-06, "loss": 7.8251, "step": 2596 }, { "epoch": 0.8141065830721003, "grad_norm": 14.959136962890625, "learning_rate": 1.2985e-06, "loss": 8.8174, "step": 2597 }, { "epoch": 0.8144200626959248, "grad_norm": 12.237345695495605, "learning_rate": 1.299e-06, "loss": 6.9888, "step": 2598 }, { "epoch": 0.8147335423197493, "grad_norm": 9.535272598266602, "learning_rate": 1.2995000000000002e-06, "loss": 5.9116, "step": 2599 }, { "epoch": 0.8150470219435737, "grad_norm": 15.491859436035156, "learning_rate": 1.3e-06, "loss": 7.9926, "step": 2600 }, { "epoch": 0.8153605015673981, "grad_norm": 11.194720268249512, "learning_rate": 1.3005e-06, "loss": 7.7533, "step": 2601 }, { "epoch": 0.8156739811912226, "grad_norm": 18.05571746826172, "learning_rate": 1.3010000000000001e-06, "loss": 9.5349, "step": 2602 }, { "epoch": 0.815987460815047, "grad_norm": 9.782732009887695, "learning_rate": 1.3015e-06, "loss": 6.5899, "step": 2603 }, { "epoch": 0.8163009404388715, "grad_norm": 15.516654968261719, "learning_rate": 1.3020000000000002e-06, "loss": 7.107, "step": 2604 }, { "epoch": 0.8166144200626959, "grad_norm": 13.18638801574707, "learning_rate": 1.3025000000000002e-06, "loss": 6.44, "step": 2605 }, { "epoch": 0.8169278996865204, "grad_norm": 14.614501953125, "learning_rate": 1.303e-06, "loss": 7.5293, "step": 2606 }, { "epoch": 0.8172413793103448, "grad_norm": 11.227118492126465, "learning_rate": 1.3035e-06, "loss": 7.8937, "step": 2607 }, { "epoch": 0.8175548589341692, "grad_norm": 15.58757495880127, "learning_rate": 1.304e-06, "loss": 8.0596, "step": 2608 }, { "epoch": 0.8178683385579937, "grad_norm": 22.042787551879883, "learning_rate": 1.3045000000000002e-06, "loss": 8.2819, "step": 2609 }, { "epoch": 0.8181818181818182, "grad_norm": 21.173282623291016, "learning_rate": 1.3050000000000002e-06, "loss": 10.4288, "step": 2610 }, { "epoch": 0.8184952978056427, "grad_norm": 18.32832145690918, "learning_rate": 1.3055e-06, "loss": 12.1147, "step": 2611 }, { "epoch": 0.8188087774294671, "grad_norm": 15.920726776123047, "learning_rate": 1.306e-06, "loss": 9.0858, "step": 2612 }, { "epoch": 0.8191222570532916, "grad_norm": 12.705510139465332, "learning_rate": 1.3065e-06, "loss": 8.1776, "step": 2613 }, { "epoch": 0.819435736677116, "grad_norm": 13.502321243286133, "learning_rate": 1.3070000000000001e-06, "loss": 7.3692, "step": 2614 }, { "epoch": 0.8197492163009404, "grad_norm": 15.334294319152832, "learning_rate": 1.3075000000000002e-06, "loss": 8.4849, "step": 2615 }, { "epoch": 0.8200626959247649, "grad_norm": 13.238600730895996, "learning_rate": 1.308e-06, "loss": 6.9999, "step": 2616 }, { "epoch": 0.8203761755485893, "grad_norm": 20.93524742126465, "learning_rate": 1.3085e-06, "loss": 7.2483, "step": 2617 }, { "epoch": 0.8206896551724138, "grad_norm": 13.544681549072266, "learning_rate": 1.309e-06, "loss": 7.8575, "step": 2618 }, { "epoch": 0.8210031347962382, "grad_norm": 14.38907241821289, "learning_rate": 1.3095000000000001e-06, "loss": 6.2083, "step": 2619 }, { "epoch": 0.8213166144200627, "grad_norm": 20.6708927154541, "learning_rate": 1.3100000000000002e-06, "loss": 9.0823, "step": 2620 }, { "epoch": 0.8216300940438871, "grad_norm": 12.318846702575684, "learning_rate": 1.3105000000000002e-06, "loss": 6.4326, "step": 2621 }, { "epoch": 0.8219435736677116, "grad_norm": 40.32315444946289, "learning_rate": 1.311e-06, "loss": 14.9634, "step": 2622 }, { "epoch": 0.8222570532915361, "grad_norm": 15.944631576538086, "learning_rate": 1.3115e-06, "loss": 7.8842, "step": 2623 }, { "epoch": 0.8225705329153605, "grad_norm": 12.112171173095703, "learning_rate": 1.3120000000000003e-06, "loss": 5.5937, "step": 2624 }, { "epoch": 0.822884012539185, "grad_norm": 13.884896278381348, "learning_rate": 1.3125000000000001e-06, "loss": 6.3687, "step": 2625 }, { "epoch": 0.8231974921630094, "grad_norm": 22.745121002197266, "learning_rate": 1.3130000000000002e-06, "loss": 12.1536, "step": 2626 }, { "epoch": 0.8235109717868339, "grad_norm": 18.648685455322266, "learning_rate": 1.3135e-06, "loss": 7.5739, "step": 2627 }, { "epoch": 0.8238244514106583, "grad_norm": 18.585302352905273, "learning_rate": 1.314e-06, "loss": 8.4919, "step": 2628 }, { "epoch": 0.8241379310344827, "grad_norm": 26.631200790405273, "learning_rate": 1.3145000000000003e-06, "loss": 8.0334, "step": 2629 }, { "epoch": 0.8244514106583072, "grad_norm": 22.677764892578125, "learning_rate": 1.3150000000000001e-06, "loss": 8.3661, "step": 2630 }, { "epoch": 0.8247648902821316, "grad_norm": 27.903852462768555, "learning_rate": 1.3155000000000002e-06, "loss": 10.0715, "step": 2631 }, { "epoch": 0.8250783699059561, "grad_norm": 26.193275451660156, "learning_rate": 1.316e-06, "loss": 9.49, "step": 2632 }, { "epoch": 0.8253918495297806, "grad_norm": 15.721944808959961, "learning_rate": 1.3165e-06, "loss": 7.3057, "step": 2633 }, { "epoch": 0.8257053291536051, "grad_norm": 16.153507232666016, "learning_rate": 1.3170000000000003e-06, "loss": 6.6323, "step": 2634 }, { "epoch": 0.8260188087774295, "grad_norm": 17.67839813232422, "learning_rate": 1.3175e-06, "loss": 7.1916, "step": 2635 }, { "epoch": 0.826332288401254, "grad_norm": 18.60005760192871, "learning_rate": 1.3180000000000001e-06, "loss": 7.4737, "step": 2636 }, { "epoch": 0.8266457680250784, "grad_norm": 16.918989181518555, "learning_rate": 1.3185e-06, "loss": 9.5991, "step": 2637 }, { "epoch": 0.8269592476489028, "grad_norm": 28.17279624938965, "learning_rate": 1.319e-06, "loss": 9.0958, "step": 2638 }, { "epoch": 0.8272727272727273, "grad_norm": 22.73778533935547, "learning_rate": 1.3195000000000002e-06, "loss": 6.3333, "step": 2639 }, { "epoch": 0.8275862068965517, "grad_norm": 17.459001541137695, "learning_rate": 1.32e-06, "loss": 6.0962, "step": 2640 }, { "epoch": 0.8278996865203762, "grad_norm": 13.217561721801758, "learning_rate": 1.3205e-06, "loss": 6.563, "step": 2641 }, { "epoch": 0.8282131661442006, "grad_norm": 15.55752182006836, "learning_rate": 1.3210000000000001e-06, "loss": 5.5877, "step": 2642 }, { "epoch": 0.828526645768025, "grad_norm": 56.43824005126953, "learning_rate": 1.3215e-06, "loss": 7.7625, "step": 2643 }, { "epoch": 0.8288401253918495, "grad_norm": 21.975671768188477, "learning_rate": 1.3220000000000002e-06, "loss": 9.7735, "step": 2644 }, { "epoch": 0.829153605015674, "grad_norm": 19.311349868774414, "learning_rate": 1.3225000000000003e-06, "loss": 8.4531, "step": 2645 }, { "epoch": 0.8294670846394985, "grad_norm": 39.477603912353516, "learning_rate": 1.323e-06, "loss": 14.1747, "step": 2646 }, { "epoch": 0.8297805642633229, "grad_norm": 23.62912368774414, "learning_rate": 1.3235000000000001e-06, "loss": 9.9994, "step": 2647 }, { "epoch": 0.8300940438871474, "grad_norm": 17.039447784423828, "learning_rate": 1.324e-06, "loss": 6.8518, "step": 2648 }, { "epoch": 0.8304075235109718, "grad_norm": 21.798898696899414, "learning_rate": 1.3245000000000002e-06, "loss": 8.8252, "step": 2649 }, { "epoch": 0.8307210031347962, "grad_norm": 31.107290267944336, "learning_rate": 1.3250000000000002e-06, "loss": 9.1876, "step": 2650 }, { "epoch": 0.8310344827586207, "grad_norm": 21.375978469848633, "learning_rate": 1.3255e-06, "loss": 6.6534, "step": 2651 }, { "epoch": 0.8313479623824451, "grad_norm": 16.57238006591797, "learning_rate": 1.326e-06, "loss": 5.7842, "step": 2652 }, { "epoch": 0.8316614420062696, "grad_norm": 25.424558639526367, "learning_rate": 1.3265e-06, "loss": 8.5218, "step": 2653 }, { "epoch": 0.831974921630094, "grad_norm": 32.927066802978516, "learning_rate": 1.3270000000000002e-06, "loss": 9.3753, "step": 2654 }, { "epoch": 0.8322884012539185, "grad_norm": 18.978023529052734, "learning_rate": 1.3275000000000002e-06, "loss": 6.1247, "step": 2655 }, { "epoch": 0.8326018808777429, "grad_norm": 38.223243713378906, "learning_rate": 1.328e-06, "loss": 11.0785, "step": 2656 }, { "epoch": 0.8329153605015674, "grad_norm": 16.340456008911133, "learning_rate": 1.3285e-06, "loss": 6.4915, "step": 2657 }, { "epoch": 0.8332288401253919, "grad_norm": 18.187742233276367, "learning_rate": 1.3290000000000001e-06, "loss": 8.5854, "step": 2658 }, { "epoch": 0.8335423197492163, "grad_norm": 17.474607467651367, "learning_rate": 1.3295000000000001e-06, "loss": 7.5377, "step": 2659 }, { "epoch": 0.8338557993730408, "grad_norm": 19.971267700195312, "learning_rate": 1.3300000000000002e-06, "loss": 7.2454, "step": 2660 }, { "epoch": 0.8341692789968652, "grad_norm": 14.669326782226562, "learning_rate": 1.3305000000000002e-06, "loss": 6.1117, "step": 2661 }, { "epoch": 0.8344827586206897, "grad_norm": 18.929859161376953, "learning_rate": 1.331e-06, "loss": 7.8862, "step": 2662 }, { "epoch": 0.8347962382445141, "grad_norm": 29.54069709777832, "learning_rate": 1.3315e-06, "loss": 7.5074, "step": 2663 }, { "epoch": 0.8351097178683385, "grad_norm": 21.967472076416016, "learning_rate": 1.3320000000000003e-06, "loss": 7.0764, "step": 2664 }, { "epoch": 0.835423197492163, "grad_norm": 45.06687927246094, "learning_rate": 1.3325000000000002e-06, "loss": 9.701, "step": 2665 }, { "epoch": 0.8357366771159874, "grad_norm": 24.894472122192383, "learning_rate": 1.3330000000000002e-06, "loss": 8.708, "step": 2666 }, { "epoch": 0.8360501567398119, "grad_norm": 20.155099868774414, "learning_rate": 1.3335e-06, "loss": 6.1855, "step": 2667 }, { "epoch": 0.8363636363636363, "grad_norm": 20.541330337524414, "learning_rate": 1.334e-06, "loss": 6.6755, "step": 2668 }, { "epoch": 0.8366771159874609, "grad_norm": 22.628978729248047, "learning_rate": 1.3345000000000003e-06, "loss": 8.2543, "step": 2669 }, { "epoch": 0.8369905956112853, "grad_norm": 22.389394760131836, "learning_rate": 1.3350000000000001e-06, "loss": 7.0468, "step": 2670 }, { "epoch": 0.8373040752351097, "grad_norm": 23.417722702026367, "learning_rate": 1.3355000000000002e-06, "loss": 6.792, "step": 2671 }, { "epoch": 0.8376175548589342, "grad_norm": 21.459794998168945, "learning_rate": 1.336e-06, "loss": 6.9663, "step": 2672 }, { "epoch": 0.8379310344827586, "grad_norm": 32.657989501953125, "learning_rate": 1.3365e-06, "loss": 5.9438, "step": 2673 }, { "epoch": 0.8382445141065831, "grad_norm": 47.682586669921875, "learning_rate": 1.3370000000000003e-06, "loss": 10.6549, "step": 2674 }, { "epoch": 0.8385579937304075, "grad_norm": 20.36661720275879, "learning_rate": 1.3375000000000001e-06, "loss": 6.8448, "step": 2675 }, { "epoch": 0.838871473354232, "grad_norm": 14.008045196533203, "learning_rate": 1.3380000000000001e-06, "loss": 6.8451, "step": 2676 }, { "epoch": 0.8391849529780564, "grad_norm": 17.530170440673828, "learning_rate": 1.3385e-06, "loss": 5.8831, "step": 2677 }, { "epoch": 0.8394984326018808, "grad_norm": 17.58102798461914, "learning_rate": 1.339e-06, "loss": 6.3528, "step": 2678 }, { "epoch": 0.8398119122257053, "grad_norm": 23.83525848388672, "learning_rate": 1.3395000000000003e-06, "loss": 7.7174, "step": 2679 }, { "epoch": 0.8401253918495298, "grad_norm": 27.301860809326172, "learning_rate": 1.34e-06, "loss": 5.6273, "step": 2680 }, { "epoch": 0.8404388714733543, "grad_norm": 29.820091247558594, "learning_rate": 1.3405000000000001e-06, "loss": 7.9886, "step": 2681 }, { "epoch": 0.8407523510971787, "grad_norm": 31.963327407836914, "learning_rate": 1.3410000000000002e-06, "loss": 6.5933, "step": 2682 }, { "epoch": 0.8410658307210032, "grad_norm": 23.809494018554688, "learning_rate": 1.3415e-06, "loss": 7.4239, "step": 2683 }, { "epoch": 0.8413793103448276, "grad_norm": 26.722423553466797, "learning_rate": 1.3420000000000002e-06, "loss": 6.8609, "step": 2684 }, { "epoch": 0.841692789968652, "grad_norm": 20.260234832763672, "learning_rate": 1.3425000000000003e-06, "loss": 7.6539, "step": 2685 }, { "epoch": 0.8420062695924765, "grad_norm": 29.476564407348633, "learning_rate": 1.343e-06, "loss": 8.3136, "step": 2686 }, { "epoch": 0.8423197492163009, "grad_norm": 17.281204223632812, "learning_rate": 1.3435000000000001e-06, "loss": 5.9271, "step": 2687 }, { "epoch": 0.8426332288401254, "grad_norm": 39.53363037109375, "learning_rate": 1.344e-06, "loss": 8.3828, "step": 2688 }, { "epoch": 0.8429467084639498, "grad_norm": 19.412582397460938, "learning_rate": 1.3445e-06, "loss": 6.6062, "step": 2689 }, { "epoch": 0.8432601880877743, "grad_norm": 31.106475830078125, "learning_rate": 1.3450000000000003e-06, "loss": 7.9131, "step": 2690 }, { "epoch": 0.8435736677115987, "grad_norm": 37.35240936279297, "learning_rate": 1.3455e-06, "loss": 8.8415, "step": 2691 }, { "epoch": 0.8438871473354232, "grad_norm": 26.39192771911621, "learning_rate": 1.3460000000000001e-06, "loss": 6.637, "step": 2692 }, { "epoch": 0.8442006269592477, "grad_norm": 21.605411529541016, "learning_rate": 1.3465e-06, "loss": 6.3293, "step": 2693 }, { "epoch": 0.8445141065830721, "grad_norm": 21.46399688720703, "learning_rate": 1.347e-06, "loss": 6.8912, "step": 2694 }, { "epoch": 0.8448275862068966, "grad_norm": 26.33460807800293, "learning_rate": 1.3475000000000002e-06, "loss": 8.7624, "step": 2695 }, { "epoch": 0.845141065830721, "grad_norm": 21.527713775634766, "learning_rate": 1.348e-06, "loss": 5.8286, "step": 2696 }, { "epoch": 0.8454545454545455, "grad_norm": 30.383634567260742, "learning_rate": 1.3485e-06, "loss": 9.5837, "step": 2697 }, { "epoch": 0.8457680250783699, "grad_norm": 33.206356048583984, "learning_rate": 1.3490000000000001e-06, "loss": 7.8532, "step": 2698 }, { "epoch": 0.8460815047021943, "grad_norm": 36.5329475402832, "learning_rate": 1.3495e-06, "loss": 5.4797, "step": 2699 }, { "epoch": 0.8463949843260188, "grad_norm": 30.74308204650879, "learning_rate": 1.3500000000000002e-06, "loss": 7.289, "step": 2700 }, { "epoch": 0.8467084639498432, "grad_norm": 38.16331481933594, "learning_rate": 1.3505000000000002e-06, "loss": 7.9117, "step": 2701 }, { "epoch": 0.8470219435736677, "grad_norm": 26.741302490234375, "learning_rate": 1.351e-06, "loss": 5.9725, "step": 2702 }, { "epoch": 0.8473354231974921, "grad_norm": 34.44478988647461, "learning_rate": 1.3515e-06, "loss": 7.5997, "step": 2703 }, { "epoch": 0.8476489028213167, "grad_norm": 23.709253311157227, "learning_rate": 1.352e-06, "loss": 6.8915, "step": 2704 }, { "epoch": 0.8479623824451411, "grad_norm": 24.891733169555664, "learning_rate": 1.3525000000000002e-06, "loss": 5.9107, "step": 2705 }, { "epoch": 0.8482758620689655, "grad_norm": 24.73875617980957, "learning_rate": 1.3530000000000002e-06, "loss": 6.2778, "step": 2706 }, { "epoch": 0.84858934169279, "grad_norm": 18.066926956176758, "learning_rate": 1.3535e-06, "loss": 6.1856, "step": 2707 }, { "epoch": 0.8489028213166144, "grad_norm": 40.347476959228516, "learning_rate": 1.354e-06, "loss": 7.0503, "step": 2708 }, { "epoch": 0.8492163009404389, "grad_norm": 31.80996322631836, "learning_rate": 1.3545e-06, "loss": 8.8319, "step": 2709 }, { "epoch": 0.8495297805642633, "grad_norm": 27.470136642456055, "learning_rate": 1.3550000000000002e-06, "loss": 7.3765, "step": 2710 }, { "epoch": 0.8498432601880878, "grad_norm": 83.07367706298828, "learning_rate": 1.3555000000000002e-06, "loss": 20.241, "step": 2711 }, { "epoch": 0.8501567398119122, "grad_norm": 30.60701560974121, "learning_rate": 1.356e-06, "loss": 6.9115, "step": 2712 }, { "epoch": 0.8504702194357366, "grad_norm": 31.232662200927734, "learning_rate": 1.3565e-06, "loss": 7.2203, "step": 2713 }, { "epoch": 0.8507836990595611, "grad_norm": 29.009035110473633, "learning_rate": 1.3569999999999999e-06, "loss": 7.3232, "step": 2714 }, { "epoch": 0.8510971786833855, "grad_norm": 20.113876342773438, "learning_rate": 1.3575000000000001e-06, "loss": 5.1684, "step": 2715 }, { "epoch": 0.8514106583072101, "grad_norm": 21.815345764160156, "learning_rate": 1.3580000000000002e-06, "loss": 6.1103, "step": 2716 }, { "epoch": 0.8517241379310345, "grad_norm": 28.0091552734375, "learning_rate": 1.3585e-06, "loss": 7.3426, "step": 2717 }, { "epoch": 0.852037617554859, "grad_norm": 30.732309341430664, "learning_rate": 1.359e-06, "loss": 6.6737, "step": 2718 }, { "epoch": 0.8523510971786834, "grad_norm": 24.529449462890625, "learning_rate": 1.3595e-06, "loss": 6.6836, "step": 2719 }, { "epoch": 0.8526645768025078, "grad_norm": 21.116939544677734, "learning_rate": 1.3600000000000001e-06, "loss": 6.1635, "step": 2720 }, { "epoch": 0.8529780564263323, "grad_norm": 27.107351303100586, "learning_rate": 1.3605000000000001e-06, "loss": 6.0729, "step": 2721 }, { "epoch": 0.8532915360501567, "grad_norm": 19.72542953491211, "learning_rate": 1.3610000000000002e-06, "loss": 4.9322, "step": 2722 }, { "epoch": 0.8536050156739812, "grad_norm": 20.744781494140625, "learning_rate": 1.3615e-06, "loss": 6.2112, "step": 2723 }, { "epoch": 0.8539184952978056, "grad_norm": 23.486122131347656, "learning_rate": 1.362e-06, "loss": 6.0354, "step": 2724 }, { "epoch": 0.85423197492163, "grad_norm": 37.22897720336914, "learning_rate": 1.3625000000000003e-06, "loss": 6.7026, "step": 2725 }, { "epoch": 0.8545454545454545, "grad_norm": 28.69023323059082, "learning_rate": 1.3630000000000001e-06, "loss": 6.349, "step": 2726 }, { "epoch": 0.854858934169279, "grad_norm": 31.381017684936523, "learning_rate": 1.3635000000000002e-06, "loss": 6.7903, "step": 2727 }, { "epoch": 0.8551724137931035, "grad_norm": 23.358259201049805, "learning_rate": 1.364e-06, "loss": 5.7168, "step": 2728 }, { "epoch": 0.8554858934169279, "grad_norm": 29.059661865234375, "learning_rate": 1.3645e-06, "loss": 6.2689, "step": 2729 }, { "epoch": 0.8557993730407524, "grad_norm": 28.51305389404297, "learning_rate": 1.3650000000000003e-06, "loss": 5.9654, "step": 2730 }, { "epoch": 0.8561128526645768, "grad_norm": 24.55581283569336, "learning_rate": 1.3655e-06, "loss": 5.0936, "step": 2731 }, { "epoch": 0.8564263322884013, "grad_norm": 28.580116271972656, "learning_rate": 1.3660000000000001e-06, "loss": 6.2048, "step": 2732 }, { "epoch": 0.8567398119122257, "grad_norm": 37.072593688964844, "learning_rate": 1.3665e-06, "loss": 6.9439, "step": 2733 }, { "epoch": 0.8570532915360501, "grad_norm": 24.373992919921875, "learning_rate": 1.367e-06, "loss": 6.8616, "step": 2734 }, { "epoch": 0.8573667711598746, "grad_norm": 37.73335647583008, "learning_rate": 1.3675000000000002e-06, "loss": 6.2993, "step": 2735 }, { "epoch": 0.857680250783699, "grad_norm": 80.02848815917969, "learning_rate": 1.368e-06, "loss": 14.2041, "step": 2736 }, { "epoch": 0.8579937304075235, "grad_norm": 52.7184944152832, "learning_rate": 1.3685000000000001e-06, "loss": 8.9868, "step": 2737 }, { "epoch": 0.8583072100313479, "grad_norm": 31.193477630615234, "learning_rate": 1.3690000000000001e-06, "loss": 7.0294, "step": 2738 }, { "epoch": 0.8586206896551725, "grad_norm": 30.320158004760742, "learning_rate": 1.3695e-06, "loss": 6.4768, "step": 2739 }, { "epoch": 0.8589341692789969, "grad_norm": 30.57577896118164, "learning_rate": 1.3700000000000002e-06, "loss": 5.7312, "step": 2740 }, { "epoch": 0.8592476489028213, "grad_norm": 25.569894790649414, "learning_rate": 1.3705000000000003e-06, "loss": 5.5384, "step": 2741 }, { "epoch": 0.8595611285266458, "grad_norm": 40.88070297241211, "learning_rate": 1.371e-06, "loss": 7.8681, "step": 2742 }, { "epoch": 0.8598746081504702, "grad_norm": 32.0083122253418, "learning_rate": 1.3715000000000001e-06, "loss": 6.5132, "step": 2743 }, { "epoch": 0.8601880877742947, "grad_norm": 33.715633392333984, "learning_rate": 1.372e-06, "loss": 6.0128, "step": 2744 }, { "epoch": 0.8605015673981191, "grad_norm": 43.099884033203125, "learning_rate": 1.3725000000000002e-06, "loss": 6.8138, "step": 2745 }, { "epoch": 0.8608150470219436, "grad_norm": 47.59553909301758, "learning_rate": 1.3730000000000002e-06, "loss": 8.2016, "step": 2746 }, { "epoch": 0.861128526645768, "grad_norm": 40.0462532043457, "learning_rate": 1.3735e-06, "loss": 6.6381, "step": 2747 }, { "epoch": 0.8614420062695924, "grad_norm": 35.96194076538086, "learning_rate": 1.374e-06, "loss": 7.399, "step": 2748 }, { "epoch": 0.8617554858934169, "grad_norm": 39.26766586303711, "learning_rate": 1.3745e-06, "loss": 6.7996, "step": 2749 }, { "epoch": 0.8620689655172413, "grad_norm": 51.32083511352539, "learning_rate": 1.3750000000000002e-06, "loss": 8.2172, "step": 2750 }, { "epoch": 0.8623824451410659, "grad_norm": 18.551576614379883, "learning_rate": 1.3755000000000002e-06, "loss": 6.0155, "step": 2751 }, { "epoch": 0.8626959247648903, "grad_norm": 20.321107864379883, "learning_rate": 1.376e-06, "loss": 5.1774, "step": 2752 }, { "epoch": 0.8630094043887148, "grad_norm": 25.40889549255371, "learning_rate": 1.3765e-06, "loss": 5.933, "step": 2753 }, { "epoch": 0.8633228840125392, "grad_norm": 40.312984466552734, "learning_rate": 1.377e-06, "loss": 6.7176, "step": 2754 }, { "epoch": 0.8636363636363636, "grad_norm": 38.97128677368164, "learning_rate": 1.3775000000000002e-06, "loss": 5.2988, "step": 2755 }, { "epoch": 0.8639498432601881, "grad_norm": 39.15882110595703, "learning_rate": 1.3780000000000002e-06, "loss": 7.5587, "step": 2756 }, { "epoch": 0.8642633228840125, "grad_norm": 37.12295913696289, "learning_rate": 1.3785e-06, "loss": 5.3687, "step": 2757 }, { "epoch": 0.864576802507837, "grad_norm": 69.34944152832031, "learning_rate": 1.379e-06, "loss": 10.4011, "step": 2758 }, { "epoch": 0.8648902821316614, "grad_norm": 46.22438430786133, "learning_rate": 1.3795e-06, "loss": 7.7162, "step": 2759 }, { "epoch": 0.8652037617554859, "grad_norm": 21.386926651000977, "learning_rate": 1.3800000000000001e-06, "loss": 4.8139, "step": 2760 }, { "epoch": 0.8655172413793103, "grad_norm": 41.118896484375, "learning_rate": 1.3805000000000002e-06, "loss": 6.995, "step": 2761 }, { "epoch": 0.8658307210031349, "grad_norm": 29.520702362060547, "learning_rate": 1.3810000000000002e-06, "loss": 5.1273, "step": 2762 }, { "epoch": 0.8661442006269593, "grad_norm": 36.20766830444336, "learning_rate": 1.3815e-06, "loss": 6.9384, "step": 2763 }, { "epoch": 0.8664576802507837, "grad_norm": 26.937095642089844, "learning_rate": 1.382e-06, "loss": 6.1545, "step": 2764 }, { "epoch": 0.8667711598746082, "grad_norm": 61.82893753051758, "learning_rate": 1.3825000000000003e-06, "loss": 8.894, "step": 2765 }, { "epoch": 0.8670846394984326, "grad_norm": 27.976224899291992, "learning_rate": 1.3830000000000001e-06, "loss": 4.7557, "step": 2766 }, { "epoch": 0.8673981191222571, "grad_norm": 79.30841827392578, "learning_rate": 1.3835000000000002e-06, "loss": 8.8842, "step": 2767 }, { "epoch": 0.8677115987460815, "grad_norm": 32.45915985107422, "learning_rate": 1.384e-06, "loss": 5.9744, "step": 2768 }, { "epoch": 0.868025078369906, "grad_norm": 27.075918197631836, "learning_rate": 1.3845e-06, "loss": 4.6374, "step": 2769 }, { "epoch": 0.8683385579937304, "grad_norm": 60.21990203857422, "learning_rate": 1.3850000000000003e-06, "loss": 6.1392, "step": 2770 }, { "epoch": 0.8686520376175548, "grad_norm": 38.180335998535156, "learning_rate": 1.3855000000000001e-06, "loss": 5.6472, "step": 2771 }, { "epoch": 0.8689655172413793, "grad_norm": 44.778709411621094, "learning_rate": 1.3860000000000002e-06, "loss": 6.1988, "step": 2772 }, { "epoch": 0.8692789968652037, "grad_norm": 44.12003707885742, "learning_rate": 1.3865e-06, "loss": 6.2107, "step": 2773 }, { "epoch": 0.8695924764890283, "grad_norm": 115.1012954711914, "learning_rate": 1.387e-06, "loss": 10.4227, "step": 2774 }, { "epoch": 0.8699059561128527, "grad_norm": 47.72286605834961, "learning_rate": 1.3875000000000003e-06, "loss": 5.7027, "step": 2775 }, { "epoch": 0.8702194357366771, "grad_norm": 42.00877380371094, "learning_rate": 1.388e-06, "loss": 7.9404, "step": 2776 }, { "epoch": 0.8705329153605016, "grad_norm": 30.263233184814453, "learning_rate": 1.3885000000000001e-06, "loss": 5.9391, "step": 2777 }, { "epoch": 0.870846394984326, "grad_norm": 32.171573638916016, "learning_rate": 1.3890000000000002e-06, "loss": 5.6703, "step": 2778 }, { "epoch": 0.8711598746081505, "grad_norm": 29.78256607055664, "learning_rate": 1.3895e-06, "loss": 5.0913, "step": 2779 }, { "epoch": 0.8714733542319749, "grad_norm": 60.958702087402344, "learning_rate": 1.3900000000000002e-06, "loss": 8.0207, "step": 2780 }, { "epoch": 0.8717868338557994, "grad_norm": 43.16213607788086, "learning_rate": 1.3905000000000003e-06, "loss": 6.1787, "step": 2781 }, { "epoch": 0.8721003134796238, "grad_norm": 46.14605712890625, "learning_rate": 1.3910000000000001e-06, "loss": 5.719, "step": 2782 }, { "epoch": 0.8724137931034482, "grad_norm": 43.69407272338867, "learning_rate": 1.3915000000000001e-06, "loss": 7.7847, "step": 2783 }, { "epoch": 0.8727272727272727, "grad_norm": 41.964820861816406, "learning_rate": 1.392e-06, "loss": 5.3784, "step": 2784 }, { "epoch": 0.8730407523510971, "grad_norm": 33.25990295410156, "learning_rate": 1.3925000000000002e-06, "loss": 5.3528, "step": 2785 }, { "epoch": 0.8733542319749217, "grad_norm": 37.829498291015625, "learning_rate": 1.3930000000000003e-06, "loss": 5.1219, "step": 2786 }, { "epoch": 0.8736677115987461, "grad_norm": 42.88932418823242, "learning_rate": 1.3935e-06, "loss": 6.6543, "step": 2787 }, { "epoch": 0.8739811912225706, "grad_norm": 63.86898422241211, "learning_rate": 1.3940000000000001e-06, "loss": 9.7226, "step": 2788 }, { "epoch": 0.874294670846395, "grad_norm": 45.50909423828125, "learning_rate": 1.3945e-06, "loss": 7.3516, "step": 2789 }, { "epoch": 0.8746081504702194, "grad_norm": 40.1636962890625, "learning_rate": 1.3950000000000002e-06, "loss": 5.1074, "step": 2790 }, { "epoch": 0.8749216300940439, "grad_norm": 37.66518020629883, "learning_rate": 1.3955000000000002e-06, "loss": 5.9449, "step": 2791 }, { "epoch": 0.8752351097178683, "grad_norm": 77.52034759521484, "learning_rate": 1.396e-06, "loss": 8.5409, "step": 2792 }, { "epoch": 0.8755485893416928, "grad_norm": 45.52507019042969, "learning_rate": 1.3965e-06, "loss": 6.171, "step": 2793 }, { "epoch": 0.8758620689655172, "grad_norm": 41.04618835449219, "learning_rate": 1.397e-06, "loss": 5.7984, "step": 2794 }, { "epoch": 0.8761755485893417, "grad_norm": 57.51970672607422, "learning_rate": 1.3975000000000002e-06, "loss": 6.6904, "step": 2795 }, { "epoch": 0.8764890282131661, "grad_norm": 59.90044403076172, "learning_rate": 1.3980000000000002e-06, "loss": 6.9548, "step": 2796 }, { "epoch": 0.8768025078369905, "grad_norm": 37.544612884521484, "learning_rate": 1.3985e-06, "loss": 6.0155, "step": 2797 }, { "epoch": 0.8771159874608151, "grad_norm": 43.11286926269531, "learning_rate": 1.399e-06, "loss": 5.8831, "step": 2798 }, { "epoch": 0.8774294670846395, "grad_norm": 22.86078453063965, "learning_rate": 1.3995000000000001e-06, "loss": 3.8691, "step": 2799 }, { "epoch": 0.877742946708464, "grad_norm": 24.594881057739258, "learning_rate": 1.4000000000000001e-06, "loss": 4.9944, "step": 2800 }, { "epoch": 0.8780564263322884, "grad_norm": 32.79366683959961, "learning_rate": 1.4005000000000002e-06, "loss": 4.5094, "step": 2801 }, { "epoch": 0.8783699059561129, "grad_norm": 40.043540954589844, "learning_rate": 1.4010000000000002e-06, "loss": 5.0449, "step": 2802 }, { "epoch": 0.8786833855799373, "grad_norm": 39.7636604309082, "learning_rate": 1.4015e-06, "loss": 4.8512, "step": 2803 }, { "epoch": 0.8789968652037617, "grad_norm": 38.312355041503906, "learning_rate": 1.402e-06, "loss": 4.9508, "step": 2804 }, { "epoch": 0.8793103448275862, "grad_norm": 40.00060272216797, "learning_rate": 1.4025000000000003e-06, "loss": 6.2614, "step": 2805 }, { "epoch": 0.8796238244514106, "grad_norm": 38.401275634765625, "learning_rate": 1.4030000000000002e-06, "loss": 5.2398, "step": 2806 }, { "epoch": 0.8799373040752351, "grad_norm": 40.27410125732422, "learning_rate": 1.4035000000000002e-06, "loss": 5.9456, "step": 2807 }, { "epoch": 0.8802507836990595, "grad_norm": 28.530675888061523, "learning_rate": 1.404e-06, "loss": 4.5628, "step": 2808 }, { "epoch": 0.8805642633228841, "grad_norm": 39.466033935546875, "learning_rate": 1.4045e-06, "loss": 5.2629, "step": 2809 }, { "epoch": 0.8808777429467085, "grad_norm": 29.432931900024414, "learning_rate": 1.4050000000000003e-06, "loss": 4.6265, "step": 2810 }, { "epoch": 0.881191222570533, "grad_norm": 48.174129486083984, "learning_rate": 1.4055000000000001e-06, "loss": 4.853, "step": 2811 }, { "epoch": 0.8815047021943574, "grad_norm": 53.55119323730469, "learning_rate": 1.4060000000000002e-06, "loss": 7.2377, "step": 2812 }, { "epoch": 0.8818181818181818, "grad_norm": 36.38772964477539, "learning_rate": 1.4065e-06, "loss": 5.2177, "step": 2813 }, { "epoch": 0.8821316614420063, "grad_norm": 34.217830657958984, "learning_rate": 1.407e-06, "loss": 5.3052, "step": 2814 }, { "epoch": 0.8824451410658307, "grad_norm": 42.84861373901367, "learning_rate": 1.4075e-06, "loss": 5.6933, "step": 2815 }, { "epoch": 0.8827586206896552, "grad_norm": 53.72264862060547, "learning_rate": 1.4080000000000001e-06, "loss": 5.3901, "step": 2816 }, { "epoch": 0.8830721003134796, "grad_norm": 53.93974685668945, "learning_rate": 1.4085000000000002e-06, "loss": 5.2435, "step": 2817 }, { "epoch": 0.883385579937304, "grad_norm": 28.91839027404785, "learning_rate": 1.4090000000000002e-06, "loss": 3.7364, "step": 2818 }, { "epoch": 0.8836990595611285, "grad_norm": 36.89215850830078, "learning_rate": 1.4095e-06, "loss": 4.5822, "step": 2819 }, { "epoch": 0.8840125391849529, "grad_norm": 33.66713333129883, "learning_rate": 1.41e-06, "loss": 4.6832, "step": 2820 }, { "epoch": 0.8843260188087775, "grad_norm": 44.25896453857422, "learning_rate": 1.4105000000000003e-06, "loss": 6.3235, "step": 2821 }, { "epoch": 0.8846394984326019, "grad_norm": 34.358551025390625, "learning_rate": 1.4110000000000001e-06, "loss": 5.9712, "step": 2822 }, { "epoch": 0.8849529780564264, "grad_norm": 36.53178787231445, "learning_rate": 1.4115000000000002e-06, "loss": 5.4185, "step": 2823 }, { "epoch": 0.8852664576802508, "grad_norm": 61.49184036254883, "learning_rate": 1.412e-06, "loss": 6.4534, "step": 2824 }, { "epoch": 0.8855799373040752, "grad_norm": 96.07270812988281, "learning_rate": 1.4125e-06, "loss": 9.2125, "step": 2825 }, { "epoch": 0.8858934169278997, "grad_norm": 44.58479690551758, "learning_rate": 1.4130000000000003e-06, "loss": 4.9576, "step": 2826 }, { "epoch": 0.8862068965517241, "grad_norm": 52.979209899902344, "learning_rate": 1.4135e-06, "loss": 5.3499, "step": 2827 }, { "epoch": 0.8865203761755486, "grad_norm": 37.45901107788086, "learning_rate": 1.4140000000000001e-06, "loss": 4.4372, "step": 2828 }, { "epoch": 0.886833855799373, "grad_norm": 44.921173095703125, "learning_rate": 1.4145e-06, "loss": 5.9111, "step": 2829 }, { "epoch": 0.8871473354231975, "grad_norm": 38.52549362182617, "learning_rate": 1.415e-06, "loss": 4.8988, "step": 2830 }, { "epoch": 0.8874608150470219, "grad_norm": 48.08058166503906, "learning_rate": 1.4155000000000003e-06, "loss": 4.7856, "step": 2831 }, { "epoch": 0.8877742946708463, "grad_norm": 32.446327209472656, "learning_rate": 1.416e-06, "loss": 4.5528, "step": 2832 }, { "epoch": 0.8880877742946709, "grad_norm": 86.36289978027344, "learning_rate": 1.4165000000000001e-06, "loss": 7.3174, "step": 2833 }, { "epoch": 0.8884012539184953, "grad_norm": 64.09017944335938, "learning_rate": 1.417e-06, "loss": 6.8142, "step": 2834 }, { "epoch": 0.8887147335423198, "grad_norm": 52.754486083984375, "learning_rate": 1.4175e-06, "loss": 6.6618, "step": 2835 }, { "epoch": 0.8890282131661442, "grad_norm": 43.92601013183594, "learning_rate": 1.4180000000000002e-06, "loss": 5.7932, "step": 2836 }, { "epoch": 0.8893416927899687, "grad_norm": 39.43013000488281, "learning_rate": 1.4185e-06, "loss": 5.5289, "step": 2837 }, { "epoch": 0.8896551724137931, "grad_norm": 42.97591018676758, "learning_rate": 1.419e-06, "loss": 4.7558, "step": 2838 }, { "epoch": 0.8899686520376175, "grad_norm": 43.5150032043457, "learning_rate": 1.4195000000000001e-06, "loss": 4.532, "step": 2839 }, { "epoch": 0.890282131661442, "grad_norm": 40.31664276123047, "learning_rate": 1.42e-06, "loss": 5.3421, "step": 2840 }, { "epoch": 0.8905956112852664, "grad_norm": 34.56455612182617, "learning_rate": 1.4205000000000002e-06, "loss": 4.0237, "step": 2841 }, { "epoch": 0.8909090909090909, "grad_norm": 30.62310218811035, "learning_rate": 1.4210000000000002e-06, "loss": 4.2576, "step": 2842 }, { "epoch": 0.8912225705329153, "grad_norm": 40.746028900146484, "learning_rate": 1.4215e-06, "loss": 4.587, "step": 2843 }, { "epoch": 0.8915360501567398, "grad_norm": 31.952014923095703, "learning_rate": 1.4220000000000001e-06, "loss": 3.8408, "step": 2844 }, { "epoch": 0.8918495297805643, "grad_norm": 26.450883865356445, "learning_rate": 1.4225e-06, "loss": 4.4055, "step": 2845 }, { "epoch": 0.8921630094043888, "grad_norm": 47.278404235839844, "learning_rate": 1.4230000000000002e-06, "loss": 4.9155, "step": 2846 }, { "epoch": 0.8924764890282132, "grad_norm": 55.63157272338867, "learning_rate": 1.4235000000000002e-06, "loss": 5.8403, "step": 2847 }, { "epoch": 0.8927899686520376, "grad_norm": 30.85458755493164, "learning_rate": 1.424e-06, "loss": 4.5499, "step": 2848 }, { "epoch": 0.8931034482758621, "grad_norm": 36.83681106567383, "learning_rate": 1.4245e-06, "loss": 5.0289, "step": 2849 }, { "epoch": 0.8934169278996865, "grad_norm": 32.328121185302734, "learning_rate": 1.425e-06, "loss": 4.2722, "step": 2850 }, { "epoch": 0.893730407523511, "grad_norm": 44.29472732543945, "learning_rate": 1.4255000000000002e-06, "loss": 4.4436, "step": 2851 }, { "epoch": 0.8940438871473354, "grad_norm": 46.61187744140625, "learning_rate": 1.4260000000000002e-06, "loss": 5.3737, "step": 2852 }, { "epoch": 0.8943573667711598, "grad_norm": 35.942100524902344, "learning_rate": 1.4265e-06, "loss": 4.4256, "step": 2853 }, { "epoch": 0.8946708463949843, "grad_norm": 32.366519927978516, "learning_rate": 1.427e-06, "loss": 3.8425, "step": 2854 }, { "epoch": 0.8949843260188087, "grad_norm": 35.66120529174805, "learning_rate": 1.4275e-06, "loss": 4.4804, "step": 2855 }, { "epoch": 0.8952978056426333, "grad_norm": 37.548492431640625, "learning_rate": 1.4280000000000001e-06, "loss": 5.4055, "step": 2856 }, { "epoch": 0.8956112852664577, "grad_norm": 41.95212936401367, "learning_rate": 1.4285000000000002e-06, "loss": 4.8837, "step": 2857 }, { "epoch": 0.8959247648902822, "grad_norm": 34.09566879272461, "learning_rate": 1.4290000000000002e-06, "loss": 4.8557, "step": 2858 }, { "epoch": 0.8962382445141066, "grad_norm": 52.16231918334961, "learning_rate": 1.4295e-06, "loss": 4.9953, "step": 2859 }, { "epoch": 0.896551724137931, "grad_norm": 38.20033264160156, "learning_rate": 1.43e-06, "loss": 4.2838, "step": 2860 }, { "epoch": 0.8968652037617555, "grad_norm": 52.53467559814453, "learning_rate": 1.4305000000000003e-06, "loss": 5.1196, "step": 2861 }, { "epoch": 0.8971786833855799, "grad_norm": 68.67937469482422, "learning_rate": 1.4310000000000001e-06, "loss": 5.4295, "step": 2862 }, { "epoch": 0.8974921630094044, "grad_norm": 39.02657699584961, "learning_rate": 1.4315000000000002e-06, "loss": 3.6579, "step": 2863 }, { "epoch": 0.8978056426332288, "grad_norm": 27.187744140625, "learning_rate": 1.432e-06, "loss": 3.7283, "step": 2864 }, { "epoch": 0.8981191222570533, "grad_norm": 42.428802490234375, "learning_rate": 1.4325e-06, "loss": 4.022, "step": 2865 }, { "epoch": 0.8984326018808777, "grad_norm": 29.769405364990234, "learning_rate": 1.4330000000000003e-06, "loss": 3.7161, "step": 2866 }, { "epoch": 0.8987460815047021, "grad_norm": 40.81663513183594, "learning_rate": 1.4335000000000001e-06, "loss": 4.9217, "step": 2867 }, { "epoch": 0.8990595611285267, "grad_norm": 48.31073760986328, "learning_rate": 1.4340000000000002e-06, "loss": 4.4489, "step": 2868 }, { "epoch": 0.8993730407523511, "grad_norm": 32.35671615600586, "learning_rate": 1.4345e-06, "loss": 4.1166, "step": 2869 }, { "epoch": 0.8996865203761756, "grad_norm": 46.619041442871094, "learning_rate": 1.435e-06, "loss": 6.4497, "step": 2870 }, { "epoch": 0.9, "grad_norm": 56.73939514160156, "learning_rate": 1.4355000000000003e-06, "loss": 5.7911, "step": 2871 }, { "epoch": 0.9003134796238245, "grad_norm": 31.99533462524414, "learning_rate": 1.436e-06, "loss": 3.6651, "step": 2872 }, { "epoch": 0.9006269592476489, "grad_norm": 37.6666374206543, "learning_rate": 1.4365000000000001e-06, "loss": 3.6502, "step": 2873 }, { "epoch": 0.9009404388714733, "grad_norm": 23.286277770996094, "learning_rate": 1.437e-06, "loss": 3.7086, "step": 2874 }, { "epoch": 0.9012539184952978, "grad_norm": 42.88998794555664, "learning_rate": 1.4375e-06, "loss": 5.5071, "step": 2875 }, { "epoch": 0.9015673981191222, "grad_norm": 37.3940315246582, "learning_rate": 1.4380000000000003e-06, "loss": 5.1036, "step": 2876 }, { "epoch": 0.9018808777429467, "grad_norm": 48.386104583740234, "learning_rate": 1.4385e-06, "loss": 4.5443, "step": 2877 }, { "epoch": 0.9021943573667711, "grad_norm": 63.84465408325195, "learning_rate": 1.4390000000000001e-06, "loss": 6.2741, "step": 2878 }, { "epoch": 0.9025078369905956, "grad_norm": 32.616859436035156, "learning_rate": 1.4395000000000002e-06, "loss": 4.203, "step": 2879 }, { "epoch": 0.9028213166144201, "grad_norm": 42.289005279541016, "learning_rate": 1.44e-06, "loss": 5.2399, "step": 2880 }, { "epoch": 0.9031347962382446, "grad_norm": 62.790042877197266, "learning_rate": 1.4405000000000002e-06, "loss": 4.648, "step": 2881 }, { "epoch": 0.903448275862069, "grad_norm": 51.915889739990234, "learning_rate": 1.4410000000000003e-06, "loss": 5.2985, "step": 2882 }, { "epoch": 0.9037617554858934, "grad_norm": 75.9524154663086, "learning_rate": 1.4415e-06, "loss": 6.2667, "step": 2883 }, { "epoch": 0.9040752351097179, "grad_norm": 28.834205627441406, "learning_rate": 1.4420000000000001e-06, "loss": 3.4066, "step": 2884 }, { "epoch": 0.9043887147335423, "grad_norm": 51.43450927734375, "learning_rate": 1.4425e-06, "loss": 6.0204, "step": 2885 }, { "epoch": 0.9047021943573668, "grad_norm": 76.39936065673828, "learning_rate": 1.4430000000000002e-06, "loss": 5.3644, "step": 2886 }, { "epoch": 0.9050156739811912, "grad_norm": 39.322025299072266, "learning_rate": 1.4435000000000002e-06, "loss": 4.0316, "step": 2887 }, { "epoch": 0.9053291536050156, "grad_norm": 30.268177032470703, "learning_rate": 1.444e-06, "loss": 4.4225, "step": 2888 }, { "epoch": 0.9056426332288401, "grad_norm": 32.31449508666992, "learning_rate": 1.4445e-06, "loss": 4.538, "step": 2889 }, { "epoch": 0.9059561128526645, "grad_norm": 26.894195556640625, "learning_rate": 1.445e-06, "loss": 3.7654, "step": 2890 }, { "epoch": 0.9062695924764891, "grad_norm": 67.31766510009766, "learning_rate": 1.4455000000000002e-06, "loss": 6.158, "step": 2891 }, { "epoch": 0.9065830721003135, "grad_norm": 24.468050003051758, "learning_rate": 1.4460000000000002e-06, "loss": 3.9294, "step": 2892 }, { "epoch": 0.906896551724138, "grad_norm": 47.52812576293945, "learning_rate": 1.4465e-06, "loss": 5.1586, "step": 2893 }, { "epoch": 0.9072100313479624, "grad_norm": 35.4903450012207, "learning_rate": 1.447e-06, "loss": 4.418, "step": 2894 }, { "epoch": 0.9075235109717869, "grad_norm": 31.2788028717041, "learning_rate": 1.4475000000000001e-06, "loss": 3.6287, "step": 2895 }, { "epoch": 0.9078369905956113, "grad_norm": 31.898345947265625, "learning_rate": 1.4480000000000002e-06, "loss": 3.8173, "step": 2896 }, { "epoch": 0.9081504702194357, "grad_norm": 59.66981506347656, "learning_rate": 1.4485000000000002e-06, "loss": 5.177, "step": 2897 }, { "epoch": 0.9084639498432602, "grad_norm": 50.7946891784668, "learning_rate": 1.4490000000000002e-06, "loss": 5.3483, "step": 2898 }, { "epoch": 0.9087774294670846, "grad_norm": 27.841205596923828, "learning_rate": 1.4495e-06, "loss": 3.2566, "step": 2899 }, { "epoch": 0.9090909090909091, "grad_norm": 29.948871612548828, "learning_rate": 1.45e-06, "loss": 4.0416, "step": 2900 }, { "epoch": 0.9094043887147335, "grad_norm": 23.60964584350586, "learning_rate": 1.4505000000000003e-06, "loss": 3.1585, "step": 2901 }, { "epoch": 0.909717868338558, "grad_norm": 33.11689376831055, "learning_rate": 1.4510000000000002e-06, "loss": 4.4712, "step": 2902 }, { "epoch": 0.9100313479623825, "grad_norm": 62.05201721191406, "learning_rate": 1.4515000000000002e-06, "loss": 5.4037, "step": 2903 }, { "epoch": 0.9103448275862069, "grad_norm": 105.85570526123047, "learning_rate": 1.452e-06, "loss": 5.0095, "step": 2904 }, { "epoch": 0.9106583072100314, "grad_norm": 45.89641189575195, "learning_rate": 1.4525e-06, "loss": 3.9751, "step": 2905 }, { "epoch": 0.9109717868338558, "grad_norm": 52.051944732666016, "learning_rate": 1.4530000000000003e-06, "loss": 4.7415, "step": 2906 }, { "epoch": 0.9112852664576803, "grad_norm": 67.7939682006836, "learning_rate": 1.4535000000000001e-06, "loss": 4.7054, "step": 2907 }, { "epoch": 0.9115987460815047, "grad_norm": 41.19660949707031, "learning_rate": 1.4540000000000002e-06, "loss": 5.2859, "step": 2908 }, { "epoch": 0.9119122257053291, "grad_norm": 46.74116516113281, "learning_rate": 1.4545e-06, "loss": 4.0541, "step": 2909 }, { "epoch": 0.9122257053291536, "grad_norm": 84.27909088134766, "learning_rate": 1.455e-06, "loss": 5.2779, "step": 2910 }, { "epoch": 0.912539184952978, "grad_norm": 53.99125289916992, "learning_rate": 1.4555000000000003e-06, "loss": 4.3379, "step": 2911 }, { "epoch": 0.9128526645768025, "grad_norm": 52.822410583496094, "learning_rate": 1.4560000000000001e-06, "loss": 5.3916, "step": 2912 }, { "epoch": 0.9131661442006269, "grad_norm": 53.39067840576172, "learning_rate": 1.4565000000000002e-06, "loss": 5.6752, "step": 2913 }, { "epoch": 0.9134796238244514, "grad_norm": 42.46992111206055, "learning_rate": 1.457e-06, "loss": 3.9782, "step": 2914 }, { "epoch": 0.9137931034482759, "grad_norm": 31.098447799682617, "learning_rate": 1.4575e-06, "loss": 3.5816, "step": 2915 }, { "epoch": 0.9141065830721004, "grad_norm": 49.15898513793945, "learning_rate": 1.4580000000000003e-06, "loss": 3.5684, "step": 2916 }, { "epoch": 0.9144200626959248, "grad_norm": 49.75162124633789, "learning_rate": 1.4585e-06, "loss": 4.8455, "step": 2917 }, { "epoch": 0.9147335423197492, "grad_norm": 51.76414108276367, "learning_rate": 1.4590000000000001e-06, "loss": 4.1124, "step": 2918 }, { "epoch": 0.9150470219435737, "grad_norm": 41.26679229736328, "learning_rate": 1.4595000000000002e-06, "loss": 4.2793, "step": 2919 }, { "epoch": 0.9153605015673981, "grad_norm": 43.92702865600586, "learning_rate": 1.46e-06, "loss": 4.6872, "step": 2920 }, { "epoch": 0.9156739811912226, "grad_norm": 34.27779769897461, "learning_rate": 1.4605000000000002e-06, "loss": 3.8484, "step": 2921 }, { "epoch": 0.915987460815047, "grad_norm": 31.705305099487305, "learning_rate": 1.4610000000000003e-06, "loss": 4.3887, "step": 2922 }, { "epoch": 0.9163009404388714, "grad_norm": 53.87886428833008, "learning_rate": 1.4615000000000001e-06, "loss": 4.4851, "step": 2923 }, { "epoch": 0.9166144200626959, "grad_norm": 43.97981262207031, "learning_rate": 1.4620000000000001e-06, "loss": 3.952, "step": 2924 }, { "epoch": 0.9169278996865203, "grad_norm": 36.8157958984375, "learning_rate": 1.4625e-06, "loss": 3.4762, "step": 2925 }, { "epoch": 0.9172413793103448, "grad_norm": 35.28675079345703, "learning_rate": 1.4630000000000002e-06, "loss": 4.2781, "step": 2926 }, { "epoch": 0.9175548589341693, "grad_norm": 37.354148864746094, "learning_rate": 1.4635000000000003e-06, "loss": 3.6935, "step": 2927 }, { "epoch": 0.9178683385579938, "grad_norm": 61.448665618896484, "learning_rate": 1.464e-06, "loss": 5.5435, "step": 2928 }, { "epoch": 0.9181818181818182, "grad_norm": 33.78268814086914, "learning_rate": 1.4645000000000001e-06, "loss": 4.5201, "step": 2929 }, { "epoch": 0.9184952978056427, "grad_norm": 37.02507019042969, "learning_rate": 1.465e-06, "loss": 3.5038, "step": 2930 }, { "epoch": 0.9188087774294671, "grad_norm": 29.58689308166504, "learning_rate": 1.4655000000000002e-06, "loss": 3.4179, "step": 2931 }, { "epoch": 0.9191222570532915, "grad_norm": 56.62324142456055, "learning_rate": 1.4660000000000002e-06, "loss": 3.9863, "step": 2932 }, { "epoch": 0.919435736677116, "grad_norm": 30.186080932617188, "learning_rate": 1.4665e-06, "loss": 3.6394, "step": 2933 }, { "epoch": 0.9197492163009404, "grad_norm": 91.04447937011719, "learning_rate": 1.467e-06, "loss": 5.1371, "step": 2934 }, { "epoch": 0.9200626959247649, "grad_norm": 94.30821990966797, "learning_rate": 1.4675000000000001e-06, "loss": 4.2498, "step": 2935 }, { "epoch": 0.9203761755485893, "grad_norm": 42.256168365478516, "learning_rate": 1.4680000000000002e-06, "loss": 4.5557, "step": 2936 }, { "epoch": 0.9206896551724137, "grad_norm": 38.17436599731445, "learning_rate": 1.4685000000000002e-06, "loss": 4.0215, "step": 2937 }, { "epoch": 0.9210031347962383, "grad_norm": 38.624698638916016, "learning_rate": 1.4690000000000003e-06, "loss": 3.5599, "step": 2938 }, { "epoch": 0.9213166144200627, "grad_norm": 38.51246643066406, "learning_rate": 1.4695e-06, "loss": 3.5211, "step": 2939 }, { "epoch": 0.9216300940438872, "grad_norm": 37.080074310302734, "learning_rate": 1.4700000000000001e-06, "loss": 5.7069, "step": 2940 }, { "epoch": 0.9219435736677116, "grad_norm": 37.63076400756836, "learning_rate": 1.4705e-06, "loss": 4.2385, "step": 2941 }, { "epoch": 0.9222570532915361, "grad_norm": 82.26602935791016, "learning_rate": 1.4710000000000002e-06, "loss": 3.6196, "step": 2942 }, { "epoch": 0.9225705329153605, "grad_norm": 44.46656036376953, "learning_rate": 1.4715000000000002e-06, "loss": 4.0862, "step": 2943 }, { "epoch": 0.922884012539185, "grad_norm": 34.88580322265625, "learning_rate": 1.472e-06, "loss": 3.5532, "step": 2944 }, { "epoch": 0.9231974921630094, "grad_norm": 41.69892501831055, "learning_rate": 1.4725e-06, "loss": 4.2038, "step": 2945 }, { "epoch": 0.9235109717868338, "grad_norm": 48.94770431518555, "learning_rate": 1.473e-06, "loss": 4.1548, "step": 2946 }, { "epoch": 0.9238244514106583, "grad_norm": 30.48263168334961, "learning_rate": 1.4735000000000002e-06, "loss": 3.5391, "step": 2947 }, { "epoch": 0.9241379310344827, "grad_norm": 37.1695671081543, "learning_rate": 1.4740000000000002e-06, "loss": 3.9293, "step": 2948 }, { "epoch": 0.9244514106583072, "grad_norm": 132.91793823242188, "learning_rate": 1.4745e-06, "loss": 4.7049, "step": 2949 }, { "epoch": 0.9247648902821317, "grad_norm": 52.81797790527344, "learning_rate": 1.475e-06, "loss": 4.619, "step": 2950 }, { "epoch": 0.9250783699059562, "grad_norm": 27.379749298095703, "learning_rate": 1.4754999999999999e-06, "loss": 3.5159, "step": 2951 }, { "epoch": 0.9253918495297806, "grad_norm": 37.0819206237793, "learning_rate": 1.4760000000000001e-06, "loss": 5.0518, "step": 2952 }, { "epoch": 0.925705329153605, "grad_norm": 60.48858642578125, "learning_rate": 1.4765000000000002e-06, "loss": 4.3286, "step": 2953 }, { "epoch": 0.9260188087774295, "grad_norm": 60.34362030029297, "learning_rate": 1.477e-06, "loss": 4.1523, "step": 2954 }, { "epoch": 0.9263322884012539, "grad_norm": 47.12152862548828, "learning_rate": 1.4775e-06, "loss": 3.5215, "step": 2955 }, { "epoch": 0.9266457680250784, "grad_norm": 33.692054748535156, "learning_rate": 1.478e-06, "loss": 3.7666, "step": 2956 }, { "epoch": 0.9269592476489028, "grad_norm": 32.28064727783203, "learning_rate": 1.4785000000000001e-06, "loss": 3.2797, "step": 2957 }, { "epoch": 0.9272727272727272, "grad_norm": 47.644588470458984, "learning_rate": 1.4790000000000002e-06, "loss": 4.0666, "step": 2958 }, { "epoch": 0.9275862068965517, "grad_norm": 37.37916564941406, "learning_rate": 1.4795000000000002e-06, "loss": 3.9653, "step": 2959 }, { "epoch": 0.9278996865203761, "grad_norm": 34.87968444824219, "learning_rate": 1.48e-06, "loss": 3.4317, "step": 2960 }, { "epoch": 0.9282131661442006, "grad_norm": 75.75358581542969, "learning_rate": 1.4805e-06, "loss": 5.1771, "step": 2961 }, { "epoch": 0.9285266457680251, "grad_norm": 39.766578674316406, "learning_rate": 1.4810000000000003e-06, "loss": 3.8669, "step": 2962 }, { "epoch": 0.9288401253918496, "grad_norm": 69.24491119384766, "learning_rate": 1.4815000000000001e-06, "loss": 4.7225, "step": 2963 }, { "epoch": 0.929153605015674, "grad_norm": 84.43096923828125, "learning_rate": 1.4820000000000002e-06, "loss": 4.2543, "step": 2964 }, { "epoch": 0.9294670846394985, "grad_norm": 41.02848434448242, "learning_rate": 1.4825e-06, "loss": 4.1251, "step": 2965 }, { "epoch": 0.9297805642633229, "grad_norm": 41.528892517089844, "learning_rate": 1.483e-06, "loss": 3.6634, "step": 2966 }, { "epoch": 0.9300940438871473, "grad_norm": 28.742746353149414, "learning_rate": 1.4835000000000003e-06, "loss": 4.2559, "step": 2967 }, { "epoch": 0.9304075235109718, "grad_norm": 52.95411682128906, "learning_rate": 1.4840000000000001e-06, "loss": 3.8098, "step": 2968 }, { "epoch": 0.9307210031347962, "grad_norm": 48.885982513427734, "learning_rate": 1.4845000000000001e-06, "loss": 4.4849, "step": 2969 }, { "epoch": 0.9310344827586207, "grad_norm": 36.8331298828125, "learning_rate": 1.485e-06, "loss": 4.2251, "step": 2970 }, { "epoch": 0.9313479623824451, "grad_norm": 33.98138427734375, "learning_rate": 1.4855e-06, "loss": 3.6462, "step": 2971 }, { "epoch": 0.9316614420062695, "grad_norm": 33.16653823852539, "learning_rate": 1.4860000000000003e-06, "loss": 3.4936, "step": 2972 }, { "epoch": 0.931974921630094, "grad_norm": 23.719505310058594, "learning_rate": 1.4865e-06, "loss": 3.3029, "step": 2973 }, { "epoch": 0.9322884012539185, "grad_norm": 34.3863410949707, "learning_rate": 1.4870000000000001e-06, "loss": 4.0512, "step": 2974 }, { "epoch": 0.932601880877743, "grad_norm": 31.7304744720459, "learning_rate": 1.4875000000000002e-06, "loss": 3.8679, "step": 2975 }, { "epoch": 0.9329153605015674, "grad_norm": 36.177642822265625, "learning_rate": 1.488e-06, "loss": 3.5125, "step": 2976 }, { "epoch": 0.9332288401253919, "grad_norm": 48.70751953125, "learning_rate": 1.4885000000000002e-06, "loss": 4.5736, "step": 2977 }, { "epoch": 0.9335423197492163, "grad_norm": 47.91338348388672, "learning_rate": 1.4890000000000003e-06, "loss": 4.3447, "step": 2978 }, { "epoch": 0.9338557993730408, "grad_norm": 38.861026763916016, "learning_rate": 1.4895e-06, "loss": 3.2831, "step": 2979 }, { "epoch": 0.9341692789968652, "grad_norm": 31.002511978149414, "learning_rate": 1.4900000000000001e-06, "loss": 3.899, "step": 2980 }, { "epoch": 0.9344827586206896, "grad_norm": 45.859554290771484, "learning_rate": 1.4905e-06, "loss": 3.303, "step": 2981 }, { "epoch": 0.9347962382445141, "grad_norm": 37.03654479980469, "learning_rate": 1.4910000000000002e-06, "loss": 3.4903, "step": 2982 }, { "epoch": 0.9351097178683385, "grad_norm": 51.43019485473633, "learning_rate": 1.4915000000000002e-06, "loss": 3.4536, "step": 2983 }, { "epoch": 0.935423197492163, "grad_norm": 24.4971866607666, "learning_rate": 1.492e-06, "loss": 3.258, "step": 2984 }, { "epoch": 0.9357366771159875, "grad_norm": 35.39488220214844, "learning_rate": 1.4925000000000001e-06, "loss": 3.8771, "step": 2985 }, { "epoch": 0.936050156739812, "grad_norm": 33.246829986572266, "learning_rate": 1.493e-06, "loss": 3.5061, "step": 2986 }, { "epoch": 0.9363636363636364, "grad_norm": 80.51388549804688, "learning_rate": 1.4935000000000002e-06, "loss": 3.0297, "step": 2987 }, { "epoch": 0.9366771159874608, "grad_norm": 32.59467315673828, "learning_rate": 1.4940000000000002e-06, "loss": 3.5332, "step": 2988 }, { "epoch": 0.9369905956112853, "grad_norm": 35.27291488647461, "learning_rate": 1.4945e-06, "loss": 3.4688, "step": 2989 }, { "epoch": 0.9373040752351097, "grad_norm": 34.252323150634766, "learning_rate": 1.495e-06, "loss": 4.57, "step": 2990 }, { "epoch": 0.9376175548589342, "grad_norm": 38.402591705322266, "learning_rate": 1.4955e-06, "loss": 3.7867, "step": 2991 }, { "epoch": 0.9379310344827586, "grad_norm": 60.38703918457031, "learning_rate": 1.4960000000000002e-06, "loss": 3.5518, "step": 2992 }, { "epoch": 0.938244514106583, "grad_norm": 49.44574737548828, "learning_rate": 1.4965000000000002e-06, "loss": 4.4428, "step": 2993 }, { "epoch": 0.9385579937304075, "grad_norm": 36.240272521972656, "learning_rate": 1.497e-06, "loss": 3.9223, "step": 2994 }, { "epoch": 0.9388714733542319, "grad_norm": 43.761146545410156, "learning_rate": 1.4975e-06, "loss": 4.0446, "step": 2995 }, { "epoch": 0.9391849529780564, "grad_norm": 37.836631774902344, "learning_rate": 1.498e-06, "loss": 4.5072, "step": 2996 }, { "epoch": 0.9394984326018809, "grad_norm": 43.07557678222656, "learning_rate": 1.4985000000000001e-06, "loss": 3.9492, "step": 2997 }, { "epoch": 0.9398119122257054, "grad_norm": 45.68537521362305, "learning_rate": 1.4990000000000002e-06, "loss": 3.9818, "step": 2998 }, { "epoch": 0.9401253918495298, "grad_norm": 39.437374114990234, "learning_rate": 1.4995000000000002e-06, "loss": 3.8931, "step": 2999 }, { "epoch": 0.9404388714733543, "grad_norm": 43.97903823852539, "learning_rate": 1.5e-06, "loss": 3.9224, "step": 3000 }, { "epoch": 0.9407523510971787, "grad_norm": 39.799964904785156, "learning_rate": 1.5005e-06, "loss": 4.0903, "step": 3001 }, { "epoch": 0.9410658307210031, "grad_norm": 31.609031677246094, "learning_rate": 1.5010000000000003e-06, "loss": 3.1316, "step": 3002 }, { "epoch": 0.9413793103448276, "grad_norm": 44.23056411743164, "learning_rate": 1.5015000000000002e-06, "loss": 3.8267, "step": 3003 }, { "epoch": 0.941692789968652, "grad_norm": 54.30805206298828, "learning_rate": 1.5020000000000002e-06, "loss": 4.5096, "step": 3004 }, { "epoch": 0.9420062695924765, "grad_norm": 35.796661376953125, "learning_rate": 1.5025e-06, "loss": 3.364, "step": 3005 }, { "epoch": 0.9423197492163009, "grad_norm": 38.125091552734375, "learning_rate": 1.503e-06, "loss": 3.3353, "step": 3006 }, { "epoch": 0.9426332288401253, "grad_norm": 55.130775451660156, "learning_rate": 1.5035000000000003e-06, "loss": 3.269, "step": 3007 }, { "epoch": 0.9429467084639498, "grad_norm": 33.1065788269043, "learning_rate": 1.5040000000000001e-06, "loss": 3.8043, "step": 3008 }, { "epoch": 0.9432601880877743, "grad_norm": 32.7559814453125, "learning_rate": 1.5045000000000002e-06, "loss": 3.3734, "step": 3009 }, { "epoch": 0.9435736677115988, "grad_norm": 54.44166946411133, "learning_rate": 1.505e-06, "loss": 3.8441, "step": 3010 }, { "epoch": 0.9438871473354232, "grad_norm": 42.8200569152832, "learning_rate": 1.5055e-06, "loss": 3.2862, "step": 3011 }, { "epoch": 0.9442006269592477, "grad_norm": 33.59670639038086, "learning_rate": 1.5060000000000003e-06, "loss": 3.8924, "step": 3012 }, { "epoch": 0.9445141065830721, "grad_norm": 39.061279296875, "learning_rate": 1.5065e-06, "loss": 3.7316, "step": 3013 }, { "epoch": 0.9448275862068966, "grad_norm": 37.6108283996582, "learning_rate": 1.5070000000000001e-06, "loss": 4.0276, "step": 3014 }, { "epoch": 0.945141065830721, "grad_norm": 31.421833038330078, "learning_rate": 1.5075000000000002e-06, "loss": 3.2679, "step": 3015 }, { "epoch": 0.9454545454545454, "grad_norm": 34.2425422668457, "learning_rate": 1.508e-06, "loss": 3.7938, "step": 3016 }, { "epoch": 0.9457680250783699, "grad_norm": 58.670650482177734, "learning_rate": 1.5085000000000003e-06, "loss": 4.0145, "step": 3017 }, { "epoch": 0.9460815047021943, "grad_norm": 20.024412155151367, "learning_rate": 1.5090000000000003e-06, "loss": 3.1288, "step": 3018 }, { "epoch": 0.9463949843260188, "grad_norm": 58.533390045166016, "learning_rate": 1.5095000000000001e-06, "loss": 3.3225, "step": 3019 }, { "epoch": 0.9467084639498433, "grad_norm": 54.33335876464844, "learning_rate": 1.5100000000000002e-06, "loss": 6.4205, "step": 3020 }, { "epoch": 0.9470219435736678, "grad_norm": 48.962615966796875, "learning_rate": 1.5105e-06, "loss": 4.808, "step": 3021 }, { "epoch": 0.9473354231974922, "grad_norm": 45.55655288696289, "learning_rate": 1.5110000000000002e-06, "loss": 4.7946, "step": 3022 }, { "epoch": 0.9476489028213166, "grad_norm": 50.928199768066406, "learning_rate": 1.5115000000000003e-06, "loss": 2.9686, "step": 3023 }, { "epoch": 0.9479623824451411, "grad_norm": 48.291133880615234, "learning_rate": 1.512e-06, "loss": 4.4833, "step": 3024 }, { "epoch": 0.9482758620689655, "grad_norm": 53.17333221435547, "learning_rate": 1.5125000000000001e-06, "loss": 3.8591, "step": 3025 }, { "epoch": 0.94858934169279, "grad_norm": 39.857887268066406, "learning_rate": 1.513e-06, "loss": 3.6312, "step": 3026 }, { "epoch": 0.9489028213166144, "grad_norm": 54.0508918762207, "learning_rate": 1.5135000000000002e-06, "loss": 3.8041, "step": 3027 }, { "epoch": 0.9492163009404389, "grad_norm": 80.95004272460938, "learning_rate": 1.5140000000000002e-06, "loss": 5.0221, "step": 3028 }, { "epoch": 0.9495297805642633, "grad_norm": 49.28989791870117, "learning_rate": 1.5145e-06, "loss": 3.7739, "step": 3029 }, { "epoch": 0.9498432601880877, "grad_norm": 47.68058395385742, "learning_rate": 1.5150000000000001e-06, "loss": 4.321, "step": 3030 }, { "epoch": 0.9501567398119122, "grad_norm": 38.63204574584961, "learning_rate": 1.5155e-06, "loss": 3.6415, "step": 3031 }, { "epoch": 0.9504702194357367, "grad_norm": 71.45962524414062, "learning_rate": 1.5160000000000002e-06, "loss": 4.3374, "step": 3032 }, { "epoch": 0.9507836990595612, "grad_norm": 33.07627868652344, "learning_rate": 1.5165000000000002e-06, "loss": 4.8849, "step": 3033 }, { "epoch": 0.9510971786833856, "grad_norm": 43.36479187011719, "learning_rate": 1.517e-06, "loss": 4.4194, "step": 3034 }, { "epoch": 0.95141065830721, "grad_norm": 39.327003479003906, "learning_rate": 1.5175e-06, "loss": 3.8361, "step": 3035 }, { "epoch": 0.9517241379310345, "grad_norm": 42.822105407714844, "learning_rate": 1.5180000000000001e-06, "loss": 3.613, "step": 3036 }, { "epoch": 0.9520376175548589, "grad_norm": 36.2182502746582, "learning_rate": 1.5185000000000002e-06, "loss": 2.9997, "step": 3037 }, { "epoch": 0.9523510971786834, "grad_norm": 53.4159049987793, "learning_rate": 1.5190000000000002e-06, "loss": 3.312, "step": 3038 }, { "epoch": 0.9526645768025078, "grad_norm": 31.3487491607666, "learning_rate": 1.5195000000000002e-06, "loss": 3.2757, "step": 3039 }, { "epoch": 0.9529780564263323, "grad_norm": 30.916259765625, "learning_rate": 1.52e-06, "loss": 3.5675, "step": 3040 }, { "epoch": 0.9532915360501567, "grad_norm": 38.4372673034668, "learning_rate": 1.5205e-06, "loss": 3.7267, "step": 3041 }, { "epoch": 0.9536050156739811, "grad_norm": 33.22764587402344, "learning_rate": 1.5210000000000003e-06, "loss": 4.0307, "step": 3042 }, { "epoch": 0.9539184952978056, "grad_norm": 44.29248809814453, "learning_rate": 1.5215000000000002e-06, "loss": 4.2784, "step": 3043 }, { "epoch": 0.9542319749216301, "grad_norm": 28.41790199279785, "learning_rate": 1.5220000000000002e-06, "loss": 4.0886, "step": 3044 }, { "epoch": 0.9545454545454546, "grad_norm": 27.84803009033203, "learning_rate": 1.5225e-06, "loss": 3.059, "step": 3045 }, { "epoch": 0.954858934169279, "grad_norm": 64.40411376953125, "learning_rate": 1.523e-06, "loss": 4.3402, "step": 3046 }, { "epoch": 0.9551724137931035, "grad_norm": 41.377769470214844, "learning_rate": 1.5235000000000003e-06, "loss": 4.1004, "step": 3047 }, { "epoch": 0.9554858934169279, "grad_norm": 28.825105667114258, "learning_rate": 1.5240000000000001e-06, "loss": 3.5855, "step": 3048 }, { "epoch": 0.9557993730407524, "grad_norm": 36.527618408203125, "learning_rate": 1.5245000000000002e-06, "loss": 3.7634, "step": 3049 }, { "epoch": 0.9561128526645768, "grad_norm": 48.31051254272461, "learning_rate": 1.525e-06, "loss": 3.2032, "step": 3050 }, { "epoch": 0.9564263322884012, "grad_norm": 49.092552185058594, "learning_rate": 1.5255e-06, "loss": 3.2192, "step": 3051 }, { "epoch": 0.9567398119122257, "grad_norm": 28.44539451599121, "learning_rate": 1.5260000000000003e-06, "loss": 3.1739, "step": 3052 }, { "epoch": 0.9570532915360501, "grad_norm": 48.597190856933594, "learning_rate": 1.5265000000000001e-06, "loss": 5.3544, "step": 3053 }, { "epoch": 0.9573667711598746, "grad_norm": 37.88104248046875, "learning_rate": 1.5270000000000002e-06, "loss": 3.1668, "step": 3054 }, { "epoch": 0.957680250783699, "grad_norm": 88.76260375976562, "learning_rate": 1.5275000000000002e-06, "loss": 4.3589, "step": 3055 }, { "epoch": 0.9579937304075236, "grad_norm": 31.21636962890625, "learning_rate": 1.528e-06, "loss": 3.3685, "step": 3056 }, { "epoch": 0.958307210031348, "grad_norm": 44.11945343017578, "learning_rate": 1.5285000000000003e-06, "loss": 3.3106, "step": 3057 }, { "epoch": 0.9586206896551724, "grad_norm": 46.66257095336914, "learning_rate": 1.5290000000000003e-06, "loss": 3.5772, "step": 3058 }, { "epoch": 0.9589341692789969, "grad_norm": 45.46147537231445, "learning_rate": 1.5295000000000001e-06, "loss": 6.1372, "step": 3059 }, { "epoch": 0.9592476489028213, "grad_norm": 37.89000701904297, "learning_rate": 1.5300000000000002e-06, "loss": 3.3852, "step": 3060 }, { "epoch": 0.9595611285266458, "grad_norm": 38.45157241821289, "learning_rate": 1.5305e-06, "loss": 3.4452, "step": 3061 }, { "epoch": 0.9598746081504702, "grad_norm": 29.158227920532227, "learning_rate": 1.5310000000000002e-06, "loss": 3.0486, "step": 3062 }, { "epoch": 0.9601880877742947, "grad_norm": 97.31979370117188, "learning_rate": 1.5315000000000003e-06, "loss": 4.3573, "step": 3063 }, { "epoch": 0.9605015673981191, "grad_norm": 63.818050384521484, "learning_rate": 1.5320000000000001e-06, "loss": 4.6664, "step": 3064 }, { "epoch": 0.9608150470219435, "grad_norm": 28.385637283325195, "learning_rate": 1.5325000000000002e-06, "loss": 3.4442, "step": 3065 }, { "epoch": 0.961128526645768, "grad_norm": 40.3345947265625, "learning_rate": 1.533e-06, "loss": 3.7686, "step": 3066 }, { "epoch": 0.9614420062695925, "grad_norm": 35.8496208190918, "learning_rate": 1.5335e-06, "loss": 3.5075, "step": 3067 }, { "epoch": 0.961755485893417, "grad_norm": 31.015871047973633, "learning_rate": 1.5340000000000003e-06, "loss": 3.5269, "step": 3068 }, { "epoch": 0.9620689655172414, "grad_norm": 26.467029571533203, "learning_rate": 1.5345e-06, "loss": 3.0488, "step": 3069 }, { "epoch": 0.9623824451410659, "grad_norm": 57.80925369262695, "learning_rate": 1.5350000000000001e-06, "loss": 4.5774, "step": 3070 }, { "epoch": 0.9626959247648903, "grad_norm": 68.83757019042969, "learning_rate": 1.5355e-06, "loss": 4.7216, "step": 3071 }, { "epoch": 0.9630094043887147, "grad_norm": 44.15302658081055, "learning_rate": 1.536e-06, "loss": 4.4129, "step": 3072 }, { "epoch": 0.9633228840125392, "grad_norm": 48.796966552734375, "learning_rate": 1.5365000000000002e-06, "loss": 3.7593, "step": 3073 }, { "epoch": 0.9636363636363636, "grad_norm": 52.55888748168945, "learning_rate": 1.537e-06, "loss": 3.7959, "step": 3074 }, { "epoch": 0.9639498432601881, "grad_norm": 31.702449798583984, "learning_rate": 1.5375e-06, "loss": 3.7677, "step": 3075 }, { "epoch": 0.9642633228840125, "grad_norm": 47.94102096557617, "learning_rate": 1.5380000000000001e-06, "loss": 3.5232, "step": 3076 }, { "epoch": 0.964576802507837, "grad_norm": 46.60993957519531, "learning_rate": 1.5385e-06, "loss": 4.0139, "step": 3077 }, { "epoch": 0.9648902821316614, "grad_norm": 28.9523983001709, "learning_rate": 1.5390000000000002e-06, "loss": 2.7694, "step": 3078 }, { "epoch": 0.965203761755486, "grad_norm": 44.399925231933594, "learning_rate": 1.5395000000000003e-06, "loss": 3.5384, "step": 3079 }, { "epoch": 0.9655172413793104, "grad_norm": 42.65142822265625, "learning_rate": 1.54e-06, "loss": 3.7671, "step": 3080 }, { "epoch": 0.9658307210031348, "grad_norm": 51.35871887207031, "learning_rate": 1.5405000000000001e-06, "loss": 3.4669, "step": 3081 }, { "epoch": 0.9661442006269593, "grad_norm": 41.16212844848633, "learning_rate": 1.541e-06, "loss": 3.4601, "step": 3082 }, { "epoch": 0.9664576802507837, "grad_norm": 36.21253204345703, "learning_rate": 1.5415000000000002e-06, "loss": 3.063, "step": 3083 }, { "epoch": 0.9667711598746082, "grad_norm": 29.120792388916016, "learning_rate": 1.5420000000000002e-06, "loss": 3.5839, "step": 3084 }, { "epoch": 0.9670846394984326, "grad_norm": 32.0816535949707, "learning_rate": 1.5425e-06, "loss": 3.7305, "step": 3085 }, { "epoch": 0.967398119122257, "grad_norm": 30.316551208496094, "learning_rate": 1.543e-06, "loss": 3.2055, "step": 3086 }, { "epoch": 0.9677115987460815, "grad_norm": 33.83134841918945, "learning_rate": 1.5435e-06, "loss": 3.3693, "step": 3087 }, { "epoch": 0.9680250783699059, "grad_norm": 47.991493225097656, "learning_rate": 1.5440000000000002e-06, "loss": 4.0049, "step": 3088 }, { "epoch": 0.9683385579937304, "grad_norm": 24.988670349121094, "learning_rate": 1.5445000000000002e-06, "loss": 3.81, "step": 3089 }, { "epoch": 0.9686520376175548, "grad_norm": 39.11276626586914, "learning_rate": 1.545e-06, "loss": 3.7266, "step": 3090 }, { "epoch": 0.9689655172413794, "grad_norm": 69.33980560302734, "learning_rate": 1.5455e-06, "loss": 4.3037, "step": 3091 }, { "epoch": 0.9692789968652038, "grad_norm": 43.25937271118164, "learning_rate": 1.546e-06, "loss": 3.6816, "step": 3092 }, { "epoch": 0.9695924764890282, "grad_norm": 56.0028190612793, "learning_rate": 1.5465000000000001e-06, "loss": 3.5904, "step": 3093 }, { "epoch": 0.9699059561128527, "grad_norm": 62.90016555786133, "learning_rate": 1.5470000000000002e-06, "loss": 4.1878, "step": 3094 }, { "epoch": 0.9702194357366771, "grad_norm": 27.32510757446289, "learning_rate": 1.5475000000000002e-06, "loss": 2.9088, "step": 3095 }, { "epoch": 0.9705329153605016, "grad_norm": 29.192907333374023, "learning_rate": 1.548e-06, "loss": 3.6891, "step": 3096 }, { "epoch": 0.970846394984326, "grad_norm": 29.538455963134766, "learning_rate": 1.5485e-06, "loss": 3.5678, "step": 3097 }, { "epoch": 0.9711598746081505, "grad_norm": 39.13743209838867, "learning_rate": 1.5490000000000003e-06, "loss": 3.5836, "step": 3098 }, { "epoch": 0.9714733542319749, "grad_norm": 35.45713424682617, "learning_rate": 1.5495000000000002e-06, "loss": 3.5822, "step": 3099 }, { "epoch": 0.9717868338557993, "grad_norm": 37.34980392456055, "learning_rate": 1.5500000000000002e-06, "loss": 3.4961, "step": 3100 }, { "epoch": 0.9721003134796238, "grad_norm": 52.05093765258789, "learning_rate": 1.5505e-06, "loss": 4.2753, "step": 3101 }, { "epoch": 0.9724137931034482, "grad_norm": 43.123226165771484, "learning_rate": 1.551e-06, "loss": 4.7245, "step": 3102 }, { "epoch": 0.9727272727272728, "grad_norm": 30.87041473388672, "learning_rate": 1.5515000000000003e-06, "loss": 3.187, "step": 3103 }, { "epoch": 0.9730407523510972, "grad_norm": 61.1220588684082, "learning_rate": 1.5520000000000001e-06, "loss": 3.7726, "step": 3104 }, { "epoch": 0.9733542319749217, "grad_norm": 24.472267150878906, "learning_rate": 1.5525000000000002e-06, "loss": 3.2109, "step": 3105 }, { "epoch": 0.9736677115987461, "grad_norm": 41.3580436706543, "learning_rate": 1.553e-06, "loss": 4.827, "step": 3106 }, { "epoch": 0.9739811912225705, "grad_norm": 23.06800079345703, "learning_rate": 1.5535e-06, "loss": 3.6523, "step": 3107 }, { "epoch": 0.974294670846395, "grad_norm": 45.498695373535156, "learning_rate": 1.5540000000000003e-06, "loss": 4.1928, "step": 3108 }, { "epoch": 0.9746081504702194, "grad_norm": 32.11305618286133, "learning_rate": 1.5545000000000001e-06, "loss": 3.395, "step": 3109 }, { "epoch": 0.9749216300940439, "grad_norm": 22.9804744720459, "learning_rate": 1.5550000000000001e-06, "loss": 2.9892, "step": 3110 }, { "epoch": 0.9752351097178683, "grad_norm": 33.303001403808594, "learning_rate": 1.5555e-06, "loss": 3.2164, "step": 3111 }, { "epoch": 0.9755485893416928, "grad_norm": 35.61023712158203, "learning_rate": 1.556e-06, "loss": 3.2413, "step": 3112 }, { "epoch": 0.9758620689655172, "grad_norm": 32.667476654052734, "learning_rate": 1.5565000000000003e-06, "loss": 4.2483, "step": 3113 }, { "epoch": 0.9761755485893417, "grad_norm": 30.902109146118164, "learning_rate": 1.557e-06, "loss": 3.5198, "step": 3114 }, { "epoch": 0.9764890282131662, "grad_norm": 29.37360191345215, "learning_rate": 1.5575000000000001e-06, "loss": 3.6189, "step": 3115 }, { "epoch": 0.9768025078369906, "grad_norm": 41.751956939697266, "learning_rate": 1.5580000000000002e-06, "loss": 3.3907, "step": 3116 }, { "epoch": 0.9771159874608151, "grad_norm": 48.0351676940918, "learning_rate": 1.5585e-06, "loss": 3.2474, "step": 3117 }, { "epoch": 0.9774294670846395, "grad_norm": 33.41267395019531, "learning_rate": 1.5590000000000002e-06, "loss": 3.4043, "step": 3118 }, { "epoch": 0.977742946708464, "grad_norm": 26.803760528564453, "learning_rate": 1.5595000000000003e-06, "loss": 3.016, "step": 3119 }, { "epoch": 0.9780564263322884, "grad_norm": 59.20711135864258, "learning_rate": 1.56e-06, "loss": 3.8982, "step": 3120 }, { "epoch": 0.9783699059561128, "grad_norm": 25.695276260375977, "learning_rate": 1.5605000000000001e-06, "loss": 2.929, "step": 3121 }, { "epoch": 0.9786833855799373, "grad_norm": 41.91707229614258, "learning_rate": 1.561e-06, "loss": 3.3785, "step": 3122 }, { "epoch": 0.9789968652037617, "grad_norm": 26.014942169189453, "learning_rate": 1.5615000000000002e-06, "loss": 4.3426, "step": 3123 }, { "epoch": 0.9793103448275862, "grad_norm": 43.355995178222656, "learning_rate": 1.5620000000000002e-06, "loss": 3.191, "step": 3124 }, { "epoch": 0.9796238244514106, "grad_norm": 52.087005615234375, "learning_rate": 1.5625e-06, "loss": 3.2585, "step": 3125 }, { "epoch": 0.9799373040752352, "grad_norm": 61.68992233276367, "learning_rate": 1.5630000000000001e-06, "loss": 3.7206, "step": 3126 }, { "epoch": 0.9802507836990596, "grad_norm": 34.2900390625, "learning_rate": 1.5635e-06, "loss": 3.2416, "step": 3127 }, { "epoch": 0.980564263322884, "grad_norm": 51.79576873779297, "learning_rate": 1.5640000000000002e-06, "loss": 5.8384, "step": 3128 }, { "epoch": 0.9808777429467085, "grad_norm": 26.661314010620117, "learning_rate": 1.5645000000000002e-06, "loss": 3.4689, "step": 3129 }, { "epoch": 0.9811912225705329, "grad_norm": 35.80608367919922, "learning_rate": 1.565e-06, "loss": 3.2023, "step": 3130 }, { "epoch": 0.9815047021943574, "grad_norm": 33.74555587768555, "learning_rate": 1.5655e-06, "loss": 3.3833, "step": 3131 }, { "epoch": 0.9818181818181818, "grad_norm": 40.17083740234375, "learning_rate": 1.566e-06, "loss": 3.5556, "step": 3132 }, { "epoch": 0.9821316614420063, "grad_norm": 39.58924865722656, "learning_rate": 1.5665000000000002e-06, "loss": 3.0832, "step": 3133 }, { "epoch": 0.9824451410658307, "grad_norm": 45.79573440551758, "learning_rate": 1.5670000000000002e-06, "loss": 3.5993, "step": 3134 }, { "epoch": 0.9827586206896551, "grad_norm": 30.124801635742188, "learning_rate": 1.5675e-06, "loss": 3.0965, "step": 3135 }, { "epoch": 0.9830721003134796, "grad_norm": 50.938419342041016, "learning_rate": 1.568e-06, "loss": 3.1747, "step": 3136 }, { "epoch": 0.983385579937304, "grad_norm": 37.803470611572266, "learning_rate": 1.5685e-06, "loss": 3.724, "step": 3137 }, { "epoch": 0.9836990595611286, "grad_norm": 45.604373931884766, "learning_rate": 1.5690000000000001e-06, "loss": 2.8654, "step": 3138 }, { "epoch": 0.984012539184953, "grad_norm": 29.251733779907227, "learning_rate": 1.5695000000000002e-06, "loss": 3.0197, "step": 3139 }, { "epoch": 0.9843260188087775, "grad_norm": 40.991294860839844, "learning_rate": 1.5700000000000002e-06, "loss": 3.0759, "step": 3140 }, { "epoch": 0.9846394984326019, "grad_norm": 39.70516586303711, "learning_rate": 1.5705e-06, "loss": 3.2381, "step": 3141 }, { "epoch": 0.9849529780564263, "grad_norm": 104.91124725341797, "learning_rate": 1.571e-06, "loss": 3.9789, "step": 3142 }, { "epoch": 0.9852664576802508, "grad_norm": 69.99864959716797, "learning_rate": 1.5715000000000003e-06, "loss": 5.4289, "step": 3143 }, { "epoch": 0.9855799373040752, "grad_norm": 47.31104278564453, "learning_rate": 1.5720000000000002e-06, "loss": 3.5801, "step": 3144 }, { "epoch": 0.9858934169278997, "grad_norm": 23.58669090270996, "learning_rate": 1.5725000000000002e-06, "loss": 3.2141, "step": 3145 }, { "epoch": 0.9862068965517241, "grad_norm": 60.75740432739258, "learning_rate": 1.573e-06, "loss": 3.1711, "step": 3146 }, { "epoch": 0.9865203761755486, "grad_norm": 35.55626678466797, "learning_rate": 1.5735e-06, "loss": 3.0043, "step": 3147 }, { "epoch": 0.986833855799373, "grad_norm": 56.99102783203125, "learning_rate": 1.5740000000000003e-06, "loss": 4.1672, "step": 3148 }, { "epoch": 0.9871473354231975, "grad_norm": 25.71376609802246, "learning_rate": 1.5745000000000001e-06, "loss": 2.92, "step": 3149 }, { "epoch": 0.987460815047022, "grad_norm": 31.668563842773438, "learning_rate": 1.5750000000000002e-06, "loss": 3.2671, "step": 3150 }, { "epoch": 0.9877742946708464, "grad_norm": 32.40145492553711, "learning_rate": 1.5755e-06, "loss": 3.1882, "step": 3151 }, { "epoch": 0.9880877742946709, "grad_norm": 33.56290054321289, "learning_rate": 1.576e-06, "loss": 2.97, "step": 3152 }, { "epoch": 0.9884012539184953, "grad_norm": 23.0459041595459, "learning_rate": 1.5765000000000003e-06, "loss": 2.8921, "step": 3153 }, { "epoch": 0.9887147335423198, "grad_norm": 31.948535919189453, "learning_rate": 1.577e-06, "loss": 3.348, "step": 3154 }, { "epoch": 0.9890282131661442, "grad_norm": 39.63069534301758, "learning_rate": 1.5775000000000001e-06, "loss": 2.9237, "step": 3155 }, { "epoch": 0.9893416927899686, "grad_norm": 32.34980392456055, "learning_rate": 1.5780000000000002e-06, "loss": 3.245, "step": 3156 }, { "epoch": 0.9896551724137931, "grad_norm": 41.89958953857422, "learning_rate": 1.5785e-06, "loss": 3.4284, "step": 3157 }, { "epoch": 0.9899686520376175, "grad_norm": 51.7044792175293, "learning_rate": 1.5790000000000003e-06, "loss": 3.0146, "step": 3158 }, { "epoch": 0.990282131661442, "grad_norm": 110.62555694580078, "learning_rate": 1.5795000000000003e-06, "loss": 3.7889, "step": 3159 }, { "epoch": 0.9905956112852664, "grad_norm": 42.60169982910156, "learning_rate": 1.5800000000000001e-06, "loss": 4.1996, "step": 3160 }, { "epoch": 0.990909090909091, "grad_norm": 45.62926483154297, "learning_rate": 1.5805000000000002e-06, "loss": 2.8792, "step": 3161 }, { "epoch": 0.9912225705329154, "grad_norm": 29.770870208740234, "learning_rate": 1.581e-06, "loss": 3.162, "step": 3162 }, { "epoch": 0.9915360501567398, "grad_norm": 40.25620651245117, "learning_rate": 1.5815000000000002e-06, "loss": 3.2687, "step": 3163 }, { "epoch": 0.9918495297805643, "grad_norm": 51.3452262878418, "learning_rate": 1.5820000000000003e-06, "loss": 3.6536, "step": 3164 }, { "epoch": 0.9921630094043887, "grad_norm": 42.561981201171875, "learning_rate": 1.5825e-06, "loss": 3.2372, "step": 3165 }, { "epoch": 0.9924764890282132, "grad_norm": 104.1058578491211, "learning_rate": 1.5830000000000001e-06, "loss": 3.6081, "step": 3166 }, { "epoch": 0.9927899686520376, "grad_norm": 33.43682098388672, "learning_rate": 1.5835e-06, "loss": 3.0076, "step": 3167 }, { "epoch": 0.993103448275862, "grad_norm": 31.840662002563477, "learning_rate": 1.5840000000000002e-06, "loss": 3.4222, "step": 3168 }, { "epoch": 0.9934169278996865, "grad_norm": 39.76411437988281, "learning_rate": 1.5845000000000002e-06, "loss": 3.4354, "step": 3169 }, { "epoch": 0.9937304075235109, "grad_norm": 45.577274322509766, "learning_rate": 1.585e-06, "loss": 4.2827, "step": 3170 }, { "epoch": 0.9940438871473354, "grad_norm": 40.2696647644043, "learning_rate": 1.5855000000000001e-06, "loss": 3.1225, "step": 3171 }, { "epoch": 0.9943573667711598, "grad_norm": 37.35997009277344, "learning_rate": 1.586e-06, "loss": 3.626, "step": 3172 }, { "epoch": 0.9946708463949844, "grad_norm": 29.363245010375977, "learning_rate": 1.5865000000000002e-06, "loss": 3.437, "step": 3173 }, { "epoch": 0.9949843260188088, "grad_norm": 30.479536056518555, "learning_rate": 1.5870000000000002e-06, "loss": 3.4304, "step": 3174 }, { "epoch": 0.9952978056426333, "grad_norm": 27.580543518066406, "learning_rate": 1.5875e-06, "loss": 3.2106, "step": 3175 }, { "epoch": 0.9956112852664577, "grad_norm": 39.4871711730957, "learning_rate": 1.588e-06, "loss": 3.465, "step": 3176 }, { "epoch": 0.9959247648902821, "grad_norm": 20.305816650390625, "learning_rate": 1.5885000000000001e-06, "loss": 2.8586, "step": 3177 }, { "epoch": 0.9962382445141066, "grad_norm": 54.459625244140625, "learning_rate": 1.5890000000000002e-06, "loss": 3.8607, "step": 3178 }, { "epoch": 0.996551724137931, "grad_norm": 45.21859359741211, "learning_rate": 1.5895000000000002e-06, "loss": 4.0433, "step": 3179 }, { "epoch": 0.9968652037617555, "grad_norm": 30.275541305541992, "learning_rate": 1.5900000000000002e-06, "loss": 3.1236, "step": 3180 }, { "epoch": 0.9971786833855799, "grad_norm": 24.845712661743164, "learning_rate": 1.5905e-06, "loss": 3.052, "step": 3181 }, { "epoch": 0.9974921630094044, "grad_norm": 47.068382263183594, "learning_rate": 1.591e-06, "loss": 4.1035, "step": 3182 }, { "epoch": 0.9978056426332288, "grad_norm": 86.9577865600586, "learning_rate": 1.5915000000000003e-06, "loss": 3.6548, "step": 3183 }, { "epoch": 0.9981191222570532, "grad_norm": 45.81187057495117, "learning_rate": 1.5920000000000002e-06, "loss": 4.0105, "step": 3184 }, { "epoch": 0.9984326018808778, "grad_norm": 54.567386627197266, "learning_rate": 1.5925000000000002e-06, "loss": 3.7583, "step": 3185 }, { "epoch": 0.9987460815047022, "grad_norm": 45.75572967529297, "learning_rate": 1.593e-06, "loss": 3.3029, "step": 3186 }, { "epoch": 0.9990595611285267, "grad_norm": 56.43427658081055, "learning_rate": 1.5935e-06, "loss": 2.8933, "step": 3187 }, { "epoch": 0.9993730407523511, "grad_norm": 41.16676712036133, "learning_rate": 1.594e-06, "loss": 3.0468, "step": 3188 }, { "epoch": 0.9996865203761756, "grad_norm": 38.499820709228516, "learning_rate": 1.5945000000000001e-06, "loss": 3.9996, "step": 3189 }, { "epoch": 1.0, "grad_norm": 20.5953426361084, "learning_rate": 1.5950000000000002e-06, "loss": 2.7745, "step": 3190 } ], "logging_steps": 1, "max_steps": 12760, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.717135139930112e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }