diff --git "a/results_longbunny2/trainer_state.json" "b/results_longbunny2/trainer_state.json" new file mode 100644--- /dev/null +++ "b/results_longbunny2/trainer_state.json" @@ -0,0 +1,79732 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999451110403653, + "eval_steps": 500, + "global_step": 11386, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 347.5937256852774, + "learning_rate": 2.9239766081871347e-08, + "loss": 2.557, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 403.586877686182, + "learning_rate": 5.847953216374269e-08, + "loss": 2.7679, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 152.32269164083206, + "learning_rate": 8.771929824561404e-08, + "loss": 3.1086, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 118.39141270793344, + "learning_rate": 1.1695906432748539e-07, + "loss": 2.9327, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 177.63423026272494, + "learning_rate": 1.4619883040935672e-07, + "loss": 3.222, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 111.87136737162852, + "learning_rate": 1.7543859649122808e-07, + "loss": 2.9428, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 267.44995179000966, + "learning_rate": 2.046783625730994e-07, + "loss": 2.7322, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 544.8964379717203, + "learning_rate": 2.3391812865497077e-07, + "loss": 3.0272, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 269.54615465644605, + "learning_rate": 2.6315789473684213e-07, + "loss": 2.7684, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 432.4701306795495, + "learning_rate": 2.9239766081871344e-07, + "loss": 2.8973, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 100.9224084143791, + "learning_rate": 3.216374269005848e-07, + "loss": 2.6408, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 123.3081578243168, + "learning_rate": 3.5087719298245616e-07, + "loss": 2.7623, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 160.42727350954908, + "learning_rate": 3.801169590643275e-07, + "loss": 2.7257, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 5.437772414002715, + "learning_rate": 4.093567251461988e-07, + "loss": 0.6067, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 103.6225982481011, + "learning_rate": 4.385964912280702e-07, + "loss": 2.3796, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 84.51212916019416, + "learning_rate": 4.6783625730994155e-07, + "loss": 2.4342, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 188.82951219172134, + "learning_rate": 4.970760233918129e-07, + "loss": 2.616, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 205.67802729861356, + "learning_rate": 5.263157894736843e-07, + "loss": 2.4399, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 85.03318343958462, + "learning_rate": 5.555555555555555e-07, + "loss": 1.7578, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 424.8792229344935, + "learning_rate": 5.847953216374269e-07, + "loss": 2.0997, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 70.92536195403804, + "learning_rate": 6.140350877192982e-07, + "loss": 1.8684, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 81.73667799627943, + "learning_rate": 6.432748538011696e-07, + "loss": 1.8637, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 167.192009630662, + "learning_rate": 6.72514619883041e-07, + "loss": 1.9511, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 77.47792988702194, + "learning_rate": 7.017543859649123e-07, + "loss": 1.7912, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 58.01236787397561, + "learning_rate": 7.309941520467837e-07, + "loss": 1.5989, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 47.913427714343584, + "learning_rate": 7.60233918128655e-07, + "loss": 1.4761, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 25.497775080804672, + "learning_rate": 7.894736842105263e-07, + "loss": 1.3667, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 67.61360688875955, + "learning_rate": 8.187134502923977e-07, + "loss": 1.285, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 38.745279269672544, + "learning_rate": 8.47953216374269e-07, + "loss": 1.3587, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 24.367073643717564, + "learning_rate": 8.771929824561404e-07, + "loss": 1.3687, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 51.09290684131207, + "learning_rate": 9.064327485380117e-07, + "loss": 1.4027, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 234.13118949374146, + "learning_rate": 9.356725146198831e-07, + "loss": 1.2843, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 37.42793312102058, + "learning_rate": 9.649122807017545e-07, + "loss": 1.2314, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 60.13018693821932, + "learning_rate": 9.941520467836258e-07, + "loss": 1.3765, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 41.75479820059971, + "learning_rate": 1.0233918128654972e-06, + "loss": 1.5821, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 22.62016812815393, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.2895, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 47.70543950881478, + "learning_rate": 1.0818713450292399e-06, + "loss": 1.3365, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 26.894280148786937, + "learning_rate": 1.111111111111111e-06, + "loss": 1.1059, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 40.78328947296494, + "learning_rate": 1.1403508771929824e-06, + "loss": 1.2093, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 144.77942580970827, + "learning_rate": 1.1695906432748538e-06, + "loss": 1.4183, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 43.09983904149294, + "learning_rate": 1.1988304093567251e-06, + "loss": 1.1724, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 34.036222771257904, + "learning_rate": 1.2280701754385965e-06, + "loss": 1.2109, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 98.21381985375224, + "learning_rate": 1.2573099415204678e-06, + "loss": 1.5304, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 30.944403653061578, + "learning_rate": 1.2865497076023392e-06, + "loss": 1.1125, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 29.386768076164103, + "learning_rate": 1.3157894736842106e-06, + "loss": 1.1044, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 33.56483811026299, + "learning_rate": 1.345029239766082e-06, + "loss": 1.4484, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 50.974690733262506, + "learning_rate": 1.3742690058479533e-06, + "loss": 1.3919, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 30.18943952541861, + "learning_rate": 1.4035087719298246e-06, + "loss": 1.1426, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 5.66199792622841, + "learning_rate": 1.432748538011696e-06, + "loss": 0.5772, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 5.782237125806837, + "learning_rate": 1.4619883040935674e-06, + "loss": 0.5502, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 21.82170288255331, + "learning_rate": 1.4912280701754387e-06, + "loss": 1.2466, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 39.81435330527878, + "learning_rate": 1.52046783625731e-06, + "loss": 1.0742, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 72.13841272486157, + "learning_rate": 1.5497076023391812e-06, + "loss": 1.1555, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 32.9045594964733, + "learning_rate": 1.5789473684210526e-06, + "loss": 1.1046, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 29.78691215990834, + "learning_rate": 1.608187134502924e-06, + "loss": 1.1509, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 59.05785471185873, + "learning_rate": 1.6374269005847953e-06, + "loss": 1.0121, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 30.13634520605677, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.0299, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 153.45037276800068, + "learning_rate": 1.695906432748538e-06, + "loss": 1.1688, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 5.5973638524717355, + "learning_rate": 1.7251461988304094e-06, + "loss": 0.6377, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 43.52447095811426, + "learning_rate": 1.7543859649122807e-06, + "loss": 1.0731, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 34.985697783663966, + "learning_rate": 1.783625730994152e-06, + "loss": 0.9481, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 70.42864291692356, + "learning_rate": 1.8128654970760235e-06, + "loss": 1.0717, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 58.83754382251564, + "learning_rate": 1.8421052631578948e-06, + "loss": 0.8048, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 3.460095768069285, + "learning_rate": 1.8713450292397662e-06, + "loss": 0.504, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 89.13120457055453, + "learning_rate": 1.9005847953216375e-06, + "loss": 1.1585, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 4.881312993680846, + "learning_rate": 1.929824561403509e-06, + "loss": 0.4976, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 24.568993550842894, + "learning_rate": 1.9590643274853803e-06, + "loss": 1.13, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 35.699389935333, + "learning_rate": 1.9883040935672516e-06, + "loss": 1.251, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 143.1360477393452, + "learning_rate": 2.017543859649123e-06, + "loss": 1.0924, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 3.7832323140343584, + "learning_rate": 2.0467836257309943e-06, + "loss": 0.5061, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 103.30926776686007, + "learning_rate": 2.0760233918128657e-06, + "loss": 1.0461, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 57.49374602175974, + "learning_rate": 2.105263157894737e-06, + "loss": 1.0353, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 30.700911316840873, + "learning_rate": 2.1345029239766084e-06, + "loss": 1.0645, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 34.01630662114853, + "learning_rate": 2.1637426900584798e-06, + "loss": 1.1847, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 17.308242072724642, + "learning_rate": 2.192982456140351e-06, + "loss": 0.9895, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 63.75543579652859, + "learning_rate": 2.222222222222222e-06, + "loss": 1.0087, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 50.10614797950744, + "learning_rate": 2.2514619883040934e-06, + "loss": 1.262, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.9174681503603495, + "learning_rate": 2.280701754385965e-06, + "loss": 0.4722, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 254.80141784852887, + "learning_rate": 2.309941520467836e-06, + "loss": 1.2064, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 151.47389813586358, + "learning_rate": 2.3391812865497075e-06, + "loss": 0.9558, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 34.75712205696785, + "learning_rate": 2.368421052631579e-06, + "loss": 1.0298, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 5.247126032298808, + "learning_rate": 2.3976608187134502e-06, + "loss": 0.6259, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 20.741968787422728, + "learning_rate": 2.4269005847953216e-06, + "loss": 0.903, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 44.47418786668146, + "learning_rate": 2.456140350877193e-06, + "loss": 1.1652, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 72.37054200934242, + "learning_rate": 2.4853801169590643e-06, + "loss": 0.9375, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 3.217364442775304, + "learning_rate": 2.5146198830409357e-06, + "loss": 0.4424, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 54.45201779204756, + "learning_rate": 2.5438596491228075e-06, + "loss": 1.0962, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 29.306059465677567, + "learning_rate": 2.5730994152046784e-06, + "loss": 1.1462, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 29.338962537035567, + "learning_rate": 2.60233918128655e-06, + "loss": 1.0723, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 19.295737230750465, + "learning_rate": 2.631578947368421e-06, + "loss": 0.8287, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 3.938608298682384, + "learning_rate": 2.660818713450293e-06, + "loss": 0.4787, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 80.36953411325989, + "learning_rate": 2.690058479532164e-06, + "loss": 1.0271, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 69.27703262012241, + "learning_rate": 2.7192982456140356e-06, + "loss": 1.122, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 60.20299897168858, + "learning_rate": 2.7485380116959066e-06, + "loss": 0.9177, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 22.92265663441918, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.9868, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 42.05365645995588, + "learning_rate": 2.8070175438596493e-06, + "loss": 0.9645, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 31.59134561315551, + "learning_rate": 2.8362573099415206e-06, + "loss": 0.9174, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 32.41586086585292, + "learning_rate": 2.865497076023392e-06, + "loss": 1.0521, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 119.81388458141831, + "learning_rate": 2.8947368421052634e-06, + "loss": 1.0001, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 23.774065100192885, + "learning_rate": 2.9239766081871347e-06, + "loss": 1.1142, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 21.944354384703846, + "learning_rate": 2.953216374269006e-06, + "loss": 1.0492, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 262.95626175091434, + "learning_rate": 2.9824561403508774e-06, + "loss": 1.0565, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 34.76788376402441, + "learning_rate": 3.011695906432749e-06, + "loss": 0.9616, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 29.800869430477157, + "learning_rate": 3.04093567251462e-06, + "loss": 0.934, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 111.93910994240267, + "learning_rate": 3.0701754385964915e-06, + "loss": 1.0292, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 33.187658674953475, + "learning_rate": 3.0994152046783624e-06, + "loss": 1.06, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 22.52967988261398, + "learning_rate": 3.1286549707602342e-06, + "loss": 1.0127, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 23.427238630879696, + "learning_rate": 3.157894736842105e-06, + "loss": 0.8306, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 73.69121180858616, + "learning_rate": 3.187134502923977e-06, + "loss": 1.0529, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 49.008229259306944, + "learning_rate": 3.216374269005848e-06, + "loss": 1.0312, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 31.75672713089636, + "learning_rate": 3.2456140350877197e-06, + "loss": 0.9601, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 40.35099412310244, + "learning_rate": 3.2748538011695906e-06, + "loss": 1.2326, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 29.36280599225826, + "learning_rate": 3.3040935672514624e-06, + "loss": 1.0501, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 29.781901402524877, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9325, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 49.933579236455934, + "learning_rate": 3.362573099415205e-06, + "loss": 0.9649, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 46.08523826382831, + "learning_rate": 3.391812865497076e-06, + "loss": 0.9053, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 21.028537531205025, + "learning_rate": 3.421052631578948e-06, + "loss": 0.9397, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 33.16514129523585, + "learning_rate": 3.4502923976608188e-06, + "loss": 0.9328, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 5.80087884491003, + "learning_rate": 3.4795321637426905e-06, + "loss": 0.5713, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 142.11620361275035, + "learning_rate": 3.5087719298245615e-06, + "loss": 0.9428, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 80.837462375817, + "learning_rate": 3.5380116959064333e-06, + "loss": 1.1717, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 24.49680535268009, + "learning_rate": 3.567251461988304e-06, + "loss": 0.9268, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 36.46264297638424, + "learning_rate": 3.596491228070176e-06, + "loss": 0.892, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 23.81032229767086, + "learning_rate": 3.625730994152047e-06, + "loss": 1.0896, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 61.304474655156376, + "learning_rate": 3.6549707602339187e-06, + "loss": 0.9501, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 62.025254417835676, + "learning_rate": 3.6842105263157896e-06, + "loss": 1.092, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 44.85415394258663, + "learning_rate": 3.713450292397661e-06, + "loss": 0.9556, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 37.4289983434137, + "learning_rate": 3.7426900584795324e-06, + "loss": 0.9763, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 38.58549413947484, + "learning_rate": 3.7719298245614037e-06, + "loss": 0.9647, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 4.7631620239664, + "learning_rate": 3.801169590643275e-06, + "loss": 0.4723, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 26.28606795074874, + "learning_rate": 3.830409356725147e-06, + "loss": 0.915, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 39.20579233884729, + "learning_rate": 3.859649122807018e-06, + "loss": 0.9123, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 23.634031852882668, + "learning_rate": 3.88888888888889e-06, + "loss": 0.9603, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 23.26637612109773, + "learning_rate": 3.9181286549707605e-06, + "loss": 0.9511, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 37.70813758515853, + "learning_rate": 3.947368421052632e-06, + "loss": 1.1185, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 20.677110675089928, + "learning_rate": 3.976608187134503e-06, + "loss": 0.9056, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 25.880798312300588, + "learning_rate": 4.005847953216375e-06, + "loss": 0.8759, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 26.627334876401974, + "learning_rate": 4.035087719298246e-06, + "loss": 0.8987, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 5.721908664991347, + "learning_rate": 4.064327485380118e-06, + "loss": 0.5016, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 31.979173580580742, + "learning_rate": 4.093567251461989e-06, + "loss": 1.1034, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 205.81913964163513, + "learning_rate": 4.12280701754386e-06, + "loss": 1.0551, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 29.572311786679695, + "learning_rate": 4.152046783625731e-06, + "loss": 0.9124, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 47.62702367116973, + "learning_rate": 4.181286549707602e-06, + "loss": 1.0289, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 24.24601891261077, + "learning_rate": 4.210526315789474e-06, + "loss": 1.1226, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 32.92686053684915, + "learning_rate": 4.239766081871345e-06, + "loss": 1.1541, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 27.74226032710943, + "learning_rate": 4.269005847953217e-06, + "loss": 1.0127, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 31.004486864287514, + "learning_rate": 4.298245614035088e-06, + "loss": 0.8855, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 18.35488785037886, + "learning_rate": 4.3274853801169596e-06, + "loss": 0.9955, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 26.163164583561876, + "learning_rate": 4.3567251461988305e-06, + "loss": 1.0314, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 27.23520679546574, + "learning_rate": 4.385964912280702e-06, + "loss": 0.9521, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 259.3018666092869, + "learning_rate": 4.415204678362573e-06, + "loss": 0.9985, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 17.354622497197614, + "learning_rate": 4.444444444444444e-06, + "loss": 0.9782, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 24.208474322685788, + "learning_rate": 4.473684210526316e-06, + "loss": 1.0529, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 15.191272898087567, + "learning_rate": 4.502923976608187e-06, + "loss": 0.8985, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 21.755921480530027, + "learning_rate": 4.532163742690059e-06, + "loss": 0.9371, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 19.43717015423074, + "learning_rate": 4.56140350877193e-06, + "loss": 0.9239, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 19.244197720053883, + "learning_rate": 4.590643274853801e-06, + "loss": 0.985, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 3.722135876981497, + "learning_rate": 4.619883040935672e-06, + "loss": 0.4724, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 33.82426600465982, + "learning_rate": 4.649122807017544e-06, + "loss": 0.9282, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 16.06758297604639, + "learning_rate": 4.678362573099415e-06, + "loss": 0.8034, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 42.15572366273046, + "learning_rate": 4.707602339181287e-06, + "loss": 0.8755, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 17.037431270110005, + "learning_rate": 4.736842105263158e-06, + "loss": 0.9141, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 27.131245681270435, + "learning_rate": 4.7660818713450295e-06, + "loss": 1.136, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 31.325753149315293, + "learning_rate": 4.7953216374269005e-06, + "loss": 1.2187, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 14.268049213059903, + "learning_rate": 4.824561403508772e-06, + "loss": 0.9369, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 39.73297220536055, + "learning_rate": 4.853801169590643e-06, + "loss": 0.9078, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 30.554790541321438, + "learning_rate": 4.883040935672515e-06, + "loss": 0.9382, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 21.66552051583903, + "learning_rate": 4.912280701754386e-06, + "loss": 1.1743, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 21.293287015254723, + "learning_rate": 4.941520467836258e-06, + "loss": 0.8717, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 24.668088189047257, + "learning_rate": 4.970760233918129e-06, + "loss": 0.8511, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 76.47002011235834, + "learning_rate": 5e-06, + "loss": 1.0475, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 13.40757650425277, + "learning_rate": 5.029239766081871e-06, + "loss": 1.0157, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 30.40824156930903, + "learning_rate": 5.058479532163744e-06, + "loss": 0.9359, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 34.10955209153258, + "learning_rate": 5.087719298245615e-06, + "loss": 1.018, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 51.32920517713585, + "learning_rate": 5.116959064327486e-06, + "loss": 0.957, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 21.209897517603984, + "learning_rate": 5.146198830409357e-06, + "loss": 0.9154, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 4.172473421405253, + "learning_rate": 5.175438596491229e-06, + "loss": 0.4838, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 64.08643866900788, + "learning_rate": 5.2046783625731e-06, + "loss": 0.8908, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 17.624547493360122, + "learning_rate": 5.233918128654971e-06, + "loss": 0.9416, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 21.025952092391705, + "learning_rate": 5.263157894736842e-06, + "loss": 1.0143, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 22.40763413989325, + "learning_rate": 5.292397660818714e-06, + "loss": 0.9553, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 52.22483380265089, + "learning_rate": 5.321637426900586e-06, + "loss": 0.8894, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 18.98599685006593, + "learning_rate": 5.350877192982457e-06, + "loss": 0.8045, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 31.684287560223492, + "learning_rate": 5.380116959064328e-06, + "loss": 0.8739, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 2.9634181180688266, + "learning_rate": 5.4093567251461994e-06, + "loss": 0.4822, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 34.47000181741338, + "learning_rate": 5.438596491228071e-06, + "loss": 0.9474, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 28.70956233391904, + "learning_rate": 5.467836257309942e-06, + "loss": 0.8793, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 15.429032872441155, + "learning_rate": 5.497076023391813e-06, + "loss": 1.0022, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 31.875921154986024, + "learning_rate": 5.526315789473685e-06, + "loss": 0.9734, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 14.558662009428511, + "learning_rate": 5.555555555555557e-06, + "loss": 0.8963, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 51.58354225567763, + "learning_rate": 5.584795321637428e-06, + "loss": 1.0847, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 23.520545975739616, + "learning_rate": 5.6140350877192985e-06, + "loss": 0.9916, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 74.56542121913049, + "learning_rate": 5.64327485380117e-06, + "loss": 0.9678, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 42.234051873167566, + "learning_rate": 5.672514619883041e-06, + "loss": 0.9595, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 38.09728445917059, + "learning_rate": 5.701754385964913e-06, + "loss": 0.8689, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 28.0924267079273, + "learning_rate": 5.730994152046784e-06, + "loss": 0.9288, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 21.69765172497942, + "learning_rate": 5.760233918128656e-06, + "loss": 0.7808, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 75.54241521336007, + "learning_rate": 5.789473684210527e-06, + "loss": 1.0944, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 14.441678720927921, + "learning_rate": 5.8187134502923985e-06, + "loss": 0.8964, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 39.70632793446906, + "learning_rate": 5.847953216374269e-06, + "loss": 0.8814, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 32.44484178896818, + "learning_rate": 5.877192982456141e-06, + "loss": 0.9676, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 161.06183462195972, + "learning_rate": 5.906432748538012e-06, + "loss": 0.8619, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 18.306028172753052, + "learning_rate": 5.935672514619883e-06, + "loss": 0.8952, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 48.40823789962648, + "learning_rate": 5.964912280701755e-06, + "loss": 1.0629, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 29.593588009663375, + "learning_rate": 5.994152046783627e-06, + "loss": 0.9797, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 17.298944088840926, + "learning_rate": 6.023391812865498e-06, + "loss": 0.9538, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 34.44989784221152, + "learning_rate": 6.0526315789473685e-06, + "loss": 0.7444, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 24.514553578079777, + "learning_rate": 6.08187134502924e-06, + "loss": 0.9314, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 14.797621542381581, + "learning_rate": 6.111111111111112e-06, + "loss": 0.831, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 33.35714057161018, + "learning_rate": 6.140350877192983e-06, + "loss": 1.0024, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 19.309868155678494, + "learning_rate": 6.169590643274854e-06, + "loss": 0.9301, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 53.16709921556343, + "learning_rate": 6.198830409356725e-06, + "loss": 1.0716, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 37.55323640341641, + "learning_rate": 6.2280701754385975e-06, + "loss": 1.0143, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 25.81949467503662, + "learning_rate": 6.2573099415204685e-06, + "loss": 0.9941, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 4.962069356585259, + "learning_rate": 6.286549707602339e-06, + "loss": 0.4387, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 15.161232493243567, + "learning_rate": 6.31578947368421e-06, + "loss": 1.0159, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 3.4795126027278758, + "learning_rate": 6.345029239766083e-06, + "loss": 0.4674, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 15.758556054555164, + "learning_rate": 6.374269005847954e-06, + "loss": 0.9646, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 20.655776923723238, + "learning_rate": 6.403508771929825e-06, + "loss": 1.0082, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 18.04544560320438, + "learning_rate": 6.432748538011696e-06, + "loss": 0.9234, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 14.396444401538753, + "learning_rate": 6.461988304093568e-06, + "loss": 0.9675, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 18.335808089323987, + "learning_rate": 6.491228070175439e-06, + "loss": 1.0302, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 27.659738170406317, + "learning_rate": 6.52046783625731e-06, + "loss": 1.0261, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 19.42546942734988, + "learning_rate": 6.549707602339181e-06, + "loss": 1.0016, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 13.174564732623717, + "learning_rate": 6.578947368421054e-06, + "loss": 0.9993, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 30.40350189255635, + "learning_rate": 6.608187134502925e-06, + "loss": 0.9824, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 20.75344429143543, + "learning_rate": 6.637426900584796e-06, + "loss": 0.8356, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 18.265024940841005, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8156, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 4.607067298624387, + "learning_rate": 6.695906432748539e-06, + "loss": 0.5521, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 16.50268342765512, + "learning_rate": 6.72514619883041e-06, + "loss": 1.1195, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 13.50053645721789, + "learning_rate": 6.754385964912281e-06, + "loss": 0.9066, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 24.63286361847981, + "learning_rate": 6.783625730994152e-06, + "loss": 0.9032, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 3.279606397012581, + "learning_rate": 6.812865497076025e-06, + "loss": 0.452, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 17.28468236767936, + "learning_rate": 6.842105263157896e-06, + "loss": 1.0925, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 14.507029369719588, + "learning_rate": 6.871345029239767e-06, + "loss": 0.8559, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 17.35984147412289, + "learning_rate": 6.9005847953216375e-06, + "loss": 1.0032, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 16.676188333140235, + "learning_rate": 6.92982456140351e-06, + "loss": 1.008, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 19.887430195675037, + "learning_rate": 6.959064327485381e-06, + "loss": 0.9947, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 16.18923220873402, + "learning_rate": 6.988304093567252e-06, + "loss": 0.9107, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 3.201216207031438, + "learning_rate": 7.017543859649123e-06, + "loss": 0.47, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 26.620813345340114, + "learning_rate": 7.046783625730995e-06, + "loss": 1.0465, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 22.897133021795586, + "learning_rate": 7.0760233918128665e-06, + "loss": 0.9335, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 11.224990635790947, + "learning_rate": 7.1052631578947375e-06, + "loss": 0.9401, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 16.303621048931127, + "learning_rate": 7.134502923976608e-06, + "loss": 0.979, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 21.111872849145477, + "learning_rate": 7.16374269005848e-06, + "loss": 0.9991, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 16.861331071118062, + "learning_rate": 7.192982456140352e-06, + "loss": 0.9506, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 16.295387820546917, + "learning_rate": 7.222222222222223e-06, + "loss": 0.8551, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 26.62542039363144, + "learning_rate": 7.251461988304094e-06, + "loss": 1.0387, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 13.535960144725623, + "learning_rate": 7.280701754385966e-06, + "loss": 0.8864, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 20.82763087689696, + "learning_rate": 7.309941520467837e-06, + "loss": 1.0549, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 18.012547777337197, + "learning_rate": 7.339181286549708e-06, + "loss": 0.9078, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 10.31968859517719, + "learning_rate": 7.368421052631579e-06, + "loss": 0.8658, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 14.214582294937006, + "learning_rate": 7.397660818713451e-06, + "loss": 0.9884, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 19.237354331280994, + "learning_rate": 7.426900584795322e-06, + "loss": 0.9573, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 13.623575273901368, + "learning_rate": 7.456140350877194e-06, + "loss": 0.9773, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 14.427150480324025, + "learning_rate": 7.485380116959065e-06, + "loss": 0.9628, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 29.820050223530824, + "learning_rate": 7.5146198830409365e-06, + "loss": 0.943, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 22.103328270139208, + "learning_rate": 7.5438596491228074e-06, + "loss": 0.9268, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 17.88318963574914, + "learning_rate": 7.573099415204679e-06, + "loss": 1.1121, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 15.439151637165446, + "learning_rate": 7.60233918128655e-06, + "loss": 1.0238, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 19.398138197780035, + "learning_rate": 7.631578947368423e-06, + "loss": 0.912, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 11.68567722488484, + "learning_rate": 7.660818713450294e-06, + "loss": 0.968, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 15.054231872014928, + "learning_rate": 7.690058479532165e-06, + "loss": 0.9927, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 14.760107132210338, + "learning_rate": 7.719298245614036e-06, + "loss": 0.9228, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 16.135550937206332, + "learning_rate": 7.748538011695908e-06, + "loss": 1.1844, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 16.7402225940394, + "learning_rate": 7.77777777777778e-06, + "loss": 1.017, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 13.51698210012837, + "learning_rate": 7.80701754385965e-06, + "loss": 0.9391, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 15.081914793325444, + "learning_rate": 7.836257309941521e-06, + "loss": 0.7916, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 16.458128216618285, + "learning_rate": 7.865497076023394e-06, + "loss": 1.0756, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 30.945100078196024, + "learning_rate": 7.894736842105265e-06, + "loss": 0.9859, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 20.91346209803544, + "learning_rate": 7.923976608187136e-06, + "loss": 0.9346, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 16.88254907718061, + "learning_rate": 7.953216374269006e-06, + "loss": 0.917, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 15.246288603931946, + "learning_rate": 7.982456140350877e-06, + "loss": 0.9031, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 15.416174681421179, + "learning_rate": 8.01169590643275e-06, + "loss": 1.0778, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 13.497640749096963, + "learning_rate": 8.040935672514621e-06, + "loss": 1.0346, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 13.87386232087359, + "learning_rate": 8.070175438596492e-06, + "loss": 1.0557, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 19.52053384583888, + "learning_rate": 8.099415204678363e-06, + "loss": 0.9531, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 17.77599710744223, + "learning_rate": 8.128654970760235e-06, + "loss": 0.9527, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 15.829285816166703, + "learning_rate": 8.157894736842106e-06, + "loss": 0.9192, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 21.736553908148675, + "learning_rate": 8.187134502923977e-06, + "loss": 1.1461, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 18.28458106830523, + "learning_rate": 8.216374269005848e-06, + "loss": 1.0876, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 37.65877165267157, + "learning_rate": 8.24561403508772e-06, + "loss": 1.0267, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 17.90883716874341, + "learning_rate": 8.274853801169592e-06, + "loss": 0.8479, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 22.587593515396787, + "learning_rate": 8.304093567251463e-06, + "loss": 1.0318, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 17.846457780763195, + "learning_rate": 8.333333333333334e-06, + "loss": 0.8865, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 13.639509833023936, + "learning_rate": 8.362573099415205e-06, + "loss": 0.9498, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 20.306756632876457, + "learning_rate": 8.391812865497077e-06, + "loss": 0.8673, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 16.580029396252815, + "learning_rate": 8.421052631578948e-06, + "loss": 0.7814, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 11.929309433987552, + "learning_rate": 8.45029239766082e-06, + "loss": 0.7782, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 11.482479568113547, + "learning_rate": 8.47953216374269e-06, + "loss": 0.9054, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 25.379865612234646, + "learning_rate": 8.508771929824563e-06, + "loss": 0.9475, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 15.34721082474685, + "learning_rate": 8.538011695906434e-06, + "loss": 1.0992, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 19.370005792918523, + "learning_rate": 8.567251461988305e-06, + "loss": 0.9548, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 16.131943007476043, + "learning_rate": 8.596491228070176e-06, + "loss": 0.9683, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 66.06760251399098, + "learning_rate": 8.625730994152046e-06, + "loss": 1.0695, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 14.261503149498367, + "learning_rate": 8.654970760233919e-06, + "loss": 0.8247, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 21.271554230002092, + "learning_rate": 8.68421052631579e-06, + "loss": 0.9943, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 594.611506748742, + "learning_rate": 8.713450292397661e-06, + "loss": 0.8933, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 16.963381376414336, + "learning_rate": 8.742690058479532e-06, + "loss": 0.8995, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 19.045302863360913, + "learning_rate": 8.771929824561405e-06, + "loss": 0.9594, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 25.524687345260535, + "learning_rate": 8.801169590643275e-06, + "loss": 1.0978, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 14.534786529404569, + "learning_rate": 8.830409356725146e-06, + "loss": 0.8955, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 19.158206654333714, + "learning_rate": 8.859649122807017e-06, + "loss": 0.9462, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 14.964071064945308, + "learning_rate": 8.888888888888888e-06, + "loss": 0.9142, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 14.02006376078565, + "learning_rate": 8.918128654970761e-06, + "loss": 0.9855, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 22.19151785524439, + "learning_rate": 8.947368421052632e-06, + "loss": 0.9755, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 11.549840170086135, + "learning_rate": 8.976608187134503e-06, + "loss": 0.9298, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 11.783158390433192, + "learning_rate": 9.005847953216374e-06, + "loss": 0.9725, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 29.424663531091614, + "learning_rate": 9.035087719298246e-06, + "loss": 1.0529, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 21.467433851969513, + "learning_rate": 9.064327485380117e-06, + "loss": 0.9381, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 12.46722382047058, + "learning_rate": 9.093567251461988e-06, + "loss": 0.8247, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 10.22435115719463, + "learning_rate": 9.12280701754386e-06, + "loss": 0.9224, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 19.69987861534263, + "learning_rate": 9.152046783625732e-06, + "loss": 1.0794, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 11.42846851637189, + "learning_rate": 9.181286549707603e-06, + "loss": 0.9899, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 12.065027815794812, + "learning_rate": 9.210526315789474e-06, + "loss": 0.9313, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 15.099955020548315, + "learning_rate": 9.239766081871345e-06, + "loss": 0.8672, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 12.712910737688567, + "learning_rate": 9.269005847953217e-06, + "loss": 0.9563, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 20.070077941942014, + "learning_rate": 9.298245614035088e-06, + "loss": 0.9651, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 15.574491714190406, + "learning_rate": 9.327485380116959e-06, + "loss": 0.6866, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 11.790514032015707, + "learning_rate": 9.35672514619883e-06, + "loss": 0.9827, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 22.278289696057495, + "learning_rate": 9.385964912280703e-06, + "loss": 1.0351, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 11.681180154013362, + "learning_rate": 9.415204678362574e-06, + "loss": 0.9675, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 26.62179159467999, + "learning_rate": 9.444444444444445e-06, + "loss": 0.9632, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 11.22424637880163, + "learning_rate": 9.473684210526315e-06, + "loss": 0.9679, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 24.040299707278226, + "learning_rate": 9.502923976608188e-06, + "loss": 0.938, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 24.983319082619516, + "learning_rate": 9.532163742690059e-06, + "loss": 1.0038, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 15.63608920931394, + "learning_rate": 9.56140350877193e-06, + "loss": 0.7809, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 18.261108199139894, + "learning_rate": 9.590643274853801e-06, + "loss": 0.8831, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 20.98169060497951, + "learning_rate": 9.619883040935674e-06, + "loss": 1.0421, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 21.187451750045373, + "learning_rate": 9.649122807017545e-06, + "loss": 0.9732, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 20.322722398467985, + "learning_rate": 9.678362573099415e-06, + "loss": 0.9231, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 14.852803033331078, + "learning_rate": 9.707602339181286e-06, + "loss": 0.8609, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 14.948495049405677, + "learning_rate": 9.736842105263159e-06, + "loss": 0.9722, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 11.41518498602968, + "learning_rate": 9.76608187134503e-06, + "loss": 0.8541, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 9.3869986973662, + "learning_rate": 9.795321637426901e-06, + "loss": 0.9447, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 13.526348016080528, + "learning_rate": 9.824561403508772e-06, + "loss": 0.9789, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 16.84180982534175, + "learning_rate": 9.853801169590644e-06, + "loss": 0.8979, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 12.36088637724119, + "learning_rate": 9.883040935672515e-06, + "loss": 0.9554, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 11.473351393739403, + "learning_rate": 9.912280701754386e-06, + "loss": 0.9279, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 17.38254668761977, + "learning_rate": 9.941520467836257e-06, + "loss": 0.9557, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 10.419046536977977, + "learning_rate": 9.97076023391813e-06, + "loss": 0.9754, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 3.916076988725924, + "learning_rate": 1e-05, + "loss": 0.4969, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 4.316355892256963, + "learning_rate": 9.999999797704159e-06, + "loss": 0.5768, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 18.544040743140467, + "learning_rate": 9.999999190816651e-06, + "loss": 1.0199, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 22.504592861869746, + "learning_rate": 9.999998179337526e-06, + "loss": 0.8711, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 12.914165880470676, + "learning_rate": 9.999996763266866e-06, + "loss": 0.9315, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 23.948350079027193, + "learning_rate": 9.999994942604782e-06, + "loss": 0.9162, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 12.623649584066795, + "learning_rate": 9.999992717351428e-06, + "loss": 0.9199, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 2.7775651163627493, + "learning_rate": 9.999990087506977e-06, + "loss": 0.4667, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 9.831582417328056, + "learning_rate": 9.999987053071647e-06, + "loss": 0.9744, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 5.145927342144499, + "learning_rate": 9.999983614045682e-06, + "loss": 0.5313, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 5.849471400160812, + "learning_rate": 9.99997977042936e-06, + "loss": 0.5461, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 10.21902897694262, + "learning_rate": 9.99997552222299e-06, + "loss": 1.1778, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 22.70785005628821, + "learning_rate": 9.99997086942692e-06, + "loss": 1.1101, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 14.137453019780468, + "learning_rate": 9.999965812041523e-06, + "loss": 1.0278, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 11.453721272738967, + "learning_rate": 9.99996035006721e-06, + "loss": 0.9677, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 14.166462580467506, + "learning_rate": 9.999954483504424e-06, + "loss": 0.7911, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 11.09947735216516, + "learning_rate": 9.999948212353637e-06, + "loss": 0.9765, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 8.845429118850534, + "learning_rate": 9.999941536615356e-06, + "loss": 0.8667, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 15.418506976148585, + "learning_rate": 9.999934456290125e-06, + "loss": 0.8879, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 12.427957099373284, + "learning_rate": 9.999926971378512e-06, + "loss": 0.8908, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 15.8136207242965, + "learning_rate": 9.99991908188113e-06, + "loss": 1.0374, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 10.261849644112246, + "learning_rate": 9.99991078779861e-06, + "loss": 0.902, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 10.934166047293228, + "learning_rate": 9.999902089131626e-06, + "loss": 0.9403, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 11.754492108490554, + "learning_rate": 9.999892985880882e-06, + "loss": 1.0237, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 11.09614326675651, + "learning_rate": 9.999883478047114e-06, + "loss": 0.9857, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 12.573469186302201, + "learning_rate": 9.999873565631092e-06, + "loss": 1.0981, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 8.953931431057953, + "learning_rate": 9.999863248633619e-06, + "loss": 0.9433, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 10.558199966338279, + "learning_rate": 9.999852527055527e-06, + "loss": 1.0017, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 37.23718718259435, + "learning_rate": 9.999841400897688e-06, + "loss": 0.9376, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 12.183789093810033, + "learning_rate": 9.999829870160997e-06, + "loss": 0.9074, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 10.985023800340016, + "learning_rate": 9.999817934846391e-06, + "loss": 0.8484, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 16.11112323278431, + "learning_rate": 9.999805594954835e-06, + "loss": 0.9569, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 14.98465750897132, + "learning_rate": 9.999792850487326e-06, + "loss": 1.1107, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 10.904477998287799, + "learning_rate": 9.999779701444897e-06, + "loss": 0.959, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 15.401433254999763, + "learning_rate": 9.999766147828609e-06, + "loss": 1.0923, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 8.69414979498584, + "learning_rate": 9.999752189639563e-06, + "loss": 0.828, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 3.126221415416826, + "learning_rate": 9.999737826878887e-06, + "loss": 0.479, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 3.660239451180123, + "learning_rate": 9.99972305954774e-06, + "loss": 0.5115, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 11.47710308220475, + "learning_rate": 9.999707887647323e-06, + "loss": 0.9757, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 10.011200689556082, + "learning_rate": 9.99969231117886e-06, + "loss": 1.0236, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 11.725975358339701, + "learning_rate": 9.99967633014361e-06, + "loss": 0.9915, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 18.022132323715077, + "learning_rate": 9.999659944542868e-06, + "loss": 0.9569, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 19.68901364312359, + "learning_rate": 9.999643154377961e-06, + "loss": 0.9076, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 10.313031302079557, + "learning_rate": 9.999625959650245e-06, + "loss": 0.8476, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 13.760722602461014, + "learning_rate": 9.999608360361114e-06, + "loss": 0.9933, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 12.126714357826472, + "learning_rate": 9.999590356511991e-06, + "loss": 1.002, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 3.3170990867578065, + "learning_rate": 9.999571948104333e-06, + "loss": 0.5763, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 12.689253381596986, + "learning_rate": 9.999553135139627e-06, + "loss": 1.0134, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 20.01417038763532, + "learning_rate": 9.9995339176194e-06, + "loss": 1.0565, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 30.590754653996584, + "learning_rate": 9.999514295545203e-06, + "loss": 1.0641, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 13.01198890308173, + "learning_rate": 9.999494268918626e-06, + "loss": 0.9521, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 12.301218901607184, + "learning_rate": 9.99947383774129e-06, + "loss": 1.1198, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 16.994611568343533, + "learning_rate": 9.999453002014847e-06, + "loss": 0.996, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 36.1031819777587, + "learning_rate": 9.999431761740983e-06, + "loss": 0.8881, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 11.602406821888342, + "learning_rate": 9.999410116921414e-06, + "loss": 0.9067, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 14.098397969775164, + "learning_rate": 9.9993880675579e-06, + "loss": 0.8867, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 14.123200236886278, + "learning_rate": 9.999365613652216e-06, + "loss": 0.9576, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 11.565215386753188, + "learning_rate": 9.999342755206183e-06, + "loss": 0.794, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 11.18042366315869, + "learning_rate": 9.99931949222165e-06, + "loss": 0.9288, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 10.780090739450134, + "learning_rate": 9.9992958247005e-06, + "loss": 1.0028, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 13.362851357464121, + "learning_rate": 9.999271752644649e-06, + "loss": 0.9009, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 2.7038591715324203, + "learning_rate": 9.999247276056043e-06, + "loss": 0.4641, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 14.914755373367987, + "learning_rate": 9.999222394936663e-06, + "loss": 0.9564, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 12.588372789802238, + "learning_rate": 9.999197109288522e-06, + "loss": 0.9436, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 7.806466387992163, + "learning_rate": 9.999171419113667e-06, + "loss": 0.8655, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 11.607313492855923, + "learning_rate": 9.999145324414176e-06, + "loss": 0.8504, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 13.743679284666454, + "learning_rate": 9.999118825192162e-06, + "loss": 1.0151, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 13.484855520790566, + "learning_rate": 9.999091921449768e-06, + "loss": 0.9413, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 3.3650446483480465, + "learning_rate": 9.99906461318917e-06, + "loss": 0.5374, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 10.000669243787367, + "learning_rate": 9.999036900412581e-06, + "loss": 0.9725, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 9.48631616718109, + "learning_rate": 9.999008783122242e-06, + "loss": 0.9111, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 13.30028588158057, + "learning_rate": 9.998980261320426e-06, + "loss": 0.6649, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 3.3699627245227757, + "learning_rate": 9.998951335009443e-06, + "loss": 0.5016, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 10.633315958148037, + "learning_rate": 9.998922004191634e-06, + "loss": 0.9279, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 3.8869630668965436, + "learning_rate": 9.99889226886937e-06, + "loss": 0.4399, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 14.25574904137687, + "learning_rate": 9.998862129045059e-06, + "loss": 0.8298, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 12.871688150672481, + "learning_rate": 9.998831584721141e-06, + "loss": 0.9181, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 17.312870928848035, + "learning_rate": 9.998800635900085e-06, + "loss": 0.7515, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 15.615415059052058, + "learning_rate": 9.998769282584398e-06, + "loss": 1.0537, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 19.260840857503503, + "learning_rate": 9.998737524776616e-06, + "loss": 0.8745, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 11.095425593982634, + "learning_rate": 9.998705362479307e-06, + "loss": 0.9018, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 2.838429372639107, + "learning_rate": 9.998672795695076e-06, + "loss": 0.4207, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 64.75298264929914, + "learning_rate": 9.998639824426557e-06, + "loss": 0.9228, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 13.25536977671555, + "learning_rate": 9.998606448676418e-06, + "loss": 0.8943, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 10.735970736679286, + "learning_rate": 9.99857266844736e-06, + "loss": 1.0534, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 10.07530213231149, + "learning_rate": 9.998538483742115e-06, + "loss": 0.9668, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 9.217844237929599, + "learning_rate": 9.998503894563453e-06, + "loss": 0.9805, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 11.32816429113021, + "learning_rate": 9.998468900914168e-06, + "loss": 0.8297, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 26.69515034396377, + "learning_rate": 9.998433502797097e-06, + "loss": 0.9382, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 9.453613925802912, + "learning_rate": 9.9983977002151e-06, + "loss": 0.9256, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 17.464469200446, + "learning_rate": 9.998361493171073e-06, + "loss": 0.9355, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 16.706076103509854, + "learning_rate": 9.998324881667951e-06, + "loss": 0.7422, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 9.46464256290843, + "learning_rate": 9.998287865708694e-06, + "loss": 0.872, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 22.63969590180243, + "learning_rate": 9.998250445296297e-06, + "loss": 0.8706, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 16.60577341916446, + "learning_rate": 9.998212620433787e-06, + "loss": 0.775, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 15.0124468929472, + "learning_rate": 9.998174391124227e-06, + "loss": 0.9336, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 20.01563466852367, + "learning_rate": 9.998135757370709e-06, + "loss": 0.9511, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 25.989923623568004, + "learning_rate": 9.99809671917636e-06, + "loss": 1.0495, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 2.534621324173973, + "learning_rate": 9.998057276544337e-06, + "loss": 0.4601, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 12.348203065867596, + "learning_rate": 9.998017429477834e-06, + "loss": 0.9682, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 11.136124398753177, + "learning_rate": 9.997977177980074e-06, + "loss": 1.0074, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 10.168713427478275, + "learning_rate": 9.997936522054315e-06, + "loss": 0.9949, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 9.697006705482629, + "learning_rate": 9.997895461703845e-06, + "loss": 1.1212, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 13.257968621086562, + "learning_rate": 9.99785399693199e-06, + "loss": 0.8136, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 16.17231993041416, + "learning_rate": 9.997812127742102e-06, + "loss": 0.9623, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 17.615760056586044, + "learning_rate": 9.99776985413757e-06, + "loss": 1.064, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 14.426174556704447, + "learning_rate": 9.997727176121814e-06, + "loss": 0.9694, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 56.7772607100907, + "learning_rate": 9.997684093698289e-06, + "loss": 0.9898, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 11.107433283310645, + "learning_rate": 9.99764060687048e-06, + "loss": 0.9833, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 11.942015346155975, + "learning_rate": 9.997596715641906e-06, + "loss": 0.9344, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 16.966033656753822, + "learning_rate": 9.99755242001612e-06, + "loss": 0.9935, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 5.9982723556243815, + "learning_rate": 9.997507719996704e-06, + "loss": 0.5836, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 13.2421862072448, + "learning_rate": 9.997462615587276e-06, + "loss": 0.9634, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 16.506952437386552, + "learning_rate": 9.997417106791486e-06, + "loss": 0.889, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 3.6229601061714387, + "learning_rate": 9.997371193613018e-06, + "loss": 0.5265, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 22.458689180366576, + "learning_rate": 9.997324876055585e-06, + "loss": 0.8414, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 16.225570109118035, + "learning_rate": 9.997278154122935e-06, + "loss": 0.8185, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 12.311641845154428, + "learning_rate": 9.99723102781885e-06, + "loss": 0.7916, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 13.12750775732185, + "learning_rate": 9.997183497147142e-06, + "loss": 1.0285, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 13.098798498146724, + "learning_rate": 9.997135562111659e-06, + "loss": 1.0198, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 11.649701491737458, + "learning_rate": 9.997087222716279e-06, + "loss": 0.9644, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 13.455387992743187, + "learning_rate": 9.99703847896491e-06, + "loss": 1.0024, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 13.788192056801371, + "learning_rate": 9.996989330861504e-06, + "loss": 1.1517, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 15.249656986698499, + "learning_rate": 9.99693977841003e-06, + "loss": 1.1086, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 14.681418252048383, + "learning_rate": 9.996889821614502e-06, + "loss": 0.9304, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 25.327967647461236, + "learning_rate": 9.996839460478963e-06, + "loss": 0.8753, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 13.809995437678387, + "learning_rate": 9.996788695007485e-06, + "loss": 0.7747, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 13.804179095211259, + "learning_rate": 9.996737525204176e-06, + "loss": 0.948, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 17.442112452884825, + "learning_rate": 9.996685951073182e-06, + "loss": 0.88, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 19.14402084957918, + "learning_rate": 9.99663397261867e-06, + "loss": 0.8989, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 11.579950794042414, + "learning_rate": 9.996581589844849e-06, + "loss": 0.8188, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 13.17197052893533, + "learning_rate": 9.996528802755957e-06, + "loss": 0.8237, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 12.010846350744082, + "learning_rate": 9.996475611356265e-06, + "loss": 0.8859, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 33.63399577406676, + "learning_rate": 9.996422015650079e-06, + "loss": 0.9442, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 11.968097725988711, + "learning_rate": 9.996368015641733e-06, + "loss": 0.8374, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 15.222419225935363, + "learning_rate": 9.996313611335598e-06, + "loss": 1.0135, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 14.275287842280694, + "learning_rate": 9.996258802736078e-06, + "loss": 0.9576, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 13.14885132258905, + "learning_rate": 9.996203589847606e-06, + "loss": 0.8646, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 13.893558346899539, + "learning_rate": 9.99614797267465e-06, + "loss": 0.814, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 10.397696981874008, + "learning_rate": 9.996091951221711e-06, + "loss": 1.168, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 14.916177984137919, + "learning_rate": 9.996035525493321e-06, + "loss": 0.8698, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 9.619460556617327, + "learning_rate": 9.995978695494049e-06, + "loss": 0.8032, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 15.549548820712252, + "learning_rate": 9.99592146122849e-06, + "loss": 0.7908, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 13.16548145261841, + "learning_rate": 9.995863822701278e-06, + "loss": 0.8834, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 4.06759300484798, + "learning_rate": 9.995805779917074e-06, + "loss": 0.5546, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 12.575185323241026, + "learning_rate": 9.995747332880577e-06, + "loss": 1.0323, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 3.704489478859196, + "learning_rate": 9.995688481596515e-06, + "loss": 0.5063, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 10.844059588724626, + "learning_rate": 9.99562922606965e-06, + "loss": 0.7902, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 11.770889976326602, + "learning_rate": 9.99556956630478e-06, + "loss": 0.7965, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 13.175353355865413, + "learning_rate": 9.99550950230673e-06, + "loss": 0.9023, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 13.702089978035794, + "learning_rate": 9.99544903408036e-06, + "loss": 0.9868, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 17.974259670209968, + "learning_rate": 9.995388161630564e-06, + "loss": 0.9238, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 10.459415048860263, + "learning_rate": 9.995326884962268e-06, + "loss": 0.808, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 2.5027610789495527, + "learning_rate": 9.995265204080429e-06, + "loss": 0.4693, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 2.9787951894779403, + "learning_rate": 9.99520311899004e-06, + "loss": 0.4714, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 15.092906957628502, + "learning_rate": 9.995140629696122e-06, + "loss": 0.8676, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 13.320236787134112, + "learning_rate": 9.995077736203735e-06, + "loss": 0.916, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 15.319085784491172, + "learning_rate": 9.995014438517964e-06, + "loss": 0.9416, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 13.731238046850548, + "learning_rate": 9.994950736643936e-06, + "loss": 0.852, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 11.509928326433355, + "learning_rate": 9.994886630586801e-06, + "loss": 0.8905, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 14.932593473358232, + "learning_rate": 9.99482212035175e-06, + "loss": 0.9603, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 26.681998951956793, + "learning_rate": 9.994757205944001e-06, + "loss": 0.8266, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 2.9142651028715996, + "learning_rate": 9.994691887368807e-06, + "loss": 0.4521, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 27.916121268082918, + "learning_rate": 9.994626164631453e-06, + "loss": 0.804, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 15.848587079137301, + "learning_rate": 9.99456003773726e-06, + "loss": 0.909, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 11.097120401232045, + "learning_rate": 9.994493506691577e-06, + "loss": 0.9443, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 11.733472739734744, + "learning_rate": 9.994426571499785e-06, + "loss": 0.7571, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 9.170128210417275, + "learning_rate": 9.994359232167304e-06, + "loss": 0.9227, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 15.230188269606336, + "learning_rate": 9.99429148869958e-06, + "loss": 0.9645, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 13.563999162339986, + "learning_rate": 9.994223341102097e-06, + "loss": 1.022, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 13.163033003155503, + "learning_rate": 9.994154789380369e-06, + "loss": 0.8308, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 7.234291845112643, + "learning_rate": 9.994085833539943e-06, + "loss": 0.8342, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 14.220542270763152, + "learning_rate": 9.994016473586399e-06, + "loss": 1.003, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 9.934777629757008, + "learning_rate": 9.993946709525348e-06, + "loss": 0.8723, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 11.026419246556003, + "learning_rate": 9.993876541362437e-06, + "loss": 0.93, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 20.204496461490333, + "learning_rate": 9.993805969103342e-06, + "loss": 1.022, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 10.76948598760552, + "learning_rate": 9.993734992753777e-06, + "loss": 0.8551, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 9.691260582065334, + "learning_rate": 9.993663612319482e-06, + "loss": 0.8672, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 16.41650924397765, + "learning_rate": 9.993591827806234e-06, + "loss": 0.8623, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 13.185428761921964, + "learning_rate": 9.993519639219841e-06, + "loss": 0.9466, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 15.556337140994449, + "learning_rate": 9.993447046566146e-06, + "loss": 0.9799, + "step": 522 + }, + { + "epoch": 0.05, + "grad_norm": 13.689739105702825, + "learning_rate": 9.993374049851022e-06, + "loss": 0.9621, + "step": 523 + }, + { + "epoch": 0.05, + "grad_norm": 21.538992867147442, + "learning_rate": 9.993300649080375e-06, + "loss": 0.8762, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 20.930034281586025, + "learning_rate": 9.993226844260147e-06, + "loss": 1.0648, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 16.96801961390727, + "learning_rate": 9.993152635396309e-06, + "loss": 0.8405, + "step": 526 + }, + { + "epoch": 0.05, + "grad_norm": 12.151161798463646, + "learning_rate": 9.993078022494861e-06, + "loss": 1.0026, + "step": 527 + }, + { + "epoch": 0.05, + "grad_norm": 9.674204461124443, + "learning_rate": 9.99300300556185e-06, + "loss": 0.8727, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 15.85101238230783, + "learning_rate": 9.992927584603339e-06, + "loss": 0.8999, + "step": 529 + }, + { + "epoch": 0.05, + "grad_norm": 20.145579518601952, + "learning_rate": 9.992851759625434e-06, + "loss": 0.9696, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 12.583195177219237, + "learning_rate": 9.99277553063427e-06, + "loss": 0.8363, + "step": 531 + }, + { + "epoch": 0.05, + "grad_norm": 23.20667031059216, + "learning_rate": 9.992698897636013e-06, + "loss": 0.8527, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 10.828232397240638, + "learning_rate": 9.992621860636868e-06, + "loss": 0.8172, + "step": 533 + }, + { + "epoch": 0.05, + "grad_norm": 11.705013904969334, + "learning_rate": 9.992544419643066e-06, + "loss": 0.9211, + "step": 534 + }, + { + "epoch": 0.05, + "grad_norm": 11.269540127942081, + "learning_rate": 9.992466574660875e-06, + "loss": 1.0038, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 19.52071398940795, + "learning_rate": 9.992388325696594e-06, + "loss": 0.7495, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 9.928549173134623, + "learning_rate": 9.992309672756552e-06, + "loss": 0.8495, + "step": 537 + }, + { + "epoch": 0.05, + "grad_norm": 20.068097189207048, + "learning_rate": 9.992230615847116e-06, + "loss": 1.0642, + "step": 538 + }, + { + "epoch": 0.05, + "grad_norm": 17.651752278952028, + "learning_rate": 9.992151154974684e-06, + "loss": 0.9756, + "step": 539 + }, + { + "epoch": 0.05, + "grad_norm": 15.875330060754152, + "learning_rate": 9.992071290145684e-06, + "loss": 1.0129, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 21.573484649069677, + "learning_rate": 9.991991021366578e-06, + "loss": 0.9115, + "step": 541 + }, + { + "epoch": 0.05, + "grad_norm": 16.208347200830364, + "learning_rate": 9.991910348643864e-06, + "loss": 0.9051, + "step": 542 + }, + { + "epoch": 0.05, + "grad_norm": 14.488906391217387, + "learning_rate": 9.991829271984067e-06, + "loss": 0.8963, + "step": 543 + }, + { + "epoch": 0.05, + "grad_norm": 14.205617457297002, + "learning_rate": 9.99174779139375e-06, + "loss": 0.9739, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 15.65283696724939, + "learning_rate": 9.991665906879503e-06, + "loss": 0.9977, + "step": 545 + }, + { + "epoch": 0.05, + "grad_norm": 13.437347494980587, + "learning_rate": 9.991583618447958e-06, + "loss": 0.9692, + "step": 546 + }, + { + "epoch": 0.05, + "grad_norm": 3.716662892111003, + "learning_rate": 9.991500926105766e-06, + "loss": 0.4711, + "step": 547 + }, + { + "epoch": 0.05, + "grad_norm": 12.835685037404312, + "learning_rate": 9.991417829859622e-06, + "loss": 0.9778, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 12.010387769635996, + "learning_rate": 9.991334329716252e-06, + "loss": 0.7919, + "step": 549 + }, + { + "epoch": 0.05, + "grad_norm": 23.57838807187828, + "learning_rate": 9.99125042568241e-06, + "loss": 0.8002, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 4.920543879629488, + "learning_rate": 9.991166117764885e-06, + "loss": 0.6272, + "step": 551 + }, + { + "epoch": 0.05, + "grad_norm": 20.178597440942312, + "learning_rate": 9.991081405970503e-06, + "loss": 0.9273, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 17.9091939990453, + "learning_rate": 9.990996290306111e-06, + "loss": 0.9256, + "step": 553 + }, + { + "epoch": 0.05, + "grad_norm": 15.073369247085727, + "learning_rate": 9.990910770778606e-06, + "loss": 0.9624, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 15.341870279407816, + "learning_rate": 9.990824847394901e-06, + "loss": 0.8574, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 4.456539811911177, + "learning_rate": 9.990738520161952e-06, + "loss": 0.496, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 23.533097161971646, + "learning_rate": 9.990651789086742e-06, + "loss": 0.9151, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 25.49988196864749, + "learning_rate": 9.990564654176293e-06, + "loss": 1.0767, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 11.923466632575169, + "learning_rate": 9.990477115437652e-06, + "loss": 0.9052, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 12.440532203188539, + "learning_rate": 9.990389172877905e-06, + "loss": 0.9511, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 9.069041879716853, + "learning_rate": 9.990300826504167e-06, + "loss": 0.7893, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 10.339049379825534, + "learning_rate": 9.990212076323587e-06, + "loss": 0.8881, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 16.00689007528694, + "learning_rate": 9.990122922343346e-06, + "loss": 0.9562, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 12.877073255789243, + "learning_rate": 9.99003336457066e-06, + "loss": 0.9676, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 16.341689823556475, + "learning_rate": 9.989943403012774e-06, + "loss": 0.9074, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 19.81913083530869, + "learning_rate": 9.989853037676966e-06, + "loss": 0.9849, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 13.991152165118345, + "learning_rate": 9.989762268570552e-06, + "loss": 1.0674, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 12.605895689555261, + "learning_rate": 9.989671095700877e-06, + "loss": 0.7661, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 18.097240515701394, + "learning_rate": 9.989579519075316e-06, + "loss": 0.9042, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 15.192655396376129, + "learning_rate": 9.98948753870128e-06, + "loss": 0.9103, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 16.023310349854682, + "learning_rate": 9.98939515458621e-06, + "loss": 0.9949, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 15.434936867785114, + "learning_rate": 9.989302366737585e-06, + "loss": 0.9608, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 26.359095499824072, + "learning_rate": 9.989209175162912e-06, + "loss": 0.8963, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 10.505817007716075, + "learning_rate": 9.989115579869733e-06, + "loss": 0.9557, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 8.80004045383503, + "learning_rate": 9.989021580865618e-06, + "loss": 0.9324, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 16.523181478158204, + "learning_rate": 9.988927178158177e-06, + "loss": 0.9518, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 11.955439812989725, + "learning_rate": 9.988832371755047e-06, + "loss": 0.8223, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 11.102875463876208, + "learning_rate": 9.988737161663898e-06, + "loss": 0.7923, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 21.731822710638667, + "learning_rate": 9.988641547892439e-06, + "loss": 0.8023, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 12.114380207643892, + "learning_rate": 9.988545530448402e-06, + "loss": 0.9777, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 11.583647089930022, + "learning_rate": 9.98844910933956e-06, + "loss": 0.9386, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 15.727860922813214, + "learning_rate": 9.988352284573713e-06, + "loss": 0.8416, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 10.825122886651625, + "learning_rate": 9.988255056158699e-06, + "loss": 0.9509, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 17.82104202959595, + "learning_rate": 9.988157424102381e-06, + "loss": 0.8805, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 10.964033794515117, + "learning_rate": 9.988059388412663e-06, + "loss": 0.8058, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 10.37878452803886, + "learning_rate": 9.987960949097475e-06, + "loss": 0.8148, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 10.329013600099147, + "learning_rate": 9.987862106164784e-06, + "loss": 0.8521, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 15.697615402706738, + "learning_rate": 9.98776285962259e-06, + "loss": 0.8732, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 30.069888623459352, + "learning_rate": 9.987663209478922e-06, + "loss": 0.8824, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 17.457190564074494, + "learning_rate": 9.987563155741843e-06, + "loss": 0.846, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 8.992612554380731, + "learning_rate": 9.98746269841945e-06, + "loss": 0.7683, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 12.106250380422074, + "learning_rate": 9.987361837519871e-06, + "loss": 0.931, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 10.614057374535111, + "learning_rate": 9.987260573051268e-06, + "loss": 0.887, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 9.73539116146794, + "learning_rate": 9.987158905021836e-06, + "loss": 0.9827, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 11.824421811574805, + "learning_rate": 9.9870568334398e-06, + "loss": 0.9238, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 10.184200215162216, + "learning_rate": 9.986954358313423e-06, + "loss": 0.9144, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 10.621662457550755, + "learning_rate": 9.986851479650994e-06, + "loss": 1.039, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 15.825734582465987, + "learning_rate": 9.986748197460837e-06, + "loss": 0.9418, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 14.809347232525601, + "learning_rate": 9.986644511751312e-06, + "loss": 0.9909, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 7.220710464103994, + "learning_rate": 9.986540422530808e-06, + "loss": 0.8556, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 20.583890324160254, + "learning_rate": 9.986435929807747e-06, + "loss": 0.8556, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 20.34902414642568, + "learning_rate": 9.986331033590587e-06, + "loss": 0.8743, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 3.771480550494476, + "learning_rate": 9.98622573388781e-06, + "loss": 0.4943, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 10.974537980682264, + "learning_rate": 9.986120030707945e-06, + "loss": 0.8287, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 9.243084552873105, + "learning_rate": 9.98601392405954e-06, + "loss": 0.9754, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 11.518204356718785, + "learning_rate": 9.98590741395118e-06, + "loss": 0.9637, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 8.339558542253027, + "learning_rate": 9.985800500391487e-06, + "loss": 0.83, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 7.6512554375967445, + "learning_rate": 9.985693183389112e-06, + "loss": 0.9978, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 33.24879327298668, + "learning_rate": 9.985585462952736e-06, + "loss": 0.9178, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 23.49399616378401, + "learning_rate": 9.985477339091078e-06, + "loss": 0.8931, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 12.406973941363415, + "learning_rate": 9.985368811812887e-06, + "loss": 0.8799, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 14.202666164502553, + "learning_rate": 9.985259881126945e-06, + "loss": 0.7972, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 9.924258084927127, + "learning_rate": 9.985150547042067e-06, + "loss": 0.9591, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 12.278909414003056, + "learning_rate": 9.985040809567097e-06, + "loss": 0.9439, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 15.440034360929573, + "learning_rate": 9.984930668710918e-06, + "loss": 0.929, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 29.155251437778148, + "learning_rate": 9.98482012448244e-06, + "loss": 0.872, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 12.969131058685957, + "learning_rate": 9.98470917689061e-06, + "loss": 1.0513, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 13.356460074295462, + "learning_rate": 9.984597825944405e-06, + "loss": 0.905, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 14.216547414137882, + "learning_rate": 9.984486071652835e-06, + "loss": 0.9517, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 4.136498112932582, + "learning_rate": 9.984373914024946e-06, + "loss": 0.5661, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 13.16731585110145, + "learning_rate": 9.984261353069808e-06, + "loss": 0.8897, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 16.588534631231514, + "learning_rate": 9.984148388796533e-06, + "loss": 1.0036, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 8.492352031483975, + "learning_rate": 9.98403502121426e-06, + "loss": 0.8701, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 17.638780319395334, + "learning_rate": 9.983921250332167e-06, + "loss": 0.8011, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 10.861088977090542, + "learning_rate": 9.983807076159453e-06, + "loss": 0.9405, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 28.055322958600883, + "learning_rate": 9.983692498705361e-06, + "loss": 1.0167, + "step": 626 + }, + { + "epoch": 0.06, + "grad_norm": 9.849973301944171, + "learning_rate": 9.983577517979164e-06, + "loss": 0.8849, + "step": 627 + }, + { + "epoch": 0.06, + "grad_norm": 10.637281291900665, + "learning_rate": 9.983462133990163e-06, + "loss": 0.9836, + "step": 628 + }, + { + "epoch": 0.06, + "grad_norm": 14.99150367344402, + "learning_rate": 9.983346346747695e-06, + "loss": 1.1147, + "step": 629 + }, + { + "epoch": 0.06, + "grad_norm": 10.31701861727662, + "learning_rate": 9.983230156261133e-06, + "loss": 0.7877, + "step": 630 + }, + { + "epoch": 0.06, + "grad_norm": 15.935362419051229, + "learning_rate": 9.983113562539873e-06, + "loss": 0.9517, + "step": 631 + }, + { + "epoch": 0.06, + "grad_norm": 16.73216295090134, + "learning_rate": 9.982996565593352e-06, + "loss": 0.9551, + "step": 632 + }, + { + "epoch": 0.06, + "grad_norm": 16.68105548326393, + "learning_rate": 9.982879165431038e-06, + "loss": 0.9064, + "step": 633 + }, + { + "epoch": 0.06, + "grad_norm": 10.82015568215065, + "learning_rate": 9.982761362062431e-06, + "loss": 0.9294, + "step": 634 + }, + { + "epoch": 0.06, + "grad_norm": 16.191558279806895, + "learning_rate": 9.982643155497064e-06, + "loss": 0.9343, + "step": 635 + }, + { + "epoch": 0.06, + "grad_norm": 9.872316210618044, + "learning_rate": 9.9825245457445e-06, + "loss": 0.8424, + "step": 636 + }, + { + "epoch": 0.06, + "grad_norm": 11.159028062813036, + "learning_rate": 9.982405532814339e-06, + "loss": 0.9007, + "step": 637 + }, + { + "epoch": 0.06, + "grad_norm": 10.08707095949704, + "learning_rate": 9.982286116716208e-06, + "loss": 0.8371, + "step": 638 + }, + { + "epoch": 0.06, + "grad_norm": 15.924123893575306, + "learning_rate": 9.982166297459775e-06, + "loss": 0.8572, + "step": 639 + }, + { + "epoch": 0.06, + "grad_norm": 12.846301373777735, + "learning_rate": 9.982046075054732e-06, + "loss": 1.0767, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 16.724250135120084, + "learning_rate": 9.981925449510805e-06, + "loss": 0.8431, + "step": 641 + }, + { + "epoch": 0.06, + "grad_norm": 9.245004483510732, + "learning_rate": 9.98180442083776e-06, + "loss": 0.7497, + "step": 642 + }, + { + "epoch": 0.06, + "grad_norm": 2.6642274685836815, + "learning_rate": 9.981682989045387e-06, + "loss": 0.4589, + "step": 643 + }, + { + "epoch": 0.06, + "grad_norm": 10.039936290196104, + "learning_rate": 9.981561154143513e-06, + "loss": 0.8145, + "step": 644 + }, + { + "epoch": 0.06, + "grad_norm": 9.979222948153625, + "learning_rate": 9.981438916141999e-06, + "loss": 0.8312, + "step": 645 + }, + { + "epoch": 0.06, + "grad_norm": 24.574040742478278, + "learning_rate": 9.981316275050732e-06, + "loss": 1.0577, + "step": 646 + }, + { + "epoch": 0.06, + "grad_norm": 18.67754787389612, + "learning_rate": 9.981193230879637e-06, + "loss": 1.0287, + "step": 647 + }, + { + "epoch": 0.06, + "grad_norm": 16.003998446065786, + "learning_rate": 9.981069783638676e-06, + "loss": 0.8992, + "step": 648 + }, + { + "epoch": 0.06, + "grad_norm": 10.092749245768777, + "learning_rate": 9.980945933337828e-06, + "loss": 0.9038, + "step": 649 + }, + { + "epoch": 0.06, + "grad_norm": 9.710779661247493, + "learning_rate": 9.980821679987125e-06, + "loss": 0.8752, + "step": 650 + }, + { + "epoch": 0.06, + "grad_norm": 8.427703941068467, + "learning_rate": 9.980697023596614e-06, + "loss": 0.9288, + "step": 651 + }, + { + "epoch": 0.06, + "grad_norm": 6.139449966761725, + "learning_rate": 9.980571964176387e-06, + "loss": 0.549, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 10.11250369646969, + "learning_rate": 9.980446501736559e-06, + "loss": 0.962, + "step": 653 + }, + { + "epoch": 0.06, + "grad_norm": 2.7318950270452937, + "learning_rate": 9.980320636287285e-06, + "loss": 0.5141, + "step": 654 + }, + { + "epoch": 0.06, + "grad_norm": 11.402801399280984, + "learning_rate": 9.980194367838752e-06, + "loss": 0.8462, + "step": 655 + }, + { + "epoch": 0.06, + "grad_norm": 12.78675876809027, + "learning_rate": 9.980067696401173e-06, + "loss": 0.9493, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 8.295081111300377, + "learning_rate": 9.979940621984801e-06, + "loss": 0.8914, + "step": 657 + }, + { + "epoch": 0.06, + "grad_norm": 11.313835289691786, + "learning_rate": 9.979813144599916e-06, + "loss": 0.7857, + "step": 658 + }, + { + "epoch": 0.06, + "grad_norm": 9.26874379495282, + "learning_rate": 9.979685264256835e-06, + "loss": 0.8555, + "step": 659 + }, + { + "epoch": 0.06, + "grad_norm": 33.75306316104988, + "learning_rate": 9.979556980965907e-06, + "loss": 0.8505, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 13.087346268953329, + "learning_rate": 9.979428294737509e-06, + "loss": 0.8147, + "step": 661 + }, + { + "epoch": 0.06, + "grad_norm": 8.999707156370848, + "learning_rate": 9.979299205582059e-06, + "loss": 0.8568, + "step": 662 + }, + { + "epoch": 0.06, + "grad_norm": 15.650555783930372, + "learning_rate": 9.979169713509997e-06, + "loss": 0.9494, + "step": 663 + }, + { + "epoch": 0.06, + "grad_norm": 13.653294917647898, + "learning_rate": 9.979039818531805e-06, + "loss": 0.8302, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 7.542479372801486, + "learning_rate": 9.978909520657995e-06, + "loss": 0.8842, + "step": 665 + }, + { + "epoch": 0.06, + "grad_norm": 11.784907632745789, + "learning_rate": 9.978778819899109e-06, + "loss": 1.0062, + "step": 666 + }, + { + "epoch": 0.06, + "grad_norm": 12.12609693015052, + "learning_rate": 9.97864771626572e-06, + "loss": 0.8737, + "step": 667 + }, + { + "epoch": 0.06, + "grad_norm": 8.058701204682322, + "learning_rate": 9.978516209768441e-06, + "loss": 0.8071, + "step": 668 + }, + { + "epoch": 0.06, + "grad_norm": 15.04688658247992, + "learning_rate": 9.978384300417911e-06, + "loss": 0.8286, + "step": 669 + }, + { + "epoch": 0.06, + "grad_norm": 9.230892013157233, + "learning_rate": 9.978251988224805e-06, + "loss": 0.8839, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 7.296206782312859, + "learning_rate": 9.978119273199829e-06, + "loss": 0.8921, + "step": 671 + }, + { + "epoch": 0.06, + "grad_norm": 34.97511756074449, + "learning_rate": 9.97798615535372e-06, + "loss": 0.904, + "step": 672 + }, + { + "epoch": 0.06, + "grad_norm": 12.001017183381228, + "learning_rate": 9.977852634697254e-06, + "loss": 0.9346, + "step": 673 + }, + { + "epoch": 0.06, + "grad_norm": 13.073931227429524, + "learning_rate": 9.977718711241233e-06, + "loss": 0.9443, + "step": 674 + }, + { + "epoch": 0.06, + "grad_norm": 16.64403567996628, + "learning_rate": 9.977584384996492e-06, + "loss": 0.9392, + "step": 675 + }, + { + "epoch": 0.06, + "grad_norm": 7.51209026533266, + "learning_rate": 9.977449655973905e-06, + "loss": 0.888, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 11.994624182031274, + "learning_rate": 9.977314524184371e-06, + "loss": 0.9343, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 12.652379395952675, + "learning_rate": 9.977178989638823e-06, + "loss": 0.8458, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 11.55862924640319, + "learning_rate": 9.977043052348232e-06, + "loss": 0.9324, + "step": 679 + }, + { + "epoch": 0.06, + "grad_norm": 9.779703415634389, + "learning_rate": 9.976906712323594e-06, + "loss": 0.8463, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 10.721783776026042, + "learning_rate": 9.976769969575947e-06, + "loss": 1.0065, + "step": 681 + }, + { + "epoch": 0.06, + "grad_norm": 9.985367905981322, + "learning_rate": 9.97663282411635e-06, + "loss": 1.0339, + "step": 682 + }, + { + "epoch": 0.06, + "grad_norm": 34.4848817825998, + "learning_rate": 9.976495275955904e-06, + "loss": 0.9457, + "step": 683 + }, + { + "epoch": 0.06, + "grad_norm": 2.6373071057700024, + "learning_rate": 9.976357325105739e-06, + "loss": 0.4864, + "step": 684 + }, + { + "epoch": 0.06, + "grad_norm": 18.37909638352772, + "learning_rate": 9.976218971577015e-06, + "loss": 0.839, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 13.851517889792289, + "learning_rate": 9.97608021538093e-06, + "loss": 0.8026, + "step": 686 + }, + { + "epoch": 0.06, + "grad_norm": 9.526097866941312, + "learning_rate": 9.975941056528712e-06, + "loss": 0.9535, + "step": 687 + }, + { + "epoch": 0.06, + "grad_norm": 9.000124473198792, + "learning_rate": 9.97580149503162e-06, + "loss": 0.9039, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 10.9301325358747, + "learning_rate": 9.97566153090095e-06, + "loss": 1.0032, + "step": 689 + }, + { + "epoch": 0.06, + "grad_norm": 11.72792921085119, + "learning_rate": 9.975521164148021e-06, + "loss": 0.8537, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 14.037781254154973, + "learning_rate": 9.9753803947842e-06, + "loss": 0.943, + "step": 691 + }, + { + "epoch": 0.06, + "grad_norm": 16.627421465370176, + "learning_rate": 9.97523922282087e-06, + "loss": 0.9605, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 12.1565556110977, + "learning_rate": 9.97509764826946e-06, + "loss": 0.8702, + "step": 693 + }, + { + "epoch": 0.06, + "grad_norm": 17.318240664046662, + "learning_rate": 9.974955671141425e-06, + "loss": 0.8542, + "step": 694 + }, + { + "epoch": 0.06, + "grad_norm": 13.045179693526743, + "learning_rate": 9.97481329144825e-06, + "loss": 0.8642, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 14.128217614257043, + "learning_rate": 9.97467050920146e-06, + "loss": 0.9713, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 12.631764777023255, + "learning_rate": 9.974527324412609e-06, + "loss": 0.9188, + "step": 697 + }, + { + "epoch": 0.06, + "grad_norm": 15.447648826790227, + "learning_rate": 9.974383737093279e-06, + "loss": 0.9487, + "step": 698 + }, + { + "epoch": 0.06, + "grad_norm": 10.247747816648664, + "learning_rate": 9.974239747255092e-06, + "loss": 0.916, + "step": 699 + }, + { + "epoch": 0.06, + "grad_norm": 12.548577634767128, + "learning_rate": 9.9740953549097e-06, + "loss": 0.8261, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 11.119176531325559, + "learning_rate": 9.973950560068786e-06, + "loss": 1.0298, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 14.638005516919913, + "learning_rate": 9.973805362744065e-06, + "loss": 0.8636, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 9.166965188299132, + "learning_rate": 9.973659762947287e-06, + "loss": 0.9452, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 22.95727613787519, + "learning_rate": 9.973513760690236e-06, + "loss": 0.8333, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 4.777933246705784, + "learning_rate": 9.973367355984724e-06, + "loss": 0.4996, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 12.237257701343554, + "learning_rate": 9.973220548842599e-06, + "loss": 0.9185, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 22.14382327314812, + "learning_rate": 9.973073339275739e-06, + "loss": 0.8908, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 10.813365101338928, + "learning_rate": 9.972925727296055e-06, + "loss": 0.8803, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 9.584720996917019, + "learning_rate": 9.972777712915496e-06, + "loss": 0.8481, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 9.143603716460369, + "learning_rate": 9.972629296146035e-06, + "loss": 1.1241, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 7.92018457302724, + "learning_rate": 9.972480476999682e-06, + "loss": 0.8421, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 13.324374867522236, + "learning_rate": 9.97233125548848e-06, + "loss": 0.9158, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 3.4147111949534734, + "learning_rate": 9.972181631624507e-06, + "loss": 0.4851, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 14.93811560390718, + "learning_rate": 9.972031605419864e-06, + "loss": 0.9293, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 3.045833828566081, + "learning_rate": 9.971881176886695e-06, + "loss": 0.5193, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 9.884809258796455, + "learning_rate": 9.971730346037172e-06, + "loss": 0.6804, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 9.070158952742759, + "learning_rate": 9.971579112883499e-06, + "loss": 0.958, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 9.210188823622477, + "learning_rate": 9.971427477437913e-06, + "loss": 0.9906, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 15.217905162228057, + "learning_rate": 9.971275439712686e-06, + "loss": 0.8443, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 9.397193581339598, + "learning_rate": 9.971122999720121e-06, + "loss": 0.8826, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 12.468630893302148, + "learning_rate": 9.97097015747255e-06, + "loss": 0.9209, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 11.579443036623005, + "learning_rate": 9.970816912982345e-06, + "loss": 0.8381, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 11.216904152548173, + "learning_rate": 9.970663266261901e-06, + "loss": 0.8873, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 17.119885634397125, + "learning_rate": 9.970509217323655e-06, + "loss": 0.9072, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 2.8742278837758897, + "learning_rate": 9.97035476618007e-06, + "loss": 0.5437, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 11.965699597939622, + "learning_rate": 9.970199912843649e-06, + "loss": 0.8872, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 11.427518571626461, + "learning_rate": 9.970044657326913e-06, + "loss": 0.923, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 18.923686073975237, + "learning_rate": 9.969888999642434e-06, + "loss": 0.9011, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 22.43573538289388, + "learning_rate": 9.969732939802802e-06, + "loss": 0.8862, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 18.94886909613432, + "learning_rate": 9.96957647782065e-06, + "loss": 0.8985, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 13.688244107094958, + "learning_rate": 9.969419613708635e-06, + "loss": 0.8893, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 11.645620869150562, + "learning_rate": 9.969262347479451e-06, + "loss": 0.7417, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 11.809498886420496, + "learning_rate": 9.969104679145823e-06, + "loss": 0.952, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 17.18144777192554, + "learning_rate": 9.968946608720512e-06, + "loss": 0.869, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 10.144629391324663, + "learning_rate": 9.968788136216304e-06, + "loss": 1.0153, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 10.245491036890316, + "learning_rate": 9.968629261646027e-06, + "loss": 0.9569, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 12.447243076193141, + "learning_rate": 9.968469985022534e-06, + "loss": 0.9187, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 12.626043391217978, + "learning_rate": 9.968310306358715e-06, + "loss": 0.9588, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 3.148280035634793, + "learning_rate": 9.96815022566749e-06, + "loss": 0.451, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 9.654752072921914, + "learning_rate": 9.967989742961814e-06, + "loss": 0.7977, + "step": 740 + }, + { + "epoch": 0.07, + "grad_norm": 11.642919766185761, + "learning_rate": 9.967828858254671e-06, + "loss": 0.8656, + "step": 741 + }, + { + "epoch": 0.07, + "grad_norm": 10.196677299230217, + "learning_rate": 9.96766757155908e-06, + "loss": 0.8882, + "step": 742 + }, + { + "epoch": 0.07, + "grad_norm": 23.151163403489214, + "learning_rate": 9.967505882888094e-06, + "loss": 0.8655, + "step": 743 + }, + { + "epoch": 0.07, + "grad_norm": 10.867861831184912, + "learning_rate": 9.967343792254792e-06, + "loss": 0.9284, + "step": 744 + }, + { + "epoch": 0.07, + "grad_norm": 13.816986385679625, + "learning_rate": 9.967181299672294e-06, + "loss": 0.8218, + "step": 745 + }, + { + "epoch": 0.07, + "grad_norm": 8.450029884335539, + "learning_rate": 9.96701840515375e-06, + "loss": 0.8323, + "step": 746 + }, + { + "epoch": 0.07, + "grad_norm": 14.40731236020346, + "learning_rate": 9.966855108712338e-06, + "loss": 0.8494, + "step": 747 + }, + { + "epoch": 0.07, + "grad_norm": 11.622991421937098, + "learning_rate": 9.96669141036127e-06, + "loss": 1.0201, + "step": 748 + }, + { + "epoch": 0.07, + "grad_norm": 12.68326945040358, + "learning_rate": 9.966527310113798e-06, + "loss": 0.803, + "step": 749 + }, + { + "epoch": 0.07, + "grad_norm": 9.509454815316461, + "learning_rate": 9.966362807983196e-06, + "loss": 1.0976, + "step": 750 + }, + { + "epoch": 0.07, + "grad_norm": 60.85266957251158, + "learning_rate": 9.966197903982777e-06, + "loss": 0.8499, + "step": 751 + }, + { + "epoch": 0.07, + "grad_norm": 11.429879334954224, + "learning_rate": 9.966032598125885e-06, + "loss": 0.9055, + "step": 752 + }, + { + "epoch": 0.07, + "grad_norm": 11.57826550302891, + "learning_rate": 9.965866890425895e-06, + "loss": 0.9768, + "step": 753 + }, + { + "epoch": 0.07, + "grad_norm": 8.314311456861768, + "learning_rate": 9.965700780896217e-06, + "loss": 0.8353, + "step": 754 + }, + { + "epoch": 0.07, + "grad_norm": 16.369204998525312, + "learning_rate": 9.965534269550291e-06, + "loss": 0.7915, + "step": 755 + }, + { + "epoch": 0.07, + "grad_norm": 3.316126495446106, + "learning_rate": 9.965367356401592e-06, + "loss": 0.5111, + "step": 756 + }, + { + "epoch": 0.07, + "grad_norm": 17.471537606609438, + "learning_rate": 9.965200041463626e-06, + "loss": 0.977, + "step": 757 + }, + { + "epoch": 0.07, + "grad_norm": 14.428142426080745, + "learning_rate": 9.965032324749933e-06, + "loss": 0.7289, + "step": 758 + }, + { + "epoch": 0.07, + "grad_norm": 11.425143133172874, + "learning_rate": 9.964864206274081e-06, + "loss": 0.8897, + "step": 759 + }, + { + "epoch": 0.07, + "grad_norm": 9.693184357244883, + "learning_rate": 9.964695686049676e-06, + "loss": 0.7934, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 10.23800374999055, + "learning_rate": 9.964526764090357e-06, + "loss": 0.7769, + "step": 761 + }, + { + "epoch": 0.07, + "grad_norm": 12.263459545204334, + "learning_rate": 9.964357440409788e-06, + "loss": 0.9087, + "step": 762 + }, + { + "epoch": 0.07, + "grad_norm": 3.607144591060809, + "learning_rate": 9.964187715021673e-06, + "loss": 0.5249, + "step": 763 + }, + { + "epoch": 0.07, + "grad_norm": 9.116209911924708, + "learning_rate": 9.964017587939748e-06, + "loss": 0.7882, + "step": 764 + }, + { + "epoch": 0.07, + "grad_norm": 14.908337301161415, + "learning_rate": 9.963847059177774e-06, + "loss": 0.7946, + "step": 765 + }, + { + "epoch": 0.07, + "grad_norm": 10.565344487243234, + "learning_rate": 9.963676128749554e-06, + "loss": 0.8763, + "step": 766 + }, + { + "epoch": 0.07, + "grad_norm": 22.939338747642058, + "learning_rate": 9.963504796668918e-06, + "loss": 0.9848, + "step": 767 + }, + { + "epoch": 0.07, + "grad_norm": 9.938362194043929, + "learning_rate": 9.963333062949732e-06, + "loss": 0.8611, + "step": 768 + }, + { + "epoch": 0.07, + "grad_norm": 11.934000751877056, + "learning_rate": 9.963160927605888e-06, + "loss": 0.9683, + "step": 769 + }, + { + "epoch": 0.07, + "grad_norm": 17.16643097341052, + "learning_rate": 9.96298839065132e-06, + "loss": 0.8923, + "step": 770 + }, + { + "epoch": 0.07, + "grad_norm": 13.263528759137674, + "learning_rate": 9.962815452099985e-06, + "loss": 0.9364, + "step": 771 + }, + { + "epoch": 0.07, + "grad_norm": 10.106282413249211, + "learning_rate": 9.96264211196588e-06, + "loss": 0.8541, + "step": 772 + }, + { + "epoch": 0.07, + "grad_norm": 11.426514939813826, + "learning_rate": 9.96246837026303e-06, + "loss": 0.9194, + "step": 773 + }, + { + "epoch": 0.07, + "grad_norm": 27.19767497071208, + "learning_rate": 9.962294227005494e-06, + "loss": 0.9121, + "step": 774 + }, + { + "epoch": 0.07, + "grad_norm": 16.336959965391415, + "learning_rate": 9.962119682207364e-06, + "loss": 0.9702, + "step": 775 + }, + { + "epoch": 0.07, + "grad_norm": 12.818975598967768, + "learning_rate": 9.961944735882762e-06, + "loss": 1.0659, + "step": 776 + }, + { + "epoch": 0.07, + "grad_norm": 18.45232654232473, + "learning_rate": 9.961769388045846e-06, + "loss": 0.9291, + "step": 777 + }, + { + "epoch": 0.07, + "grad_norm": 13.69968415179356, + "learning_rate": 9.961593638710805e-06, + "loss": 0.7493, + "step": 778 + }, + { + "epoch": 0.07, + "grad_norm": 3.2824403505956417, + "learning_rate": 9.961417487891861e-06, + "loss": 0.537, + "step": 779 + }, + { + "epoch": 0.07, + "grad_norm": 10.988119157185617, + "learning_rate": 9.961240935603264e-06, + "loss": 0.7433, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 14.345185883993423, + "learning_rate": 9.961063981859307e-06, + "loss": 0.8537, + "step": 781 + }, + { + "epoch": 0.07, + "grad_norm": 9.363394643772713, + "learning_rate": 9.960886626674302e-06, + "loss": 0.809, + "step": 782 + }, + { + "epoch": 0.07, + "grad_norm": 16.057759874573325, + "learning_rate": 9.960708870062604e-06, + "loss": 0.9384, + "step": 783 + }, + { + "epoch": 0.07, + "grad_norm": 9.970687803823957, + "learning_rate": 9.960530712038597e-06, + "loss": 0.7209, + "step": 784 + }, + { + "epoch": 0.07, + "grad_norm": 11.622307042987789, + "learning_rate": 9.960352152616695e-06, + "loss": 0.7443, + "step": 785 + }, + { + "epoch": 0.07, + "grad_norm": 10.006409746334747, + "learning_rate": 9.960173191811347e-06, + "loss": 0.9454, + "step": 786 + }, + { + "epoch": 0.07, + "grad_norm": 13.067850960863385, + "learning_rate": 9.959993829637037e-06, + "loss": 0.9166, + "step": 787 + }, + { + "epoch": 0.07, + "grad_norm": 9.583382520578855, + "learning_rate": 9.959814066108276e-06, + "loss": 0.8529, + "step": 788 + }, + { + "epoch": 0.07, + "grad_norm": 15.992968804301332, + "learning_rate": 9.95963390123961e-06, + "loss": 0.9372, + "step": 789 + }, + { + "epoch": 0.07, + "grad_norm": 9.681690711718945, + "learning_rate": 9.959453335045622e-06, + "loss": 0.8839, + "step": 790 + }, + { + "epoch": 0.07, + "grad_norm": 16.043346299643304, + "learning_rate": 9.959272367540916e-06, + "loss": 1.0008, + "step": 791 + }, + { + "epoch": 0.07, + "grad_norm": 8.572337164417537, + "learning_rate": 9.959090998740141e-06, + "loss": 0.9119, + "step": 792 + }, + { + "epoch": 0.07, + "grad_norm": 4.106405364587775, + "learning_rate": 9.95890922865797e-06, + "loss": 0.4965, + "step": 793 + }, + { + "epoch": 0.07, + "grad_norm": 9.750599836251746, + "learning_rate": 9.958727057309115e-06, + "loss": 0.8351, + "step": 794 + }, + { + "epoch": 0.07, + "grad_norm": 10.530717538796388, + "learning_rate": 9.958544484708314e-06, + "loss": 1.0007, + "step": 795 + }, + { + "epoch": 0.07, + "grad_norm": 9.285437084025967, + "learning_rate": 9.958361510870342e-06, + "loss": 0.7657, + "step": 796 + }, + { + "epoch": 0.07, + "grad_norm": 4.618462240774491, + "learning_rate": 9.958178135810004e-06, + "loss": 0.4823, + "step": 797 + }, + { + "epoch": 0.07, + "grad_norm": 12.242857995533848, + "learning_rate": 9.957994359542137e-06, + "loss": 0.9048, + "step": 798 + }, + { + "epoch": 0.07, + "grad_norm": 11.391161844234732, + "learning_rate": 9.957810182081616e-06, + "loss": 1.0183, + "step": 799 + }, + { + "epoch": 0.07, + "grad_norm": 9.672980395018998, + "learning_rate": 9.957625603443342e-06, + "loss": 0.8178, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 4.203090567876935, + "learning_rate": 9.95744062364225e-06, + "loss": 0.5152, + "step": 801 + }, + { + "epoch": 0.07, + "grad_norm": 9.410978544807948, + "learning_rate": 9.957255242693309e-06, + "loss": 0.8062, + "step": 802 + }, + { + "epoch": 0.07, + "grad_norm": 9.959518070907853, + "learning_rate": 9.95706946061152e-06, + "loss": 0.8275, + "step": 803 + }, + { + "epoch": 0.07, + "grad_norm": 14.912865391461608, + "learning_rate": 9.956883277411914e-06, + "loss": 1.0109, + "step": 804 + }, + { + "epoch": 0.07, + "grad_norm": 9.103350386313283, + "learning_rate": 9.956696693109563e-06, + "loss": 0.749, + "step": 805 + }, + { + "epoch": 0.07, + "grad_norm": 6.148124603912607, + "learning_rate": 9.956509707719556e-06, + "loss": 0.9101, + "step": 806 + }, + { + "epoch": 0.07, + "grad_norm": 12.434799471188628, + "learning_rate": 9.95632232125703e-06, + "loss": 0.8516, + "step": 807 + }, + { + "epoch": 0.07, + "grad_norm": 6.259246782313333, + "learning_rate": 9.956134533737147e-06, + "loss": 0.8637, + "step": 808 + }, + { + "epoch": 0.07, + "grad_norm": 14.269340848451177, + "learning_rate": 9.955946345175101e-06, + "loss": 0.9068, + "step": 809 + }, + { + "epoch": 0.07, + "grad_norm": 23.149642174188873, + "learning_rate": 9.95575775558612e-06, + "loss": 0.8714, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 11.150149764774255, + "learning_rate": 9.955568764985464e-06, + "loss": 0.9207, + "step": 811 + }, + { + "epoch": 0.07, + "grad_norm": 13.78923684470478, + "learning_rate": 9.955379373388429e-06, + "loss": 0.8213, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 2.6651229782215573, + "learning_rate": 9.955189580810337e-06, + "loss": 0.4677, + "step": 813 + }, + { + "epoch": 0.07, + "grad_norm": 9.462739211212114, + "learning_rate": 9.954999387266546e-06, + "loss": 0.8368, + "step": 814 + }, + { + "epoch": 0.07, + "grad_norm": 2.9158415995700544, + "learning_rate": 9.954808792772447e-06, + "loss": 0.5092, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 8.418955506090782, + "learning_rate": 9.954617797343463e-06, + "loss": 0.8517, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 10.808930669840215, + "learning_rate": 9.954426400995049e-06, + "loss": 0.7804, + "step": 817 + }, + { + "epoch": 0.07, + "grad_norm": 9.576383436131106, + "learning_rate": 9.954234603742691e-06, + "loss": 0.9444, + "step": 818 + }, + { + "epoch": 0.07, + "grad_norm": 10.034997385140562, + "learning_rate": 9.95404240560191e-06, + "loss": 0.9552, + "step": 819 + }, + { + "epoch": 0.07, + "grad_norm": 14.513620290569301, + "learning_rate": 9.953849806588258e-06, + "loss": 1.002, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 10.683678146530621, + "learning_rate": 9.95365680671732e-06, + "loss": 0.9292, + "step": 821 + }, + { + "epoch": 0.07, + "grad_norm": 8.690705100436654, + "learning_rate": 9.953463406004714e-06, + "loss": 0.8466, + "step": 822 + }, + { + "epoch": 0.07, + "grad_norm": 13.850383711158889, + "learning_rate": 9.953269604466088e-06, + "loss": 0.9111, + "step": 823 + }, + { + "epoch": 0.07, + "grad_norm": 13.538719708241702, + "learning_rate": 9.953075402117124e-06, + "loss": 0.977, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 8.287302147771692, + "learning_rate": 9.952880798973539e-06, + "loss": 0.8602, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 8.206227169908802, + "learning_rate": 9.952685795051078e-06, + "loss": 0.847, + "step": 826 + }, + { + "epoch": 0.07, + "grad_norm": 16.05493364293321, + "learning_rate": 9.95249039036552e-06, + "loss": 0.8984, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 22.013174348820886, + "learning_rate": 9.952294584932678e-06, + "loss": 0.9815, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 3.379327509472447, + "learning_rate": 9.952098378768396e-06, + "loss": 0.5795, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 7.784352653786437, + "learning_rate": 9.951901771888553e-06, + "loss": 0.895, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 9.521622934137985, + "learning_rate": 9.951704764309053e-06, + "loss": 0.7479, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 10.851856286244637, + "learning_rate": 9.95150735604584e-06, + "loss": 0.9747, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 11.031017599828411, + "learning_rate": 9.95130954711489e-06, + "loss": 0.9458, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 12.10221875222526, + "learning_rate": 9.951111337532206e-06, + "loss": 0.858, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 12.913846390761568, + "learning_rate": 9.950912727313828e-06, + "loss": 0.8523, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 16.42477490122651, + "learning_rate": 9.950713716475829e-06, + "loss": 0.9088, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 27.028218221339785, + "learning_rate": 9.95051430503431e-06, + "loss": 0.9065, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 11.455328971068887, + "learning_rate": 9.950314493005409e-06, + "loss": 0.8293, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 15.15347847651865, + "learning_rate": 9.950114280405293e-06, + "loss": 0.8416, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 10.23684899883355, + "learning_rate": 9.949913667250163e-06, + "loss": 0.8507, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 2.8133093427932203, + "learning_rate": 9.949712653556255e-06, + "loss": 0.5198, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 12.053778228223457, + "learning_rate": 9.94951123933983e-06, + "loss": 0.8186, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 10.529611908218245, + "learning_rate": 9.94930942461719e-06, + "loss": 0.9879, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 10.077953842602453, + "learning_rate": 9.949107209404664e-06, + "loss": 0.9397, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 12.18880463881648, + "learning_rate": 9.948904593718615e-06, + "loss": 1.031, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 11.863946413241822, + "learning_rate": 9.948701577575439e-06, + "loss": 0.8385, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 9.47363052529147, + "learning_rate": 9.948498160991562e-06, + "loss": 0.8853, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 12.661082929085444, + "learning_rate": 9.948294343983446e-06, + "loss": 0.9378, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 8.467509038932908, + "learning_rate": 9.948090126567583e-06, + "loss": 0.9787, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 11.868586813435527, + "learning_rate": 9.947885508760495e-06, + "loss": 0.7937, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 11.108472776123715, + "learning_rate": 9.947680490578745e-06, + "loss": 0.9761, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 13.405469911473556, + "learning_rate": 9.94747507203892e-06, + "loss": 0.9423, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 11.03719388567328, + "learning_rate": 9.947269253157639e-06, + "loss": 0.8364, + "step": 853 + }, + { + "epoch": 0.08, + "grad_norm": 4.110480097353059, + "learning_rate": 9.947063033951561e-06, + "loss": 0.5285, + "step": 854 + }, + { + "epoch": 0.08, + "grad_norm": 11.750068587690173, + "learning_rate": 9.94685641443737e-06, + "loss": 0.8593, + "step": 855 + }, + { + "epoch": 0.08, + "grad_norm": 24.550044736674874, + "learning_rate": 9.94664939463179e-06, + "loss": 0.7679, + "step": 856 + }, + { + "epoch": 0.08, + "grad_norm": 21.22251266331987, + "learning_rate": 9.946441974551566e-06, + "loss": 0.8649, + "step": 857 + }, + { + "epoch": 0.08, + "grad_norm": 8.814278918493025, + "learning_rate": 9.946234154213488e-06, + "loss": 0.9201, + "step": 858 + }, + { + "epoch": 0.08, + "grad_norm": 12.155104160756027, + "learning_rate": 9.94602593363437e-06, + "loss": 0.8078, + "step": 859 + }, + { + "epoch": 0.08, + "grad_norm": 11.347079927073729, + "learning_rate": 9.945817312831057e-06, + "loss": 0.819, + "step": 860 + }, + { + "epoch": 0.08, + "grad_norm": 10.900475084958723, + "learning_rate": 9.945608291820437e-06, + "loss": 0.7295, + "step": 861 + }, + { + "epoch": 0.08, + "grad_norm": 28.09708939494232, + "learning_rate": 9.94539887061942e-06, + "loss": 0.8722, + "step": 862 + }, + { + "epoch": 0.08, + "grad_norm": 13.305018417680271, + "learning_rate": 9.945189049244951e-06, + "loss": 0.8846, + "step": 863 + }, + { + "epoch": 0.08, + "grad_norm": 3.5156920928589783, + "learning_rate": 9.944978827714013e-06, + "loss": 0.5217, + "step": 864 + }, + { + "epoch": 0.08, + "grad_norm": 18.80318122347223, + "learning_rate": 9.944768206043613e-06, + "loss": 0.8301, + "step": 865 + }, + { + "epoch": 0.08, + "grad_norm": 19.970035177730548, + "learning_rate": 9.944557184250793e-06, + "loss": 0.9345, + "step": 866 + }, + { + "epoch": 0.08, + "grad_norm": 11.713040319762273, + "learning_rate": 9.944345762352631e-06, + "loss": 0.7251, + "step": 867 + }, + { + "epoch": 0.08, + "grad_norm": 9.669007831302386, + "learning_rate": 9.944133940366236e-06, + "loss": 0.8831, + "step": 868 + }, + { + "epoch": 0.08, + "grad_norm": 10.948786947233312, + "learning_rate": 9.943921718308746e-06, + "loss": 0.8545, + "step": 869 + }, + { + "epoch": 0.08, + "grad_norm": 12.081786616577705, + "learning_rate": 9.943709096197334e-06, + "loss": 0.8315, + "step": 870 + }, + { + "epoch": 0.08, + "grad_norm": 13.22503075361401, + "learning_rate": 9.943496074049206e-06, + "loss": 0.8071, + "step": 871 + }, + { + "epoch": 0.08, + "grad_norm": 10.041295307136247, + "learning_rate": 9.943282651881599e-06, + "loss": 0.8367, + "step": 872 + }, + { + "epoch": 0.08, + "grad_norm": 11.150385555882348, + "learning_rate": 9.943068829711781e-06, + "loss": 0.8823, + "step": 873 + }, + { + "epoch": 0.08, + "grad_norm": 10.319486248388827, + "learning_rate": 9.942854607557058e-06, + "loss": 1.0307, + "step": 874 + }, + { + "epoch": 0.08, + "grad_norm": 7.072356747031662, + "learning_rate": 9.942639985434762e-06, + "loss": 0.797, + "step": 875 + }, + { + "epoch": 0.08, + "grad_norm": 39.7692914161999, + "learning_rate": 9.942424963362259e-06, + "loss": 0.9342, + "step": 876 + }, + { + "epoch": 0.08, + "grad_norm": 9.137034893247106, + "learning_rate": 9.942209541356948e-06, + "loss": 0.8659, + "step": 877 + }, + { + "epoch": 0.08, + "grad_norm": 17.34303134601401, + "learning_rate": 9.941993719436263e-06, + "loss": 0.7167, + "step": 878 + }, + { + "epoch": 0.08, + "grad_norm": 12.195729206227494, + "learning_rate": 9.941777497617666e-06, + "loss": 0.7135, + "step": 879 + }, + { + "epoch": 0.08, + "grad_norm": 13.59629970393336, + "learning_rate": 9.941560875918655e-06, + "loss": 0.9221, + "step": 880 + }, + { + "epoch": 0.08, + "grad_norm": 9.564297583686754, + "learning_rate": 9.941343854356757e-06, + "loss": 0.8128, + "step": 881 + }, + { + "epoch": 0.08, + "grad_norm": 11.580066837829389, + "learning_rate": 9.941126432949536e-06, + "loss": 0.8781, + "step": 882 + }, + { + "epoch": 0.08, + "grad_norm": 10.687125583400558, + "learning_rate": 9.94090861171458e-06, + "loss": 1.077, + "step": 883 + }, + { + "epoch": 0.08, + "grad_norm": 11.702947530351869, + "learning_rate": 9.94069039066952e-06, + "loss": 0.8727, + "step": 884 + }, + { + "epoch": 0.08, + "grad_norm": 10.681506838476606, + "learning_rate": 9.940471769832011e-06, + "loss": 0.6872, + "step": 885 + }, + { + "epoch": 0.08, + "grad_norm": 13.592997592878751, + "learning_rate": 9.940252749219747e-06, + "loss": 0.9285, + "step": 886 + }, + { + "epoch": 0.08, + "grad_norm": 8.488103205621746, + "learning_rate": 9.940033328850445e-06, + "loss": 0.9254, + "step": 887 + }, + { + "epoch": 0.08, + "grad_norm": 10.802226713636303, + "learning_rate": 9.939813508741866e-06, + "loss": 0.7946, + "step": 888 + }, + { + "epoch": 0.08, + "grad_norm": 8.521901510308062, + "learning_rate": 9.939593288911793e-06, + "loss": 0.8288, + "step": 889 + }, + { + "epoch": 0.08, + "grad_norm": 3.084712279248298, + "learning_rate": 9.93937266937805e-06, + "loss": 0.5164, + "step": 890 + }, + { + "epoch": 0.08, + "grad_norm": 9.434464981036253, + "learning_rate": 9.939151650158483e-06, + "loss": 1.0206, + "step": 891 + }, + { + "epoch": 0.08, + "grad_norm": 14.119174182994094, + "learning_rate": 9.938930231270982e-06, + "loss": 0.967, + "step": 892 + }, + { + "epoch": 0.08, + "grad_norm": 2.824334817119243, + "learning_rate": 9.938708412733463e-06, + "loss": 0.4483, + "step": 893 + }, + { + "epoch": 0.08, + "grad_norm": 9.58685137478499, + "learning_rate": 9.938486194563874e-06, + "loss": 0.7568, + "step": 894 + }, + { + "epoch": 0.08, + "grad_norm": 10.198816139336431, + "learning_rate": 9.938263576780195e-06, + "loss": 0.7675, + "step": 895 + }, + { + "epoch": 0.08, + "grad_norm": 10.776912509890169, + "learning_rate": 9.938040559400445e-06, + "loss": 0.7389, + "step": 896 + }, + { + "epoch": 0.08, + "grad_norm": 16.31671846539181, + "learning_rate": 9.937817142442666e-06, + "loss": 0.9574, + "step": 897 + }, + { + "epoch": 0.08, + "grad_norm": 8.149818308431707, + "learning_rate": 9.937593325924936e-06, + "loss": 0.8699, + "step": 898 + }, + { + "epoch": 0.08, + "grad_norm": 14.781930315954469, + "learning_rate": 9.937369109865368e-06, + "loss": 0.8197, + "step": 899 + }, + { + "epoch": 0.08, + "grad_norm": 10.293454291305178, + "learning_rate": 9.937144494282104e-06, + "loss": 1.0076, + "step": 900 + }, + { + "epoch": 0.08, + "grad_norm": 10.269322398236133, + "learning_rate": 9.936919479193321e-06, + "loss": 0.9372, + "step": 901 + }, + { + "epoch": 0.08, + "grad_norm": 3.764235030267584, + "learning_rate": 9.936694064617228e-06, + "loss": 0.4642, + "step": 902 + }, + { + "epoch": 0.08, + "grad_norm": 11.482865723602647, + "learning_rate": 9.93646825057206e-06, + "loss": 0.7819, + "step": 903 + }, + { + "epoch": 0.08, + "grad_norm": 16.514003388718788, + "learning_rate": 9.936242037076092e-06, + "loss": 0.9133, + "step": 904 + }, + { + "epoch": 0.08, + "grad_norm": 12.086257412417888, + "learning_rate": 9.936015424147632e-06, + "loss": 0.7903, + "step": 905 + }, + { + "epoch": 0.08, + "grad_norm": 10.329174810362522, + "learning_rate": 9.935788411805011e-06, + "loss": 0.9304, + "step": 906 + }, + { + "epoch": 0.08, + "grad_norm": 7.366480171975912, + "learning_rate": 9.935561000066604e-06, + "loss": 0.8835, + "step": 907 + }, + { + "epoch": 0.08, + "grad_norm": 10.027304026952802, + "learning_rate": 9.935333188950812e-06, + "loss": 0.8874, + "step": 908 + }, + { + "epoch": 0.08, + "grad_norm": 10.93623142346918, + "learning_rate": 9.935104978476067e-06, + "loss": 0.8137, + "step": 909 + }, + { + "epoch": 0.08, + "grad_norm": 16.137423076616226, + "learning_rate": 9.934876368660836e-06, + "loss": 0.9317, + "step": 910 + }, + { + "epoch": 0.08, + "grad_norm": 9.934101749106093, + "learning_rate": 9.934647359523619e-06, + "loss": 0.8309, + "step": 911 + }, + { + "epoch": 0.08, + "grad_norm": 15.733517070532649, + "learning_rate": 9.934417951082945e-06, + "loss": 0.9912, + "step": 912 + }, + { + "epoch": 0.08, + "grad_norm": 13.382114735196163, + "learning_rate": 9.934188143357378e-06, + "loss": 0.9097, + "step": 913 + }, + { + "epoch": 0.08, + "grad_norm": 13.972969402734357, + "learning_rate": 9.933957936365515e-06, + "loss": 0.8558, + "step": 914 + }, + { + "epoch": 0.08, + "grad_norm": 17.213789413206168, + "learning_rate": 9.933727330125982e-06, + "loss": 1.0148, + "step": 915 + }, + { + "epoch": 0.08, + "grad_norm": 11.335480076843377, + "learning_rate": 9.93349632465744e-06, + "loss": 0.9363, + "step": 916 + }, + { + "epoch": 0.08, + "grad_norm": 6.9449605480402115, + "learning_rate": 9.933264919978583e-06, + "loss": 0.8028, + "step": 917 + }, + { + "epoch": 0.08, + "grad_norm": 16.279842741609176, + "learning_rate": 9.933033116108135e-06, + "loss": 0.9009, + "step": 918 + }, + { + "epoch": 0.08, + "grad_norm": 14.79715649406054, + "learning_rate": 9.932800913064852e-06, + "loss": 1.0012, + "step": 919 + }, + { + "epoch": 0.08, + "grad_norm": 14.859730172686252, + "learning_rate": 9.932568310867524e-06, + "loss": 0.9944, + "step": 920 + }, + { + "epoch": 0.08, + "grad_norm": 16.06554463629298, + "learning_rate": 9.932335309534976e-06, + "loss": 0.8997, + "step": 921 + }, + { + "epoch": 0.08, + "grad_norm": 2.601702766393243, + "learning_rate": 9.932101909086056e-06, + "loss": 0.4881, + "step": 922 + }, + { + "epoch": 0.08, + "grad_norm": 17.055075032351365, + "learning_rate": 9.931868109539654e-06, + "loss": 0.9, + "step": 923 + }, + { + "epoch": 0.08, + "grad_norm": 10.322031801333845, + "learning_rate": 9.931633910914688e-06, + "loss": 0.9066, + "step": 924 + }, + { + "epoch": 0.08, + "grad_norm": 21.043897093576536, + "learning_rate": 9.931399313230112e-06, + "loss": 0.9113, + "step": 925 + }, + { + "epoch": 0.08, + "grad_norm": 10.85208870507991, + "learning_rate": 9.931164316504905e-06, + "loss": 0.9033, + "step": 926 + }, + { + "epoch": 0.08, + "grad_norm": 10.686305602143022, + "learning_rate": 9.930928920758083e-06, + "loss": 0.9602, + "step": 927 + }, + { + "epoch": 0.08, + "grad_norm": 10.781219953955667, + "learning_rate": 9.930693126008698e-06, + "loss": 0.8585, + "step": 928 + }, + { + "epoch": 0.08, + "grad_norm": 13.864044528715018, + "learning_rate": 9.930456932275825e-06, + "loss": 0.9402, + "step": 929 + }, + { + "epoch": 0.08, + "grad_norm": 13.741037788505073, + "learning_rate": 9.930220339578576e-06, + "loss": 0.8362, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 9.175127270115464, + "learning_rate": 9.929983347936101e-06, + "loss": 0.7752, + "step": 931 + }, + { + "epoch": 0.08, + "grad_norm": 14.629407777733052, + "learning_rate": 9.929745957367573e-06, + "loss": 1.028, + "step": 932 + }, + { + "epoch": 0.08, + "grad_norm": 8.33887250721653, + "learning_rate": 9.929508167892204e-06, + "loss": 0.8053, + "step": 933 + }, + { + "epoch": 0.08, + "grad_norm": 9.088595070304937, + "learning_rate": 9.929269979529233e-06, + "loss": 0.9086, + "step": 934 + }, + { + "epoch": 0.08, + "grad_norm": 14.83703412270857, + "learning_rate": 9.929031392297935e-06, + "loss": 0.9607, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 12.35326611889575, + "learning_rate": 9.928792406217615e-06, + "loss": 0.9868, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 22.271983725555177, + "learning_rate": 9.928553021307612e-06, + "loss": 0.7369, + "step": 937 + }, + { + "epoch": 0.08, + "grad_norm": 9.684518028763677, + "learning_rate": 9.928313237587297e-06, + "loss": 0.8436, + "step": 938 + }, + { + "epoch": 0.08, + "grad_norm": 9.97977636539667, + "learning_rate": 9.928073055076073e-06, + "loss": 0.8653, + "step": 939 + }, + { + "epoch": 0.08, + "grad_norm": 67.6163101070056, + "learning_rate": 9.927832473793376e-06, + "loss": 1.0219, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 22.744624939326126, + "learning_rate": 9.92759149375867e-06, + "loss": 0.8768, + "step": 941 + }, + { + "epoch": 0.08, + "grad_norm": 16.050392903112876, + "learning_rate": 9.927350114991456e-06, + "loss": 0.9487, + "step": 942 + }, + { + "epoch": 0.08, + "grad_norm": 12.685293851050337, + "learning_rate": 9.92710833751127e-06, + "loss": 0.8118, + "step": 943 + }, + { + "epoch": 0.08, + "grad_norm": 18.15446008014423, + "learning_rate": 9.926866161337672e-06, + "loss": 0.7783, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 8.229738010172007, + "learning_rate": 9.92662358649026e-06, + "loss": 0.8147, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 15.190308159941702, + "learning_rate": 9.926380612988661e-06, + "loss": 1.008, + "step": 946 + }, + { + "epoch": 0.08, + "grad_norm": 10.030240096559634, + "learning_rate": 9.926137240852539e-06, + "loss": 0.8376, + "step": 947 + }, + { + "epoch": 0.08, + "grad_norm": 12.989932764722393, + "learning_rate": 9.925893470101583e-06, + "loss": 1.0356, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 10.869922123437933, + "learning_rate": 9.925649300755524e-06, + "loss": 0.8676, + "step": 949 + }, + { + "epoch": 0.08, + "grad_norm": 3.139471169547921, + "learning_rate": 9.925404732834116e-06, + "loss": 0.4972, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 11.049985805752195, + "learning_rate": 9.92515976635715e-06, + "loss": 1.0503, + "step": 951 + }, + { + "epoch": 0.08, + "grad_norm": 9.553285519603644, + "learning_rate": 9.92491440134445e-06, + "loss": 1.0252, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 15.410563330308566, + "learning_rate": 9.924668637815866e-06, + "loss": 0.9744, + "step": 953 + }, + { + "epoch": 0.08, + "grad_norm": 9.65076739743317, + "learning_rate": 9.92442247579129e-06, + "loss": 0.7822, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 15.700383107733016, + "learning_rate": 9.924175915290637e-06, + "loss": 1.0919, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 3.4779733880255037, + "learning_rate": 9.92392895633386e-06, + "loss": 0.4668, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 11.051492500455785, + "learning_rate": 9.923681598940943e-06, + "loss": 0.8694, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 4.686323501963412, + "learning_rate": 9.9234338431319e-06, + "loss": 0.5581, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 23.208948537648336, + "learning_rate": 9.923185688926783e-06, + "loss": 0.8415, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 8.231245049473818, + "learning_rate": 9.922937136345668e-06, + "loss": 0.6199, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 3.4755614928885072, + "learning_rate": 9.922688185408666e-06, + "loss": 0.5043, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 14.389781426889087, + "learning_rate": 9.922438836135928e-06, + "loss": 0.8372, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 31.059490359409807, + "learning_rate": 9.922189088547624e-06, + "loss": 0.7856, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 10.917091215769576, + "learning_rate": 9.92193894266397e-06, + "loss": 1.0667, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 11.98691905940743, + "learning_rate": 9.921688398505202e-06, + "loss": 0.9779, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 12.92805445259288, + "learning_rate": 9.921437456091596e-06, + "loss": 0.9076, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 13.185673823689545, + "learning_rate": 9.92118611544346e-06, + "loss": 0.5882, + "step": 967 + }, + { + "epoch": 0.09, + "grad_norm": 12.186606671802943, + "learning_rate": 9.920934376581126e-06, + "loss": 0.8302, + "step": 968 + }, + { + "epoch": 0.09, + "grad_norm": 11.930723445965311, + "learning_rate": 9.920682239524968e-06, + "loss": 0.9175, + "step": 969 + }, + { + "epoch": 0.09, + "grad_norm": 20.02420821818242, + "learning_rate": 9.920429704295392e-06, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.09, + "grad_norm": 13.531626592731914, + "learning_rate": 9.920176770912825e-06, + "loss": 0.869, + "step": 971 + }, + { + "epoch": 0.09, + "grad_norm": 11.210072479369748, + "learning_rate": 9.91992343939774e-06, + "loss": 0.9315, + "step": 972 + }, + { + "epoch": 0.09, + "grad_norm": 13.060211726143757, + "learning_rate": 9.919669709770634e-06, + "loss": 0.9931, + "step": 973 + }, + { + "epoch": 0.09, + "grad_norm": 15.056180682037496, + "learning_rate": 9.919415582052037e-06, + "loss": 0.8859, + "step": 974 + }, + { + "epoch": 0.09, + "grad_norm": 13.801049977923142, + "learning_rate": 9.919161056262517e-06, + "loss": 0.9499, + "step": 975 + }, + { + "epoch": 0.09, + "grad_norm": 14.480772033156354, + "learning_rate": 9.918906132422663e-06, + "loss": 1.0048, + "step": 976 + }, + { + "epoch": 0.09, + "grad_norm": 17.664162834927776, + "learning_rate": 9.91865081055311e-06, + "loss": 1.0367, + "step": 977 + }, + { + "epoch": 0.09, + "grad_norm": 2.9162696522450613, + "learning_rate": 9.918395090674514e-06, + "loss": 0.4953, + "step": 978 + }, + { + "epoch": 0.09, + "grad_norm": 10.22089397541415, + "learning_rate": 9.91813897280757e-06, + "loss": 0.8855, + "step": 979 + }, + { + "epoch": 0.09, + "grad_norm": 9.99517684282474, + "learning_rate": 9.917882456972999e-06, + "loss": 0.8267, + "step": 980 + }, + { + "epoch": 0.09, + "grad_norm": 11.589565663065116, + "learning_rate": 9.917625543191561e-06, + "loss": 0.8294, + "step": 981 + }, + { + "epoch": 0.09, + "grad_norm": 8.184368611060156, + "learning_rate": 9.917368231484045e-06, + "loss": 0.805, + "step": 982 + }, + { + "epoch": 0.09, + "grad_norm": 10.738241150497025, + "learning_rate": 9.91711052187127e-06, + "loss": 0.9254, + "step": 983 + }, + { + "epoch": 0.09, + "grad_norm": 14.792967580325586, + "learning_rate": 9.916852414374092e-06, + "loss": 0.7818, + "step": 984 + }, + { + "epoch": 0.09, + "grad_norm": 5.396206981011578, + "learning_rate": 9.916593909013394e-06, + "loss": 0.5432, + "step": 985 + }, + { + "epoch": 0.09, + "grad_norm": 10.013425259667484, + "learning_rate": 9.916335005810096e-06, + "loss": 0.9344, + "step": 986 + }, + { + "epoch": 0.09, + "grad_norm": 9.866987759371495, + "learning_rate": 9.916075704785148e-06, + "loss": 1.0138, + "step": 987 + }, + { + "epoch": 0.09, + "grad_norm": 9.213712102288875, + "learning_rate": 9.91581600595953e-06, + "loss": 0.8421, + "step": 988 + }, + { + "epoch": 0.09, + "grad_norm": 17.18393203612241, + "learning_rate": 9.91555590935426e-06, + "loss": 0.9646, + "step": 989 + }, + { + "epoch": 0.09, + "grad_norm": 8.773575008845182, + "learning_rate": 9.91529541499038e-06, + "loss": 0.8188, + "step": 990 + }, + { + "epoch": 0.09, + "grad_norm": 13.303819923634572, + "learning_rate": 9.915034522888972e-06, + "loss": 0.8597, + "step": 991 + }, + { + "epoch": 0.09, + "grad_norm": 14.809093215555999, + "learning_rate": 9.914773233071146e-06, + "loss": 1.017, + "step": 992 + }, + { + "epoch": 0.09, + "grad_norm": 15.46943541467123, + "learning_rate": 9.914511545558046e-06, + "loss": 0.8434, + "step": 993 + }, + { + "epoch": 0.09, + "grad_norm": 9.788951391313958, + "learning_rate": 9.914249460370846e-06, + "loss": 0.8754, + "step": 994 + }, + { + "epoch": 0.09, + "grad_norm": 2.429483574584447, + "learning_rate": 9.913986977530753e-06, + "loss": 0.4586, + "step": 995 + }, + { + "epoch": 0.09, + "grad_norm": 13.88613938011523, + "learning_rate": 9.91372409705901e-06, + "loss": 1.0159, + "step": 996 + }, + { + "epoch": 0.09, + "grad_norm": 9.189451541578157, + "learning_rate": 9.913460818976885e-06, + "loss": 0.7174, + "step": 997 + }, + { + "epoch": 0.09, + "grad_norm": 11.757648339149185, + "learning_rate": 9.913197143305685e-06, + "loss": 0.9757, + "step": 998 + }, + { + "epoch": 0.09, + "grad_norm": 13.112228104159989, + "learning_rate": 9.912933070066743e-06, + "loss": 0.7632, + "step": 999 + }, + { + "epoch": 0.09, + "grad_norm": 60.663789500063444, + "learning_rate": 9.912668599281433e-06, + "loss": 0.7915, + "step": 1000 + }, + { + "epoch": 0.09, + "grad_norm": 15.686354267754595, + "learning_rate": 9.912403730971148e-06, + "loss": 0.9654, + "step": 1001 + }, + { + "epoch": 0.09, + "grad_norm": 41.88238591530792, + "learning_rate": 9.912138465157325e-06, + "loss": 0.8293, + "step": 1002 + }, + { + "epoch": 0.09, + "grad_norm": 3.951100409267719, + "learning_rate": 9.911872801861428e-06, + "loss": 0.5615, + "step": 1003 + }, + { + "epoch": 0.09, + "grad_norm": 11.380109994855882, + "learning_rate": 9.911606741104955e-06, + "loss": 0.8165, + "step": 1004 + }, + { + "epoch": 0.09, + "grad_norm": 70.1118158369615, + "learning_rate": 9.911340282909437e-06, + "loss": 0.8587, + "step": 1005 + }, + { + "epoch": 0.09, + "grad_norm": 17.747315944059377, + "learning_rate": 9.91107342729643e-06, + "loss": 1.1259, + "step": 1006 + }, + { + "epoch": 0.09, + "grad_norm": 12.615765247641292, + "learning_rate": 9.91080617428753e-06, + "loss": 0.9842, + "step": 1007 + }, + { + "epoch": 0.09, + "grad_norm": 18.476193237697423, + "learning_rate": 9.910538523904366e-06, + "loss": 0.7371, + "step": 1008 + }, + { + "epoch": 0.09, + "grad_norm": 12.017268075184571, + "learning_rate": 9.910270476168589e-06, + "loss": 0.8787, + "step": 1009 + }, + { + "epoch": 0.09, + "grad_norm": 23.912489780867883, + "learning_rate": 9.910002031101895e-06, + "loss": 0.8393, + "step": 1010 + }, + { + "epoch": 0.09, + "grad_norm": 10.613813752331726, + "learning_rate": 9.909733188726005e-06, + "loss": 0.8967, + "step": 1011 + }, + { + "epoch": 0.09, + "grad_norm": 12.913662691782672, + "learning_rate": 9.909463949062671e-06, + "loss": 0.9616, + "step": 1012 + }, + { + "epoch": 0.09, + "grad_norm": 11.852133696241893, + "learning_rate": 9.909194312133681e-06, + "loss": 0.8075, + "step": 1013 + }, + { + "epoch": 0.09, + "grad_norm": 12.74929023867983, + "learning_rate": 9.908924277960855e-06, + "loss": 0.7741, + "step": 1014 + }, + { + "epoch": 0.09, + "grad_norm": 12.952161731042885, + "learning_rate": 9.90865384656604e-06, + "loss": 1.079, + "step": 1015 + }, + { + "epoch": 0.09, + "grad_norm": 11.072637345940507, + "learning_rate": 9.908383017971119e-06, + "loss": 0.8872, + "step": 1016 + }, + { + "epoch": 0.09, + "grad_norm": 26.545434181868572, + "learning_rate": 9.908111792198012e-06, + "loss": 0.9474, + "step": 1017 + }, + { + "epoch": 0.09, + "grad_norm": 16.134589089983116, + "learning_rate": 9.907840169268662e-06, + "loss": 0.7613, + "step": 1018 + }, + { + "epoch": 0.09, + "grad_norm": 2.329905132733442, + "learning_rate": 9.907568149205049e-06, + "loss": 0.4715, + "step": 1019 + }, + { + "epoch": 0.09, + "grad_norm": 13.424277643260298, + "learning_rate": 9.907295732029186e-06, + "loss": 0.8012, + "step": 1020 + }, + { + "epoch": 0.09, + "grad_norm": 12.48422531717434, + "learning_rate": 9.907022917763114e-06, + "loss": 0.9653, + "step": 1021 + }, + { + "epoch": 0.09, + "grad_norm": 9.140057743172182, + "learning_rate": 9.90674970642891e-06, + "loss": 0.8889, + "step": 1022 + }, + { + "epoch": 0.09, + "grad_norm": 7.770217378636645, + "learning_rate": 9.90647609804868e-06, + "loss": 0.7706, + "step": 1023 + }, + { + "epoch": 0.09, + "grad_norm": 14.528579437675631, + "learning_rate": 9.90620209264457e-06, + "loss": 0.8785, + "step": 1024 + }, + { + "epoch": 0.09, + "grad_norm": 9.706879472104422, + "learning_rate": 9.905927690238743e-06, + "loss": 0.9332, + "step": 1025 + }, + { + "epoch": 0.09, + "grad_norm": 11.013317877924726, + "learning_rate": 9.905652890853412e-06, + "loss": 0.8373, + "step": 1026 + }, + { + "epoch": 0.09, + "grad_norm": 2.8466294139424644, + "learning_rate": 9.905377694510806e-06, + "loss": 0.5282, + "step": 1027 + }, + { + "epoch": 0.09, + "grad_norm": 10.67954301863694, + "learning_rate": 9.905102101233198e-06, + "loss": 0.8998, + "step": 1028 + }, + { + "epoch": 0.09, + "grad_norm": 9.25163240294979, + "learning_rate": 9.904826111042889e-06, + "loss": 0.7212, + "step": 1029 + }, + { + "epoch": 0.09, + "grad_norm": 9.570470251369347, + "learning_rate": 9.904549723962206e-06, + "loss": 0.8396, + "step": 1030 + }, + { + "epoch": 0.09, + "grad_norm": 3.1932819544531243, + "learning_rate": 9.90427294001352e-06, + "loss": 0.54, + "step": 1031 + }, + { + "epoch": 0.09, + "grad_norm": 11.25328155226562, + "learning_rate": 9.903995759219226e-06, + "loss": 1.0439, + "step": 1032 + }, + { + "epoch": 0.09, + "grad_norm": 18.380395856194752, + "learning_rate": 9.903718181601752e-06, + "loss": 0.9592, + "step": 1033 + }, + { + "epoch": 0.09, + "grad_norm": 3.337282777596122, + "learning_rate": 9.90344020718356e-06, + "loss": 0.5524, + "step": 1034 + }, + { + "epoch": 0.09, + "grad_norm": 14.878716032767883, + "learning_rate": 9.90316183598714e-06, + "loss": 0.8709, + "step": 1035 + }, + { + "epoch": 0.09, + "grad_norm": 7.54740762218563, + "learning_rate": 9.902883068035023e-06, + "loss": 1.0638, + "step": 1036 + }, + { + "epoch": 0.09, + "grad_norm": 15.093088268314128, + "learning_rate": 9.902603903349763e-06, + "loss": 0.8805, + "step": 1037 + }, + { + "epoch": 0.09, + "grad_norm": 29.46225737282382, + "learning_rate": 9.90232434195395e-06, + "loss": 0.6896, + "step": 1038 + }, + { + "epoch": 0.09, + "grad_norm": 4.151329132567571, + "learning_rate": 9.902044383870207e-06, + "loss": 0.533, + "step": 1039 + }, + { + "epoch": 0.09, + "grad_norm": 14.026669552958916, + "learning_rate": 9.901764029121186e-06, + "loss": 1.0351, + "step": 1040 + }, + { + "epoch": 0.09, + "grad_norm": 10.625537707504252, + "learning_rate": 9.901483277729573e-06, + "loss": 0.8265, + "step": 1041 + }, + { + "epoch": 0.09, + "grad_norm": 12.640441335522908, + "learning_rate": 9.901202129718086e-06, + "loss": 0.8079, + "step": 1042 + }, + { + "epoch": 0.09, + "grad_norm": 13.908236376647183, + "learning_rate": 9.900920585109477e-06, + "loss": 0.9109, + "step": 1043 + }, + { + "epoch": 0.09, + "grad_norm": 15.071791586457591, + "learning_rate": 9.900638643926526e-06, + "loss": 0.978, + "step": 1044 + }, + { + "epoch": 0.09, + "grad_norm": 20.167043522966093, + "learning_rate": 9.900356306192047e-06, + "loss": 0.8825, + "step": 1045 + }, + { + "epoch": 0.09, + "grad_norm": 25.20684238178467, + "learning_rate": 9.900073571928887e-06, + "loss": 1.1038, + "step": 1046 + }, + { + "epoch": 0.09, + "grad_norm": 14.719448268802088, + "learning_rate": 9.899790441159925e-06, + "loss": 0.9223, + "step": 1047 + }, + { + "epoch": 0.09, + "grad_norm": 13.02867032281717, + "learning_rate": 9.89950691390807e-06, + "loss": 0.9245, + "step": 1048 + }, + { + "epoch": 0.09, + "grad_norm": 11.207273398746786, + "learning_rate": 9.899222990196266e-06, + "loss": 0.8771, + "step": 1049 + }, + { + "epoch": 0.09, + "grad_norm": 6.6916829066300405, + "learning_rate": 9.898938670047486e-06, + "loss": 0.8311, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 11.370069245340689, + "learning_rate": 9.89865395348474e-06, + "loss": 0.9968, + "step": 1051 + }, + { + "epoch": 0.09, + "grad_norm": 14.892989794124093, + "learning_rate": 9.898368840531062e-06, + "loss": 1.0424, + "step": 1052 + }, + { + "epoch": 0.09, + "grad_norm": 14.62296877045043, + "learning_rate": 9.898083331209526e-06, + "loss": 0.8089, + "step": 1053 + }, + { + "epoch": 0.09, + "grad_norm": 2.1704385922071068, + "learning_rate": 9.897797425543236e-06, + "loss": 0.4943, + "step": 1054 + }, + { + "epoch": 0.09, + "grad_norm": 16.13648729735238, + "learning_rate": 9.897511123555325e-06, + "loss": 0.9432, + "step": 1055 + }, + { + "epoch": 0.09, + "grad_norm": 11.67297419985204, + "learning_rate": 9.89722442526896e-06, + "loss": 0.762, + "step": 1056 + }, + { + "epoch": 0.09, + "grad_norm": 2.8182461371854446, + "learning_rate": 9.896937330707341e-06, + "loss": 0.4916, + "step": 1057 + }, + { + "epoch": 0.09, + "grad_norm": 7.529559180510383, + "learning_rate": 9.8966498398937e-06, + "loss": 0.7743, + "step": 1058 + }, + { + "epoch": 0.09, + "grad_norm": 10.690941501669792, + "learning_rate": 9.896361952851297e-06, + "loss": 0.8958, + "step": 1059 + }, + { + "epoch": 0.09, + "grad_norm": 9.5524389612174, + "learning_rate": 9.89607366960343e-06, + "loss": 0.9749, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 10.537732068700352, + "learning_rate": 9.895784990173427e-06, + "loss": 0.9651, + "step": 1061 + }, + { + "epoch": 0.09, + "grad_norm": 10.33564892162035, + "learning_rate": 9.895495914584645e-06, + "loss": 0.7147, + "step": 1062 + }, + { + "epoch": 0.09, + "grad_norm": 11.370057173462152, + "learning_rate": 9.895206442860476e-06, + "loss": 0.7685, + "step": 1063 + }, + { + "epoch": 0.09, + "grad_norm": 8.837094662464718, + "learning_rate": 9.894916575024347e-06, + "loss": 0.9486, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 16.623980018529824, + "learning_rate": 9.894626311099709e-06, + "loss": 0.9901, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 8.302962155792343, + "learning_rate": 9.894335651110053e-06, + "loss": 0.7574, + "step": 1066 + }, + { + "epoch": 0.09, + "grad_norm": 10.249344751861715, + "learning_rate": 9.894044595078895e-06, + "loss": 1.0146, + "step": 1067 + }, + { + "epoch": 0.09, + "grad_norm": 10.199724656376688, + "learning_rate": 9.893753143029792e-06, + "loss": 0.8315, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 11.199135094776283, + "learning_rate": 9.893461294986323e-06, + "loss": 0.9206, + "step": 1069 + }, + { + "epoch": 0.09, + "grad_norm": 19.969537748030795, + "learning_rate": 9.893169050972107e-06, + "loss": 0.9809, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 10.968306303471255, + "learning_rate": 9.89287641101079e-06, + "loss": 0.9627, + "step": 1071 + }, + { + "epoch": 0.09, + "grad_norm": 10.697667100061825, + "learning_rate": 9.892583375126053e-06, + "loss": 0.8945, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 9.2018567208818, + "learning_rate": 9.892289943341608e-06, + "loss": 0.9306, + "step": 1073 + }, + { + "epoch": 0.09, + "grad_norm": 11.115598069652812, + "learning_rate": 9.891996115681199e-06, + "loss": 1.0043, + "step": 1074 + }, + { + "epoch": 0.09, + "grad_norm": 2.548698709094164, + "learning_rate": 9.891701892168602e-06, + "loss": 0.5042, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 15.980612917197588, + "learning_rate": 9.891407272827624e-06, + "loss": 0.9868, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 3.014378943165734, + "learning_rate": 9.891112257682105e-06, + "loss": 0.4886, + "step": 1077 + }, + { + "epoch": 0.09, + "grad_norm": 12.568532848305685, + "learning_rate": 9.890816846755919e-06, + "loss": 0.7275, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 3.288517506920509, + "learning_rate": 9.89052104007297e-06, + "loss": 0.52, + "step": 1079 + }, + { + "epoch": 0.09, + "grad_norm": 9.346684464259278, + "learning_rate": 9.890224837657192e-06, + "loss": 0.9401, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 6.521534026863548, + "learning_rate": 9.889928239532555e-06, + "loss": 0.5617, + "step": 1081 + }, + { + "epoch": 0.1, + "grad_norm": 11.58356544884218, + "learning_rate": 9.889631245723061e-06, + "loss": 1.0599, + "step": 1082 + }, + { + "epoch": 0.1, + "grad_norm": 11.695823405493025, + "learning_rate": 9.889333856252737e-06, + "loss": 0.8994, + "step": 1083 + }, + { + "epoch": 0.1, + "grad_norm": 9.849263488266201, + "learning_rate": 9.889036071145653e-06, + "loss": 0.9515, + "step": 1084 + }, + { + "epoch": 0.1, + "grad_norm": 9.358974397424646, + "learning_rate": 9.8887378904259e-06, + "loss": 0.7351, + "step": 1085 + }, + { + "epoch": 0.1, + "grad_norm": 7.178361737440624, + "learning_rate": 9.88843931411761e-06, + "loss": 0.9344, + "step": 1086 + }, + { + "epoch": 0.1, + "grad_norm": 8.48806549288028, + "learning_rate": 9.888140342244944e-06, + "loss": 1.0601, + "step": 1087 + }, + { + "epoch": 0.1, + "grad_norm": 3.3067423202278188, + "learning_rate": 9.88784097483209e-06, + "loss": 0.4862, + "step": 1088 + }, + { + "epoch": 0.1, + "grad_norm": 8.684741070677308, + "learning_rate": 9.887541211903276e-06, + "loss": 0.8212, + "step": 1089 + }, + { + "epoch": 0.1, + "grad_norm": 39.22159128111506, + "learning_rate": 9.887241053482756e-06, + "loss": 0.8048, + "step": 1090 + }, + { + "epoch": 0.1, + "grad_norm": 12.237633404882095, + "learning_rate": 9.886940499594821e-06, + "loss": 0.8296, + "step": 1091 + }, + { + "epoch": 0.1, + "grad_norm": 10.707402440890391, + "learning_rate": 9.886639550263788e-06, + "loss": 0.8631, + "step": 1092 + }, + { + "epoch": 0.1, + "grad_norm": 8.921519530893724, + "learning_rate": 9.886338205514013e-06, + "loss": 0.8558, + "step": 1093 + }, + { + "epoch": 0.1, + "grad_norm": 11.152595601443558, + "learning_rate": 9.886036465369878e-06, + "loss": 0.8248, + "step": 1094 + }, + { + "epoch": 0.1, + "grad_norm": 15.439190108434248, + "learning_rate": 9.885734329855798e-06, + "loss": 0.9681, + "step": 1095 + }, + { + "epoch": 0.1, + "grad_norm": 19.158118364438483, + "learning_rate": 9.885431798996225e-06, + "loss": 0.7813, + "step": 1096 + }, + { + "epoch": 0.1, + "grad_norm": 9.76716914189321, + "learning_rate": 9.885128872815637e-06, + "loss": 0.8235, + "step": 1097 + }, + { + "epoch": 0.1, + "grad_norm": 13.096757598985931, + "learning_rate": 9.884825551338545e-06, + "loss": 0.8995, + "step": 1098 + }, + { + "epoch": 0.1, + "grad_norm": 10.58175656089541, + "learning_rate": 9.884521834589497e-06, + "loss": 0.7764, + "step": 1099 + }, + { + "epoch": 0.1, + "grad_norm": 8.656064201170674, + "learning_rate": 9.884217722593066e-06, + "loss": 0.7528, + "step": 1100 + }, + { + "epoch": 0.1, + "grad_norm": 7.826430454132687, + "learning_rate": 9.883913215373862e-06, + "loss": 0.7133, + "step": 1101 + }, + { + "epoch": 0.1, + "grad_norm": 11.327404016092474, + "learning_rate": 9.883608312956524e-06, + "loss": 0.7648, + "step": 1102 + }, + { + "epoch": 0.1, + "grad_norm": 9.257757299602197, + "learning_rate": 9.883303015365725e-06, + "loss": 0.8574, + "step": 1103 + }, + { + "epoch": 0.1, + "grad_norm": 23.29640692377761, + "learning_rate": 9.88299732262617e-06, + "loss": 0.9122, + "step": 1104 + }, + { + "epoch": 0.1, + "grad_norm": 19.779572559101492, + "learning_rate": 9.882691234762591e-06, + "loss": 0.9215, + "step": 1105 + }, + { + "epoch": 0.1, + "grad_norm": 9.193521681961451, + "learning_rate": 9.882384751799762e-06, + "loss": 0.889, + "step": 1106 + }, + { + "epoch": 0.1, + "grad_norm": 18.358814145755495, + "learning_rate": 9.882077873762478e-06, + "loss": 0.8349, + "step": 1107 + }, + { + "epoch": 0.1, + "grad_norm": 13.678229892840182, + "learning_rate": 9.881770600675577e-06, + "loss": 0.9022, + "step": 1108 + }, + { + "epoch": 0.1, + "grad_norm": 10.248419098530164, + "learning_rate": 9.881462932563916e-06, + "loss": 0.9379, + "step": 1109 + }, + { + "epoch": 0.1, + "grad_norm": 13.635424436997022, + "learning_rate": 9.881154869452397e-06, + "loss": 0.8176, + "step": 1110 + }, + { + "epoch": 0.1, + "grad_norm": 12.792937124605663, + "learning_rate": 9.880846411365945e-06, + "loss": 1.0764, + "step": 1111 + }, + { + "epoch": 0.1, + "grad_norm": 18.513327387900556, + "learning_rate": 9.880537558329518e-06, + "loss": 0.776, + "step": 1112 + }, + { + "epoch": 0.1, + "grad_norm": 11.131011409230123, + "learning_rate": 9.880228310368112e-06, + "loss": 0.8322, + "step": 1113 + }, + { + "epoch": 0.1, + "grad_norm": 9.313048618741997, + "learning_rate": 9.879918667506748e-06, + "loss": 0.7238, + "step": 1114 + }, + { + "epoch": 0.1, + "grad_norm": 12.170250972692699, + "learning_rate": 9.879608629770483e-06, + "loss": 0.875, + "step": 1115 + }, + { + "epoch": 0.1, + "grad_norm": 11.571083909654726, + "learning_rate": 9.879298197184406e-06, + "loss": 0.8937, + "step": 1116 + }, + { + "epoch": 0.1, + "grad_norm": 38.081136888099486, + "learning_rate": 9.878987369773633e-06, + "loss": 1.0033, + "step": 1117 + }, + { + "epoch": 0.1, + "grad_norm": 11.641901638671417, + "learning_rate": 9.87867614756332e-06, + "loss": 0.8184, + "step": 1118 + }, + { + "epoch": 0.1, + "grad_norm": 11.336392659628375, + "learning_rate": 9.878364530578646e-06, + "loss": 0.7357, + "step": 1119 + }, + { + "epoch": 0.1, + "grad_norm": 7.399040408129952, + "learning_rate": 9.87805251884483e-06, + "loss": 0.7241, + "step": 1120 + }, + { + "epoch": 0.1, + "grad_norm": 44.29416086404231, + "learning_rate": 9.877740112387118e-06, + "loss": 0.8631, + "step": 1121 + }, + { + "epoch": 0.1, + "grad_norm": 11.252754222358938, + "learning_rate": 9.877427311230791e-06, + "loss": 0.7626, + "step": 1122 + }, + { + "epoch": 0.1, + "grad_norm": 11.17575023640585, + "learning_rate": 9.877114115401159e-06, + "loss": 0.7438, + "step": 1123 + }, + { + "epoch": 0.1, + "grad_norm": 10.413825811866689, + "learning_rate": 9.876800524923565e-06, + "loss": 0.8975, + "step": 1124 + }, + { + "epoch": 0.1, + "grad_norm": 9.337436783894153, + "learning_rate": 9.876486539823384e-06, + "loss": 0.8483, + "step": 1125 + }, + { + "epoch": 0.1, + "grad_norm": 13.278346993113676, + "learning_rate": 9.876172160126024e-06, + "loss": 0.8313, + "step": 1126 + }, + { + "epoch": 0.1, + "grad_norm": 9.894661024262327, + "learning_rate": 9.875857385856923e-06, + "loss": 0.8923, + "step": 1127 + }, + { + "epoch": 0.1, + "grad_norm": 11.454248352642598, + "learning_rate": 9.875542217041556e-06, + "loss": 0.7053, + "step": 1128 + }, + { + "epoch": 0.1, + "grad_norm": 14.184289296266838, + "learning_rate": 9.875226653705422e-06, + "loss": 0.8525, + "step": 1129 + }, + { + "epoch": 0.1, + "grad_norm": 29.900036151055815, + "learning_rate": 9.874910695874053e-06, + "loss": 0.9567, + "step": 1130 + }, + { + "epoch": 0.1, + "grad_norm": 13.797530363238403, + "learning_rate": 9.874594343573023e-06, + "loss": 0.8871, + "step": 1131 + }, + { + "epoch": 0.1, + "grad_norm": 14.053842057986724, + "learning_rate": 9.874277596827926e-06, + "loss": 0.8499, + "step": 1132 + }, + { + "epoch": 0.1, + "grad_norm": 8.093793678216844, + "learning_rate": 9.873960455664396e-06, + "loss": 0.9226, + "step": 1133 + }, + { + "epoch": 0.1, + "grad_norm": 9.238088843807082, + "learning_rate": 9.87364292010809e-06, + "loss": 0.6781, + "step": 1134 + }, + { + "epoch": 0.1, + "grad_norm": 13.409979238437073, + "learning_rate": 9.873324990184707e-06, + "loss": 0.8435, + "step": 1135 + }, + { + "epoch": 0.1, + "grad_norm": 28.796795710313134, + "learning_rate": 9.873006665919973e-06, + "loss": 0.9127, + "step": 1136 + }, + { + "epoch": 0.1, + "grad_norm": 8.954165591906186, + "learning_rate": 9.872687947339645e-06, + "loss": 0.7368, + "step": 1137 + }, + { + "epoch": 0.1, + "grad_norm": 9.338668191703325, + "learning_rate": 9.872368834469514e-06, + "loss": 0.9343, + "step": 1138 + }, + { + "epoch": 0.1, + "grad_norm": 9.741045340014896, + "learning_rate": 9.8720493273354e-06, + "loss": 0.743, + "step": 1139 + }, + { + "epoch": 0.1, + "grad_norm": 11.910984438805464, + "learning_rate": 9.87172942596316e-06, + "loss": 0.994, + "step": 1140 + }, + { + "epoch": 0.1, + "grad_norm": 14.656119828484721, + "learning_rate": 9.87140913037868e-06, + "loss": 0.8594, + "step": 1141 + }, + { + "epoch": 0.1, + "grad_norm": 6.896080099714091, + "learning_rate": 9.871088440607874e-06, + "loss": 1.1158, + "step": 1142 + }, + { + "epoch": 0.1, + "grad_norm": 8.173029471861533, + "learning_rate": 9.870767356676696e-06, + "loss": 0.7658, + "step": 1143 + }, + { + "epoch": 0.1, + "grad_norm": 9.603950189705355, + "learning_rate": 9.870445878611124e-06, + "loss": 0.8032, + "step": 1144 + }, + { + "epoch": 0.1, + "grad_norm": 7.904614127876022, + "learning_rate": 9.870124006437172e-06, + "loss": 0.8472, + "step": 1145 + }, + { + "epoch": 0.1, + "grad_norm": 25.74999315967226, + "learning_rate": 9.869801740180889e-06, + "loss": 0.8656, + "step": 1146 + }, + { + "epoch": 0.1, + "grad_norm": 8.401854517527157, + "learning_rate": 9.869479079868348e-06, + "loss": 0.8472, + "step": 1147 + }, + { + "epoch": 0.1, + "grad_norm": 10.091229112315434, + "learning_rate": 9.86915602552566e-06, + "loss": 0.9142, + "step": 1148 + }, + { + "epoch": 0.1, + "grad_norm": 9.258829465555554, + "learning_rate": 9.868832577178966e-06, + "loss": 0.7578, + "step": 1149 + }, + { + "epoch": 0.1, + "grad_norm": 12.92690721787567, + "learning_rate": 9.868508734854439e-06, + "loss": 0.8737, + "step": 1150 + }, + { + "epoch": 0.1, + "grad_norm": 14.620819120905262, + "learning_rate": 9.868184498578283e-06, + "loss": 0.8638, + "step": 1151 + }, + { + "epoch": 0.1, + "grad_norm": 17.00502104073314, + "learning_rate": 9.867859868376736e-06, + "loss": 0.9227, + "step": 1152 + }, + { + "epoch": 0.1, + "grad_norm": 10.587905617903003, + "learning_rate": 9.867534844276066e-06, + "loss": 0.7481, + "step": 1153 + }, + { + "epoch": 0.1, + "grad_norm": 10.736442775518734, + "learning_rate": 9.867209426302572e-06, + "loss": 0.8873, + "step": 1154 + }, + { + "epoch": 0.1, + "grad_norm": 12.945913079219086, + "learning_rate": 9.86688361448259e-06, + "loss": 1.0019, + "step": 1155 + }, + { + "epoch": 0.1, + "grad_norm": 10.758205693376349, + "learning_rate": 9.866557408842479e-06, + "loss": 0.8185, + "step": 1156 + }, + { + "epoch": 0.1, + "grad_norm": 17.36149529407485, + "learning_rate": 9.866230809408637e-06, + "loss": 0.7948, + "step": 1157 + }, + { + "epoch": 0.1, + "grad_norm": 17.06253922273076, + "learning_rate": 9.865903816207494e-06, + "loss": 0.8882, + "step": 1158 + }, + { + "epoch": 0.1, + "grad_norm": 8.95640254656765, + "learning_rate": 9.865576429265508e-06, + "loss": 0.9119, + "step": 1159 + }, + { + "epoch": 0.1, + "grad_norm": 10.476174498807064, + "learning_rate": 9.86524864860917e-06, + "loss": 0.9869, + "step": 1160 + }, + { + "epoch": 0.1, + "grad_norm": 11.762509786938715, + "learning_rate": 9.864920474265005e-06, + "loss": 0.7669, + "step": 1161 + }, + { + "epoch": 0.1, + "grad_norm": 8.505708909506053, + "learning_rate": 9.864591906259569e-06, + "loss": 0.9611, + "step": 1162 + }, + { + "epoch": 0.1, + "grad_norm": 14.886595198648477, + "learning_rate": 9.864262944619444e-06, + "loss": 0.7421, + "step": 1163 + }, + { + "epoch": 0.1, + "grad_norm": 9.963159522130537, + "learning_rate": 9.863933589371257e-06, + "loss": 0.8562, + "step": 1164 + }, + { + "epoch": 0.1, + "grad_norm": 11.051472187267384, + "learning_rate": 9.86360384054165e-06, + "loss": 0.9148, + "step": 1165 + }, + { + "epoch": 0.1, + "grad_norm": 8.419685234297956, + "learning_rate": 9.863273698157315e-06, + "loss": 0.8825, + "step": 1166 + }, + { + "epoch": 0.1, + "grad_norm": 11.589167792225043, + "learning_rate": 9.86294316224496e-06, + "loss": 0.8134, + "step": 1167 + }, + { + "epoch": 0.1, + "grad_norm": 10.657879870932195, + "learning_rate": 9.862612232831335e-06, + "loss": 0.9689, + "step": 1168 + }, + { + "epoch": 0.1, + "grad_norm": 19.673581545940525, + "learning_rate": 9.862280909943213e-06, + "loss": 0.9426, + "step": 1169 + }, + { + "epoch": 0.1, + "grad_norm": 9.247030745290138, + "learning_rate": 9.86194919360741e-06, + "loss": 0.9984, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 2.672449360774323, + "learning_rate": 9.861617083850767e-06, + "loss": 0.4735, + "step": 1171 + }, + { + "epoch": 0.1, + "grad_norm": 11.366329062840444, + "learning_rate": 9.861284580700155e-06, + "loss": 0.7528, + "step": 1172 + }, + { + "epoch": 0.1, + "grad_norm": 15.520620090964485, + "learning_rate": 9.860951684182481e-06, + "loss": 1.0425, + "step": 1173 + }, + { + "epoch": 0.1, + "grad_norm": 7.881211097510561, + "learning_rate": 9.860618394324682e-06, + "loss": 0.8675, + "step": 1174 + }, + { + "epoch": 0.1, + "grad_norm": 9.886414368725909, + "learning_rate": 9.860284711153728e-06, + "loss": 0.851, + "step": 1175 + }, + { + "epoch": 0.1, + "grad_norm": 25.398096458968478, + "learning_rate": 9.859950634696621e-06, + "loss": 0.9811, + "step": 1176 + }, + { + "epoch": 0.1, + "grad_norm": 7.821147526544886, + "learning_rate": 9.859616164980392e-06, + "loss": 0.8174, + "step": 1177 + }, + { + "epoch": 0.1, + "grad_norm": 7.883125078279172, + "learning_rate": 9.859281302032107e-06, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.1, + "grad_norm": 13.473443483745807, + "learning_rate": 9.858946045878861e-06, + "loss": 0.9964, + "step": 1179 + }, + { + "epoch": 0.1, + "grad_norm": 8.615216561393892, + "learning_rate": 9.858610396547785e-06, + "loss": 0.7387, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 10.151931547884805, + "learning_rate": 9.858274354066036e-06, + "loss": 0.8863, + "step": 1181 + }, + { + "epoch": 0.1, + "grad_norm": 9.710727404594964, + "learning_rate": 9.857937918460809e-06, + "loss": 0.9659, + "step": 1182 + }, + { + "epoch": 0.1, + "grad_norm": 8.040463862742849, + "learning_rate": 9.857601089759324e-06, + "loss": 0.8958, + "step": 1183 + }, + { + "epoch": 0.1, + "grad_norm": 18.671377516331948, + "learning_rate": 9.85726386798884e-06, + "loss": 0.9269, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 3.1870778004692495, + "learning_rate": 9.856926253176645e-06, + "loss": 0.5396, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 11.137861664100859, + "learning_rate": 9.856588245350056e-06, + "loss": 1.0272, + "step": 1186 + }, + { + "epoch": 0.1, + "grad_norm": 14.759144549656627, + "learning_rate": 9.856249844536424e-06, + "loss": 0.8542, + "step": 1187 + }, + { + "epoch": 0.1, + "grad_norm": 8.939261220349358, + "learning_rate": 9.855911050763133e-06, + "loss": 0.9172, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 10.986987651501208, + "learning_rate": 9.855571864057598e-06, + "loss": 1.0313, + "step": 1189 + }, + { + "epoch": 0.1, + "grad_norm": 7.509261258493609, + "learning_rate": 9.855232284447263e-06, + "loss": 0.8045, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 15.606724481110941, + "learning_rate": 9.854892311959608e-06, + "loss": 0.8955, + "step": 1191 + }, + { + "epoch": 0.1, + "grad_norm": 9.20485289576695, + "learning_rate": 9.854551946622144e-06, + "loss": 0.9401, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 10.59992358068928, + "learning_rate": 9.85421118846241e-06, + "loss": 0.7617, + "step": 1193 + }, + { + "epoch": 0.1, + "grad_norm": 10.815439488794508, + "learning_rate": 9.853870037507983e-06, + "loss": 0.9986, + "step": 1194 + }, + { + "epoch": 0.1, + "grad_norm": 11.400106864637348, + "learning_rate": 9.853528493786466e-06, + "loss": 0.9404, + "step": 1195 + }, + { + "epoch": 0.11, + "grad_norm": 7.883960456884642, + "learning_rate": 9.853186557325496e-06, + "loss": 0.9387, + "step": 1196 + }, + { + "epoch": 0.11, + "grad_norm": 9.628447998785692, + "learning_rate": 9.852844228152743e-06, + "loss": 0.924, + "step": 1197 + }, + { + "epoch": 0.11, + "grad_norm": 24.43541299030219, + "learning_rate": 9.852501506295908e-06, + "loss": 0.8167, + "step": 1198 + }, + { + "epoch": 0.11, + "grad_norm": 10.553121952546078, + "learning_rate": 9.852158391782722e-06, + "loss": 0.8372, + "step": 1199 + }, + { + "epoch": 0.11, + "grad_norm": 9.369440588558746, + "learning_rate": 9.85181488464095e-06, + "loss": 0.8665, + "step": 1200 + }, + { + "epoch": 0.11, + "grad_norm": 12.170184422829116, + "learning_rate": 9.851470984898388e-06, + "loss": 0.9335, + "step": 1201 + }, + { + "epoch": 0.11, + "grad_norm": 11.37512747560385, + "learning_rate": 9.851126692582865e-06, + "loss": 0.9273, + "step": 1202 + }, + { + "epoch": 0.11, + "grad_norm": 2.609114757508655, + "learning_rate": 9.85078200772224e-06, + "loss": 0.481, + "step": 1203 + }, + { + "epoch": 0.11, + "grad_norm": 10.99440052321079, + "learning_rate": 9.8504369303444e-06, + "loss": 0.797, + "step": 1204 + }, + { + "epoch": 0.11, + "grad_norm": 11.59042837569248, + "learning_rate": 9.850091460477276e-06, + "loss": 0.8861, + "step": 1205 + }, + { + "epoch": 0.11, + "grad_norm": 11.860035763918846, + "learning_rate": 9.849745598148817e-06, + "loss": 0.9293, + "step": 1206 + }, + { + "epoch": 0.11, + "grad_norm": 2.920605486372513, + "learning_rate": 9.849399343387011e-06, + "loss": 0.4651, + "step": 1207 + }, + { + "epoch": 0.11, + "grad_norm": 16.035084108278088, + "learning_rate": 9.849052696219879e-06, + "loss": 0.8979, + "step": 1208 + }, + { + "epoch": 0.11, + "grad_norm": 10.290808469729543, + "learning_rate": 9.848705656675465e-06, + "loss": 0.7835, + "step": 1209 + }, + { + "epoch": 0.11, + "grad_norm": 5.163740902861153, + "learning_rate": 9.848358224781857e-06, + "loss": 0.5437, + "step": 1210 + }, + { + "epoch": 0.11, + "grad_norm": 9.189371368577962, + "learning_rate": 9.848010400567167e-06, + "loss": 0.8165, + "step": 1211 + }, + { + "epoch": 0.11, + "grad_norm": 10.687282142942351, + "learning_rate": 9.84766218405954e-06, + "loss": 0.8015, + "step": 1212 + }, + { + "epoch": 0.11, + "grad_norm": 10.199517535484649, + "learning_rate": 9.84731357528715e-06, + "loss": 0.9773, + "step": 1213 + }, + { + "epoch": 0.11, + "grad_norm": 10.905429763295665, + "learning_rate": 9.84696457427821e-06, + "loss": 1.0129, + "step": 1214 + }, + { + "epoch": 0.11, + "grad_norm": 13.319189624275019, + "learning_rate": 9.846615181060959e-06, + "loss": 0.9443, + "step": 1215 + }, + { + "epoch": 0.11, + "grad_norm": 11.182161715294868, + "learning_rate": 9.84626539566367e-06, + "loss": 0.7339, + "step": 1216 + }, + { + "epoch": 0.11, + "grad_norm": 20.835338897266183, + "learning_rate": 9.845915218114646e-06, + "loss": 0.9609, + "step": 1217 + }, + { + "epoch": 0.11, + "grad_norm": 11.592109208518739, + "learning_rate": 9.845564648442223e-06, + "loss": 0.9448, + "step": 1218 + }, + { + "epoch": 0.11, + "grad_norm": 11.350448773864722, + "learning_rate": 9.84521368667477e-06, + "loss": 0.9225, + "step": 1219 + }, + { + "epoch": 0.11, + "grad_norm": 9.91987966240964, + "learning_rate": 9.844862332840684e-06, + "loss": 0.8621, + "step": 1220 + }, + { + "epoch": 0.11, + "grad_norm": 11.799569809265869, + "learning_rate": 9.844510586968397e-06, + "loss": 0.8349, + "step": 1221 + }, + { + "epoch": 0.11, + "grad_norm": 2.783765164990666, + "learning_rate": 9.844158449086372e-06, + "loss": 0.4903, + "step": 1222 + }, + { + "epoch": 0.11, + "grad_norm": 9.01493033773193, + "learning_rate": 9.843805919223105e-06, + "loss": 0.9838, + "step": 1223 + }, + { + "epoch": 0.11, + "grad_norm": 10.329748194470637, + "learning_rate": 9.843452997407117e-06, + "loss": 0.7091, + "step": 1224 + }, + { + "epoch": 0.11, + "grad_norm": 9.330882775702007, + "learning_rate": 9.84309968366697e-06, + "loss": 0.7246, + "step": 1225 + }, + { + "epoch": 0.11, + "grad_norm": 8.630455434570784, + "learning_rate": 9.842745978031254e-06, + "loss": 0.7918, + "step": 1226 + }, + { + "epoch": 0.11, + "grad_norm": 8.835686682088252, + "learning_rate": 9.842391880528589e-06, + "loss": 0.8202, + "step": 1227 + }, + { + "epoch": 0.11, + "grad_norm": 3.322264598093885, + "learning_rate": 9.842037391187626e-06, + "loss": 0.5528, + "step": 1228 + }, + { + "epoch": 0.11, + "grad_norm": 11.987691161716512, + "learning_rate": 9.841682510037054e-06, + "loss": 0.8286, + "step": 1229 + }, + { + "epoch": 0.11, + "grad_norm": 16.17400703451786, + "learning_rate": 9.841327237105585e-06, + "loss": 0.9718, + "step": 1230 + }, + { + "epoch": 0.11, + "grad_norm": 10.367597059062883, + "learning_rate": 9.84097157242197e-06, + "loss": 0.8266, + "step": 1231 + }, + { + "epoch": 0.11, + "grad_norm": 16.38007832445046, + "learning_rate": 9.840615516014988e-06, + "loss": 0.8978, + "step": 1232 + }, + { + "epoch": 0.11, + "grad_norm": 9.666646686955296, + "learning_rate": 9.84025906791345e-06, + "loss": 0.8602, + "step": 1233 + }, + { + "epoch": 0.11, + "grad_norm": 9.551511063706942, + "learning_rate": 9.839902228146201e-06, + "loss": 0.9615, + "step": 1234 + }, + { + "epoch": 0.11, + "grad_norm": 16.148748009506292, + "learning_rate": 9.839544996742112e-06, + "loss": 0.7928, + "step": 1235 + }, + { + "epoch": 0.11, + "grad_norm": 7.842373110773571, + "learning_rate": 9.839187373730095e-06, + "loss": 0.7279, + "step": 1236 + }, + { + "epoch": 0.11, + "grad_norm": 8.958602040218706, + "learning_rate": 9.838829359139084e-06, + "loss": 0.8499, + "step": 1237 + }, + { + "epoch": 0.11, + "grad_norm": 13.288722291470382, + "learning_rate": 9.83847095299805e-06, + "loss": 0.8354, + "step": 1238 + }, + { + "epoch": 0.11, + "grad_norm": 15.728688072571185, + "learning_rate": 9.838112155335994e-06, + "loss": 0.8356, + "step": 1239 + }, + { + "epoch": 0.11, + "grad_norm": 47.32360419058596, + "learning_rate": 9.837752966181951e-06, + "loss": 0.8815, + "step": 1240 + }, + { + "epoch": 0.11, + "grad_norm": 7.6652141429043645, + "learning_rate": 9.837393385564985e-06, + "loss": 1.0356, + "step": 1241 + }, + { + "epoch": 0.11, + "grad_norm": 14.460504434961026, + "learning_rate": 9.837033413514191e-06, + "loss": 0.7301, + "step": 1242 + }, + { + "epoch": 0.11, + "grad_norm": 8.139159564474808, + "learning_rate": 9.836673050058703e-06, + "loss": 0.7001, + "step": 1243 + }, + { + "epoch": 0.11, + "grad_norm": 22.273344229118834, + "learning_rate": 9.836312295227674e-06, + "loss": 1.0149, + "step": 1244 + }, + { + "epoch": 0.11, + "grad_norm": 19.396290708589696, + "learning_rate": 9.835951149050302e-06, + "loss": 0.9269, + "step": 1245 + }, + { + "epoch": 0.11, + "grad_norm": 11.065928620001827, + "learning_rate": 9.835589611555805e-06, + "loss": 0.8793, + "step": 1246 + }, + { + "epoch": 0.11, + "grad_norm": 17.435832346161334, + "learning_rate": 9.83522768277344e-06, + "loss": 0.8405, + "step": 1247 + }, + { + "epoch": 0.11, + "grad_norm": 20.792458966128205, + "learning_rate": 9.834865362732495e-06, + "loss": 0.8945, + "step": 1248 + }, + { + "epoch": 0.11, + "grad_norm": 9.479941648717793, + "learning_rate": 9.834502651462287e-06, + "loss": 0.9712, + "step": 1249 + }, + { + "epoch": 0.11, + "grad_norm": 12.685420591052269, + "learning_rate": 9.834139548992165e-06, + "loss": 0.8922, + "step": 1250 + }, + { + "epoch": 0.11, + "grad_norm": 10.426180480549121, + "learning_rate": 9.833776055351514e-06, + "loss": 0.7742, + "step": 1251 + }, + { + "epoch": 0.11, + "grad_norm": 11.102259058014981, + "learning_rate": 9.833412170569743e-06, + "loss": 0.9458, + "step": 1252 + }, + { + "epoch": 0.11, + "grad_norm": 30.04309374316441, + "learning_rate": 9.8330478946763e-06, + "loss": 0.9638, + "step": 1253 + }, + { + "epoch": 0.11, + "grad_norm": 8.118901481222665, + "learning_rate": 9.83268322770066e-06, + "loss": 0.8377, + "step": 1254 + }, + { + "epoch": 0.11, + "grad_norm": 9.700702167129851, + "learning_rate": 9.832318169672334e-06, + "loss": 1.0072, + "step": 1255 + }, + { + "epoch": 0.11, + "grad_norm": 16.56780027439878, + "learning_rate": 9.831952720620858e-06, + "loss": 0.9146, + "step": 1256 + }, + { + "epoch": 0.11, + "grad_norm": 3.0393219138605785, + "learning_rate": 9.831586880575807e-06, + "loss": 0.5534, + "step": 1257 + }, + { + "epoch": 0.11, + "grad_norm": 10.731830724532802, + "learning_rate": 9.831220649566782e-06, + "loss": 0.8818, + "step": 1258 + }, + { + "epoch": 0.11, + "grad_norm": 12.625652560267273, + "learning_rate": 9.830854027623418e-06, + "loss": 0.8704, + "step": 1259 + }, + { + "epoch": 0.11, + "grad_norm": 11.348296734336728, + "learning_rate": 9.830487014775381e-06, + "loss": 0.8156, + "step": 1260 + }, + { + "epoch": 0.11, + "grad_norm": 33.75764104446829, + "learning_rate": 9.830119611052371e-06, + "loss": 0.904, + "step": 1261 + }, + { + "epoch": 0.11, + "grad_norm": 8.799906814139554, + "learning_rate": 9.829751816484116e-06, + "loss": 0.9116, + "step": 1262 + }, + { + "epoch": 0.11, + "grad_norm": 14.533455156415203, + "learning_rate": 9.829383631100378e-06, + "loss": 0.8814, + "step": 1263 + }, + { + "epoch": 0.11, + "grad_norm": 11.265037168750817, + "learning_rate": 9.82901505493095e-06, + "loss": 0.9036, + "step": 1264 + }, + { + "epoch": 0.11, + "grad_norm": 9.14382446474652, + "learning_rate": 9.828646088005656e-06, + "loss": 0.819, + "step": 1265 + }, + { + "epoch": 0.11, + "grad_norm": 12.267523640362345, + "learning_rate": 9.828276730354353e-06, + "loss": 0.8341, + "step": 1266 + }, + { + "epoch": 0.11, + "grad_norm": 11.169343298134422, + "learning_rate": 9.82790698200693e-06, + "loss": 0.8325, + "step": 1267 + }, + { + "epoch": 0.11, + "grad_norm": 16.3561534138126, + "learning_rate": 9.827536842993302e-06, + "loss": 1.0025, + "step": 1268 + }, + { + "epoch": 0.11, + "grad_norm": 14.313721124335567, + "learning_rate": 9.827166313343423e-06, + "loss": 0.8523, + "step": 1269 + }, + { + "epoch": 0.11, + "grad_norm": 13.544619281293027, + "learning_rate": 9.826795393087278e-06, + "loss": 0.8673, + "step": 1270 + }, + { + "epoch": 0.11, + "grad_norm": 10.654756651890676, + "learning_rate": 9.826424082254877e-06, + "loss": 0.9367, + "step": 1271 + }, + { + "epoch": 0.11, + "grad_norm": 10.544451326792936, + "learning_rate": 9.826052380876267e-06, + "loss": 0.7833, + "step": 1272 + }, + { + "epoch": 0.11, + "grad_norm": 4.0073040204439625, + "learning_rate": 9.825680288981528e-06, + "loss": 0.565, + "step": 1273 + }, + { + "epoch": 0.11, + "grad_norm": 13.601700760155872, + "learning_rate": 9.825307806600766e-06, + "loss": 0.9034, + "step": 1274 + }, + { + "epoch": 0.11, + "grad_norm": 16.11936318117849, + "learning_rate": 9.824934933764123e-06, + "loss": 0.835, + "step": 1275 + }, + { + "epoch": 0.11, + "grad_norm": 11.642878577424273, + "learning_rate": 9.824561670501771e-06, + "loss": 0.8455, + "step": 1276 + }, + { + "epoch": 0.11, + "grad_norm": 10.771350464884053, + "learning_rate": 9.824188016843915e-06, + "loss": 0.8928, + "step": 1277 + }, + { + "epoch": 0.11, + "grad_norm": 21.267950844670803, + "learning_rate": 9.823813972820787e-06, + "loss": 0.823, + "step": 1278 + }, + { + "epoch": 0.11, + "grad_norm": 7.982361747824831, + "learning_rate": 9.823439538462658e-06, + "loss": 0.862, + "step": 1279 + }, + { + "epoch": 0.11, + "grad_norm": 11.894667460029709, + "learning_rate": 9.823064713799824e-06, + "loss": 0.7511, + "step": 1280 + }, + { + "epoch": 0.11, + "grad_norm": 10.601843626947327, + "learning_rate": 9.822689498862617e-06, + "loss": 0.7352, + "step": 1281 + }, + { + "epoch": 0.11, + "grad_norm": 10.014196045754113, + "learning_rate": 9.822313893681396e-06, + "loss": 0.9771, + "step": 1282 + }, + { + "epoch": 0.11, + "grad_norm": 15.78682028163692, + "learning_rate": 9.821937898286558e-06, + "loss": 0.9817, + "step": 1283 + }, + { + "epoch": 0.11, + "grad_norm": 13.248366540791569, + "learning_rate": 9.821561512708526e-06, + "loss": 0.8104, + "step": 1284 + }, + { + "epoch": 0.11, + "grad_norm": 9.847190598324891, + "learning_rate": 9.821184736977756e-06, + "loss": 0.7059, + "step": 1285 + }, + { + "epoch": 0.11, + "grad_norm": 12.308191901591131, + "learning_rate": 9.820807571124738e-06, + "loss": 0.9204, + "step": 1286 + }, + { + "epoch": 0.11, + "grad_norm": 33.52971031148486, + "learning_rate": 9.82043001517999e-06, + "loss": 0.7552, + "step": 1287 + }, + { + "epoch": 0.11, + "grad_norm": 10.296874935365818, + "learning_rate": 9.820052069174062e-06, + "loss": 0.8574, + "step": 1288 + }, + { + "epoch": 0.11, + "grad_norm": 18.595531354726173, + "learning_rate": 9.81967373313754e-06, + "loss": 0.7165, + "step": 1289 + }, + { + "epoch": 0.11, + "grad_norm": 16.28282682550622, + "learning_rate": 9.819295007101035e-06, + "loss": 0.8309, + "step": 1290 + }, + { + "epoch": 0.11, + "grad_norm": 13.704760249971955, + "learning_rate": 9.818915891095196e-06, + "loss": 0.8183, + "step": 1291 + }, + { + "epoch": 0.11, + "grad_norm": 16.489972524373588, + "learning_rate": 9.818536385150698e-06, + "loss": 0.8612, + "step": 1292 + }, + { + "epoch": 0.11, + "grad_norm": 12.123514786043467, + "learning_rate": 9.81815648929825e-06, + "loss": 0.7817, + "step": 1293 + }, + { + "epoch": 0.11, + "grad_norm": 2.5147893075620873, + "learning_rate": 9.817776203568596e-06, + "loss": 0.4367, + "step": 1294 + }, + { + "epoch": 0.11, + "grad_norm": 17.358601131475773, + "learning_rate": 9.817395527992504e-06, + "loss": 0.9427, + "step": 1295 + }, + { + "epoch": 0.11, + "grad_norm": 8.240158119495861, + "learning_rate": 9.81701446260078e-06, + "loss": 0.7662, + "step": 1296 + }, + { + "epoch": 0.11, + "grad_norm": 32.80861559731321, + "learning_rate": 9.816633007424258e-06, + "loss": 0.9084, + "step": 1297 + }, + { + "epoch": 0.11, + "grad_norm": 13.18799943990535, + "learning_rate": 9.816251162493803e-06, + "loss": 0.7998, + "step": 1298 + }, + { + "epoch": 0.11, + "grad_norm": 21.13835257577565, + "learning_rate": 9.81586892784032e-06, + "loss": 0.8239, + "step": 1299 + }, + { + "epoch": 0.11, + "grad_norm": 14.856377816478423, + "learning_rate": 9.81548630349473e-06, + "loss": 0.8365, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 9.73890093690426, + "learning_rate": 9.815103289488001e-06, + "loss": 0.8132, + "step": 1301 + }, + { + "epoch": 0.11, + "grad_norm": 14.053421792561055, + "learning_rate": 9.814719885851122e-06, + "loss": 0.9751, + "step": 1302 + }, + { + "epoch": 0.11, + "grad_norm": 12.594753326742156, + "learning_rate": 9.814336092615118e-06, + "loss": 0.769, + "step": 1303 + }, + { + "epoch": 0.11, + "grad_norm": 11.732096963382176, + "learning_rate": 9.813951909811049e-06, + "loss": 0.8282, + "step": 1304 + }, + { + "epoch": 0.11, + "grad_norm": 44.57722827687913, + "learning_rate": 9.813567337469996e-06, + "loss": 0.9473, + "step": 1305 + }, + { + "epoch": 0.11, + "grad_norm": 13.097577815396518, + "learning_rate": 9.81318237562308e-06, + "loss": 0.6892, + "step": 1306 + }, + { + "epoch": 0.11, + "grad_norm": 17.6013033810495, + "learning_rate": 9.812797024301455e-06, + "loss": 0.8429, + "step": 1307 + }, + { + "epoch": 0.11, + "grad_norm": 14.886216703866616, + "learning_rate": 9.812411283536301e-06, + "loss": 0.8581, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 10.687107650043918, + "learning_rate": 9.812025153358829e-06, + "loss": 0.8012, + "step": 1309 + }, + { + "epoch": 0.12, + "grad_norm": 12.511345757432608, + "learning_rate": 9.811638633800287e-06, + "loss": 0.8908, + "step": 1310 + }, + { + "epoch": 0.12, + "grad_norm": 9.810478879507814, + "learning_rate": 9.81125172489195e-06, + "loss": 1.0339, + "step": 1311 + }, + { + "epoch": 0.12, + "grad_norm": 9.540848637780982, + "learning_rate": 9.810864426665125e-06, + "loss": 0.7598, + "step": 1312 + }, + { + "epoch": 0.12, + "grad_norm": 9.290243572778598, + "learning_rate": 9.810476739151156e-06, + "loss": 0.909, + "step": 1313 + }, + { + "epoch": 0.12, + "grad_norm": 11.938018342767199, + "learning_rate": 9.81008866238141e-06, + "loss": 0.877, + "step": 1314 + }, + { + "epoch": 0.12, + "grad_norm": 14.387142864055674, + "learning_rate": 9.809700196387291e-06, + "loss": 0.7372, + "step": 1315 + }, + { + "epoch": 0.12, + "grad_norm": 14.351300700315587, + "learning_rate": 9.809311341200232e-06, + "loss": 0.8672, + "step": 1316 + }, + { + "epoch": 0.12, + "grad_norm": 16.484470003720034, + "learning_rate": 9.8089220968517e-06, + "loss": 0.9863, + "step": 1317 + }, + { + "epoch": 0.12, + "grad_norm": 10.902311572700329, + "learning_rate": 9.808532463373187e-06, + "loss": 0.7594, + "step": 1318 + }, + { + "epoch": 0.12, + "grad_norm": 17.717569415565606, + "learning_rate": 9.80814244079623e-06, + "loss": 0.9442, + "step": 1319 + }, + { + "epoch": 0.12, + "grad_norm": 12.117190705158261, + "learning_rate": 9.807752029152384e-06, + "loss": 0.9595, + "step": 1320 + }, + { + "epoch": 0.12, + "grad_norm": 9.81214071389616, + "learning_rate": 9.807361228473241e-06, + "loss": 0.8214, + "step": 1321 + }, + { + "epoch": 0.12, + "grad_norm": 14.231793189793581, + "learning_rate": 9.806970038790425e-06, + "loss": 0.8762, + "step": 1322 + }, + { + "epoch": 0.12, + "grad_norm": 3.8452565197308703, + "learning_rate": 9.806578460135587e-06, + "loss": 0.5258, + "step": 1323 + }, + { + "epoch": 0.12, + "grad_norm": 13.036430374901302, + "learning_rate": 9.806186492540417e-06, + "loss": 0.9678, + "step": 1324 + }, + { + "epoch": 0.12, + "grad_norm": 2.858549955275364, + "learning_rate": 9.805794136036631e-06, + "loss": 0.4136, + "step": 1325 + }, + { + "epoch": 0.12, + "grad_norm": 12.442896295209913, + "learning_rate": 9.805401390655975e-06, + "loss": 1.0176, + "step": 1326 + }, + { + "epoch": 0.12, + "grad_norm": 20.139539501631464, + "learning_rate": 9.805008256430235e-06, + "loss": 0.8411, + "step": 1327 + }, + { + "epoch": 0.12, + "grad_norm": 15.313306229813263, + "learning_rate": 9.804614733391218e-06, + "loss": 0.9081, + "step": 1328 + }, + { + "epoch": 0.12, + "grad_norm": 13.612946460898863, + "learning_rate": 9.804220821570769e-06, + "loss": 0.7563, + "step": 1329 + }, + { + "epoch": 0.12, + "grad_norm": 23.014201516571887, + "learning_rate": 9.803826521000762e-06, + "loss": 0.8684, + "step": 1330 + }, + { + "epoch": 0.12, + "grad_norm": 17.929827898399914, + "learning_rate": 9.803431831713104e-06, + "loss": 0.866, + "step": 1331 + }, + { + "epoch": 0.12, + "grad_norm": 13.266092025496562, + "learning_rate": 9.803036753739733e-06, + "loss": 0.8853, + "step": 1332 + }, + { + "epoch": 0.12, + "grad_norm": 2.5822099501760536, + "learning_rate": 9.802641287112618e-06, + "loss": 0.4789, + "step": 1333 + }, + { + "epoch": 0.12, + "grad_norm": 12.605937986202141, + "learning_rate": 9.802245431863757e-06, + "loss": 0.8362, + "step": 1334 + }, + { + "epoch": 0.12, + "grad_norm": 11.238829334913909, + "learning_rate": 9.801849188025184e-06, + "loss": 0.9733, + "step": 1335 + }, + { + "epoch": 0.12, + "grad_norm": 12.283732778875475, + "learning_rate": 9.801452555628963e-06, + "loss": 0.8419, + "step": 1336 + }, + { + "epoch": 0.12, + "grad_norm": 9.144456866551506, + "learning_rate": 9.801055534707189e-06, + "loss": 0.8675, + "step": 1337 + }, + { + "epoch": 0.12, + "grad_norm": 20.593680828031907, + "learning_rate": 9.800658125291985e-06, + "loss": 1.1121, + "step": 1338 + }, + { + "epoch": 0.12, + "grad_norm": 15.667796267247772, + "learning_rate": 9.800260327415511e-06, + "loss": 0.9683, + "step": 1339 + }, + { + "epoch": 0.12, + "grad_norm": 10.50981376230451, + "learning_rate": 9.799862141109956e-06, + "loss": 0.9065, + "step": 1340 + }, + { + "epoch": 0.12, + "grad_norm": 16.533656569305784, + "learning_rate": 9.799463566407544e-06, + "loss": 0.7428, + "step": 1341 + }, + { + "epoch": 0.12, + "grad_norm": 16.406681575332016, + "learning_rate": 9.79906460334052e-06, + "loss": 0.7967, + "step": 1342 + }, + { + "epoch": 0.12, + "grad_norm": 10.732680547550913, + "learning_rate": 9.798665251941172e-06, + "loss": 0.9118, + "step": 1343 + }, + { + "epoch": 0.12, + "grad_norm": 9.028035921191677, + "learning_rate": 9.798265512241816e-06, + "loss": 0.7515, + "step": 1344 + }, + { + "epoch": 0.12, + "grad_norm": 8.987006147582031, + "learning_rate": 9.797865384274795e-06, + "loss": 0.9629, + "step": 1345 + }, + { + "epoch": 0.12, + "grad_norm": 15.959802473988038, + "learning_rate": 9.797464868072489e-06, + "loss": 0.921, + "step": 1346 + }, + { + "epoch": 0.12, + "grad_norm": 18.98723070590631, + "learning_rate": 9.797063963667304e-06, + "loss": 0.8537, + "step": 1347 + }, + { + "epoch": 0.12, + "grad_norm": 12.434032207779103, + "learning_rate": 9.796662671091683e-06, + "loss": 0.8983, + "step": 1348 + }, + { + "epoch": 0.12, + "grad_norm": 10.910304827170174, + "learning_rate": 9.796260990378098e-06, + "loss": 0.9857, + "step": 1349 + }, + { + "epoch": 0.12, + "grad_norm": 7.765293508662017, + "learning_rate": 9.795858921559052e-06, + "loss": 0.7944, + "step": 1350 + }, + { + "epoch": 0.12, + "grad_norm": 2.46202333927441, + "learning_rate": 9.795456464667078e-06, + "loss": 0.4646, + "step": 1351 + }, + { + "epoch": 0.12, + "grad_norm": 8.147051851207273, + "learning_rate": 9.795053619734746e-06, + "loss": 0.8896, + "step": 1352 + }, + { + "epoch": 0.12, + "grad_norm": 21.318146241770368, + "learning_rate": 9.79465038679465e-06, + "loss": 0.9087, + "step": 1353 + }, + { + "epoch": 0.12, + "grad_norm": 11.99260238904977, + "learning_rate": 9.794246765879421e-06, + "loss": 0.803, + "step": 1354 + }, + { + "epoch": 0.12, + "grad_norm": 8.571979119724723, + "learning_rate": 9.793842757021718e-06, + "loss": 1.0708, + "step": 1355 + }, + { + "epoch": 0.12, + "grad_norm": 10.399428801200006, + "learning_rate": 9.793438360254233e-06, + "loss": 0.8829, + "step": 1356 + }, + { + "epoch": 0.12, + "grad_norm": 19.135170562270517, + "learning_rate": 9.793033575609689e-06, + "loss": 0.8585, + "step": 1357 + }, + { + "epoch": 0.12, + "grad_norm": 23.31859500519548, + "learning_rate": 9.792628403120842e-06, + "loss": 0.8883, + "step": 1358 + }, + { + "epoch": 0.12, + "grad_norm": 11.476953785968334, + "learning_rate": 9.792222842820476e-06, + "loss": 0.8029, + "step": 1359 + }, + { + "epoch": 0.12, + "grad_norm": 11.011608288409567, + "learning_rate": 9.79181689474141e-06, + "loss": 1.0414, + "step": 1360 + }, + { + "epoch": 0.12, + "grad_norm": 19.109344754392332, + "learning_rate": 9.79141055891649e-06, + "loss": 0.9216, + "step": 1361 + }, + { + "epoch": 0.12, + "grad_norm": 19.57012523809222, + "learning_rate": 9.791003835378599e-06, + "loss": 0.9331, + "step": 1362 + }, + { + "epoch": 0.12, + "grad_norm": 18.127377240515568, + "learning_rate": 9.790596724160645e-06, + "loss": 0.8994, + "step": 1363 + }, + { + "epoch": 0.12, + "grad_norm": 9.084423457828786, + "learning_rate": 9.790189225295576e-06, + "loss": 0.8509, + "step": 1364 + }, + { + "epoch": 0.12, + "grad_norm": 17.1405964029816, + "learning_rate": 9.789781338816362e-06, + "loss": 0.7472, + "step": 1365 + }, + { + "epoch": 0.12, + "grad_norm": 11.629449044634667, + "learning_rate": 9.789373064756009e-06, + "loss": 0.8055, + "step": 1366 + }, + { + "epoch": 0.12, + "grad_norm": 11.163240514149148, + "learning_rate": 9.788964403147554e-06, + "loss": 0.9688, + "step": 1367 + }, + { + "epoch": 0.12, + "grad_norm": 8.989540135838787, + "learning_rate": 9.788555354024064e-06, + "loss": 0.761, + "step": 1368 + }, + { + "epoch": 0.12, + "grad_norm": 11.695899950962076, + "learning_rate": 9.788145917418643e-06, + "loss": 0.9199, + "step": 1369 + }, + { + "epoch": 0.12, + "grad_norm": 16.216709696462072, + "learning_rate": 9.787736093364418e-06, + "loss": 0.9006, + "step": 1370 + }, + { + "epoch": 0.12, + "grad_norm": 21.24786371776563, + "learning_rate": 9.787325881894552e-06, + "loss": 0.8907, + "step": 1371 + }, + { + "epoch": 0.12, + "grad_norm": 14.450519262961693, + "learning_rate": 9.78691528304224e-06, + "loss": 0.8217, + "step": 1372 + }, + { + "epoch": 0.12, + "grad_norm": 31.015772876465984, + "learning_rate": 9.786504296840704e-06, + "loss": 0.8548, + "step": 1373 + }, + { + "epoch": 0.12, + "grad_norm": 11.4142144529359, + "learning_rate": 9.786092923323203e-06, + "loss": 0.819, + "step": 1374 + }, + { + "epoch": 0.12, + "grad_norm": 10.016130290469295, + "learning_rate": 9.785681162523024e-06, + "loss": 0.885, + "step": 1375 + }, + { + "epoch": 0.12, + "grad_norm": 17.452427040554117, + "learning_rate": 9.785269014473487e-06, + "loss": 0.862, + "step": 1376 + }, + { + "epoch": 0.12, + "grad_norm": 11.173948551329689, + "learning_rate": 9.784856479207941e-06, + "loss": 0.8491, + "step": 1377 + }, + { + "epoch": 0.12, + "grad_norm": 8.862694929561993, + "learning_rate": 9.784443556759766e-06, + "loss": 0.7345, + "step": 1378 + }, + { + "epoch": 0.12, + "grad_norm": 10.514542408798581, + "learning_rate": 9.784030247162378e-06, + "loss": 0.7971, + "step": 1379 + }, + { + "epoch": 0.12, + "grad_norm": 17.151425280399636, + "learning_rate": 9.78361655044922e-06, + "loss": 0.9494, + "step": 1380 + }, + { + "epoch": 0.12, + "grad_norm": 3.256392865546168, + "learning_rate": 9.78320246665377e-06, + "loss": 0.4484, + "step": 1381 + }, + { + "epoch": 0.12, + "grad_norm": 10.62515903720135, + "learning_rate": 9.782787995809531e-06, + "loss": 0.9902, + "step": 1382 + }, + { + "epoch": 0.12, + "grad_norm": 11.465899437420404, + "learning_rate": 9.782373137950043e-06, + "loss": 0.9487, + "step": 1383 + }, + { + "epoch": 0.12, + "grad_norm": 10.10648726865799, + "learning_rate": 9.781957893108876e-06, + "loss": 0.7903, + "step": 1384 + }, + { + "epoch": 0.12, + "grad_norm": 10.629550796563718, + "learning_rate": 9.78154226131963e-06, + "loss": 0.8712, + "step": 1385 + }, + { + "epoch": 0.12, + "grad_norm": 17.631799552040125, + "learning_rate": 9.78112624261594e-06, + "loss": 0.7848, + "step": 1386 + }, + { + "epoch": 0.12, + "grad_norm": 13.261004424299118, + "learning_rate": 9.780709837031464e-06, + "loss": 0.9313, + "step": 1387 + }, + { + "epoch": 0.12, + "grad_norm": 9.032398029533208, + "learning_rate": 9.780293044599903e-06, + "loss": 0.7912, + "step": 1388 + }, + { + "epoch": 0.12, + "grad_norm": 8.633556716894521, + "learning_rate": 9.779875865354979e-06, + "loss": 0.746, + "step": 1389 + }, + { + "epoch": 0.12, + "grad_norm": 13.573637606055277, + "learning_rate": 9.779458299330453e-06, + "loss": 0.8856, + "step": 1390 + }, + { + "epoch": 0.12, + "grad_norm": 14.830672024650811, + "learning_rate": 9.779040346560108e-06, + "loss": 0.8324, + "step": 1391 + }, + { + "epoch": 0.12, + "grad_norm": 10.993773058639764, + "learning_rate": 9.778622007077772e-06, + "loss": 1.0278, + "step": 1392 + }, + { + "epoch": 0.12, + "grad_norm": 8.489574523010301, + "learning_rate": 9.778203280917288e-06, + "loss": 0.8481, + "step": 1393 + }, + { + "epoch": 0.12, + "grad_norm": 13.1922233009389, + "learning_rate": 9.777784168112544e-06, + "loss": 0.9095, + "step": 1394 + }, + { + "epoch": 0.12, + "grad_norm": 3.134871110513443, + "learning_rate": 9.777364668697455e-06, + "loss": 0.5878, + "step": 1395 + }, + { + "epoch": 0.12, + "grad_norm": 10.662691452363328, + "learning_rate": 9.776944782705962e-06, + "loss": 0.8725, + "step": 1396 + }, + { + "epoch": 0.12, + "grad_norm": 14.321755171121156, + "learning_rate": 9.776524510172043e-06, + "loss": 0.8351, + "step": 1397 + }, + { + "epoch": 0.12, + "grad_norm": 14.611816590451138, + "learning_rate": 9.776103851129706e-06, + "loss": 0.8963, + "step": 1398 + }, + { + "epoch": 0.12, + "grad_norm": 12.513543347920518, + "learning_rate": 9.77568280561299e-06, + "loss": 0.8872, + "step": 1399 + }, + { + "epoch": 0.12, + "grad_norm": 9.948266916299918, + "learning_rate": 9.775261373655965e-06, + "loss": 1.0956, + "step": 1400 + }, + { + "epoch": 0.12, + "grad_norm": 10.160004956965075, + "learning_rate": 9.774839555292735e-06, + "loss": 0.9287, + "step": 1401 + }, + { + "epoch": 0.12, + "grad_norm": 13.427196137407712, + "learning_rate": 9.774417350557429e-06, + "loss": 0.8084, + "step": 1402 + }, + { + "epoch": 0.12, + "grad_norm": 8.703202991994138, + "learning_rate": 9.773994759484212e-06, + "loss": 0.8161, + "step": 1403 + }, + { + "epoch": 0.12, + "grad_norm": 9.254287163734093, + "learning_rate": 9.77357178210728e-06, + "loss": 0.9863, + "step": 1404 + }, + { + "epoch": 0.12, + "grad_norm": 14.14746087922323, + "learning_rate": 9.773148418460862e-06, + "loss": 0.8558, + "step": 1405 + }, + { + "epoch": 0.12, + "grad_norm": 7.471173085154003, + "learning_rate": 9.772724668579212e-06, + "loss": 0.8339, + "step": 1406 + }, + { + "epoch": 0.12, + "grad_norm": 13.697737449021293, + "learning_rate": 9.772300532496622e-06, + "loss": 0.8296, + "step": 1407 + }, + { + "epoch": 0.12, + "grad_norm": 6.6172795781959515, + "learning_rate": 9.77187601024741e-06, + "loss": 0.9322, + "step": 1408 + }, + { + "epoch": 0.12, + "grad_norm": 10.211787270009925, + "learning_rate": 9.77145110186593e-06, + "loss": 0.8014, + "step": 1409 + }, + { + "epoch": 0.12, + "grad_norm": 9.37505475161521, + "learning_rate": 9.771025807386562e-06, + "loss": 0.9301, + "step": 1410 + }, + { + "epoch": 0.12, + "grad_norm": 6.3717057236256, + "learning_rate": 9.770600126843724e-06, + "loss": 0.9091, + "step": 1411 + }, + { + "epoch": 0.12, + "grad_norm": 17.150761981674037, + "learning_rate": 9.770174060271858e-06, + "loss": 0.971, + "step": 1412 + }, + { + "epoch": 0.12, + "grad_norm": 9.146233232065574, + "learning_rate": 9.769747607705442e-06, + "loss": 0.8486, + "step": 1413 + }, + { + "epoch": 0.12, + "grad_norm": 35.02265407861344, + "learning_rate": 9.769320769178984e-06, + "loss": 0.7801, + "step": 1414 + }, + { + "epoch": 0.12, + "grad_norm": 8.786424035761078, + "learning_rate": 9.768893544727022e-06, + "loss": 0.7614, + "step": 1415 + }, + { + "epoch": 0.12, + "grad_norm": 9.411764884865802, + "learning_rate": 9.768465934384129e-06, + "loss": 0.7542, + "step": 1416 + }, + { + "epoch": 0.12, + "grad_norm": 14.52783616603983, + "learning_rate": 9.768037938184902e-06, + "loss": 1.0029, + "step": 1417 + }, + { + "epoch": 0.12, + "grad_norm": 11.108163466047941, + "learning_rate": 9.767609556163977e-06, + "loss": 0.806, + "step": 1418 + }, + { + "epoch": 0.12, + "grad_norm": 12.468954269632246, + "learning_rate": 9.767180788356018e-06, + "loss": 0.7522, + "step": 1419 + }, + { + "epoch": 0.12, + "grad_norm": 7.387924977174727, + "learning_rate": 9.766751634795719e-06, + "loss": 0.7839, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 9.10082428097885, + "learning_rate": 9.766322095517807e-06, + "loss": 0.9456, + "step": 1421 + }, + { + "epoch": 0.12, + "grad_norm": 8.123476764184108, + "learning_rate": 9.765892170557039e-06, + "loss": 0.8524, + "step": 1422 + }, + { + "epoch": 0.12, + "grad_norm": 2.167718896687399, + "learning_rate": 9.765461859948204e-06, + "loss": 0.4656, + "step": 1423 + }, + { + "epoch": 0.13, + "grad_norm": 10.014694188167079, + "learning_rate": 9.765031163726123e-06, + "loss": 0.7697, + "step": 1424 + }, + { + "epoch": 0.13, + "grad_norm": 16.999659240396017, + "learning_rate": 9.764600081925646e-06, + "loss": 0.8662, + "step": 1425 + }, + { + "epoch": 0.13, + "grad_norm": 11.96629433910608, + "learning_rate": 9.764168614581654e-06, + "loss": 1.0531, + "step": 1426 + }, + { + "epoch": 0.13, + "grad_norm": 7.8740404739186545, + "learning_rate": 9.763736761729065e-06, + "loss": 0.9263, + "step": 1427 + }, + { + "epoch": 0.13, + "grad_norm": 20.361906501214875, + "learning_rate": 9.763304523402822e-06, + "loss": 0.8525, + "step": 1428 + }, + { + "epoch": 0.13, + "grad_norm": 10.021259795509149, + "learning_rate": 9.762871899637898e-06, + "loss": 0.8956, + "step": 1429 + }, + { + "epoch": 0.13, + "grad_norm": 9.46186415299663, + "learning_rate": 9.762438890469304e-06, + "loss": 1.0867, + "step": 1430 + }, + { + "epoch": 0.13, + "grad_norm": 10.0156758425286, + "learning_rate": 9.762005495932076e-06, + "loss": 0.7792, + "step": 1431 + }, + { + "epoch": 0.13, + "grad_norm": 14.88342589965678, + "learning_rate": 9.761571716061285e-06, + "loss": 0.7005, + "step": 1432 + }, + { + "epoch": 0.13, + "grad_norm": 17.892311880909993, + "learning_rate": 9.761137550892031e-06, + "loss": 0.7863, + "step": 1433 + }, + { + "epoch": 0.13, + "grad_norm": 12.652093920177414, + "learning_rate": 9.760703000459445e-06, + "loss": 0.8633, + "step": 1434 + }, + { + "epoch": 0.13, + "grad_norm": 9.57048352044436, + "learning_rate": 9.760268064798694e-06, + "loss": 0.8086, + "step": 1435 + }, + { + "epoch": 0.13, + "grad_norm": 10.151407908744508, + "learning_rate": 9.759832743944969e-06, + "loss": 0.8262, + "step": 1436 + }, + { + "epoch": 0.13, + "grad_norm": 8.423594443329797, + "learning_rate": 9.759397037933493e-06, + "loss": 0.8276, + "step": 1437 + }, + { + "epoch": 0.13, + "grad_norm": 11.192014421366713, + "learning_rate": 9.758960946799528e-06, + "loss": 0.8042, + "step": 1438 + }, + { + "epoch": 0.13, + "grad_norm": 23.549374330645353, + "learning_rate": 9.758524470578358e-06, + "loss": 0.8664, + "step": 1439 + }, + { + "epoch": 0.13, + "grad_norm": 16.594525280855155, + "learning_rate": 9.758087609305304e-06, + "loss": 0.9321, + "step": 1440 + }, + { + "epoch": 0.13, + "grad_norm": 11.864758500698644, + "learning_rate": 9.757650363015715e-06, + "loss": 0.9397, + "step": 1441 + }, + { + "epoch": 0.13, + "grad_norm": 13.963183109927474, + "learning_rate": 9.757212731744973e-06, + "loss": 0.7619, + "step": 1442 + }, + { + "epoch": 0.13, + "grad_norm": 11.536757356834714, + "learning_rate": 9.75677471552849e-06, + "loss": 0.8868, + "step": 1443 + }, + { + "epoch": 0.13, + "grad_norm": 9.15647201747513, + "learning_rate": 9.756336314401712e-06, + "loss": 1.0159, + "step": 1444 + }, + { + "epoch": 0.13, + "grad_norm": 11.654446097256447, + "learning_rate": 9.755897528400107e-06, + "loss": 0.7954, + "step": 1445 + }, + { + "epoch": 0.13, + "grad_norm": 20.889066078125, + "learning_rate": 9.755458357559187e-06, + "loss": 0.9171, + "step": 1446 + }, + { + "epoch": 0.13, + "grad_norm": 12.773355480761948, + "learning_rate": 9.755018801914487e-06, + "loss": 0.9118, + "step": 1447 + }, + { + "epoch": 0.13, + "grad_norm": 13.396089146742543, + "learning_rate": 9.754578861501574e-06, + "loss": 0.7741, + "step": 1448 + }, + { + "epoch": 0.13, + "grad_norm": 9.577343954975037, + "learning_rate": 9.754138536356047e-06, + "loss": 0.8995, + "step": 1449 + }, + { + "epoch": 0.13, + "grad_norm": 7.765329790667569, + "learning_rate": 9.753697826513542e-06, + "loss": 0.7522, + "step": 1450 + }, + { + "epoch": 0.13, + "grad_norm": 11.113743663007904, + "learning_rate": 9.753256732009713e-06, + "loss": 0.9557, + "step": 1451 + }, + { + "epoch": 0.13, + "grad_norm": 10.295430779433573, + "learning_rate": 9.752815252880257e-06, + "loss": 0.9392, + "step": 1452 + }, + { + "epoch": 0.13, + "grad_norm": 15.87372668937085, + "learning_rate": 9.752373389160896e-06, + "loss": 0.8449, + "step": 1453 + }, + { + "epoch": 0.13, + "grad_norm": 15.996916815386363, + "learning_rate": 9.751931140887387e-06, + "loss": 0.9479, + "step": 1454 + }, + { + "epoch": 0.13, + "grad_norm": 8.983598425021633, + "learning_rate": 9.751488508095514e-06, + "loss": 0.7672, + "step": 1455 + }, + { + "epoch": 0.13, + "grad_norm": 15.160525486932805, + "learning_rate": 9.751045490821093e-06, + "loss": 0.8971, + "step": 1456 + }, + { + "epoch": 0.13, + "grad_norm": 10.48369379679316, + "learning_rate": 9.750602089099977e-06, + "loss": 1.0875, + "step": 1457 + }, + { + "epoch": 0.13, + "grad_norm": 11.20852784562168, + "learning_rate": 9.75015830296804e-06, + "loss": 0.9396, + "step": 1458 + }, + { + "epoch": 0.13, + "grad_norm": 11.128051024663955, + "learning_rate": 9.749714132461195e-06, + "loss": 0.8426, + "step": 1459 + }, + { + "epoch": 0.13, + "grad_norm": 10.176375900216494, + "learning_rate": 9.749269577615382e-06, + "loss": 1.0309, + "step": 1460 + }, + { + "epoch": 0.13, + "grad_norm": 11.692095946162022, + "learning_rate": 9.748824638466576e-06, + "loss": 1.0789, + "step": 1461 + }, + { + "epoch": 0.13, + "grad_norm": 9.21035964817288, + "learning_rate": 9.748379315050778e-06, + "loss": 0.6825, + "step": 1462 + }, + { + "epoch": 0.13, + "grad_norm": 9.768761284691337, + "learning_rate": 9.747933607404027e-06, + "loss": 0.8441, + "step": 1463 + }, + { + "epoch": 0.13, + "grad_norm": 2.7034479971879692, + "learning_rate": 9.747487515562384e-06, + "loss": 0.4771, + "step": 1464 + }, + { + "epoch": 0.13, + "grad_norm": 17.480102936531484, + "learning_rate": 9.74704103956195e-06, + "loss": 0.74, + "step": 1465 + }, + { + "epoch": 0.13, + "grad_norm": 9.750384432627703, + "learning_rate": 9.74659417943885e-06, + "loss": 0.7918, + "step": 1466 + }, + { + "epoch": 0.13, + "grad_norm": 10.647890866225731, + "learning_rate": 9.746146935229246e-06, + "loss": 0.8678, + "step": 1467 + }, + { + "epoch": 0.13, + "grad_norm": 8.553772376070132, + "learning_rate": 9.745699306969325e-06, + "loss": 0.8532, + "step": 1468 + }, + { + "epoch": 0.13, + "grad_norm": 11.939649148824687, + "learning_rate": 9.745251294695311e-06, + "loss": 0.8738, + "step": 1469 + }, + { + "epoch": 0.13, + "grad_norm": 10.450284380890634, + "learning_rate": 9.744802898443457e-06, + "loss": 0.7781, + "step": 1470 + }, + { + "epoch": 0.13, + "grad_norm": 11.355790867408059, + "learning_rate": 9.744354118250043e-06, + "loss": 1.0441, + "step": 1471 + }, + { + "epoch": 0.13, + "grad_norm": 11.885056896209113, + "learning_rate": 9.743904954151386e-06, + "loss": 0.7237, + "step": 1472 + }, + { + "epoch": 0.13, + "grad_norm": 4.040688569451749, + "learning_rate": 9.743455406183831e-06, + "loss": 0.5424, + "step": 1473 + }, + { + "epoch": 0.13, + "grad_norm": 18.911149611776793, + "learning_rate": 9.743005474383755e-06, + "loss": 0.9781, + "step": 1474 + }, + { + "epoch": 0.13, + "grad_norm": 3.3629843144301197, + "learning_rate": 9.742555158787567e-06, + "loss": 0.5325, + "step": 1475 + }, + { + "epoch": 0.13, + "grad_norm": 10.287353876031803, + "learning_rate": 9.742104459431703e-06, + "loss": 0.9668, + "step": 1476 + }, + { + "epoch": 0.13, + "grad_norm": 21.023808749752103, + "learning_rate": 9.741653376352634e-06, + "loss": 0.8165, + "step": 1477 + }, + { + "epoch": 0.13, + "grad_norm": 9.775435163340907, + "learning_rate": 9.741201909586861e-06, + "loss": 0.8113, + "step": 1478 + }, + { + "epoch": 0.13, + "grad_norm": 10.612958248981581, + "learning_rate": 9.740750059170918e-06, + "loss": 0.8743, + "step": 1479 + }, + { + "epoch": 0.13, + "grad_norm": 11.296089672151185, + "learning_rate": 9.740297825141364e-06, + "loss": 0.977, + "step": 1480 + }, + { + "epoch": 0.13, + "grad_norm": 10.274222879246894, + "learning_rate": 9.739845207534798e-06, + "loss": 0.8346, + "step": 1481 + }, + { + "epoch": 0.13, + "grad_norm": 11.690232051976466, + "learning_rate": 9.739392206387838e-06, + "loss": 0.7695, + "step": 1482 + }, + { + "epoch": 0.13, + "grad_norm": 3.5052392010500397, + "learning_rate": 9.738938821737147e-06, + "loss": 0.5292, + "step": 1483 + }, + { + "epoch": 0.13, + "grad_norm": 14.184640132364608, + "learning_rate": 9.738485053619409e-06, + "loss": 0.9531, + "step": 1484 + }, + { + "epoch": 0.13, + "grad_norm": 18.66769068000441, + "learning_rate": 9.73803090207134e-06, + "loss": 0.9057, + "step": 1485 + }, + { + "epoch": 0.13, + "grad_norm": 3.1229339885257206, + "learning_rate": 9.737576367129694e-06, + "loss": 0.5361, + "step": 1486 + }, + { + "epoch": 0.13, + "grad_norm": 7.781408760414643, + "learning_rate": 9.73712144883125e-06, + "loss": 0.8365, + "step": 1487 + }, + { + "epoch": 0.13, + "grad_norm": 10.242162806399081, + "learning_rate": 9.736666147212817e-06, + "loss": 0.8475, + "step": 1488 + }, + { + "epoch": 0.13, + "grad_norm": 3.2554775752862537, + "learning_rate": 9.736210462311237e-06, + "loss": 0.5152, + "step": 1489 + }, + { + "epoch": 0.13, + "grad_norm": 9.837386081894813, + "learning_rate": 9.735754394163387e-06, + "loss": 0.621, + "step": 1490 + }, + { + "epoch": 0.13, + "grad_norm": 13.745701144175857, + "learning_rate": 9.735297942806168e-06, + "loss": 0.946, + "step": 1491 + }, + { + "epoch": 0.13, + "grad_norm": 9.025025885709963, + "learning_rate": 9.734841108276516e-06, + "loss": 0.8996, + "step": 1492 + }, + { + "epoch": 0.13, + "grad_norm": 11.433171871290101, + "learning_rate": 9.734383890611398e-06, + "loss": 0.9307, + "step": 1493 + }, + { + "epoch": 0.13, + "grad_norm": 6.525021425223875, + "learning_rate": 9.73392628984781e-06, + "loss": 0.7346, + "step": 1494 + }, + { + "epoch": 0.13, + "grad_norm": 12.040187113871461, + "learning_rate": 9.733468306022782e-06, + "loss": 0.9699, + "step": 1495 + }, + { + "epoch": 0.13, + "grad_norm": 9.46678536216209, + "learning_rate": 9.733009939173372e-06, + "loss": 0.8304, + "step": 1496 + }, + { + "epoch": 0.13, + "grad_norm": 9.028513667853563, + "learning_rate": 9.73255118933667e-06, + "loss": 0.8954, + "step": 1497 + }, + { + "epoch": 0.13, + "grad_norm": 7.7094127588465176, + "learning_rate": 9.7320920565498e-06, + "loss": 0.8075, + "step": 1498 + }, + { + "epoch": 0.13, + "grad_norm": 10.599399653118441, + "learning_rate": 9.73163254084991e-06, + "loss": 0.7551, + "step": 1499 + }, + { + "epoch": 0.13, + "grad_norm": 16.595120552416024, + "learning_rate": 9.731172642274185e-06, + "loss": 0.8734, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 7.811017994831574, + "learning_rate": 9.730712360859842e-06, + "loss": 0.8727, + "step": 1501 + }, + { + "epoch": 0.13, + "grad_norm": 8.944855700189638, + "learning_rate": 9.730251696644123e-06, + "loss": 0.7891, + "step": 1502 + }, + { + "epoch": 0.13, + "grad_norm": 2.809734492065169, + "learning_rate": 9.729790649664305e-06, + "loss": 0.5499, + "step": 1503 + }, + { + "epoch": 0.13, + "grad_norm": 4.481561841485667, + "learning_rate": 9.729329219957695e-06, + "loss": 0.5699, + "step": 1504 + }, + { + "epoch": 0.13, + "grad_norm": 10.365920202903354, + "learning_rate": 9.728867407561631e-06, + "loss": 0.8912, + "step": 1505 + }, + { + "epoch": 0.13, + "grad_norm": 11.416067780244424, + "learning_rate": 9.728405212513484e-06, + "loss": 0.8103, + "step": 1506 + }, + { + "epoch": 0.13, + "grad_norm": 13.06894748126, + "learning_rate": 9.72794263485065e-06, + "loss": 0.8529, + "step": 1507 + }, + { + "epoch": 0.13, + "grad_norm": 7.55725023400478, + "learning_rate": 9.727479674610565e-06, + "loss": 0.7415, + "step": 1508 + }, + { + "epoch": 0.13, + "grad_norm": 12.467054065631894, + "learning_rate": 9.727016331830687e-06, + "loss": 0.8142, + "step": 1509 + }, + { + "epoch": 0.13, + "grad_norm": 10.693225742628913, + "learning_rate": 9.726552606548511e-06, + "loss": 0.9509, + "step": 1510 + }, + { + "epoch": 0.13, + "grad_norm": 86.34210748788145, + "learning_rate": 9.72608849880156e-06, + "loss": 0.8558, + "step": 1511 + }, + { + "epoch": 0.13, + "grad_norm": 12.088178868445954, + "learning_rate": 9.725624008627391e-06, + "loss": 0.7286, + "step": 1512 + }, + { + "epoch": 0.13, + "grad_norm": 20.19700066763562, + "learning_rate": 9.725159136063584e-06, + "loss": 0.8184, + "step": 1513 + }, + { + "epoch": 0.13, + "grad_norm": 8.1210968774506, + "learning_rate": 9.724693881147763e-06, + "loss": 0.9053, + "step": 1514 + }, + { + "epoch": 0.13, + "grad_norm": 11.536382215352434, + "learning_rate": 9.724228243917571e-06, + "loss": 0.8736, + "step": 1515 + }, + { + "epoch": 0.13, + "grad_norm": 12.7889687956442, + "learning_rate": 9.723762224410687e-06, + "loss": 1.0222, + "step": 1516 + }, + { + "epoch": 0.13, + "grad_norm": 7.156999418881954, + "learning_rate": 9.723295822664823e-06, + "loss": 0.8767, + "step": 1517 + }, + { + "epoch": 0.13, + "grad_norm": 8.775147470316321, + "learning_rate": 9.722829038717717e-06, + "loss": 0.8955, + "step": 1518 + }, + { + "epoch": 0.13, + "grad_norm": 9.657998737498422, + "learning_rate": 9.722361872607142e-06, + "loss": 0.8517, + "step": 1519 + }, + { + "epoch": 0.13, + "grad_norm": 13.778716234018987, + "learning_rate": 9.721894324370899e-06, + "loss": 0.8955, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 11.017333917445919, + "learning_rate": 9.721426394046821e-06, + "loss": 1.0194, + "step": 1521 + }, + { + "epoch": 0.13, + "grad_norm": 10.767058304362655, + "learning_rate": 9.720958081672772e-06, + "loss": 0.9808, + "step": 1522 + }, + { + "epoch": 0.13, + "grad_norm": 11.816234282467477, + "learning_rate": 9.720489387286651e-06, + "loss": 0.8089, + "step": 1523 + }, + { + "epoch": 0.13, + "grad_norm": 6.020256441445317, + "learning_rate": 9.720020310926379e-06, + "loss": 0.5472, + "step": 1524 + }, + { + "epoch": 0.13, + "grad_norm": 17.917361367412287, + "learning_rate": 9.719550852629915e-06, + "loss": 0.9213, + "step": 1525 + }, + { + "epoch": 0.13, + "grad_norm": 10.651816523563133, + "learning_rate": 9.719081012435248e-06, + "loss": 0.7968, + "step": 1526 + }, + { + "epoch": 0.13, + "grad_norm": 11.001939353908922, + "learning_rate": 9.718610790380393e-06, + "loss": 0.7946, + "step": 1527 + }, + { + "epoch": 0.13, + "grad_norm": 10.48707486838972, + "learning_rate": 9.718140186503403e-06, + "loss": 0.8585, + "step": 1528 + }, + { + "epoch": 0.13, + "grad_norm": 10.473668551162856, + "learning_rate": 9.717669200842358e-06, + "loss": 0.8244, + "step": 1529 + }, + { + "epoch": 0.13, + "grad_norm": 10.514494330486938, + "learning_rate": 9.717197833435367e-06, + "loss": 0.7568, + "step": 1530 + }, + { + "epoch": 0.13, + "grad_norm": 16.417815965128348, + "learning_rate": 9.716726084320576e-06, + "loss": 0.8369, + "step": 1531 + }, + { + "epoch": 0.13, + "grad_norm": 11.209489580722506, + "learning_rate": 9.716253953536155e-06, + "loss": 0.7304, + "step": 1532 + }, + { + "epoch": 0.13, + "grad_norm": 9.71383065581734, + "learning_rate": 9.71578144112031e-06, + "loss": 0.9801, + "step": 1533 + }, + { + "epoch": 0.13, + "grad_norm": 11.175850879179958, + "learning_rate": 9.715308547111273e-06, + "loss": 0.9222, + "step": 1534 + }, + { + "epoch": 0.13, + "grad_norm": 9.521077268075096, + "learning_rate": 9.714835271547312e-06, + "loss": 0.9185, + "step": 1535 + }, + { + "epoch": 0.13, + "grad_norm": 7.547470644891006, + "learning_rate": 9.714361614466726e-06, + "loss": 0.8738, + "step": 1536 + }, + { + "epoch": 0.13, + "grad_norm": 16.671252065991823, + "learning_rate": 9.713887575907838e-06, + "loss": 0.9701, + "step": 1537 + }, + { + "epoch": 0.14, + "grad_norm": 3.2031868010229347, + "learning_rate": 9.713413155909009e-06, + "loss": 0.5174, + "step": 1538 + }, + { + "epoch": 0.14, + "grad_norm": 10.972174698655444, + "learning_rate": 9.712938354508627e-06, + "loss": 0.8417, + "step": 1539 + }, + { + "epoch": 0.14, + "grad_norm": 3.1985079659252147, + "learning_rate": 9.712463171745115e-06, + "loss": 0.5291, + "step": 1540 + }, + { + "epoch": 0.14, + "grad_norm": 10.175665249391393, + "learning_rate": 9.71198760765692e-06, + "loss": 0.8016, + "step": 1541 + }, + { + "epoch": 0.14, + "grad_norm": 9.466886285583548, + "learning_rate": 9.711511662282527e-06, + "loss": 0.8327, + "step": 1542 + }, + { + "epoch": 0.14, + "grad_norm": 9.116838078788932, + "learning_rate": 9.711035335660447e-06, + "loss": 0.8443, + "step": 1543 + }, + { + "epoch": 0.14, + "grad_norm": 12.957880676797311, + "learning_rate": 9.710558627829225e-06, + "loss": 0.6843, + "step": 1544 + }, + { + "epoch": 0.14, + "grad_norm": 7.913940180813275, + "learning_rate": 9.710081538827434e-06, + "loss": 1.0037, + "step": 1545 + }, + { + "epoch": 0.14, + "grad_norm": 2.6300143768551845, + "learning_rate": 9.70960406869368e-06, + "loss": 0.5014, + "step": 1546 + }, + { + "epoch": 0.14, + "grad_norm": 10.847908134104975, + "learning_rate": 9.709126217466598e-06, + "loss": 0.8737, + "step": 1547 + }, + { + "epoch": 0.14, + "grad_norm": 7.098333110550973, + "learning_rate": 9.708647985184855e-06, + "loss": 0.8793, + "step": 1548 + }, + { + "epoch": 0.14, + "grad_norm": 9.90143233735622, + "learning_rate": 9.70816937188715e-06, + "loss": 0.8954, + "step": 1549 + }, + { + "epoch": 0.14, + "grad_norm": 7.357591868966643, + "learning_rate": 9.70769037761221e-06, + "loss": 0.9413, + "step": 1550 + }, + { + "epoch": 0.14, + "grad_norm": 12.295086374326027, + "learning_rate": 9.707211002398798e-06, + "loss": 0.8399, + "step": 1551 + }, + { + "epoch": 0.14, + "grad_norm": 8.204469046434676, + "learning_rate": 9.706731246285701e-06, + "loss": 0.6777, + "step": 1552 + }, + { + "epoch": 0.14, + "grad_norm": 11.136159362260138, + "learning_rate": 9.706251109311741e-06, + "loss": 0.8801, + "step": 1553 + }, + { + "epoch": 0.14, + "grad_norm": 8.020307452892167, + "learning_rate": 9.705770591515768e-06, + "loss": 0.9297, + "step": 1554 + }, + { + "epoch": 0.14, + "grad_norm": 9.336401584732823, + "learning_rate": 9.705289692936668e-06, + "loss": 1.0342, + "step": 1555 + }, + { + "epoch": 0.14, + "grad_norm": 8.779090874956678, + "learning_rate": 9.704808413613353e-06, + "loss": 0.8866, + "step": 1556 + }, + { + "epoch": 0.14, + "grad_norm": 132.99769443229263, + "learning_rate": 9.704326753584767e-06, + "loss": 0.835, + "step": 1557 + }, + { + "epoch": 0.14, + "grad_norm": 10.5955191049981, + "learning_rate": 9.703844712889883e-06, + "loss": 0.7529, + "step": 1558 + }, + { + "epoch": 0.14, + "grad_norm": 11.717093093077663, + "learning_rate": 9.703362291567713e-06, + "loss": 0.9057, + "step": 1559 + }, + { + "epoch": 0.14, + "grad_norm": 15.14835646919394, + "learning_rate": 9.702879489657288e-06, + "loss": 0.7845, + "step": 1560 + }, + { + "epoch": 0.14, + "grad_norm": 8.201050975210265, + "learning_rate": 9.702396307197677e-06, + "loss": 0.7877, + "step": 1561 + }, + { + "epoch": 0.14, + "grad_norm": 12.012449247604543, + "learning_rate": 9.70191274422798e-06, + "loss": 0.8207, + "step": 1562 + }, + { + "epoch": 0.14, + "grad_norm": 11.152007842079724, + "learning_rate": 9.701428800787325e-06, + "loss": 0.8257, + "step": 1563 + }, + { + "epoch": 0.14, + "grad_norm": 15.060150230561353, + "learning_rate": 9.700944476914872e-06, + "loss": 0.8486, + "step": 1564 + }, + { + "epoch": 0.14, + "grad_norm": 8.105158057266605, + "learning_rate": 9.70045977264981e-06, + "loss": 0.8669, + "step": 1565 + }, + { + "epoch": 0.14, + "grad_norm": 8.21159875170685, + "learning_rate": 9.699974688031362e-06, + "loss": 0.802, + "step": 1566 + }, + { + "epoch": 0.14, + "grad_norm": 7.849177520008691, + "learning_rate": 9.69948922309878e-06, + "loss": 0.8878, + "step": 1567 + }, + { + "epoch": 0.14, + "grad_norm": 3.9115852074331303, + "learning_rate": 9.69900337789135e-06, + "loss": 0.5329, + "step": 1568 + }, + { + "epoch": 0.14, + "grad_norm": 9.454078394072235, + "learning_rate": 9.698517152448382e-06, + "loss": 0.9012, + "step": 1569 + }, + { + "epoch": 0.14, + "grad_norm": 9.192399294432253, + "learning_rate": 9.69803054680922e-06, + "loss": 0.7427, + "step": 1570 + }, + { + "epoch": 0.14, + "grad_norm": 8.404662806132215, + "learning_rate": 9.697543561013242e-06, + "loss": 0.7426, + "step": 1571 + }, + { + "epoch": 0.14, + "grad_norm": 3.441620952914897, + "learning_rate": 9.697056195099854e-06, + "loss": 0.5359, + "step": 1572 + }, + { + "epoch": 0.14, + "grad_norm": 8.047105777238498, + "learning_rate": 9.696568449108492e-06, + "loss": 1.0046, + "step": 1573 + }, + { + "epoch": 0.14, + "grad_norm": 8.55052055736079, + "learning_rate": 9.696080323078621e-06, + "loss": 0.8207, + "step": 1574 + }, + { + "epoch": 0.14, + "grad_norm": 8.107921330015381, + "learning_rate": 9.695591817049744e-06, + "loss": 0.8646, + "step": 1575 + }, + { + "epoch": 0.14, + "grad_norm": 19.15533710713631, + "learning_rate": 9.695102931061386e-06, + "loss": 0.8992, + "step": 1576 + }, + { + "epoch": 0.14, + "grad_norm": 9.351413398159764, + "learning_rate": 9.69461366515311e-06, + "loss": 0.989, + "step": 1577 + }, + { + "epoch": 0.14, + "grad_norm": 3.305690855377652, + "learning_rate": 9.694124019364506e-06, + "loss": 0.52, + "step": 1578 + }, + { + "epoch": 0.14, + "grad_norm": 13.957568710358446, + "learning_rate": 9.693633993735195e-06, + "loss": 0.8656, + "step": 1579 + }, + { + "epoch": 0.14, + "grad_norm": 3.561865217203923, + "learning_rate": 9.693143588304826e-06, + "loss": 0.5163, + "step": 1580 + }, + { + "epoch": 0.14, + "grad_norm": 9.371306022510897, + "learning_rate": 9.692652803113084e-06, + "loss": 0.8386, + "step": 1581 + }, + { + "epoch": 0.14, + "grad_norm": 3.553065885437377, + "learning_rate": 9.692161638199686e-06, + "loss": 0.6083, + "step": 1582 + }, + { + "epoch": 0.14, + "grad_norm": 11.277315445427647, + "learning_rate": 9.691670093604373e-06, + "loss": 0.8309, + "step": 1583 + }, + { + "epoch": 0.14, + "grad_norm": 13.092318809469868, + "learning_rate": 9.691178169366919e-06, + "loss": 1.0025, + "step": 1584 + }, + { + "epoch": 0.14, + "grad_norm": 17.693223700621708, + "learning_rate": 9.690685865527132e-06, + "loss": 0.8369, + "step": 1585 + }, + { + "epoch": 0.14, + "grad_norm": 11.00760847209193, + "learning_rate": 9.690193182124845e-06, + "loss": 0.9004, + "step": 1586 + }, + { + "epoch": 0.14, + "grad_norm": 9.687307778413022, + "learning_rate": 9.68970011919993e-06, + "loss": 0.8745, + "step": 1587 + }, + { + "epoch": 0.14, + "grad_norm": 8.351674366212002, + "learning_rate": 9.689206676792281e-06, + "loss": 0.8438, + "step": 1588 + }, + { + "epoch": 0.14, + "grad_norm": 14.759742387893624, + "learning_rate": 9.688712854941828e-06, + "loss": 0.8073, + "step": 1589 + }, + { + "epoch": 0.14, + "grad_norm": 9.81955744563574, + "learning_rate": 9.68821865368853e-06, + "loss": 0.8834, + "step": 1590 + }, + { + "epoch": 0.14, + "grad_norm": 11.719371135229538, + "learning_rate": 9.687724073072378e-06, + "loss": 0.84, + "step": 1591 + }, + { + "epoch": 0.14, + "grad_norm": 12.839396673010551, + "learning_rate": 9.68722911313339e-06, + "loss": 0.8436, + "step": 1592 + }, + { + "epoch": 0.14, + "grad_norm": 2.5842728878016703, + "learning_rate": 9.68673377391162e-06, + "loss": 0.5064, + "step": 1593 + }, + { + "epoch": 0.14, + "grad_norm": 9.943142670883754, + "learning_rate": 9.686238055447147e-06, + "loss": 0.82, + "step": 1594 + }, + { + "epoch": 0.14, + "grad_norm": 12.683431735317876, + "learning_rate": 9.68574195778009e-06, + "loss": 0.7495, + "step": 1595 + }, + { + "epoch": 0.14, + "grad_norm": 12.408027994652755, + "learning_rate": 9.685245480950584e-06, + "loss": 0.758, + "step": 1596 + }, + { + "epoch": 0.14, + "grad_norm": 11.204513684465635, + "learning_rate": 9.68474862499881e-06, + "loss": 0.934, + "step": 1597 + }, + { + "epoch": 0.14, + "grad_norm": 6.735827322566873, + "learning_rate": 9.684251389964967e-06, + "loss": 0.7777, + "step": 1598 + }, + { + "epoch": 0.14, + "grad_norm": 12.544941742016869, + "learning_rate": 9.683753775889297e-06, + "loss": 0.8283, + "step": 1599 + }, + { + "epoch": 0.14, + "grad_norm": 29.116557516638164, + "learning_rate": 9.68325578281206e-06, + "loss": 0.9092, + "step": 1600 + }, + { + "epoch": 0.14, + "grad_norm": 14.176574913445867, + "learning_rate": 9.682757410773556e-06, + "loss": 0.8421, + "step": 1601 + }, + { + "epoch": 0.14, + "grad_norm": 13.655969801593166, + "learning_rate": 9.682258659814111e-06, + "loss": 0.9495, + "step": 1602 + }, + { + "epoch": 0.14, + "grad_norm": 8.612570376644491, + "learning_rate": 9.681759529974084e-06, + "loss": 0.9212, + "step": 1603 + }, + { + "epoch": 0.14, + "grad_norm": 12.949503859122036, + "learning_rate": 9.681260021293863e-06, + "loss": 0.9025, + "step": 1604 + }, + { + "epoch": 0.14, + "grad_norm": 9.988399158072134, + "learning_rate": 9.68076013381387e-06, + "loss": 0.9056, + "step": 1605 + }, + { + "epoch": 0.14, + "grad_norm": 10.649961236307277, + "learning_rate": 9.680259867574552e-06, + "loss": 0.741, + "step": 1606 + }, + { + "epoch": 0.14, + "grad_norm": 11.599740506235554, + "learning_rate": 9.679759222616389e-06, + "loss": 0.7871, + "step": 1607 + }, + { + "epoch": 0.14, + "grad_norm": 15.41412018114325, + "learning_rate": 9.679258198979897e-06, + "loss": 0.8652, + "step": 1608 + }, + { + "epoch": 0.14, + "grad_norm": 3.1684307601757276, + "learning_rate": 9.678756796705613e-06, + "loss": 0.5142, + "step": 1609 + }, + { + "epoch": 0.14, + "grad_norm": 7.9181984514043915, + "learning_rate": 9.678255015834112e-06, + "loss": 0.6803, + "step": 1610 + }, + { + "epoch": 0.14, + "grad_norm": 9.038981254651722, + "learning_rate": 9.677752856405998e-06, + "loss": 0.9146, + "step": 1611 + }, + { + "epoch": 0.14, + "grad_norm": 13.578429709417417, + "learning_rate": 9.677250318461904e-06, + "loss": 0.894, + "step": 1612 + }, + { + "epoch": 0.14, + "grad_norm": 2.4365510324958732, + "learning_rate": 9.676747402042493e-06, + "loss": 0.5809, + "step": 1613 + }, + { + "epoch": 0.14, + "grad_norm": 12.7944771870375, + "learning_rate": 9.676244107188463e-06, + "loss": 0.7606, + "step": 1614 + }, + { + "epoch": 0.14, + "grad_norm": 9.562737360583254, + "learning_rate": 9.675740433940538e-06, + "loss": 0.8622, + "step": 1615 + }, + { + "epoch": 0.14, + "grad_norm": 7.648976209229807, + "learning_rate": 9.675236382339475e-06, + "loss": 0.897, + "step": 1616 + }, + { + "epoch": 0.14, + "grad_norm": 11.811629584883967, + "learning_rate": 9.674731952426059e-06, + "loss": 0.9359, + "step": 1617 + }, + { + "epoch": 0.14, + "grad_norm": 12.8640133655927, + "learning_rate": 9.67422714424111e-06, + "loss": 0.8978, + "step": 1618 + }, + { + "epoch": 0.14, + "grad_norm": 14.672181128297977, + "learning_rate": 9.673721957825478e-06, + "loss": 0.9268, + "step": 1619 + }, + { + "epoch": 0.14, + "grad_norm": 85.40338505466335, + "learning_rate": 9.673216393220038e-06, + "loss": 0.9228, + "step": 1620 + }, + { + "epoch": 0.14, + "grad_norm": 9.854354038102404, + "learning_rate": 9.6727104504657e-06, + "loss": 0.8176, + "step": 1621 + }, + { + "epoch": 0.14, + "grad_norm": 8.378820017965154, + "learning_rate": 9.672204129603406e-06, + "loss": 0.9284, + "step": 1622 + }, + { + "epoch": 0.14, + "grad_norm": 8.010001618444003, + "learning_rate": 9.671697430674125e-06, + "loss": 0.8139, + "step": 1623 + }, + { + "epoch": 0.14, + "grad_norm": 37.732028870918136, + "learning_rate": 9.67119035371886e-06, + "loss": 0.9285, + "step": 1624 + }, + { + "epoch": 0.14, + "grad_norm": 10.275597330465697, + "learning_rate": 9.67068289877864e-06, + "loss": 0.9032, + "step": 1625 + }, + { + "epoch": 0.14, + "grad_norm": 12.628142689384822, + "learning_rate": 9.670175065894531e-06, + "loss": 0.837, + "step": 1626 + }, + { + "epoch": 0.14, + "grad_norm": 9.929031376275983, + "learning_rate": 9.669666855107623e-06, + "loss": 0.8809, + "step": 1627 + }, + { + "epoch": 0.14, + "grad_norm": 15.351139671919965, + "learning_rate": 9.66915826645904e-06, + "loss": 0.8651, + "step": 1628 + }, + { + "epoch": 0.14, + "grad_norm": 7.4360939860982915, + "learning_rate": 9.668649299989939e-06, + "loss": 0.8718, + "step": 1629 + }, + { + "epoch": 0.14, + "grad_norm": 7.442416545572839, + "learning_rate": 9.668139955741501e-06, + "loss": 0.9957, + "step": 1630 + }, + { + "epoch": 0.14, + "grad_norm": 13.935145100568002, + "learning_rate": 9.667630233754943e-06, + "loss": 0.8548, + "step": 1631 + }, + { + "epoch": 0.14, + "grad_norm": 16.627311829886747, + "learning_rate": 9.667120134071511e-06, + "loss": 0.7433, + "step": 1632 + }, + { + "epoch": 0.14, + "grad_norm": 10.735855612322391, + "learning_rate": 9.666609656732482e-06, + "loss": 0.7473, + "step": 1633 + }, + { + "epoch": 0.14, + "grad_norm": 5.329539335887738, + "learning_rate": 9.66609880177916e-06, + "loss": 0.4902, + "step": 1634 + }, + { + "epoch": 0.14, + "grad_norm": 9.23768632586643, + "learning_rate": 9.665587569252885e-06, + "loss": 0.8303, + "step": 1635 + }, + { + "epoch": 0.14, + "grad_norm": 13.140785270366154, + "learning_rate": 9.665075959195027e-06, + "loss": 1.0088, + "step": 1636 + }, + { + "epoch": 0.14, + "grad_norm": 9.084910338218378, + "learning_rate": 9.66456397164698e-06, + "loss": 0.8995, + "step": 1637 + }, + { + "epoch": 0.14, + "grad_norm": 21.411526361494133, + "learning_rate": 9.664051606650176e-06, + "loss": 0.9592, + "step": 1638 + }, + { + "epoch": 0.14, + "grad_norm": 8.539383255583429, + "learning_rate": 9.663538864246075e-06, + "loss": 0.8026, + "step": 1639 + }, + { + "epoch": 0.14, + "grad_norm": 3.4303009079800284, + "learning_rate": 9.663025744476167e-06, + "loss": 0.5546, + "step": 1640 + }, + { + "epoch": 0.14, + "grad_norm": 18.250973175303145, + "learning_rate": 9.66251224738197e-06, + "loss": 0.788, + "step": 1641 + }, + { + "epoch": 0.14, + "grad_norm": 19.068607369061496, + "learning_rate": 9.661998373005039e-06, + "loss": 0.8916, + "step": 1642 + }, + { + "epoch": 0.14, + "grad_norm": 7.2766348316554055, + "learning_rate": 9.661484121386955e-06, + "loss": 0.8955, + "step": 1643 + }, + { + "epoch": 0.14, + "grad_norm": 8.28906640970531, + "learning_rate": 9.660969492569328e-06, + "loss": 0.8879, + "step": 1644 + }, + { + "epoch": 0.14, + "grad_norm": 10.899270991678598, + "learning_rate": 9.660454486593805e-06, + "loss": 0.8296, + "step": 1645 + }, + { + "epoch": 0.14, + "grad_norm": 7.405982604684956, + "learning_rate": 9.659939103502056e-06, + "loss": 0.8603, + "step": 1646 + }, + { + "epoch": 0.14, + "grad_norm": 3.031368212059826, + "learning_rate": 9.659423343335787e-06, + "loss": 0.5066, + "step": 1647 + }, + { + "epoch": 0.14, + "grad_norm": 9.64508148132272, + "learning_rate": 9.65890720613673e-06, + "loss": 0.9177, + "step": 1648 + }, + { + "epoch": 0.14, + "grad_norm": 9.104344445157011, + "learning_rate": 9.658390691946652e-06, + "loss": 0.7282, + "step": 1649 + }, + { + "epoch": 0.14, + "grad_norm": 13.252753519101836, + "learning_rate": 9.65787380080735e-06, + "loss": 0.8635, + "step": 1650 + }, + { + "epoch": 0.14, + "grad_norm": 11.789952009096686, + "learning_rate": 9.657356532760647e-06, + "loss": 0.8993, + "step": 1651 + }, + { + "epoch": 0.15, + "grad_norm": 11.841345786370137, + "learning_rate": 9.6568388878484e-06, + "loss": 0.8492, + "step": 1652 + }, + { + "epoch": 0.15, + "grad_norm": 18.520727961749532, + "learning_rate": 9.656320866112496e-06, + "loss": 0.9108, + "step": 1653 + }, + { + "epoch": 0.15, + "grad_norm": 10.54224979732224, + "learning_rate": 9.655802467594854e-06, + "loss": 0.7934, + "step": 1654 + }, + { + "epoch": 0.15, + "grad_norm": 8.934658876313671, + "learning_rate": 9.65528369233742e-06, + "loss": 0.9371, + "step": 1655 + }, + { + "epoch": 0.15, + "grad_norm": 19.02780663737474, + "learning_rate": 9.654764540382174e-06, + "loss": 0.8748, + "step": 1656 + }, + { + "epoch": 0.15, + "grad_norm": 13.364298866886612, + "learning_rate": 9.654245011771123e-06, + "loss": 0.8402, + "step": 1657 + }, + { + "epoch": 0.15, + "grad_norm": 11.514534082708444, + "learning_rate": 9.65372510654631e-06, + "loss": 0.9299, + "step": 1658 + }, + { + "epoch": 0.15, + "grad_norm": 7.417587354017337, + "learning_rate": 9.6532048247498e-06, + "loss": 1.0127, + "step": 1659 + }, + { + "epoch": 0.15, + "grad_norm": 8.994759699911027, + "learning_rate": 9.652684166423696e-06, + "loss": 0.8119, + "step": 1660 + }, + { + "epoch": 0.15, + "grad_norm": 8.349176613526861, + "learning_rate": 9.65216313161013e-06, + "loss": 0.8746, + "step": 1661 + }, + { + "epoch": 0.15, + "grad_norm": 11.426902769880904, + "learning_rate": 9.651641720351262e-06, + "loss": 0.8252, + "step": 1662 + }, + { + "epoch": 0.15, + "grad_norm": 8.61697759664849, + "learning_rate": 9.651119932689283e-06, + "loss": 0.9167, + "step": 1663 + }, + { + "epoch": 0.15, + "grad_norm": 23.96381470720035, + "learning_rate": 9.650597768666415e-06, + "loss": 0.8504, + "step": 1664 + }, + { + "epoch": 0.15, + "grad_norm": 9.233803395934032, + "learning_rate": 9.650075228324911e-06, + "loss": 0.7643, + "step": 1665 + }, + { + "epoch": 0.15, + "grad_norm": 19.611605802688853, + "learning_rate": 9.649552311707056e-06, + "loss": 0.7638, + "step": 1666 + }, + { + "epoch": 0.15, + "grad_norm": 3.4141026402680246, + "learning_rate": 9.649029018855161e-06, + "loss": 0.5927, + "step": 1667 + }, + { + "epoch": 0.15, + "grad_norm": 2.6691429083174016, + "learning_rate": 9.648505349811573e-06, + "loss": 0.5047, + "step": 1668 + }, + { + "epoch": 0.15, + "grad_norm": 11.831991391843383, + "learning_rate": 9.647981304618663e-06, + "loss": 0.8181, + "step": 1669 + }, + { + "epoch": 0.15, + "grad_norm": 9.71739311554245, + "learning_rate": 9.647456883318838e-06, + "loss": 0.9077, + "step": 1670 + }, + { + "epoch": 0.15, + "grad_norm": 5.988754399107101, + "learning_rate": 9.646932085954532e-06, + "loss": 0.8583, + "step": 1671 + }, + { + "epoch": 0.15, + "grad_norm": 14.129972070710293, + "learning_rate": 9.646406912568211e-06, + "loss": 0.8188, + "step": 1672 + }, + { + "epoch": 0.15, + "grad_norm": 8.974631262262857, + "learning_rate": 9.645881363202371e-06, + "loss": 0.7797, + "step": 1673 + }, + { + "epoch": 0.15, + "grad_norm": 6.988669753285932, + "learning_rate": 9.645355437899541e-06, + "loss": 0.9337, + "step": 1674 + }, + { + "epoch": 0.15, + "grad_norm": 16.996174724766473, + "learning_rate": 9.644829136702275e-06, + "loss": 0.84, + "step": 1675 + }, + { + "epoch": 0.15, + "grad_norm": 14.550377631018522, + "learning_rate": 9.644302459653161e-06, + "loss": 0.9085, + "step": 1676 + }, + { + "epoch": 0.15, + "grad_norm": 11.699981004113505, + "learning_rate": 9.643775406794819e-06, + "loss": 0.7458, + "step": 1677 + }, + { + "epoch": 0.15, + "grad_norm": 2.754370546929426, + "learning_rate": 9.643247978169895e-06, + "loss": 0.5341, + "step": 1678 + }, + { + "epoch": 0.15, + "grad_norm": 16.994456069643384, + "learning_rate": 9.642720173821067e-06, + "loss": 0.8289, + "step": 1679 + }, + { + "epoch": 0.15, + "grad_norm": 2.8834584289129097, + "learning_rate": 9.642191993791047e-06, + "loss": 0.5556, + "step": 1680 + }, + { + "epoch": 0.15, + "grad_norm": 8.770274659309683, + "learning_rate": 9.641663438122573e-06, + "loss": 0.7908, + "step": 1681 + }, + { + "epoch": 0.15, + "grad_norm": 8.539759215159425, + "learning_rate": 9.641134506858413e-06, + "loss": 0.8228, + "step": 1682 + }, + { + "epoch": 0.15, + "grad_norm": 10.289156282073824, + "learning_rate": 9.64060520004137e-06, + "loss": 0.9602, + "step": 1683 + }, + { + "epoch": 0.15, + "grad_norm": 13.185961402368779, + "learning_rate": 9.640075517714272e-06, + "loss": 1.038, + "step": 1684 + }, + { + "epoch": 0.15, + "grad_norm": 9.299308497705093, + "learning_rate": 9.639545459919984e-06, + "loss": 0.8548, + "step": 1685 + }, + { + "epoch": 0.15, + "grad_norm": 10.002825683802303, + "learning_rate": 9.639015026701392e-06, + "loss": 0.8081, + "step": 1686 + }, + { + "epoch": 0.15, + "grad_norm": 27.585440919262957, + "learning_rate": 9.638484218101423e-06, + "loss": 0.767, + "step": 1687 + }, + { + "epoch": 0.15, + "grad_norm": 13.906646657389548, + "learning_rate": 9.637953034163025e-06, + "loss": 0.8609, + "step": 1688 + }, + { + "epoch": 0.15, + "grad_norm": 17.332821877068195, + "learning_rate": 9.637421474929182e-06, + "loss": 0.8168, + "step": 1689 + }, + { + "epoch": 0.15, + "grad_norm": 7.777475088480843, + "learning_rate": 9.63688954044291e-06, + "loss": 0.9437, + "step": 1690 + }, + { + "epoch": 0.15, + "grad_norm": 8.199464313831411, + "learning_rate": 9.636357230747247e-06, + "loss": 0.8936, + "step": 1691 + }, + { + "epoch": 0.15, + "grad_norm": 9.236073689144252, + "learning_rate": 9.635824545885271e-06, + "loss": 0.8736, + "step": 1692 + }, + { + "epoch": 0.15, + "grad_norm": 7.375332423806438, + "learning_rate": 9.635291485900082e-06, + "loss": 0.8169, + "step": 1693 + }, + { + "epoch": 0.15, + "grad_norm": 10.720636196949965, + "learning_rate": 9.634758050834818e-06, + "loss": 0.9901, + "step": 1694 + }, + { + "epoch": 0.15, + "grad_norm": 3.2866510017811965, + "learning_rate": 9.634224240732641e-06, + "loss": 0.5928, + "step": 1695 + }, + { + "epoch": 0.15, + "grad_norm": 2.687156631164978, + "learning_rate": 9.633690055636746e-06, + "loss": 0.5237, + "step": 1696 + }, + { + "epoch": 0.15, + "grad_norm": 11.775278517039002, + "learning_rate": 9.633155495590361e-06, + "loss": 0.7579, + "step": 1697 + }, + { + "epoch": 0.15, + "grad_norm": 9.706260164704817, + "learning_rate": 9.63262056063674e-06, + "loss": 0.8456, + "step": 1698 + }, + { + "epoch": 0.15, + "grad_norm": 8.842513394893198, + "learning_rate": 9.632085250819169e-06, + "loss": 0.8613, + "step": 1699 + }, + { + "epoch": 0.15, + "grad_norm": 10.868245572269531, + "learning_rate": 9.631549566180966e-06, + "loss": 1.0385, + "step": 1700 + }, + { + "epoch": 0.15, + "grad_norm": 23.234397264711067, + "learning_rate": 9.631013506765474e-06, + "loss": 0.6977, + "step": 1701 + }, + { + "epoch": 0.15, + "grad_norm": 7.298705434968005, + "learning_rate": 9.630477072616073e-06, + "loss": 0.8812, + "step": 1702 + }, + { + "epoch": 0.15, + "grad_norm": 2.6390607526495917, + "learning_rate": 9.62994026377617e-06, + "loss": 0.4528, + "step": 1703 + }, + { + "epoch": 0.15, + "grad_norm": 8.93136335757573, + "learning_rate": 9.629403080289204e-06, + "loss": 0.7663, + "step": 1704 + }, + { + "epoch": 0.15, + "grad_norm": 8.6307589662779, + "learning_rate": 9.628865522198639e-06, + "loss": 0.8809, + "step": 1705 + }, + { + "epoch": 0.15, + "grad_norm": 8.267012538490238, + "learning_rate": 9.628327589547977e-06, + "loss": 0.9089, + "step": 1706 + }, + { + "epoch": 0.15, + "grad_norm": 8.433384105767846, + "learning_rate": 9.627789282380743e-06, + "loss": 0.8389, + "step": 1707 + }, + { + "epoch": 0.15, + "grad_norm": 8.676785539535826, + "learning_rate": 9.627250600740501e-06, + "loss": 0.9591, + "step": 1708 + }, + { + "epoch": 0.15, + "grad_norm": 12.804496180487483, + "learning_rate": 9.626711544670837e-06, + "loss": 0.9829, + "step": 1709 + }, + { + "epoch": 0.15, + "grad_norm": 8.57688357987847, + "learning_rate": 9.62617211421537e-06, + "loss": 0.7885, + "step": 1710 + }, + { + "epoch": 0.15, + "grad_norm": 10.147707258110403, + "learning_rate": 9.625632309417752e-06, + "loss": 0.958, + "step": 1711 + }, + { + "epoch": 0.15, + "grad_norm": 10.776954627400542, + "learning_rate": 9.62509213032166e-06, + "loss": 0.8233, + "step": 1712 + }, + { + "epoch": 0.15, + "grad_norm": 9.16996165907354, + "learning_rate": 9.624551576970807e-06, + "loss": 0.9162, + "step": 1713 + }, + { + "epoch": 0.15, + "grad_norm": 10.251716034880051, + "learning_rate": 9.624010649408933e-06, + "loss": 0.8422, + "step": 1714 + }, + { + "epoch": 0.15, + "grad_norm": 8.162348340306007, + "learning_rate": 9.623469347679808e-06, + "loss": 0.9052, + "step": 1715 + }, + { + "epoch": 0.15, + "grad_norm": 4.108538398872051, + "learning_rate": 9.622927671827236e-06, + "loss": 0.5436, + "step": 1716 + }, + { + "epoch": 0.15, + "grad_norm": 3.503200430296931, + "learning_rate": 9.622385621895046e-06, + "loss": 0.6533, + "step": 1717 + }, + { + "epoch": 0.15, + "grad_norm": 9.398566905990513, + "learning_rate": 9.621843197927101e-06, + "loss": 0.9872, + "step": 1718 + }, + { + "epoch": 0.15, + "grad_norm": 7.104262096737294, + "learning_rate": 9.621300399967291e-06, + "loss": 0.8926, + "step": 1719 + }, + { + "epoch": 0.15, + "grad_norm": 8.17604088217697, + "learning_rate": 9.62075722805954e-06, + "loss": 0.8761, + "step": 1720 + }, + { + "epoch": 0.15, + "grad_norm": 31.689162324439636, + "learning_rate": 9.6202136822478e-06, + "loss": 0.9709, + "step": 1721 + }, + { + "epoch": 0.15, + "grad_norm": 14.007764001875355, + "learning_rate": 9.619669762576057e-06, + "loss": 0.9258, + "step": 1722 + }, + { + "epoch": 0.15, + "grad_norm": 12.000445152501694, + "learning_rate": 9.61912546908832e-06, + "loss": 0.9396, + "step": 1723 + }, + { + "epoch": 0.15, + "grad_norm": 9.284115729529724, + "learning_rate": 9.618580801828632e-06, + "loss": 0.8309, + "step": 1724 + }, + { + "epoch": 0.15, + "grad_norm": 9.92868533076868, + "learning_rate": 9.61803576084107e-06, + "loss": 0.7039, + "step": 1725 + }, + { + "epoch": 0.15, + "grad_norm": 10.812845074605832, + "learning_rate": 9.617490346169733e-06, + "loss": 0.8363, + "step": 1726 + }, + { + "epoch": 0.15, + "grad_norm": 11.784133450382006, + "learning_rate": 9.61694455785876e-06, + "loss": 0.9302, + "step": 1727 + }, + { + "epoch": 0.15, + "grad_norm": 9.277474120280287, + "learning_rate": 9.616398395952313e-06, + "loss": 0.9076, + "step": 1728 + }, + { + "epoch": 0.15, + "grad_norm": 7.067480271235322, + "learning_rate": 9.615851860494588e-06, + "loss": 0.7949, + "step": 1729 + }, + { + "epoch": 0.15, + "grad_norm": 5.418059165995625, + "learning_rate": 9.615304951529806e-06, + "loss": 0.5047, + "step": 1730 + }, + { + "epoch": 0.15, + "grad_norm": 9.45830285560476, + "learning_rate": 9.614757669102225e-06, + "loss": 0.8225, + "step": 1731 + }, + { + "epoch": 0.15, + "grad_norm": 6.547358081965311, + "learning_rate": 9.614210013256131e-06, + "loss": 0.7288, + "step": 1732 + }, + { + "epoch": 0.15, + "grad_norm": 7.4970657105130964, + "learning_rate": 9.613661984035838e-06, + "loss": 0.8926, + "step": 1733 + }, + { + "epoch": 0.15, + "grad_norm": 16.86097548876894, + "learning_rate": 9.61311358148569e-06, + "loss": 0.7844, + "step": 1734 + }, + { + "epoch": 0.15, + "grad_norm": 13.949371980461954, + "learning_rate": 9.612564805650068e-06, + "loss": 0.8531, + "step": 1735 + }, + { + "epoch": 0.15, + "grad_norm": 7.3201357461783685, + "learning_rate": 9.612015656573372e-06, + "loss": 0.7444, + "step": 1736 + }, + { + "epoch": 0.15, + "grad_norm": 9.684815164595426, + "learning_rate": 9.61146613430004e-06, + "loss": 0.856, + "step": 1737 + }, + { + "epoch": 0.15, + "grad_norm": 11.614193416015622, + "learning_rate": 9.610916238874542e-06, + "loss": 0.8968, + "step": 1738 + }, + { + "epoch": 0.15, + "grad_norm": 6.824277978292564, + "learning_rate": 9.610365970341369e-06, + "loss": 1.035, + "step": 1739 + }, + { + "epoch": 0.15, + "grad_norm": 8.093965992452114, + "learning_rate": 9.609815328745052e-06, + "loss": 0.868, + "step": 1740 + }, + { + "epoch": 0.15, + "grad_norm": 7.321570670632434, + "learning_rate": 9.609264314130147e-06, + "loss": 0.8373, + "step": 1741 + }, + { + "epoch": 0.15, + "grad_norm": 11.801392423823161, + "learning_rate": 9.60871292654124e-06, + "loss": 0.8354, + "step": 1742 + }, + { + "epoch": 0.15, + "grad_norm": 20.068780368603594, + "learning_rate": 9.60816116602295e-06, + "loss": 0.8829, + "step": 1743 + }, + { + "epoch": 0.15, + "grad_norm": 7.408528904628242, + "learning_rate": 9.607609032619922e-06, + "loss": 0.7587, + "step": 1744 + }, + { + "epoch": 0.15, + "grad_norm": 8.665747696358915, + "learning_rate": 9.607056526376837e-06, + "loss": 0.8626, + "step": 1745 + }, + { + "epoch": 0.15, + "grad_norm": 8.133612690926096, + "learning_rate": 9.606503647338401e-06, + "loss": 0.9591, + "step": 1746 + }, + { + "epoch": 0.15, + "grad_norm": 9.957540567846271, + "learning_rate": 9.605950395549352e-06, + "loss": 0.9259, + "step": 1747 + }, + { + "epoch": 0.15, + "grad_norm": 11.255198429023821, + "learning_rate": 9.60539677105446e-06, + "loss": 0.8611, + "step": 1748 + }, + { + "epoch": 0.15, + "grad_norm": 10.187882284493591, + "learning_rate": 9.60484277389852e-06, + "loss": 0.954, + "step": 1749 + }, + { + "epoch": 0.15, + "grad_norm": 8.718811198554345, + "learning_rate": 9.604288404126362e-06, + "loss": 0.9877, + "step": 1750 + }, + { + "epoch": 0.15, + "grad_norm": 9.12394389884533, + "learning_rate": 9.603733661782848e-06, + "loss": 0.6985, + "step": 1751 + }, + { + "epoch": 0.15, + "grad_norm": 14.841137727377287, + "learning_rate": 9.603178546912863e-06, + "loss": 0.8784, + "step": 1752 + }, + { + "epoch": 0.15, + "grad_norm": 12.102145703930038, + "learning_rate": 9.602623059561327e-06, + "loss": 1.0167, + "step": 1753 + }, + { + "epoch": 0.15, + "grad_norm": 9.237267904041882, + "learning_rate": 9.602067199773189e-06, + "loss": 0.8488, + "step": 1754 + }, + { + "epoch": 0.15, + "grad_norm": 8.36232182635563, + "learning_rate": 9.601510967593425e-06, + "loss": 0.8777, + "step": 1755 + }, + { + "epoch": 0.15, + "grad_norm": 2.2375148666992866, + "learning_rate": 9.600954363067052e-06, + "loss": 0.4636, + "step": 1756 + }, + { + "epoch": 0.15, + "grad_norm": 14.02525162470183, + "learning_rate": 9.600397386239105e-06, + "loss": 0.8543, + "step": 1757 + }, + { + "epoch": 0.15, + "grad_norm": 10.536233028994475, + "learning_rate": 9.599840037154652e-06, + "loss": 0.7898, + "step": 1758 + }, + { + "epoch": 0.15, + "grad_norm": 2.6206179480468323, + "learning_rate": 9.599282315858795e-06, + "loss": 0.494, + "step": 1759 + }, + { + "epoch": 0.15, + "grad_norm": 10.639837996674487, + "learning_rate": 9.598724222396665e-06, + "loss": 0.7104, + "step": 1760 + }, + { + "epoch": 0.15, + "grad_norm": 10.52469823017753, + "learning_rate": 9.598165756813418e-06, + "loss": 0.808, + "step": 1761 + }, + { + "epoch": 0.15, + "grad_norm": 13.434629796831222, + "learning_rate": 9.59760691915425e-06, + "loss": 0.7385, + "step": 1762 + }, + { + "epoch": 0.15, + "grad_norm": 13.74587342406383, + "learning_rate": 9.597047709464375e-06, + "loss": 0.8732, + "step": 1763 + }, + { + "epoch": 0.15, + "grad_norm": 3.1400720525979624, + "learning_rate": 9.596488127789048e-06, + "loss": 0.4992, + "step": 1764 + }, + { + "epoch": 0.16, + "grad_norm": 2.919302124198621, + "learning_rate": 9.595928174173546e-06, + "loss": 0.4556, + "step": 1765 + }, + { + "epoch": 0.16, + "grad_norm": 81.46724264939431, + "learning_rate": 9.595367848663182e-06, + "loss": 0.8579, + "step": 1766 + }, + { + "epoch": 0.16, + "grad_norm": 10.293448477245244, + "learning_rate": 9.594807151303294e-06, + "loss": 0.8772, + "step": 1767 + }, + { + "epoch": 0.16, + "grad_norm": 7.108826945840187, + "learning_rate": 9.594246082139256e-06, + "loss": 0.8264, + "step": 1768 + }, + { + "epoch": 0.16, + "grad_norm": 9.997405147054053, + "learning_rate": 9.593684641216468e-06, + "loss": 0.7296, + "step": 1769 + }, + { + "epoch": 0.16, + "grad_norm": 11.38202201995398, + "learning_rate": 9.59312282858036e-06, + "loss": 0.8381, + "step": 1770 + }, + { + "epoch": 0.16, + "grad_norm": 2.341223054751116, + "learning_rate": 9.59256064427639e-06, + "loss": 0.438, + "step": 1771 + }, + { + "epoch": 0.16, + "grad_norm": 7.149175667179453, + "learning_rate": 9.591998088350055e-06, + "loss": 0.8214, + "step": 1772 + }, + { + "epoch": 0.16, + "grad_norm": 13.816676573192636, + "learning_rate": 9.591435160846871e-06, + "loss": 0.8451, + "step": 1773 + }, + { + "epoch": 0.16, + "grad_norm": 9.616367819080494, + "learning_rate": 9.590871861812392e-06, + "loss": 0.8203, + "step": 1774 + }, + { + "epoch": 0.16, + "grad_norm": 12.387124270477251, + "learning_rate": 9.590308191292198e-06, + "loss": 0.7041, + "step": 1775 + }, + { + "epoch": 0.16, + "grad_norm": 16.67170150932227, + "learning_rate": 9.589744149331903e-06, + "loss": 0.8286, + "step": 1776 + }, + { + "epoch": 0.16, + "grad_norm": 7.261486511566004, + "learning_rate": 9.589179735977145e-06, + "loss": 0.808, + "step": 1777 + }, + { + "epoch": 0.16, + "grad_norm": 13.37977029689356, + "learning_rate": 9.588614951273596e-06, + "loss": 1.0329, + "step": 1778 + }, + { + "epoch": 0.16, + "grad_norm": 7.666485438121056, + "learning_rate": 9.588049795266957e-06, + "loss": 0.9144, + "step": 1779 + }, + { + "epoch": 0.16, + "grad_norm": 12.36228727395337, + "learning_rate": 9.587484268002961e-06, + "loss": 1.062, + "step": 1780 + }, + { + "epoch": 0.16, + "grad_norm": 10.972744725398474, + "learning_rate": 9.58691836952737e-06, + "loss": 0.7837, + "step": 1781 + }, + { + "epoch": 0.16, + "grad_norm": 10.109973798745335, + "learning_rate": 9.586352099885973e-06, + "loss": 0.9881, + "step": 1782 + }, + { + "epoch": 0.16, + "grad_norm": 7.72923842660645, + "learning_rate": 9.585785459124595e-06, + "loss": 0.8636, + "step": 1783 + }, + { + "epoch": 0.16, + "grad_norm": 3.086308648499624, + "learning_rate": 9.585218447289084e-06, + "loss": 0.4938, + "step": 1784 + }, + { + "epoch": 0.16, + "grad_norm": 9.230025005919961, + "learning_rate": 9.584651064425323e-06, + "loss": 0.8518, + "step": 1785 + }, + { + "epoch": 0.16, + "grad_norm": 17.032460151431238, + "learning_rate": 9.584083310579227e-06, + "loss": 0.768, + "step": 1786 + }, + { + "epoch": 0.16, + "grad_norm": 12.534754142461965, + "learning_rate": 9.583515185796733e-06, + "loss": 1.0429, + "step": 1787 + }, + { + "epoch": 0.16, + "grad_norm": 9.333917314497858, + "learning_rate": 9.582946690123814e-06, + "loss": 0.9676, + "step": 1788 + }, + { + "epoch": 0.16, + "grad_norm": 8.968346393687916, + "learning_rate": 9.582377823606471e-06, + "loss": 0.8072, + "step": 1789 + }, + { + "epoch": 0.16, + "grad_norm": 9.745769971672056, + "learning_rate": 9.581808586290738e-06, + "loss": 0.7564, + "step": 1790 + }, + { + "epoch": 0.16, + "grad_norm": 2.905801793581582, + "learning_rate": 9.581238978222675e-06, + "loss": 0.5056, + "step": 1791 + }, + { + "epoch": 0.16, + "grad_norm": 11.93447969210894, + "learning_rate": 9.580668999448376e-06, + "loss": 0.9384, + "step": 1792 + }, + { + "epoch": 0.16, + "grad_norm": 19.211446861234712, + "learning_rate": 9.58009865001396e-06, + "loss": 0.8479, + "step": 1793 + }, + { + "epoch": 0.16, + "grad_norm": 2.8836057706539635, + "learning_rate": 9.579527929965581e-06, + "loss": 0.4617, + "step": 1794 + }, + { + "epoch": 0.16, + "grad_norm": 9.490020850200432, + "learning_rate": 9.578956839349419e-06, + "loss": 0.993, + "step": 1795 + }, + { + "epoch": 0.16, + "grad_norm": 14.790349871680464, + "learning_rate": 9.578385378211685e-06, + "loss": 0.871, + "step": 1796 + }, + { + "epoch": 0.16, + "grad_norm": 16.20938997144054, + "learning_rate": 9.577813546598625e-06, + "loss": 0.9862, + "step": 1797 + }, + { + "epoch": 0.16, + "grad_norm": 11.043030926649259, + "learning_rate": 9.577241344556505e-06, + "loss": 0.7608, + "step": 1798 + }, + { + "epoch": 0.16, + "grad_norm": 10.496055052428467, + "learning_rate": 9.57666877213163e-06, + "loss": 0.8111, + "step": 1799 + }, + { + "epoch": 0.16, + "grad_norm": 14.099466501112897, + "learning_rate": 9.576095829370331e-06, + "loss": 0.8835, + "step": 1800 + }, + { + "epoch": 0.16, + "grad_norm": 10.974009151074975, + "learning_rate": 9.575522516318969e-06, + "loss": 0.9564, + "step": 1801 + }, + { + "epoch": 0.16, + "grad_norm": 11.805238811186543, + "learning_rate": 9.574948833023935e-06, + "loss": 1.0033, + "step": 1802 + }, + { + "epoch": 0.16, + "grad_norm": 2.8308204253266975, + "learning_rate": 9.574374779531653e-06, + "loss": 0.5025, + "step": 1803 + }, + { + "epoch": 0.16, + "grad_norm": 8.268819914913538, + "learning_rate": 9.573800355888573e-06, + "loss": 0.7705, + "step": 1804 + }, + { + "epoch": 0.16, + "grad_norm": 12.970531311117652, + "learning_rate": 9.573225562141174e-06, + "loss": 0.9466, + "step": 1805 + }, + { + "epoch": 0.16, + "grad_norm": 7.734560058479369, + "learning_rate": 9.572650398335974e-06, + "loss": 0.8333, + "step": 1806 + }, + { + "epoch": 0.16, + "grad_norm": 8.99058869724867, + "learning_rate": 9.572074864519507e-06, + "loss": 0.7476, + "step": 1807 + }, + { + "epoch": 0.16, + "grad_norm": 20.009893176527534, + "learning_rate": 9.571498960738348e-06, + "loss": 0.9347, + "step": 1808 + }, + { + "epoch": 0.16, + "grad_norm": 8.590852413051856, + "learning_rate": 9.570922687039096e-06, + "loss": 0.8509, + "step": 1809 + }, + { + "epoch": 0.16, + "grad_norm": 2.2499834276396773, + "learning_rate": 9.570346043468385e-06, + "loss": 0.4652, + "step": 1810 + }, + { + "epoch": 0.16, + "grad_norm": 10.093901047345904, + "learning_rate": 9.569769030072874e-06, + "loss": 0.7087, + "step": 1811 + }, + { + "epoch": 0.16, + "grad_norm": 13.03534252963153, + "learning_rate": 9.569191646899255e-06, + "loss": 0.942, + "step": 1812 + }, + { + "epoch": 0.16, + "grad_norm": 13.143957395952047, + "learning_rate": 9.568613893994247e-06, + "loss": 0.8024, + "step": 1813 + }, + { + "epoch": 0.16, + "grad_norm": 8.895792902426253, + "learning_rate": 9.568035771404604e-06, + "loss": 0.8527, + "step": 1814 + }, + { + "epoch": 0.16, + "grad_norm": 16.96494786158479, + "learning_rate": 9.567457279177104e-06, + "loss": 0.9195, + "step": 1815 + }, + { + "epoch": 0.16, + "grad_norm": 18.12722460890916, + "learning_rate": 9.566878417358559e-06, + "loss": 0.8552, + "step": 1816 + }, + { + "epoch": 0.16, + "grad_norm": 11.8445229292053, + "learning_rate": 9.566299185995808e-06, + "loss": 0.9846, + "step": 1817 + }, + { + "epoch": 0.16, + "grad_norm": 10.770161194161986, + "learning_rate": 9.565719585135724e-06, + "loss": 0.8089, + "step": 1818 + }, + { + "epoch": 0.16, + "grad_norm": 9.7935801636221, + "learning_rate": 9.565139614825205e-06, + "loss": 0.9211, + "step": 1819 + }, + { + "epoch": 0.16, + "grad_norm": 10.434125801070207, + "learning_rate": 9.564559275111183e-06, + "loss": 0.8947, + "step": 1820 + }, + { + "epoch": 0.16, + "grad_norm": 10.138509987150938, + "learning_rate": 9.563978566040618e-06, + "loss": 0.858, + "step": 1821 + }, + { + "epoch": 0.16, + "grad_norm": 12.972399294791025, + "learning_rate": 9.563397487660496e-06, + "loss": 0.9575, + "step": 1822 + }, + { + "epoch": 0.16, + "grad_norm": 3.146835295491101, + "learning_rate": 9.562816040017841e-06, + "loss": 0.5137, + "step": 1823 + }, + { + "epoch": 0.16, + "grad_norm": 20.6245464658513, + "learning_rate": 9.562234223159703e-06, + "loss": 0.935, + "step": 1824 + }, + { + "epoch": 0.16, + "grad_norm": 3.559266023949258, + "learning_rate": 9.56165203713316e-06, + "loss": 0.515, + "step": 1825 + }, + { + "epoch": 0.16, + "grad_norm": 10.351177210898223, + "learning_rate": 9.561069481985324e-06, + "loss": 0.7916, + "step": 1826 + }, + { + "epoch": 0.16, + "grad_norm": 8.671642817109252, + "learning_rate": 9.56048655776333e-06, + "loss": 0.7723, + "step": 1827 + }, + { + "epoch": 0.16, + "grad_norm": 14.393892898984024, + "learning_rate": 9.55990326451435e-06, + "loss": 0.9873, + "step": 1828 + }, + { + "epoch": 0.16, + "grad_norm": 10.887746344699556, + "learning_rate": 9.559319602285585e-06, + "loss": 0.75, + "step": 1829 + }, + { + "epoch": 0.16, + "grad_norm": 8.096863860260155, + "learning_rate": 9.55873557112426e-06, + "loss": 0.7831, + "step": 1830 + }, + { + "epoch": 0.16, + "grad_norm": 9.2056021603441, + "learning_rate": 9.558151171077635e-06, + "loss": 0.8652, + "step": 1831 + }, + { + "epoch": 0.16, + "grad_norm": 15.039880435286845, + "learning_rate": 9.557566402193e-06, + "loss": 0.9497, + "step": 1832 + }, + { + "epoch": 0.16, + "grad_norm": 8.89561637062633, + "learning_rate": 9.556981264517673e-06, + "loss": 0.9047, + "step": 1833 + }, + { + "epoch": 0.16, + "grad_norm": 2.8890819886452577, + "learning_rate": 9.556395758099003e-06, + "loss": 0.4836, + "step": 1834 + }, + { + "epoch": 0.16, + "grad_norm": 9.352458636549741, + "learning_rate": 9.555809882984367e-06, + "loss": 1.0113, + "step": 1835 + }, + { + "epoch": 0.16, + "grad_norm": 7.6528583580606835, + "learning_rate": 9.555223639221172e-06, + "loss": 0.9856, + "step": 1836 + }, + { + "epoch": 0.16, + "grad_norm": 11.004711801855619, + "learning_rate": 9.55463702685686e-06, + "loss": 0.7946, + "step": 1837 + }, + { + "epoch": 0.16, + "grad_norm": 12.393967313108943, + "learning_rate": 9.554050045938893e-06, + "loss": 0.8588, + "step": 1838 + }, + { + "epoch": 0.16, + "grad_norm": 8.26685931543141, + "learning_rate": 9.553462696514774e-06, + "loss": 0.8153, + "step": 1839 + }, + { + "epoch": 0.16, + "grad_norm": 8.513859922933545, + "learning_rate": 9.552874978632028e-06, + "loss": 0.8419, + "step": 1840 + }, + { + "epoch": 0.16, + "grad_norm": 8.052707773469379, + "learning_rate": 9.552286892338209e-06, + "loss": 0.8662, + "step": 1841 + }, + { + "epoch": 0.16, + "grad_norm": 11.7715787952574, + "learning_rate": 9.551698437680909e-06, + "loss": 0.8468, + "step": 1842 + }, + { + "epoch": 0.16, + "grad_norm": 10.229239568164866, + "learning_rate": 9.551109614707743e-06, + "loss": 0.6733, + "step": 1843 + }, + { + "epoch": 0.16, + "grad_norm": 12.625783151669577, + "learning_rate": 9.550520423466359e-06, + "loss": 0.8197, + "step": 1844 + }, + { + "epoch": 0.16, + "grad_norm": 10.233415023523131, + "learning_rate": 9.549930864004428e-06, + "loss": 0.9539, + "step": 1845 + }, + { + "epoch": 0.16, + "grad_norm": 7.831101883101232, + "learning_rate": 9.549340936369663e-06, + "loss": 0.7105, + "step": 1846 + }, + { + "epoch": 0.16, + "grad_norm": 12.582989347519181, + "learning_rate": 9.548750640609796e-06, + "loss": 0.8774, + "step": 1847 + }, + { + "epoch": 0.16, + "grad_norm": 11.469072771182438, + "learning_rate": 9.548159976772593e-06, + "loss": 0.8349, + "step": 1848 + }, + { + "epoch": 0.16, + "grad_norm": 14.677665175400714, + "learning_rate": 9.54756894490585e-06, + "loss": 1.0353, + "step": 1849 + }, + { + "epoch": 0.16, + "grad_norm": 14.183688188373834, + "learning_rate": 9.546977545057393e-06, + "loss": 1.0415, + "step": 1850 + }, + { + "epoch": 0.16, + "grad_norm": 19.791518618109425, + "learning_rate": 9.546385777275078e-06, + "loss": 0.847, + "step": 1851 + }, + { + "epoch": 0.16, + "grad_norm": 9.702828447744917, + "learning_rate": 9.545793641606787e-06, + "loss": 0.7217, + "step": 1852 + }, + { + "epoch": 0.16, + "grad_norm": 8.613780727439693, + "learning_rate": 9.545201138100435e-06, + "loss": 0.8038, + "step": 1853 + }, + { + "epoch": 0.16, + "grad_norm": 8.202624578929269, + "learning_rate": 9.544608266803967e-06, + "loss": 0.8548, + "step": 1854 + }, + { + "epoch": 0.16, + "grad_norm": 10.114122495704558, + "learning_rate": 9.54401502776536e-06, + "loss": 0.7696, + "step": 1855 + }, + { + "epoch": 0.16, + "grad_norm": 12.41497250178447, + "learning_rate": 9.543421421032615e-06, + "loss": 0.8748, + "step": 1856 + }, + { + "epoch": 0.16, + "grad_norm": 12.09852274533471, + "learning_rate": 9.542827446653766e-06, + "loss": 1.0266, + "step": 1857 + }, + { + "epoch": 0.16, + "grad_norm": 9.715556694109788, + "learning_rate": 9.542233104676877e-06, + "loss": 0.728, + "step": 1858 + }, + { + "epoch": 0.16, + "grad_norm": 8.669484704167107, + "learning_rate": 9.541638395150039e-06, + "loss": 0.7401, + "step": 1859 + }, + { + "epoch": 0.16, + "grad_norm": 12.047335994121996, + "learning_rate": 9.541043318121379e-06, + "loss": 0.9442, + "step": 1860 + }, + { + "epoch": 0.16, + "grad_norm": 10.660044587960575, + "learning_rate": 9.540447873639046e-06, + "loss": 0.8555, + "step": 1861 + }, + { + "epoch": 0.16, + "grad_norm": 14.155921710679726, + "learning_rate": 9.539852061751223e-06, + "loss": 0.9293, + "step": 1862 + }, + { + "epoch": 0.16, + "grad_norm": 9.702930440263938, + "learning_rate": 9.539255882506124e-06, + "loss": 0.9422, + "step": 1863 + }, + { + "epoch": 0.16, + "grad_norm": 6.5877940076357895, + "learning_rate": 9.53865933595199e-06, + "loss": 0.8517, + "step": 1864 + }, + { + "epoch": 0.16, + "grad_norm": 9.164616867274217, + "learning_rate": 9.538062422137091e-06, + "loss": 0.8724, + "step": 1865 + }, + { + "epoch": 0.16, + "grad_norm": 11.448670076032842, + "learning_rate": 9.53746514110973e-06, + "loss": 0.9143, + "step": 1866 + }, + { + "epoch": 0.16, + "grad_norm": 4.045447372011019, + "learning_rate": 9.536867492918238e-06, + "loss": 0.5851, + "step": 1867 + }, + { + "epoch": 0.16, + "grad_norm": 3.675503161986628, + "learning_rate": 9.536269477610975e-06, + "loss": 0.5883, + "step": 1868 + }, + { + "epoch": 0.16, + "grad_norm": 7.186689327837782, + "learning_rate": 9.535671095236332e-06, + "loss": 0.8246, + "step": 1869 + }, + { + "epoch": 0.16, + "grad_norm": 2.9556334261861164, + "learning_rate": 9.535072345842727e-06, + "loss": 0.5878, + "step": 1870 + }, + { + "epoch": 0.16, + "grad_norm": 9.210639285067042, + "learning_rate": 9.534473229478613e-06, + "loss": 0.747, + "step": 1871 + }, + { + "epoch": 0.16, + "grad_norm": 10.134175761261135, + "learning_rate": 9.533873746192468e-06, + "loss": 0.9536, + "step": 1872 + }, + { + "epoch": 0.16, + "grad_norm": 13.684457737338658, + "learning_rate": 9.5332738960328e-06, + "loss": 0.9522, + "step": 1873 + }, + { + "epoch": 0.16, + "grad_norm": 13.28198811142954, + "learning_rate": 9.53267367904815e-06, + "loss": 0.963, + "step": 1874 + }, + { + "epoch": 0.16, + "grad_norm": 11.858634703014166, + "learning_rate": 9.532073095287086e-06, + "loss": 0.9567, + "step": 1875 + }, + { + "epoch": 0.16, + "grad_norm": 7.056665574647344, + "learning_rate": 9.531472144798204e-06, + "loss": 0.9192, + "step": 1876 + }, + { + "epoch": 0.16, + "grad_norm": 9.619790445993605, + "learning_rate": 9.530870827630135e-06, + "loss": 0.8121, + "step": 1877 + }, + { + "epoch": 0.16, + "grad_norm": 10.866222473417832, + "learning_rate": 9.530269143831537e-06, + "loss": 0.8804, + "step": 1878 + }, + { + "epoch": 0.17, + "grad_norm": 7.214362746299606, + "learning_rate": 9.529667093451093e-06, + "loss": 0.819, + "step": 1879 + }, + { + "epoch": 0.17, + "grad_norm": 10.150744251587275, + "learning_rate": 9.529064676537522e-06, + "loss": 0.8607, + "step": 1880 + }, + { + "epoch": 0.17, + "grad_norm": 11.704545516892317, + "learning_rate": 9.528461893139573e-06, + "loss": 0.8026, + "step": 1881 + }, + { + "epoch": 0.17, + "grad_norm": 7.590844165431582, + "learning_rate": 9.52785874330602e-06, + "loss": 0.9133, + "step": 1882 + }, + { + "epoch": 0.17, + "grad_norm": 15.234878131570172, + "learning_rate": 9.52725522708567e-06, + "loss": 0.9565, + "step": 1883 + }, + { + "epoch": 0.17, + "grad_norm": 19.195162125202565, + "learning_rate": 9.526651344527355e-06, + "loss": 0.8209, + "step": 1884 + }, + { + "epoch": 0.17, + "grad_norm": 9.580472876879565, + "learning_rate": 9.526047095679946e-06, + "loss": 0.8769, + "step": 1885 + }, + { + "epoch": 0.17, + "grad_norm": 9.038856096192978, + "learning_rate": 9.525442480592332e-06, + "loss": 1.0635, + "step": 1886 + }, + { + "epoch": 0.17, + "grad_norm": 6.742735201400987, + "learning_rate": 9.524837499313442e-06, + "loss": 0.99, + "step": 1887 + }, + { + "epoch": 0.17, + "grad_norm": 3.6959083606700753, + "learning_rate": 9.524232151892227e-06, + "loss": 0.4942, + "step": 1888 + }, + { + "epoch": 0.17, + "grad_norm": 17.3239343666445, + "learning_rate": 9.523626438377673e-06, + "loss": 0.77, + "step": 1889 + }, + { + "epoch": 0.17, + "grad_norm": 9.962158118505972, + "learning_rate": 9.523020358818793e-06, + "loss": 0.8971, + "step": 1890 + }, + { + "epoch": 0.17, + "grad_norm": 11.25273814899385, + "learning_rate": 9.522413913264628e-06, + "loss": 0.9149, + "step": 1891 + }, + { + "epoch": 0.17, + "grad_norm": 12.318274177450116, + "learning_rate": 9.521807101764251e-06, + "loss": 0.8582, + "step": 1892 + }, + { + "epoch": 0.17, + "grad_norm": 19.759762198737256, + "learning_rate": 9.521199924366766e-06, + "loss": 0.8501, + "step": 1893 + }, + { + "epoch": 0.17, + "grad_norm": 8.405085336471405, + "learning_rate": 9.520592381121304e-06, + "loss": 0.8678, + "step": 1894 + }, + { + "epoch": 0.17, + "grad_norm": 13.064108352111377, + "learning_rate": 9.519984472077027e-06, + "loss": 1.0, + "step": 1895 + }, + { + "epoch": 0.17, + "grad_norm": 8.788931070101931, + "learning_rate": 9.519376197283123e-06, + "loss": 0.7792, + "step": 1896 + }, + { + "epoch": 0.17, + "grad_norm": 13.42470590743495, + "learning_rate": 9.518767556788815e-06, + "loss": 0.7315, + "step": 1897 + }, + { + "epoch": 0.17, + "grad_norm": 6.104997620955966, + "learning_rate": 9.518158550643354e-06, + "loss": 0.7234, + "step": 1898 + }, + { + "epoch": 0.17, + "grad_norm": 9.70607318778392, + "learning_rate": 9.517549178896017e-06, + "loss": 0.7727, + "step": 1899 + }, + { + "epoch": 0.17, + "grad_norm": 23.861945727297865, + "learning_rate": 9.516939441596118e-06, + "loss": 0.8721, + "step": 1900 + }, + { + "epoch": 0.17, + "grad_norm": 8.410103379494533, + "learning_rate": 9.51632933879299e-06, + "loss": 0.7704, + "step": 1901 + }, + { + "epoch": 0.17, + "grad_norm": 8.587983207246493, + "learning_rate": 9.515718870536006e-06, + "loss": 0.8972, + "step": 1902 + }, + { + "epoch": 0.17, + "grad_norm": 10.062299154806434, + "learning_rate": 9.515108036874561e-06, + "loss": 1.0089, + "step": 1903 + }, + { + "epoch": 0.17, + "grad_norm": 3.0894142334279993, + "learning_rate": 9.514496837858085e-06, + "loss": 0.5597, + "step": 1904 + }, + { + "epoch": 0.17, + "grad_norm": 17.22450430516105, + "learning_rate": 9.513885273536037e-06, + "loss": 0.8571, + "step": 1905 + }, + { + "epoch": 0.17, + "grad_norm": 8.966596932950912, + "learning_rate": 9.513273343957896e-06, + "loss": 0.8233, + "step": 1906 + }, + { + "epoch": 0.17, + "grad_norm": 13.650003329116592, + "learning_rate": 9.512661049173188e-06, + "loss": 0.8954, + "step": 1907 + }, + { + "epoch": 0.17, + "grad_norm": 12.327304252514605, + "learning_rate": 9.512048389231452e-06, + "loss": 0.9161, + "step": 1908 + }, + { + "epoch": 0.17, + "grad_norm": 9.737116334731748, + "learning_rate": 9.511435364182267e-06, + "loss": 0.8636, + "step": 1909 + }, + { + "epoch": 0.17, + "grad_norm": 8.822540866115718, + "learning_rate": 9.510821974075234e-06, + "loss": 0.801, + "step": 1910 + }, + { + "epoch": 0.17, + "grad_norm": 10.227656841458638, + "learning_rate": 9.510208218959993e-06, + "loss": 0.7818, + "step": 1911 + }, + { + "epoch": 0.17, + "grad_norm": 8.184906925539874, + "learning_rate": 9.509594098886206e-06, + "loss": 0.8128, + "step": 1912 + }, + { + "epoch": 0.17, + "grad_norm": 54.722056364987814, + "learning_rate": 9.508979613903563e-06, + "loss": 0.9089, + "step": 1913 + }, + { + "epoch": 0.17, + "grad_norm": 8.525531363304186, + "learning_rate": 9.508364764061791e-06, + "loss": 0.8015, + "step": 1914 + }, + { + "epoch": 0.17, + "grad_norm": 9.70049683857033, + "learning_rate": 9.507749549410641e-06, + "loss": 0.7773, + "step": 1915 + }, + { + "epoch": 0.17, + "grad_norm": 2.5701639129512768, + "learning_rate": 9.507133969999898e-06, + "loss": 0.4849, + "step": 1916 + }, + { + "epoch": 0.17, + "grad_norm": 21.394487436374007, + "learning_rate": 9.50651802587937e-06, + "loss": 0.8542, + "step": 1917 + }, + { + "epoch": 0.17, + "grad_norm": 24.633770561655087, + "learning_rate": 9.5059017170989e-06, + "loss": 0.9088, + "step": 1918 + }, + { + "epoch": 0.17, + "grad_norm": 7.245296977816, + "learning_rate": 9.505285043708356e-06, + "loss": 0.6847, + "step": 1919 + }, + { + "epoch": 0.17, + "grad_norm": 9.321210798231242, + "learning_rate": 9.504668005757642e-06, + "loss": 0.7507, + "step": 1920 + }, + { + "epoch": 0.17, + "grad_norm": 3.687194599051561, + "learning_rate": 9.504050603296687e-06, + "loss": 0.5708, + "step": 1921 + }, + { + "epoch": 0.17, + "grad_norm": 15.000815594071689, + "learning_rate": 9.503432836375449e-06, + "loss": 0.7603, + "step": 1922 + }, + { + "epoch": 0.17, + "grad_norm": 12.973929476872359, + "learning_rate": 9.502814705043916e-06, + "loss": 0.8394, + "step": 1923 + }, + { + "epoch": 0.17, + "grad_norm": 10.77787869726184, + "learning_rate": 9.502196209352109e-06, + "loss": 0.952, + "step": 1924 + }, + { + "epoch": 0.17, + "grad_norm": 10.299458602254798, + "learning_rate": 9.50157734935007e-06, + "loss": 0.7757, + "step": 1925 + }, + { + "epoch": 0.17, + "grad_norm": 17.887092921708415, + "learning_rate": 9.500958125087882e-06, + "loss": 0.8305, + "step": 1926 + }, + { + "epoch": 0.17, + "grad_norm": 2.2692342619431347, + "learning_rate": 9.50033853661565e-06, + "loss": 0.5161, + "step": 1927 + }, + { + "epoch": 0.17, + "grad_norm": 9.931964246327716, + "learning_rate": 9.49971858398351e-06, + "loss": 0.7896, + "step": 1928 + }, + { + "epoch": 0.17, + "grad_norm": 10.26166585131245, + "learning_rate": 9.499098267241627e-06, + "loss": 0.852, + "step": 1929 + }, + { + "epoch": 0.17, + "grad_norm": 11.045298838711663, + "learning_rate": 9.498477586440194e-06, + "loss": 0.8031, + "step": 1930 + }, + { + "epoch": 0.17, + "grad_norm": 12.948288882125262, + "learning_rate": 9.49785654162944e-06, + "loss": 0.7552, + "step": 1931 + }, + { + "epoch": 0.17, + "grad_norm": 11.23625389297248, + "learning_rate": 9.497235132859615e-06, + "loss": 0.8896, + "step": 1932 + }, + { + "epoch": 0.17, + "grad_norm": 8.18379160436524, + "learning_rate": 9.496613360181003e-06, + "loss": 0.9304, + "step": 1933 + }, + { + "epoch": 0.17, + "grad_norm": 13.491748680027534, + "learning_rate": 9.49599122364392e-06, + "loss": 0.751, + "step": 1934 + }, + { + "epoch": 0.17, + "grad_norm": 7.953711121925118, + "learning_rate": 9.495368723298704e-06, + "loss": 0.7883, + "step": 1935 + }, + { + "epoch": 0.17, + "grad_norm": 9.446008842768354, + "learning_rate": 9.49474585919573e-06, + "loss": 0.8425, + "step": 1936 + }, + { + "epoch": 0.17, + "grad_norm": 8.806698680367717, + "learning_rate": 9.494122631385397e-06, + "loss": 0.9184, + "step": 1937 + }, + { + "epoch": 0.17, + "grad_norm": 23.529536612674956, + "learning_rate": 9.493499039918137e-06, + "loss": 0.8869, + "step": 1938 + }, + { + "epoch": 0.17, + "grad_norm": 8.262013943564964, + "learning_rate": 9.492875084844409e-06, + "loss": 0.831, + "step": 1939 + }, + { + "epoch": 0.17, + "grad_norm": 7.258568907033134, + "learning_rate": 9.492250766214702e-06, + "loss": 0.8146, + "step": 1940 + }, + { + "epoch": 0.17, + "grad_norm": 10.49134354197279, + "learning_rate": 9.491626084079535e-06, + "loss": 0.8622, + "step": 1941 + }, + { + "epoch": 0.17, + "grad_norm": 11.667724707542565, + "learning_rate": 9.491001038489457e-06, + "loss": 0.9825, + "step": 1942 + }, + { + "epoch": 0.17, + "grad_norm": 3.1044487484093914, + "learning_rate": 9.490375629495045e-06, + "loss": 0.5722, + "step": 1943 + }, + { + "epoch": 0.17, + "grad_norm": 7.897827898299262, + "learning_rate": 9.489749857146908e-06, + "loss": 0.8769, + "step": 1944 + }, + { + "epoch": 0.17, + "grad_norm": 9.971556010391659, + "learning_rate": 9.489123721495678e-06, + "loss": 0.9314, + "step": 1945 + }, + { + "epoch": 0.17, + "grad_norm": 10.008573249578996, + "learning_rate": 9.488497222592028e-06, + "loss": 0.783, + "step": 1946 + }, + { + "epoch": 0.17, + "grad_norm": 10.957805751412122, + "learning_rate": 9.487870360486647e-06, + "loss": 0.8995, + "step": 1947 + }, + { + "epoch": 0.17, + "grad_norm": 9.757068005214977, + "learning_rate": 9.487243135230259e-06, + "loss": 0.7867, + "step": 1948 + }, + { + "epoch": 0.17, + "grad_norm": 3.0078796316855234, + "learning_rate": 9.486615546873625e-06, + "loss": 0.4001, + "step": 1949 + }, + { + "epoch": 0.17, + "grad_norm": 17.391232208461993, + "learning_rate": 9.485987595467521e-06, + "loss": 0.9211, + "step": 1950 + }, + { + "epoch": 0.17, + "grad_norm": 15.72584831943996, + "learning_rate": 9.485359281062764e-06, + "loss": 0.8568, + "step": 1951 + }, + { + "epoch": 0.17, + "grad_norm": 13.090142937192159, + "learning_rate": 9.484730603710195e-06, + "loss": 0.9498, + "step": 1952 + }, + { + "epoch": 0.17, + "grad_norm": 9.494675510349033, + "learning_rate": 9.484101563460686e-06, + "loss": 0.8593, + "step": 1953 + }, + { + "epoch": 0.17, + "grad_norm": 9.258509168602737, + "learning_rate": 9.483472160365136e-06, + "loss": 0.8234, + "step": 1954 + }, + { + "epoch": 0.17, + "grad_norm": 12.656415145663546, + "learning_rate": 9.482842394474477e-06, + "loss": 0.7453, + "step": 1955 + }, + { + "epoch": 0.17, + "grad_norm": 11.243602250455304, + "learning_rate": 9.48221226583967e-06, + "loss": 0.8702, + "step": 1956 + }, + { + "epoch": 0.17, + "grad_norm": 3.1174968039414255, + "learning_rate": 9.4815817745117e-06, + "loss": 0.545, + "step": 1957 + }, + { + "epoch": 0.17, + "grad_norm": 3.6659348088576706, + "learning_rate": 9.480950920541588e-06, + "loss": 0.5495, + "step": 1958 + }, + { + "epoch": 0.17, + "grad_norm": 9.588560653121771, + "learning_rate": 9.480319703980382e-06, + "loss": 0.7729, + "step": 1959 + }, + { + "epoch": 0.17, + "grad_norm": 21.514320894221246, + "learning_rate": 9.479688124879159e-06, + "loss": 0.9434, + "step": 1960 + }, + { + "epoch": 0.17, + "grad_norm": 14.113846141106812, + "learning_rate": 9.47905618328902e-06, + "loss": 0.7891, + "step": 1961 + }, + { + "epoch": 0.17, + "grad_norm": 2.3955598393807294, + "learning_rate": 9.478423879261111e-06, + "loss": 0.4627, + "step": 1962 + }, + { + "epoch": 0.17, + "grad_norm": 13.835338916703705, + "learning_rate": 9.477791212846587e-06, + "loss": 0.8927, + "step": 1963 + }, + { + "epoch": 0.17, + "grad_norm": 12.931786526402048, + "learning_rate": 9.47715818409665e-06, + "loss": 0.9734, + "step": 1964 + }, + { + "epoch": 0.17, + "grad_norm": 6.169773534194531, + "learning_rate": 9.476524793062519e-06, + "loss": 0.6889, + "step": 1965 + }, + { + "epoch": 0.17, + "grad_norm": 10.417915358619512, + "learning_rate": 9.475891039795447e-06, + "loss": 0.9363, + "step": 1966 + }, + { + "epoch": 0.17, + "grad_norm": 9.480594920931052, + "learning_rate": 9.475256924346717e-06, + "loss": 0.8805, + "step": 1967 + }, + { + "epoch": 0.17, + "grad_norm": 9.600577934137085, + "learning_rate": 9.47462244676764e-06, + "loss": 1.0079, + "step": 1968 + }, + { + "epoch": 0.17, + "grad_norm": 10.812446258622831, + "learning_rate": 9.473987607109561e-06, + "loss": 0.8846, + "step": 1969 + }, + { + "epoch": 0.17, + "grad_norm": 8.792282000581533, + "learning_rate": 9.473352405423845e-06, + "loss": 0.9704, + "step": 1970 + }, + { + "epoch": 0.17, + "grad_norm": 11.79996437468882, + "learning_rate": 9.472716841761894e-06, + "loss": 1.0194, + "step": 1971 + }, + { + "epoch": 0.17, + "grad_norm": 8.092586457853242, + "learning_rate": 9.472080916175134e-06, + "loss": 0.8067, + "step": 1972 + }, + { + "epoch": 0.17, + "grad_norm": 12.787617834694977, + "learning_rate": 9.471444628715027e-06, + "loss": 0.703, + "step": 1973 + }, + { + "epoch": 0.17, + "grad_norm": 10.346802564949336, + "learning_rate": 9.470807979433059e-06, + "loss": 0.8276, + "step": 1974 + }, + { + "epoch": 0.17, + "grad_norm": 20.029832557408774, + "learning_rate": 9.470170968380745e-06, + "loss": 0.9792, + "step": 1975 + }, + { + "epoch": 0.17, + "grad_norm": 3.42900385081751, + "learning_rate": 9.469533595609633e-06, + "loss": 0.5243, + "step": 1976 + }, + { + "epoch": 0.17, + "grad_norm": 11.278107162078921, + "learning_rate": 9.468895861171297e-06, + "loss": 0.8709, + "step": 1977 + }, + { + "epoch": 0.17, + "grad_norm": 10.530538426264666, + "learning_rate": 9.46825776511734e-06, + "loss": 0.7691, + "step": 1978 + }, + { + "epoch": 0.17, + "grad_norm": 19.55782772060852, + "learning_rate": 9.4676193074994e-06, + "loss": 0.7178, + "step": 1979 + }, + { + "epoch": 0.17, + "grad_norm": 3.6848925739350586, + "learning_rate": 9.466980488369136e-06, + "loss": 0.5181, + "step": 1980 + }, + { + "epoch": 0.17, + "grad_norm": 11.706926455333374, + "learning_rate": 9.466341307778239e-06, + "loss": 0.858, + "step": 1981 + }, + { + "epoch": 0.17, + "grad_norm": 12.724714273749905, + "learning_rate": 9.465701765778436e-06, + "loss": 0.8832, + "step": 1982 + }, + { + "epoch": 0.17, + "grad_norm": 25.29049097202628, + "learning_rate": 9.465061862421473e-06, + "loss": 0.8783, + "step": 1983 + }, + { + "epoch": 0.17, + "grad_norm": 9.049809194534859, + "learning_rate": 9.464421597759131e-06, + "loss": 0.7838, + "step": 1984 + }, + { + "epoch": 0.17, + "grad_norm": 9.494503044197529, + "learning_rate": 9.463780971843221e-06, + "loss": 0.8049, + "step": 1985 + }, + { + "epoch": 0.17, + "grad_norm": 7.844725590929383, + "learning_rate": 9.463139984725577e-06, + "loss": 0.8463, + "step": 1986 + }, + { + "epoch": 0.17, + "grad_norm": 9.151979268887368, + "learning_rate": 9.462498636458072e-06, + "loss": 0.9074, + "step": 1987 + }, + { + "epoch": 0.17, + "grad_norm": 2.809217821130951, + "learning_rate": 9.4618569270926e-06, + "loss": 0.5195, + "step": 1988 + }, + { + "epoch": 0.17, + "grad_norm": 10.706813724940272, + "learning_rate": 9.461214856681086e-06, + "loss": 0.7559, + "step": 1989 + }, + { + "epoch": 0.17, + "grad_norm": 6.2957148225039665, + "learning_rate": 9.460572425275489e-06, + "loss": 0.5291, + "step": 1990 + }, + { + "epoch": 0.17, + "grad_norm": 11.63807738302976, + "learning_rate": 9.459929632927789e-06, + "loss": 0.7729, + "step": 1991 + }, + { + "epoch": 0.17, + "grad_norm": 72.08395269191348, + "learning_rate": 9.459286479690002e-06, + "loss": 0.9409, + "step": 1992 + }, + { + "epoch": 0.18, + "grad_norm": 2.394219286472051, + "learning_rate": 9.458642965614173e-06, + "loss": 0.4822, + "step": 1993 + }, + { + "epoch": 0.18, + "grad_norm": 11.978533655784402, + "learning_rate": 9.45799909075237e-06, + "loss": 0.816, + "step": 1994 + }, + { + "epoch": 0.18, + "grad_norm": 20.351256867483066, + "learning_rate": 9.457354855156696e-06, + "loss": 0.7823, + "step": 1995 + }, + { + "epoch": 0.18, + "grad_norm": 7.291717021987908, + "learning_rate": 9.45671025887928e-06, + "loss": 0.774, + "step": 1996 + }, + { + "epoch": 0.18, + "grad_norm": 16.773565963405897, + "learning_rate": 9.456065301972286e-06, + "loss": 0.9012, + "step": 1997 + }, + { + "epoch": 0.18, + "grad_norm": 11.926543084091199, + "learning_rate": 9.455419984487898e-06, + "loss": 0.8816, + "step": 1998 + }, + { + "epoch": 0.18, + "grad_norm": 11.106126066586624, + "learning_rate": 9.454774306478338e-06, + "loss": 0.8899, + "step": 1999 + }, + { + "epoch": 0.18, + "grad_norm": 4.329822844057424, + "learning_rate": 9.45412826799585e-06, + "loss": 0.5764, + "step": 2000 + }, + { + "epoch": 0.18, + "grad_norm": 11.078156913178232, + "learning_rate": 9.453481869092712e-06, + "loss": 0.8119, + "step": 2001 + }, + { + "epoch": 0.18, + "grad_norm": 8.79836111649755, + "learning_rate": 9.45283510982123e-06, + "loss": 0.8519, + "step": 2002 + }, + { + "epoch": 0.18, + "grad_norm": 16.747097452951905, + "learning_rate": 9.452187990233737e-06, + "loss": 0.9065, + "step": 2003 + }, + { + "epoch": 0.18, + "grad_norm": 16.73283448601366, + "learning_rate": 9.451540510382596e-06, + "loss": 0.8988, + "step": 2004 + }, + { + "epoch": 0.18, + "grad_norm": 7.660220165416274, + "learning_rate": 9.450892670320205e-06, + "loss": 0.8235, + "step": 2005 + }, + { + "epoch": 0.18, + "grad_norm": 11.035397761438762, + "learning_rate": 9.45024447009898e-06, + "loss": 1.0228, + "step": 2006 + }, + { + "epoch": 0.18, + "grad_norm": 8.64064092489348, + "learning_rate": 9.449595909771376e-06, + "loss": 0.9215, + "step": 2007 + }, + { + "epoch": 0.18, + "grad_norm": 8.296039463370473, + "learning_rate": 9.448946989389873e-06, + "loss": 0.739, + "step": 2008 + }, + { + "epoch": 0.18, + "grad_norm": 13.030794597851889, + "learning_rate": 9.44829770900698e-06, + "loss": 0.9503, + "step": 2009 + }, + { + "epoch": 0.18, + "grad_norm": 10.118394885306701, + "learning_rate": 9.447648068675235e-06, + "loss": 0.7967, + "step": 2010 + }, + { + "epoch": 0.18, + "grad_norm": 10.616196559306488, + "learning_rate": 9.446998068447207e-06, + "loss": 0.7747, + "step": 2011 + }, + { + "epoch": 0.18, + "grad_norm": 11.651256914164168, + "learning_rate": 9.446347708375492e-06, + "loss": 0.8531, + "step": 2012 + }, + { + "epoch": 0.18, + "grad_norm": 7.089900510191287, + "learning_rate": 9.445696988512717e-06, + "loss": 0.8613, + "step": 2013 + }, + { + "epoch": 0.18, + "grad_norm": 8.490426201680023, + "learning_rate": 9.445045908911536e-06, + "loss": 0.8204, + "step": 2014 + }, + { + "epoch": 0.18, + "grad_norm": 8.606601100590652, + "learning_rate": 9.444394469624637e-06, + "loss": 0.9294, + "step": 2015 + }, + { + "epoch": 0.18, + "grad_norm": 8.632124818145078, + "learning_rate": 9.443742670704727e-06, + "loss": 0.718, + "step": 2016 + }, + { + "epoch": 0.18, + "grad_norm": 6.879062018726935, + "learning_rate": 9.443090512204552e-06, + "loss": 0.7773, + "step": 2017 + }, + { + "epoch": 0.18, + "grad_norm": 14.766810752040541, + "learning_rate": 9.442437994176886e-06, + "loss": 0.9026, + "step": 2018 + }, + { + "epoch": 0.18, + "grad_norm": 12.770378738814763, + "learning_rate": 9.441785116674525e-06, + "loss": 0.913, + "step": 2019 + }, + { + "epoch": 0.18, + "grad_norm": 9.788837112321321, + "learning_rate": 9.441131879750301e-06, + "loss": 0.7595, + "step": 2020 + }, + { + "epoch": 0.18, + "grad_norm": 15.168518153765604, + "learning_rate": 9.440478283457072e-06, + "loss": 0.7788, + "step": 2021 + }, + { + "epoch": 0.18, + "grad_norm": 20.134122361759783, + "learning_rate": 9.439824327847729e-06, + "loss": 0.9004, + "step": 2022 + }, + { + "epoch": 0.18, + "grad_norm": 8.257389510954708, + "learning_rate": 9.439170012975185e-06, + "loss": 0.8688, + "step": 2023 + }, + { + "epoch": 0.18, + "grad_norm": 6.998005496018507, + "learning_rate": 9.438515338892389e-06, + "loss": 0.7263, + "step": 2024 + }, + { + "epoch": 0.18, + "grad_norm": 67.6941945268716, + "learning_rate": 9.437860305652314e-06, + "loss": 0.8196, + "step": 2025 + }, + { + "epoch": 0.18, + "grad_norm": 10.686377641695687, + "learning_rate": 9.437204913307964e-06, + "loss": 0.7095, + "step": 2026 + }, + { + "epoch": 0.18, + "grad_norm": 19.86788515718401, + "learning_rate": 9.436549161912377e-06, + "loss": 0.9647, + "step": 2027 + }, + { + "epoch": 0.18, + "grad_norm": 8.907288658936517, + "learning_rate": 9.435893051518607e-06, + "loss": 0.6808, + "step": 2028 + }, + { + "epoch": 0.18, + "grad_norm": 8.001058643158453, + "learning_rate": 9.435236582179754e-06, + "loss": 0.8273, + "step": 2029 + }, + { + "epoch": 0.18, + "grad_norm": 11.129319724427875, + "learning_rate": 9.434579753948931e-06, + "loss": 0.7766, + "step": 2030 + }, + { + "epoch": 0.18, + "grad_norm": 20.156947810424985, + "learning_rate": 9.433922566879292e-06, + "loss": 0.956, + "step": 2031 + }, + { + "epoch": 0.18, + "grad_norm": 7.5980967932318695, + "learning_rate": 9.433265021024016e-06, + "loss": 0.901, + "step": 2032 + }, + { + "epoch": 0.18, + "grad_norm": 8.935767607675123, + "learning_rate": 9.432607116436308e-06, + "loss": 0.8264, + "step": 2033 + }, + { + "epoch": 0.18, + "grad_norm": 13.948571875395698, + "learning_rate": 9.431948853169404e-06, + "loss": 0.8435, + "step": 2034 + }, + { + "epoch": 0.18, + "grad_norm": 12.128102045028395, + "learning_rate": 9.431290231276573e-06, + "loss": 0.8653, + "step": 2035 + }, + { + "epoch": 0.18, + "grad_norm": 11.11002256842064, + "learning_rate": 9.430631250811107e-06, + "loss": 0.72, + "step": 2036 + }, + { + "epoch": 0.18, + "grad_norm": 8.579936695520555, + "learning_rate": 9.42997191182633e-06, + "loss": 0.9527, + "step": 2037 + }, + { + "epoch": 0.18, + "grad_norm": 10.340210782797296, + "learning_rate": 9.429312214375593e-06, + "loss": 0.8427, + "step": 2038 + }, + { + "epoch": 0.18, + "grad_norm": 13.272347464345382, + "learning_rate": 9.42865215851228e-06, + "loss": 0.9945, + "step": 2039 + }, + { + "epoch": 0.18, + "grad_norm": 9.807151755125904, + "learning_rate": 9.427991744289803e-06, + "loss": 0.8584, + "step": 2040 + }, + { + "epoch": 0.18, + "grad_norm": 8.655893570775838, + "learning_rate": 9.427330971761599e-06, + "loss": 0.8964, + "step": 2041 + }, + { + "epoch": 0.18, + "grad_norm": 12.769796226918352, + "learning_rate": 9.426669840981137e-06, + "loss": 0.7178, + "step": 2042 + }, + { + "epoch": 0.18, + "grad_norm": 13.609619308424424, + "learning_rate": 9.426008352001913e-06, + "loss": 0.7085, + "step": 2043 + }, + { + "epoch": 0.18, + "grad_norm": 12.119257159603116, + "learning_rate": 9.425346504877458e-06, + "loss": 0.8823, + "step": 2044 + }, + { + "epoch": 0.18, + "grad_norm": 13.774159857632615, + "learning_rate": 9.424684299661325e-06, + "loss": 0.8987, + "step": 2045 + }, + { + "epoch": 0.18, + "grad_norm": 7.272202462699099, + "learning_rate": 9.424021736407096e-06, + "loss": 0.8053, + "step": 2046 + }, + { + "epoch": 0.18, + "grad_norm": 16.843033454952003, + "learning_rate": 9.42335881516839e-06, + "loss": 0.8543, + "step": 2047 + }, + { + "epoch": 0.18, + "grad_norm": 7.251240142046912, + "learning_rate": 9.422695535998844e-06, + "loss": 0.8392, + "step": 2048 + }, + { + "epoch": 0.18, + "grad_norm": 6.78558291663663, + "learning_rate": 9.422031898952134e-06, + "loss": 0.8058, + "step": 2049 + }, + { + "epoch": 0.18, + "grad_norm": 10.12595421936278, + "learning_rate": 9.421367904081957e-06, + "loss": 0.7783, + "step": 2050 + }, + { + "epoch": 0.18, + "grad_norm": 10.80672545663344, + "learning_rate": 9.420703551442045e-06, + "loss": 0.7912, + "step": 2051 + }, + { + "epoch": 0.18, + "grad_norm": 5.981203857469831, + "learning_rate": 9.420038841086154e-06, + "loss": 0.7463, + "step": 2052 + }, + { + "epoch": 0.18, + "grad_norm": 24.5801698413141, + "learning_rate": 9.419373773068075e-06, + "loss": 0.8588, + "step": 2053 + }, + { + "epoch": 0.18, + "grad_norm": 6.955746972733649, + "learning_rate": 9.41870834744162e-06, + "loss": 0.866, + "step": 2054 + }, + { + "epoch": 0.18, + "grad_norm": 17.45485670924518, + "learning_rate": 9.418042564260634e-06, + "loss": 0.9276, + "step": 2055 + }, + { + "epoch": 0.18, + "grad_norm": 12.269149306432935, + "learning_rate": 9.417376423578993e-06, + "loss": 0.8056, + "step": 2056 + }, + { + "epoch": 0.18, + "grad_norm": 10.076591568022764, + "learning_rate": 9.4167099254506e-06, + "loss": 0.7553, + "step": 2057 + }, + { + "epoch": 0.18, + "grad_norm": 10.550895194223376, + "learning_rate": 9.416043069929389e-06, + "loss": 0.8657, + "step": 2058 + }, + { + "epoch": 0.18, + "grad_norm": 2.7588366878478303, + "learning_rate": 9.415375857069315e-06, + "loss": 0.5559, + "step": 2059 + }, + { + "epoch": 0.18, + "grad_norm": 20.2802560751358, + "learning_rate": 9.414708286924374e-06, + "loss": 0.8875, + "step": 2060 + }, + { + "epoch": 0.18, + "grad_norm": 14.557267205993025, + "learning_rate": 9.41404035954858e-06, + "loss": 0.8192, + "step": 2061 + }, + { + "epoch": 0.18, + "grad_norm": 2.6652531794716787, + "learning_rate": 9.413372074995984e-06, + "loss": 0.5416, + "step": 2062 + }, + { + "epoch": 0.18, + "grad_norm": 8.493181721669133, + "learning_rate": 9.41270343332066e-06, + "loss": 0.7298, + "step": 2063 + }, + { + "epoch": 0.18, + "grad_norm": 13.939592708868101, + "learning_rate": 9.412034434576715e-06, + "loss": 0.8312, + "step": 2064 + }, + { + "epoch": 0.18, + "grad_norm": 10.697098521858814, + "learning_rate": 9.411365078818281e-06, + "loss": 0.8025, + "step": 2065 + }, + { + "epoch": 0.18, + "grad_norm": 6.860295323832623, + "learning_rate": 9.410695366099525e-06, + "loss": 0.8115, + "step": 2066 + }, + { + "epoch": 0.18, + "grad_norm": 9.41983111021645, + "learning_rate": 9.410025296474637e-06, + "loss": 0.6963, + "step": 2067 + }, + { + "epoch": 0.18, + "grad_norm": 20.93753036116568, + "learning_rate": 9.409354869997836e-06, + "loss": 0.8389, + "step": 2068 + }, + { + "epoch": 0.18, + "grad_norm": 17.088239462857526, + "learning_rate": 9.408684086723375e-06, + "loss": 0.8914, + "step": 2069 + }, + { + "epoch": 0.18, + "grad_norm": 9.032440739661313, + "learning_rate": 9.40801294670553e-06, + "loss": 0.6946, + "step": 2070 + }, + { + "epoch": 0.18, + "grad_norm": 13.023827285043813, + "learning_rate": 9.407341449998607e-06, + "loss": 0.94, + "step": 2071 + }, + { + "epoch": 0.18, + "grad_norm": 5.08037442024396, + "learning_rate": 9.40666959665695e-06, + "loss": 0.5818, + "step": 2072 + }, + { + "epoch": 0.18, + "grad_norm": 20.0102789043504, + "learning_rate": 9.405997386734918e-06, + "loss": 0.8539, + "step": 2073 + }, + { + "epoch": 0.18, + "grad_norm": 12.193866775688047, + "learning_rate": 9.405324820286905e-06, + "loss": 0.9532, + "step": 2074 + }, + { + "epoch": 0.18, + "grad_norm": 9.795148342916747, + "learning_rate": 9.404651897367336e-06, + "loss": 0.7964, + "step": 2075 + }, + { + "epoch": 0.18, + "grad_norm": 7.833644488032409, + "learning_rate": 9.403978618030663e-06, + "loss": 0.7994, + "step": 2076 + }, + { + "epoch": 0.18, + "grad_norm": 8.61764253141244, + "learning_rate": 9.403304982331364e-06, + "loss": 0.8588, + "step": 2077 + }, + { + "epoch": 0.18, + "grad_norm": 8.779121957227927, + "learning_rate": 9.402630990323952e-06, + "loss": 0.7633, + "step": 2078 + }, + { + "epoch": 0.18, + "grad_norm": 8.683734524296996, + "learning_rate": 9.401956642062963e-06, + "loss": 0.7985, + "step": 2079 + }, + { + "epoch": 0.18, + "grad_norm": 10.0188153655808, + "learning_rate": 9.401281937602966e-06, + "loss": 0.8331, + "step": 2080 + }, + { + "epoch": 0.18, + "grad_norm": 2.82767930580247, + "learning_rate": 9.400606876998555e-06, + "loss": 0.4882, + "step": 2081 + }, + { + "epoch": 0.18, + "grad_norm": 8.09535939142021, + "learning_rate": 9.399931460304354e-06, + "loss": 0.8575, + "step": 2082 + }, + { + "epoch": 0.18, + "grad_norm": 10.983342410261999, + "learning_rate": 9.39925568757502e-06, + "loss": 0.7947, + "step": 2083 + }, + { + "epoch": 0.18, + "grad_norm": 11.376275562376186, + "learning_rate": 9.398579558865235e-06, + "loss": 0.7485, + "step": 2084 + }, + { + "epoch": 0.18, + "grad_norm": 7.821369897263677, + "learning_rate": 9.397903074229705e-06, + "loss": 0.8946, + "step": 2085 + }, + { + "epoch": 0.18, + "grad_norm": 12.612552428670288, + "learning_rate": 9.397226233723175e-06, + "loss": 1.066, + "step": 2086 + }, + { + "epoch": 0.18, + "grad_norm": 9.162636113962783, + "learning_rate": 9.396549037400416e-06, + "loss": 0.8503, + "step": 2087 + }, + { + "epoch": 0.18, + "grad_norm": 11.773438158694734, + "learning_rate": 9.395871485316218e-06, + "loss": 0.8884, + "step": 2088 + }, + { + "epoch": 0.18, + "grad_norm": 8.126876902719898, + "learning_rate": 9.395193577525414e-06, + "loss": 0.8622, + "step": 2089 + }, + { + "epoch": 0.18, + "grad_norm": 13.098323845100357, + "learning_rate": 9.394515314082855e-06, + "loss": 0.7435, + "step": 2090 + }, + { + "epoch": 0.18, + "grad_norm": 12.219257050558678, + "learning_rate": 9.393836695043429e-06, + "loss": 0.783, + "step": 2091 + }, + { + "epoch": 0.18, + "grad_norm": 19.685817423178047, + "learning_rate": 9.393157720462043e-06, + "loss": 0.7725, + "step": 2092 + }, + { + "epoch": 0.18, + "grad_norm": 13.363371720727972, + "learning_rate": 9.392478390393645e-06, + "loss": 0.9245, + "step": 2093 + }, + { + "epoch": 0.18, + "grad_norm": 15.019696463091297, + "learning_rate": 9.3917987048932e-06, + "loss": 0.9166, + "step": 2094 + }, + { + "epoch": 0.18, + "grad_norm": 13.882871573736649, + "learning_rate": 9.39111866401571e-06, + "loss": 0.9818, + "step": 2095 + }, + { + "epoch": 0.18, + "grad_norm": 7.4533902876215254, + "learning_rate": 9.390438267816202e-06, + "loss": 0.8336, + "step": 2096 + }, + { + "epoch": 0.18, + "grad_norm": 8.781346853105852, + "learning_rate": 9.389757516349733e-06, + "loss": 0.836, + "step": 2097 + }, + { + "epoch": 0.18, + "grad_norm": 10.951785685658082, + "learning_rate": 9.389076409671387e-06, + "loss": 0.8646, + "step": 2098 + }, + { + "epoch": 0.18, + "grad_norm": 7.147373444683978, + "learning_rate": 9.388394947836278e-06, + "loss": 0.8691, + "step": 2099 + }, + { + "epoch": 0.18, + "grad_norm": 11.800937164195156, + "learning_rate": 9.387713130899552e-06, + "loss": 0.9, + "step": 2100 + }, + { + "epoch": 0.18, + "grad_norm": 13.023037048012936, + "learning_rate": 9.387030958916376e-06, + "loss": 0.9321, + "step": 2101 + }, + { + "epoch": 0.18, + "grad_norm": 6.816102387217727, + "learning_rate": 9.386348431941953e-06, + "loss": 0.8879, + "step": 2102 + }, + { + "epoch": 0.18, + "grad_norm": 13.975493821848039, + "learning_rate": 9.38566555003151e-06, + "loss": 0.7922, + "step": 2103 + }, + { + "epoch": 0.18, + "grad_norm": 19.678370505882448, + "learning_rate": 9.384982313240309e-06, + "loss": 0.9357, + "step": 2104 + }, + { + "epoch": 0.18, + "grad_norm": 9.108590438689626, + "learning_rate": 9.38429872162363e-06, + "loss": 0.8565, + "step": 2105 + }, + { + "epoch": 0.18, + "grad_norm": 16.73012239182173, + "learning_rate": 9.383614775236794e-06, + "loss": 0.9485, + "step": 2106 + }, + { + "epoch": 0.19, + "grad_norm": 6.507754191082198, + "learning_rate": 9.382930474135139e-06, + "loss": 0.8444, + "step": 2107 + }, + { + "epoch": 0.19, + "grad_norm": 8.920489633182383, + "learning_rate": 9.382245818374043e-06, + "loss": 0.8519, + "step": 2108 + }, + { + "epoch": 0.19, + "grad_norm": 7.69067720996172, + "learning_rate": 9.381560808008905e-06, + "loss": 0.7176, + "step": 2109 + }, + { + "epoch": 0.19, + "grad_norm": 9.015015397610991, + "learning_rate": 9.380875443095154e-06, + "loss": 0.9331, + "step": 2110 + }, + { + "epoch": 0.19, + "grad_norm": 7.616894902392034, + "learning_rate": 9.380189723688249e-06, + "loss": 0.8093, + "step": 2111 + }, + { + "epoch": 0.19, + "grad_norm": 8.174766121538816, + "learning_rate": 9.379503649843678e-06, + "loss": 0.8219, + "step": 2112 + }, + { + "epoch": 0.19, + "grad_norm": 2.521869975300993, + "learning_rate": 9.378817221616955e-06, + "loss": 0.4624, + "step": 2113 + }, + { + "epoch": 0.19, + "grad_norm": 9.823959340786285, + "learning_rate": 9.378130439063629e-06, + "loss": 0.823, + "step": 2114 + }, + { + "epoch": 0.19, + "grad_norm": 9.23805924169817, + "learning_rate": 9.377443302239269e-06, + "loss": 0.7076, + "step": 2115 + }, + { + "epoch": 0.19, + "grad_norm": 10.264473847378039, + "learning_rate": 9.376755811199481e-06, + "loss": 0.8499, + "step": 2116 + }, + { + "epoch": 0.19, + "grad_norm": 9.905899051150291, + "learning_rate": 9.37606796599989e-06, + "loss": 0.901, + "step": 2117 + }, + { + "epoch": 0.19, + "grad_norm": 8.934087749510242, + "learning_rate": 9.375379766696159e-06, + "loss": 0.8085, + "step": 2118 + }, + { + "epoch": 0.19, + "grad_norm": 13.42751672682644, + "learning_rate": 9.374691213343976e-06, + "loss": 0.8663, + "step": 2119 + }, + { + "epoch": 0.19, + "grad_norm": 10.22249692359095, + "learning_rate": 9.374002305999057e-06, + "loss": 1.0589, + "step": 2120 + }, + { + "epoch": 0.19, + "grad_norm": 11.08377811867864, + "learning_rate": 9.373313044717146e-06, + "loss": 0.7981, + "step": 2121 + }, + { + "epoch": 0.19, + "grad_norm": 12.165056090713861, + "learning_rate": 9.37262342955402e-06, + "loss": 0.7654, + "step": 2122 + }, + { + "epoch": 0.19, + "grad_norm": 2.8634679132321947, + "learning_rate": 9.371933460565477e-06, + "loss": 0.502, + "step": 2123 + }, + { + "epoch": 0.19, + "grad_norm": 15.2642143142363, + "learning_rate": 9.371243137807353e-06, + "loss": 0.7566, + "step": 2124 + }, + { + "epoch": 0.19, + "grad_norm": 20.7429673523153, + "learning_rate": 9.370552461335505e-06, + "loss": 0.9095, + "step": 2125 + }, + { + "epoch": 0.19, + "grad_norm": 16.65634451470929, + "learning_rate": 9.369861431205822e-06, + "loss": 0.9764, + "step": 2126 + }, + { + "epoch": 0.19, + "grad_norm": 13.8152743511733, + "learning_rate": 9.36917004747422e-06, + "loss": 0.8643, + "step": 2127 + }, + { + "epoch": 0.19, + "grad_norm": 8.233829384912884, + "learning_rate": 9.368478310196645e-06, + "loss": 0.8973, + "step": 2128 + }, + { + "epoch": 0.19, + "grad_norm": 16.732502129882715, + "learning_rate": 9.367786219429074e-06, + "loss": 0.8501, + "step": 2129 + }, + { + "epoch": 0.19, + "grad_norm": 9.568418653829214, + "learning_rate": 9.367093775227504e-06, + "loss": 0.8306, + "step": 2130 + }, + { + "epoch": 0.19, + "grad_norm": 6.586272208834497, + "learning_rate": 9.366400977647973e-06, + "loss": 0.8935, + "step": 2131 + }, + { + "epoch": 0.19, + "grad_norm": 12.433168896755852, + "learning_rate": 9.365707826746537e-06, + "loss": 0.7873, + "step": 2132 + }, + { + "epoch": 0.19, + "grad_norm": 7.1986807977130685, + "learning_rate": 9.365014322579287e-06, + "loss": 0.9241, + "step": 2133 + }, + { + "epoch": 0.19, + "grad_norm": 14.644350735463927, + "learning_rate": 9.364320465202337e-06, + "loss": 0.9387, + "step": 2134 + }, + { + "epoch": 0.19, + "grad_norm": 3.411806720563647, + "learning_rate": 9.363626254671835e-06, + "loss": 0.5297, + "step": 2135 + }, + { + "epoch": 0.19, + "grad_norm": 8.153018206634298, + "learning_rate": 9.362931691043956e-06, + "loss": 0.7403, + "step": 2136 + }, + { + "epoch": 0.19, + "grad_norm": 13.432027440932835, + "learning_rate": 9.362236774374902e-06, + "loss": 0.8235, + "step": 2137 + }, + { + "epoch": 0.19, + "grad_norm": 13.319669779063908, + "learning_rate": 9.361541504720903e-06, + "loss": 0.8506, + "step": 2138 + }, + { + "epoch": 0.19, + "grad_norm": 9.006455360935082, + "learning_rate": 9.360845882138221e-06, + "loss": 0.9253, + "step": 2139 + }, + { + "epoch": 0.19, + "grad_norm": 8.258993778446268, + "learning_rate": 9.360149906683145e-06, + "loss": 0.8596, + "step": 2140 + }, + { + "epoch": 0.19, + "grad_norm": 3.300222522663566, + "learning_rate": 9.35945357841199e-06, + "loss": 0.5957, + "step": 2141 + }, + { + "epoch": 0.19, + "grad_norm": 9.727307830742932, + "learning_rate": 9.358756897381103e-06, + "loss": 0.7428, + "step": 2142 + }, + { + "epoch": 0.19, + "grad_norm": 9.339067815486786, + "learning_rate": 9.35805986364686e-06, + "loss": 0.7605, + "step": 2143 + }, + { + "epoch": 0.19, + "grad_norm": 9.04856596696957, + "learning_rate": 9.35736247726566e-06, + "loss": 0.8998, + "step": 2144 + }, + { + "epoch": 0.19, + "grad_norm": 3.299031352441482, + "learning_rate": 9.356664738293937e-06, + "loss": 0.5152, + "step": 2145 + }, + { + "epoch": 0.19, + "grad_norm": 7.811023661816413, + "learning_rate": 9.355966646788152e-06, + "loss": 0.8868, + "step": 2146 + }, + { + "epoch": 0.19, + "grad_norm": 10.982182046962983, + "learning_rate": 9.355268202804791e-06, + "loss": 0.8878, + "step": 2147 + }, + { + "epoch": 0.19, + "grad_norm": 2.682956088450831, + "learning_rate": 9.354569406400369e-06, + "loss": 0.4931, + "step": 2148 + }, + { + "epoch": 0.19, + "grad_norm": 6.3064163415793, + "learning_rate": 9.353870257631437e-06, + "loss": 0.8447, + "step": 2149 + }, + { + "epoch": 0.19, + "grad_norm": 9.059788647624034, + "learning_rate": 9.353170756554567e-06, + "loss": 0.7931, + "step": 2150 + }, + { + "epoch": 0.19, + "grad_norm": 6.565870608030552, + "learning_rate": 9.352470903226357e-06, + "loss": 1.0182, + "step": 2151 + }, + { + "epoch": 0.19, + "grad_norm": 10.804245210077717, + "learning_rate": 9.351770697703443e-06, + "loss": 0.8299, + "step": 2152 + }, + { + "epoch": 0.19, + "grad_norm": 7.718540664685148, + "learning_rate": 9.351070140042484e-06, + "loss": 0.7697, + "step": 2153 + }, + { + "epoch": 0.19, + "grad_norm": 7.773239181765744, + "learning_rate": 9.350369230300165e-06, + "loss": 0.9758, + "step": 2154 + }, + { + "epoch": 0.19, + "grad_norm": 10.052957836921342, + "learning_rate": 9.349667968533204e-06, + "loss": 0.9432, + "step": 2155 + }, + { + "epoch": 0.19, + "grad_norm": 8.331944277182057, + "learning_rate": 9.34896635479835e-06, + "loss": 0.8081, + "step": 2156 + }, + { + "epoch": 0.19, + "grad_norm": 15.884721700195657, + "learning_rate": 9.34826438915237e-06, + "loss": 0.8411, + "step": 2157 + }, + { + "epoch": 0.19, + "grad_norm": 8.95038269564721, + "learning_rate": 9.347562071652068e-06, + "loss": 0.777, + "step": 2158 + }, + { + "epoch": 0.19, + "grad_norm": 12.278658518586852, + "learning_rate": 9.346859402354277e-06, + "loss": 0.8687, + "step": 2159 + }, + { + "epoch": 0.19, + "grad_norm": 7.1717959043870385, + "learning_rate": 9.346156381315852e-06, + "loss": 0.8573, + "step": 2160 + }, + { + "epoch": 0.19, + "grad_norm": 6.5964884414651355, + "learning_rate": 9.345453008593683e-06, + "loss": 0.7689, + "step": 2161 + }, + { + "epoch": 0.19, + "grad_norm": 9.609193033328712, + "learning_rate": 9.344749284244685e-06, + "loss": 0.7943, + "step": 2162 + }, + { + "epoch": 0.19, + "grad_norm": 7.985143427522885, + "learning_rate": 9.344045208325802e-06, + "loss": 0.8733, + "step": 2163 + }, + { + "epoch": 0.19, + "grad_norm": 8.519044626460758, + "learning_rate": 9.343340780894006e-06, + "loss": 0.8521, + "step": 2164 + }, + { + "epoch": 0.19, + "grad_norm": 8.034287820598887, + "learning_rate": 9.342636002006299e-06, + "loss": 0.9653, + "step": 2165 + }, + { + "epoch": 0.19, + "grad_norm": 9.490827181275648, + "learning_rate": 9.34193087171971e-06, + "loss": 0.7833, + "step": 2166 + }, + { + "epoch": 0.19, + "grad_norm": 7.569110341327698, + "learning_rate": 9.341225390091298e-06, + "loss": 0.7457, + "step": 2167 + }, + { + "epoch": 0.19, + "grad_norm": 9.103373732102614, + "learning_rate": 9.340519557178149e-06, + "loss": 0.8849, + "step": 2168 + }, + { + "epoch": 0.19, + "grad_norm": 9.599753647707871, + "learning_rate": 9.339813373037376e-06, + "loss": 0.9718, + "step": 2169 + }, + { + "epoch": 0.19, + "grad_norm": 10.69983105193812, + "learning_rate": 9.339106837726127e-06, + "loss": 0.7932, + "step": 2170 + }, + { + "epoch": 0.19, + "grad_norm": 6.7319513296927544, + "learning_rate": 9.338399951301567e-06, + "loss": 0.9198, + "step": 2171 + }, + { + "epoch": 0.19, + "grad_norm": 9.308096403939379, + "learning_rate": 9.3376927138209e-06, + "loss": 0.7224, + "step": 2172 + }, + { + "epoch": 0.19, + "grad_norm": 11.737043487837067, + "learning_rate": 9.336985125341355e-06, + "loss": 0.8423, + "step": 2173 + }, + { + "epoch": 0.19, + "grad_norm": 7.0597686423476915, + "learning_rate": 9.336277185920189e-06, + "loss": 0.8987, + "step": 2174 + }, + { + "epoch": 0.19, + "grad_norm": 12.3949863199721, + "learning_rate": 9.335568895614685e-06, + "loss": 0.8093, + "step": 2175 + }, + { + "epoch": 0.19, + "grad_norm": 9.971654997861782, + "learning_rate": 9.334860254482157e-06, + "loss": 0.7, + "step": 2176 + }, + { + "epoch": 0.19, + "grad_norm": 8.415903065049804, + "learning_rate": 9.334151262579946e-06, + "loss": 0.8527, + "step": 2177 + }, + { + "epoch": 0.19, + "grad_norm": 13.985024584781266, + "learning_rate": 9.333441919965428e-06, + "loss": 0.8842, + "step": 2178 + }, + { + "epoch": 0.19, + "grad_norm": 11.052241051011961, + "learning_rate": 9.332732226695997e-06, + "loss": 0.75, + "step": 2179 + }, + { + "epoch": 0.19, + "grad_norm": 13.807070985382378, + "learning_rate": 9.332022182829082e-06, + "loss": 0.7689, + "step": 2180 + }, + { + "epoch": 0.19, + "grad_norm": 7.031894878359894, + "learning_rate": 9.331311788422137e-06, + "loss": 0.7745, + "step": 2181 + }, + { + "epoch": 0.19, + "grad_norm": 12.157626925529847, + "learning_rate": 9.330601043532646e-06, + "loss": 0.8662, + "step": 2182 + }, + { + "epoch": 0.19, + "grad_norm": 2.281085141164664, + "learning_rate": 9.329889948218123e-06, + "loss": 0.4919, + "step": 2183 + }, + { + "epoch": 0.19, + "grad_norm": 8.170952421540782, + "learning_rate": 9.329178502536108e-06, + "loss": 0.7857, + "step": 2184 + }, + { + "epoch": 0.19, + "grad_norm": 7.945549794668539, + "learning_rate": 9.32846670654417e-06, + "loss": 0.8916, + "step": 2185 + }, + { + "epoch": 0.19, + "grad_norm": 9.084610638290046, + "learning_rate": 9.327754560299906e-06, + "loss": 0.784, + "step": 2186 + }, + { + "epoch": 0.19, + "grad_norm": 2.09329957360162, + "learning_rate": 9.32704206386094e-06, + "loss": 0.5727, + "step": 2187 + }, + { + "epoch": 0.19, + "grad_norm": 19.971258812805623, + "learning_rate": 9.32632921728493e-06, + "loss": 0.8994, + "step": 2188 + }, + { + "epoch": 0.19, + "grad_norm": 6.259834282369891, + "learning_rate": 9.325616020629555e-06, + "loss": 0.8219, + "step": 2189 + }, + { + "epoch": 0.19, + "grad_norm": 11.850060271483144, + "learning_rate": 9.324902473952529e-06, + "loss": 0.832, + "step": 2190 + }, + { + "epoch": 0.19, + "grad_norm": 15.266623233872766, + "learning_rate": 9.324188577311589e-06, + "loss": 0.8317, + "step": 2191 + }, + { + "epoch": 0.19, + "grad_norm": 2.1707879478056604, + "learning_rate": 9.3234743307645e-06, + "loss": 0.4821, + "step": 2192 + }, + { + "epoch": 0.19, + "grad_norm": 9.682885026979053, + "learning_rate": 9.32275973436906e-06, + "loss": 0.8638, + "step": 2193 + }, + { + "epoch": 0.19, + "grad_norm": 15.326812813250942, + "learning_rate": 9.322044788183094e-06, + "loss": 0.9034, + "step": 2194 + }, + { + "epoch": 0.19, + "grad_norm": 10.767226830959725, + "learning_rate": 9.321329492264452e-06, + "loss": 0.9163, + "step": 2195 + }, + { + "epoch": 0.19, + "grad_norm": 8.627451790793598, + "learning_rate": 9.320613846671016e-06, + "loss": 0.7855, + "step": 2196 + }, + { + "epoch": 0.19, + "grad_norm": 11.462102069545846, + "learning_rate": 9.319897851460696e-06, + "loss": 0.9926, + "step": 2197 + }, + { + "epoch": 0.19, + "grad_norm": 10.968155510101623, + "learning_rate": 9.319181506691426e-06, + "loss": 0.8487, + "step": 2198 + }, + { + "epoch": 0.19, + "grad_norm": 9.85285095639592, + "learning_rate": 9.318464812421172e-06, + "loss": 0.8472, + "step": 2199 + }, + { + "epoch": 0.19, + "grad_norm": 11.347270637524593, + "learning_rate": 9.317747768707932e-06, + "loss": 0.7936, + "step": 2200 + }, + { + "epoch": 0.19, + "grad_norm": 2.5888849709049504, + "learning_rate": 9.317030375609721e-06, + "loss": 0.5376, + "step": 2201 + }, + { + "epoch": 0.19, + "grad_norm": 7.853269462069137, + "learning_rate": 9.316312633184596e-06, + "loss": 0.7552, + "step": 2202 + }, + { + "epoch": 0.19, + "grad_norm": 8.831951789754699, + "learning_rate": 9.315594541490631e-06, + "loss": 0.8103, + "step": 2203 + }, + { + "epoch": 0.19, + "grad_norm": 6.6821927798936915, + "learning_rate": 9.314876100585936e-06, + "loss": 0.8805, + "step": 2204 + }, + { + "epoch": 0.19, + "grad_norm": 14.711449273841751, + "learning_rate": 9.314157310528642e-06, + "loss": 0.8597, + "step": 2205 + }, + { + "epoch": 0.19, + "grad_norm": 7.016949304667369, + "learning_rate": 9.313438171376915e-06, + "loss": 0.8165, + "step": 2206 + }, + { + "epoch": 0.19, + "grad_norm": 7.589642192359774, + "learning_rate": 9.312718683188948e-06, + "loss": 0.7887, + "step": 2207 + }, + { + "epoch": 0.19, + "grad_norm": 2.720606534128751, + "learning_rate": 9.311998846022958e-06, + "loss": 0.4812, + "step": 2208 + }, + { + "epoch": 0.19, + "grad_norm": 17.514944698444182, + "learning_rate": 9.311278659937194e-06, + "loss": 0.8346, + "step": 2209 + }, + { + "epoch": 0.19, + "grad_norm": 8.274309370691054, + "learning_rate": 9.310558124989934e-06, + "loss": 0.7204, + "step": 2210 + }, + { + "epoch": 0.19, + "grad_norm": 10.036067007910322, + "learning_rate": 9.30983724123948e-06, + "loss": 0.795, + "step": 2211 + }, + { + "epoch": 0.19, + "grad_norm": 5.918119400643399, + "learning_rate": 9.309116008744164e-06, + "loss": 0.7855, + "step": 2212 + }, + { + "epoch": 0.19, + "grad_norm": 10.272481558410762, + "learning_rate": 9.308394427562348e-06, + "loss": 0.7363, + "step": 2213 + }, + { + "epoch": 0.19, + "grad_norm": 13.288042843539282, + "learning_rate": 9.307672497752422e-06, + "loss": 0.9535, + "step": 2214 + }, + { + "epoch": 0.19, + "grad_norm": 20.756400207468698, + "learning_rate": 9.306950219372805e-06, + "loss": 0.9365, + "step": 2215 + }, + { + "epoch": 0.19, + "grad_norm": 9.513012626329063, + "learning_rate": 9.30622759248194e-06, + "loss": 0.8877, + "step": 2216 + }, + { + "epoch": 0.19, + "grad_norm": 14.991511187879475, + "learning_rate": 9.305504617138299e-06, + "loss": 0.8406, + "step": 2217 + }, + { + "epoch": 0.19, + "grad_norm": 9.640955412517107, + "learning_rate": 9.304781293400387e-06, + "loss": 0.8644, + "step": 2218 + }, + { + "epoch": 0.19, + "grad_norm": 8.584801042425791, + "learning_rate": 9.304057621326734e-06, + "loss": 0.8949, + "step": 2219 + }, + { + "epoch": 0.19, + "grad_norm": 13.076175967214043, + "learning_rate": 9.303333600975898e-06, + "loss": 0.8486, + "step": 2220 + }, + { + "epoch": 0.2, + "grad_norm": 12.418933145644813, + "learning_rate": 9.302609232406465e-06, + "loss": 0.7877, + "step": 2221 + }, + { + "epoch": 0.2, + "grad_norm": 8.746023141143436, + "learning_rate": 9.30188451567705e-06, + "loss": 0.8016, + "step": 2222 + }, + { + "epoch": 0.2, + "grad_norm": 7.6354742532692885, + "learning_rate": 9.301159450846296e-06, + "loss": 0.6834, + "step": 2223 + }, + { + "epoch": 0.2, + "grad_norm": 9.357424062715461, + "learning_rate": 9.300434037972873e-06, + "loss": 0.8142, + "step": 2224 + }, + { + "epoch": 0.2, + "grad_norm": 10.089453728472975, + "learning_rate": 9.29970827711548e-06, + "loss": 0.891, + "step": 2225 + }, + { + "epoch": 0.2, + "grad_norm": 14.219284315930924, + "learning_rate": 9.29898216833285e-06, + "loss": 0.7999, + "step": 2226 + }, + { + "epoch": 0.2, + "grad_norm": 16.215252634379457, + "learning_rate": 9.298255711683729e-06, + "loss": 0.9478, + "step": 2227 + }, + { + "epoch": 0.2, + "grad_norm": 11.56764751521027, + "learning_rate": 9.297528907226907e-06, + "loss": 0.849, + "step": 2228 + }, + { + "epoch": 0.2, + "grad_norm": 4.461774923450567, + "learning_rate": 9.296801755021195e-06, + "loss": 0.4971, + "step": 2229 + }, + { + "epoch": 0.2, + "grad_norm": 2.5743314150077836, + "learning_rate": 9.296074255125434e-06, + "loss": 0.4937, + "step": 2230 + }, + { + "epoch": 0.2, + "grad_norm": 7.733544154248474, + "learning_rate": 9.295346407598486e-06, + "loss": 0.8695, + "step": 2231 + }, + { + "epoch": 0.2, + "grad_norm": 11.356317133830634, + "learning_rate": 9.294618212499255e-06, + "loss": 0.9801, + "step": 2232 + }, + { + "epoch": 0.2, + "grad_norm": 15.803866604246108, + "learning_rate": 9.293889669886663e-06, + "loss": 0.8873, + "step": 2233 + }, + { + "epoch": 0.2, + "grad_norm": 9.280917428666825, + "learning_rate": 9.293160779819658e-06, + "loss": 0.9058, + "step": 2234 + }, + { + "epoch": 0.2, + "grad_norm": 18.43779593776189, + "learning_rate": 9.292431542357226e-06, + "loss": 0.9546, + "step": 2235 + }, + { + "epoch": 0.2, + "grad_norm": 9.858649608582477, + "learning_rate": 9.291701957558374e-06, + "loss": 0.9101, + "step": 2236 + }, + { + "epoch": 0.2, + "grad_norm": 10.299089587923444, + "learning_rate": 9.290972025482137e-06, + "loss": 0.8918, + "step": 2237 + }, + { + "epoch": 0.2, + "grad_norm": 10.570481869374415, + "learning_rate": 9.290241746187583e-06, + "loss": 0.746, + "step": 2238 + }, + { + "epoch": 0.2, + "grad_norm": 8.936501876328409, + "learning_rate": 9.289511119733802e-06, + "loss": 0.7905, + "step": 2239 + }, + { + "epoch": 0.2, + "grad_norm": 13.433634829570549, + "learning_rate": 9.288780146179918e-06, + "loss": 0.7346, + "step": 2240 + }, + { + "epoch": 0.2, + "grad_norm": 12.407765976530223, + "learning_rate": 9.288048825585076e-06, + "loss": 0.9406, + "step": 2241 + }, + { + "epoch": 0.2, + "grad_norm": 3.184317663429351, + "learning_rate": 9.287317158008458e-06, + "loss": 0.4265, + "step": 2242 + }, + { + "epoch": 0.2, + "grad_norm": 3.7620011873567267, + "learning_rate": 9.286585143509268e-06, + "loss": 0.5658, + "step": 2243 + }, + { + "epoch": 0.2, + "grad_norm": 9.594151145539701, + "learning_rate": 9.285852782146736e-06, + "loss": 1.0636, + "step": 2244 + }, + { + "epoch": 0.2, + "grad_norm": 2.7182141133743545, + "learning_rate": 9.285120073980127e-06, + "loss": 0.5705, + "step": 2245 + }, + { + "epoch": 0.2, + "grad_norm": 10.760095037976736, + "learning_rate": 9.28438701906873e-06, + "loss": 0.8355, + "step": 2246 + }, + { + "epoch": 0.2, + "grad_norm": 9.393777702917735, + "learning_rate": 9.283653617471862e-06, + "loss": 0.7681, + "step": 2247 + }, + { + "epoch": 0.2, + "grad_norm": 7.67658340491093, + "learning_rate": 9.282919869248867e-06, + "loss": 0.776, + "step": 2248 + }, + { + "epoch": 0.2, + "grad_norm": 12.103220858891877, + "learning_rate": 9.282185774459123e-06, + "loss": 0.7735, + "step": 2249 + }, + { + "epoch": 0.2, + "grad_norm": 11.669417728448648, + "learning_rate": 9.281451333162028e-06, + "loss": 0.7665, + "step": 2250 + }, + { + "epoch": 0.2, + "grad_norm": 10.55015550714984, + "learning_rate": 9.280716545417015e-06, + "loss": 0.9349, + "step": 2251 + }, + { + "epoch": 0.2, + "grad_norm": 10.944906878755717, + "learning_rate": 9.279981411283536e-06, + "loss": 0.786, + "step": 2252 + }, + { + "epoch": 0.2, + "grad_norm": 14.277915147840295, + "learning_rate": 9.279245930821083e-06, + "loss": 0.7817, + "step": 2253 + }, + { + "epoch": 0.2, + "grad_norm": 8.901182215048324, + "learning_rate": 9.278510104089167e-06, + "loss": 1.0225, + "step": 2254 + }, + { + "epoch": 0.2, + "grad_norm": 7.2495729175698695, + "learning_rate": 9.27777393114733e-06, + "loss": 0.7234, + "step": 2255 + }, + { + "epoch": 0.2, + "grad_norm": 4.4792008682993085, + "learning_rate": 9.277037412055143e-06, + "loss": 0.5722, + "step": 2256 + }, + { + "epoch": 0.2, + "grad_norm": 2.644301458044473, + "learning_rate": 9.276300546872202e-06, + "loss": 0.5059, + "step": 2257 + }, + { + "epoch": 0.2, + "grad_norm": 11.940166009169074, + "learning_rate": 9.275563335658136e-06, + "loss": 1.0511, + "step": 2258 + }, + { + "epoch": 0.2, + "grad_norm": 12.416647212335901, + "learning_rate": 9.274825778472594e-06, + "loss": 0.8895, + "step": 2259 + }, + { + "epoch": 0.2, + "grad_norm": 11.158686105543886, + "learning_rate": 9.274087875375264e-06, + "loss": 0.9841, + "step": 2260 + }, + { + "epoch": 0.2, + "grad_norm": 11.88771850826283, + "learning_rate": 9.27334962642585e-06, + "loss": 0.6583, + "step": 2261 + }, + { + "epoch": 0.2, + "grad_norm": 14.417038992261732, + "learning_rate": 9.272611031684094e-06, + "loss": 1.0417, + "step": 2262 + }, + { + "epoch": 0.2, + "grad_norm": 10.182270325356253, + "learning_rate": 9.27187209120976e-06, + "loss": 0.902, + "step": 2263 + }, + { + "epoch": 0.2, + "grad_norm": 10.775223325955078, + "learning_rate": 9.271132805062642e-06, + "loss": 0.8943, + "step": 2264 + }, + { + "epoch": 0.2, + "grad_norm": 3.232595662221237, + "learning_rate": 9.270393173302563e-06, + "loss": 0.6163, + "step": 2265 + }, + { + "epoch": 0.2, + "grad_norm": 10.900432128289092, + "learning_rate": 9.26965319598937e-06, + "loss": 0.9698, + "step": 2266 + }, + { + "epoch": 0.2, + "grad_norm": 14.402420840667496, + "learning_rate": 9.268912873182945e-06, + "loss": 0.6922, + "step": 2267 + }, + { + "epoch": 0.2, + "grad_norm": 2.8393001946598906, + "learning_rate": 9.268172204943188e-06, + "loss": 0.602, + "step": 2268 + }, + { + "epoch": 0.2, + "grad_norm": 17.488307228129813, + "learning_rate": 9.26743119133004e-06, + "loss": 0.7133, + "step": 2269 + }, + { + "epoch": 0.2, + "grad_norm": 18.218046852922825, + "learning_rate": 9.266689832403455e-06, + "loss": 0.7243, + "step": 2270 + }, + { + "epoch": 0.2, + "grad_norm": 7.019648135439054, + "learning_rate": 9.26594812822343e-06, + "loss": 0.7153, + "step": 2271 + }, + { + "epoch": 0.2, + "grad_norm": 7.926034781696374, + "learning_rate": 9.265206078849976e-06, + "loss": 0.7366, + "step": 2272 + }, + { + "epoch": 0.2, + "grad_norm": 10.52985468186845, + "learning_rate": 9.264463684343139e-06, + "loss": 0.794, + "step": 2273 + }, + { + "epoch": 0.2, + "grad_norm": 11.16985478609595, + "learning_rate": 9.263720944762998e-06, + "loss": 0.9975, + "step": 2274 + }, + { + "epoch": 0.2, + "grad_norm": 7.89062244189141, + "learning_rate": 9.262977860169647e-06, + "loss": 0.8147, + "step": 2275 + }, + { + "epoch": 0.2, + "grad_norm": 2.4668827779327196, + "learning_rate": 9.262234430623221e-06, + "loss": 0.5156, + "step": 2276 + }, + { + "epoch": 0.2, + "grad_norm": 8.713719462386397, + "learning_rate": 9.261490656183873e-06, + "loss": 0.7864, + "step": 2277 + }, + { + "epoch": 0.2, + "grad_norm": 2.937280105908266, + "learning_rate": 9.260746536911792e-06, + "loss": 0.5325, + "step": 2278 + }, + { + "epoch": 0.2, + "grad_norm": 10.7250929877191, + "learning_rate": 9.260002072867187e-06, + "loss": 0.9616, + "step": 2279 + }, + { + "epoch": 0.2, + "grad_norm": 10.942744614472108, + "learning_rate": 9.259257264110301e-06, + "loss": 0.7592, + "step": 2280 + }, + { + "epoch": 0.2, + "grad_norm": 7.10572779271902, + "learning_rate": 9.258512110701401e-06, + "loss": 0.8415, + "step": 2281 + }, + { + "epoch": 0.2, + "grad_norm": 33.8258609939658, + "learning_rate": 9.257766612700788e-06, + "loss": 0.8263, + "step": 2282 + }, + { + "epoch": 0.2, + "grad_norm": 4.172309361943847, + "learning_rate": 9.25702077016878e-06, + "loss": 0.4818, + "step": 2283 + }, + { + "epoch": 0.2, + "grad_norm": 6.611846696420416, + "learning_rate": 9.256274583165732e-06, + "loss": 0.8492, + "step": 2284 + }, + { + "epoch": 0.2, + "grad_norm": 12.240700289109984, + "learning_rate": 9.255528051752025e-06, + "loss": 0.7903, + "step": 2285 + }, + { + "epoch": 0.2, + "grad_norm": 19.27941087658624, + "learning_rate": 9.254781175988069e-06, + "loss": 0.8682, + "step": 2286 + }, + { + "epoch": 0.2, + "grad_norm": 19.075987860152154, + "learning_rate": 9.254033955934296e-06, + "loss": 0.7932, + "step": 2287 + }, + { + "epoch": 0.2, + "grad_norm": 18.487824657738734, + "learning_rate": 9.253286391651172e-06, + "loss": 0.9377, + "step": 2288 + }, + { + "epoch": 0.2, + "grad_norm": 20.291969568168838, + "learning_rate": 9.25253848319919e-06, + "loss": 0.8371, + "step": 2289 + }, + { + "epoch": 0.2, + "grad_norm": 7.006805518821332, + "learning_rate": 9.251790230638865e-06, + "loss": 0.8277, + "step": 2290 + }, + { + "epoch": 0.2, + "grad_norm": 13.72950870231064, + "learning_rate": 9.251041634030747e-06, + "loss": 0.7082, + "step": 2291 + }, + { + "epoch": 0.2, + "grad_norm": 8.702102219577531, + "learning_rate": 9.250292693435411e-06, + "loss": 0.6293, + "step": 2292 + }, + { + "epoch": 0.2, + "grad_norm": 12.758474648196211, + "learning_rate": 9.249543408913462e-06, + "loss": 0.77, + "step": 2293 + }, + { + "epoch": 0.2, + "grad_norm": 10.913228153922924, + "learning_rate": 9.248793780525529e-06, + "loss": 0.7806, + "step": 2294 + }, + { + "epoch": 0.2, + "grad_norm": 15.158314747903603, + "learning_rate": 9.24804380833227e-06, + "loss": 0.9186, + "step": 2295 + }, + { + "epoch": 0.2, + "grad_norm": 8.543383375690762, + "learning_rate": 9.247293492394372e-06, + "loss": 0.8648, + "step": 2296 + }, + { + "epoch": 0.2, + "grad_norm": 11.246143819904194, + "learning_rate": 9.24654283277255e-06, + "loss": 0.8058, + "step": 2297 + }, + { + "epoch": 0.2, + "grad_norm": 30.396848976556313, + "learning_rate": 9.245791829527547e-06, + "loss": 0.7925, + "step": 2298 + }, + { + "epoch": 0.2, + "grad_norm": 14.377801643520154, + "learning_rate": 9.24504048272013e-06, + "loss": 0.8573, + "step": 2299 + }, + { + "epoch": 0.2, + "grad_norm": 3.6925593523969322, + "learning_rate": 9.244288792411099e-06, + "loss": 0.5973, + "step": 2300 + }, + { + "epoch": 0.2, + "grad_norm": 3.8398593175559457, + "learning_rate": 9.243536758661277e-06, + "loss": 0.5566, + "step": 2301 + }, + { + "epoch": 0.2, + "grad_norm": 9.658600735744356, + "learning_rate": 9.242784381531522e-06, + "loss": 0.7863, + "step": 2302 + }, + { + "epoch": 0.2, + "grad_norm": 20.44309714655454, + "learning_rate": 9.24203166108271e-06, + "loss": 0.8557, + "step": 2303 + }, + { + "epoch": 0.2, + "grad_norm": 6.331682618521454, + "learning_rate": 9.241278597375755e-06, + "loss": 0.8386, + "step": 2304 + }, + { + "epoch": 0.2, + "grad_norm": 9.587910717989503, + "learning_rate": 9.240525190471588e-06, + "loss": 0.8158, + "step": 2305 + }, + { + "epoch": 0.2, + "grad_norm": 8.811623001668234, + "learning_rate": 9.23977144043118e-06, + "loss": 0.7634, + "step": 2306 + }, + { + "epoch": 0.2, + "grad_norm": 9.697732837192698, + "learning_rate": 9.239017347315515e-06, + "loss": 0.8757, + "step": 2307 + }, + { + "epoch": 0.2, + "grad_norm": 11.56672504897552, + "learning_rate": 9.23826291118562e-06, + "loss": 0.7121, + "step": 2308 + }, + { + "epoch": 0.2, + "grad_norm": 94.69806219589384, + "learning_rate": 9.237508132102541e-06, + "loss": 0.881, + "step": 2309 + }, + { + "epoch": 0.2, + "grad_norm": 3.07841621124905, + "learning_rate": 9.236753010127352e-06, + "loss": 0.6014, + "step": 2310 + }, + { + "epoch": 0.2, + "grad_norm": 8.004000756222812, + "learning_rate": 9.235997545321156e-06, + "loss": 0.8508, + "step": 2311 + }, + { + "epoch": 0.2, + "grad_norm": 10.202709038087294, + "learning_rate": 9.235241737745087e-06, + "loss": 0.8159, + "step": 2312 + }, + { + "epoch": 0.2, + "grad_norm": 9.511674604648494, + "learning_rate": 9.234485587460299e-06, + "loss": 0.8566, + "step": 2313 + }, + { + "epoch": 0.2, + "grad_norm": 12.345964702068532, + "learning_rate": 9.233729094527981e-06, + "loss": 1.1465, + "step": 2314 + }, + { + "epoch": 0.2, + "grad_norm": 15.964916572931036, + "learning_rate": 9.23297225900935e-06, + "loss": 0.949, + "step": 2315 + }, + { + "epoch": 0.2, + "grad_norm": 14.085700696304766, + "learning_rate": 9.232215080965644e-06, + "loss": 0.8734, + "step": 2316 + }, + { + "epoch": 0.2, + "grad_norm": 3.031237405480665, + "learning_rate": 9.23145756045813e-06, + "loss": 0.5554, + "step": 2317 + }, + { + "epoch": 0.2, + "grad_norm": 10.772388460653298, + "learning_rate": 9.230699697548112e-06, + "loss": 0.7234, + "step": 2318 + }, + { + "epoch": 0.2, + "grad_norm": 11.430468541588658, + "learning_rate": 9.229941492296913e-06, + "loss": 1.0174, + "step": 2319 + }, + { + "epoch": 0.2, + "grad_norm": 6.59543015247318, + "learning_rate": 9.229182944765883e-06, + "loss": 0.8163, + "step": 2320 + }, + { + "epoch": 0.2, + "grad_norm": 3.0124352904489484, + "learning_rate": 9.228424055016403e-06, + "loss": 0.5582, + "step": 2321 + }, + { + "epoch": 0.2, + "grad_norm": 9.230991932336627, + "learning_rate": 9.227664823109884e-06, + "loss": 0.843, + "step": 2322 + }, + { + "epoch": 0.2, + "grad_norm": 8.293289310160121, + "learning_rate": 9.226905249107758e-06, + "loss": 0.858, + "step": 2323 + }, + { + "epoch": 0.2, + "grad_norm": 13.238546525105646, + "learning_rate": 9.226145333071493e-06, + "loss": 0.7997, + "step": 2324 + }, + { + "epoch": 0.2, + "grad_norm": 9.230233014551253, + "learning_rate": 9.225385075062575e-06, + "loss": 0.8449, + "step": 2325 + }, + { + "epoch": 0.2, + "grad_norm": 2.274150072978807, + "learning_rate": 9.224624475142526e-06, + "loss": 0.5944, + "step": 2326 + }, + { + "epoch": 0.2, + "grad_norm": 10.637528020136218, + "learning_rate": 9.223863533372891e-06, + "loss": 0.8661, + "step": 2327 + }, + { + "epoch": 0.2, + "grad_norm": 9.638130690052751, + "learning_rate": 9.223102249815246e-06, + "loss": 0.9484, + "step": 2328 + }, + { + "epoch": 0.2, + "grad_norm": 7.226409532440735, + "learning_rate": 9.222340624531193e-06, + "loss": 0.8323, + "step": 2329 + }, + { + "epoch": 0.2, + "grad_norm": 10.914853897944909, + "learning_rate": 9.221578657582358e-06, + "loss": 0.8629, + "step": 2330 + }, + { + "epoch": 0.2, + "grad_norm": 17.266965852753234, + "learning_rate": 9.2208163490304e-06, + "loss": 0.8815, + "step": 2331 + }, + { + "epoch": 0.2, + "grad_norm": 11.588577193768387, + "learning_rate": 9.220053698937005e-06, + "loss": 0.8514, + "step": 2332 + }, + { + "epoch": 0.2, + "grad_norm": 12.209253048357894, + "learning_rate": 9.219290707363885e-06, + "loss": 0.8564, + "step": 2333 + }, + { + "epoch": 0.2, + "grad_norm": 12.200255917746828, + "learning_rate": 9.21852737437278e-06, + "loss": 0.7757, + "step": 2334 + }, + { + "epoch": 0.21, + "grad_norm": 18.633124190752763, + "learning_rate": 9.217763700025453e-06, + "loss": 0.8418, + "step": 2335 + }, + { + "epoch": 0.21, + "grad_norm": 6.746774739961812, + "learning_rate": 9.216999684383708e-06, + "loss": 0.7247, + "step": 2336 + }, + { + "epoch": 0.21, + "grad_norm": 2.4231658889457157, + "learning_rate": 9.216235327509359e-06, + "loss": 0.5176, + "step": 2337 + }, + { + "epoch": 0.21, + "grad_norm": 13.182147279188952, + "learning_rate": 9.215470629464264e-06, + "loss": 0.8068, + "step": 2338 + }, + { + "epoch": 0.21, + "grad_norm": 8.00854223409351, + "learning_rate": 9.214705590310297e-06, + "loss": 0.8055, + "step": 2339 + }, + { + "epoch": 0.21, + "grad_norm": 11.672631592778243, + "learning_rate": 9.213940210109365e-06, + "loss": 0.7749, + "step": 2340 + }, + { + "epoch": 0.21, + "grad_norm": 7.117945096157749, + "learning_rate": 9.213174488923398e-06, + "loss": 0.8185, + "step": 2341 + }, + { + "epoch": 0.21, + "grad_norm": 24.587827361741542, + "learning_rate": 9.212408426814363e-06, + "loss": 0.8842, + "step": 2342 + }, + { + "epoch": 0.21, + "grad_norm": 12.195597625820334, + "learning_rate": 9.211642023844243e-06, + "loss": 0.8988, + "step": 2343 + }, + { + "epoch": 0.21, + "grad_norm": 4.639855391845357, + "learning_rate": 9.210875280075056e-06, + "loss": 0.6356, + "step": 2344 + }, + { + "epoch": 0.21, + "grad_norm": 11.809470908586768, + "learning_rate": 9.210108195568847e-06, + "loss": 0.8688, + "step": 2345 + }, + { + "epoch": 0.21, + "grad_norm": 9.084909645026665, + "learning_rate": 9.209340770387685e-06, + "loss": 0.7549, + "step": 2346 + }, + { + "epoch": 0.21, + "grad_norm": 7.852059678840957, + "learning_rate": 9.208573004593671e-06, + "loss": 0.9365, + "step": 2347 + }, + { + "epoch": 0.21, + "grad_norm": 9.526413059223495, + "learning_rate": 9.207804898248928e-06, + "loss": 0.7808, + "step": 2348 + }, + { + "epoch": 0.21, + "grad_norm": 9.013835931413448, + "learning_rate": 9.207036451415613e-06, + "loss": 0.8512, + "step": 2349 + }, + { + "epoch": 0.21, + "grad_norm": 8.842265407157917, + "learning_rate": 9.206267664155906e-06, + "loss": 0.7088, + "step": 2350 + }, + { + "epoch": 0.21, + "grad_norm": 2.8921595549061525, + "learning_rate": 9.205498536532019e-06, + "loss": 0.4858, + "step": 2351 + }, + { + "epoch": 0.21, + "grad_norm": 3.5751781123274453, + "learning_rate": 9.204729068606182e-06, + "loss": 0.5961, + "step": 2352 + }, + { + "epoch": 0.21, + "grad_norm": 3.2709165386624863, + "learning_rate": 9.203959260440664e-06, + "loss": 0.5136, + "step": 2353 + }, + { + "epoch": 0.21, + "grad_norm": 9.754021763273139, + "learning_rate": 9.203189112097757e-06, + "loss": 0.8119, + "step": 2354 + }, + { + "epoch": 0.21, + "grad_norm": 11.443425096665505, + "learning_rate": 9.202418623639779e-06, + "loss": 0.7722, + "step": 2355 + }, + { + "epoch": 0.21, + "grad_norm": 6.287328853006614, + "learning_rate": 9.201647795129074e-06, + "loss": 0.8487, + "step": 2356 + }, + { + "epoch": 0.21, + "grad_norm": 2.913686666596223, + "learning_rate": 9.200876626628022e-06, + "loss": 0.5706, + "step": 2357 + }, + { + "epoch": 0.21, + "grad_norm": 3.0801795567631736, + "learning_rate": 9.200105118199017e-06, + "loss": 0.5391, + "step": 2358 + }, + { + "epoch": 0.21, + "grad_norm": 17.428183430704063, + "learning_rate": 9.199333269904496e-06, + "loss": 0.9094, + "step": 2359 + }, + { + "epoch": 0.21, + "grad_norm": 22.64260474194957, + "learning_rate": 9.19856108180691e-06, + "loss": 0.7391, + "step": 2360 + }, + { + "epoch": 0.21, + "grad_norm": 8.813518815161748, + "learning_rate": 9.197788553968745e-06, + "loss": 0.8062, + "step": 2361 + }, + { + "epoch": 0.21, + "grad_norm": 9.344447633325919, + "learning_rate": 9.197015686452514e-06, + "loss": 0.8996, + "step": 2362 + }, + { + "epoch": 0.21, + "grad_norm": 12.464923770044516, + "learning_rate": 9.196242479320754e-06, + "loss": 0.8951, + "step": 2363 + }, + { + "epoch": 0.21, + "grad_norm": 12.94198548056362, + "learning_rate": 9.195468932636034e-06, + "loss": 0.8609, + "step": 2364 + }, + { + "epoch": 0.21, + "grad_norm": 17.12476184940029, + "learning_rate": 9.194695046460945e-06, + "loss": 0.838, + "step": 2365 + }, + { + "epoch": 0.21, + "grad_norm": 14.727839841594722, + "learning_rate": 9.193920820858113e-06, + "loss": 0.8199, + "step": 2366 + }, + { + "epoch": 0.21, + "grad_norm": 2.9982975644398584, + "learning_rate": 9.193146255890182e-06, + "loss": 0.5275, + "step": 2367 + }, + { + "epoch": 0.21, + "grad_norm": 7.474111144554206, + "learning_rate": 9.19237135161983e-06, + "loss": 0.9422, + "step": 2368 + }, + { + "epoch": 0.21, + "grad_norm": 7.919963879834924, + "learning_rate": 9.191596108109765e-06, + "loss": 0.9634, + "step": 2369 + }, + { + "epoch": 0.21, + "grad_norm": 2.6536519303862036, + "learning_rate": 9.190820525422713e-06, + "loss": 0.4592, + "step": 2370 + }, + { + "epoch": 0.21, + "grad_norm": 10.308418141300878, + "learning_rate": 9.190044603621437e-06, + "loss": 0.7123, + "step": 2371 + }, + { + "epoch": 0.21, + "grad_norm": 11.544324942400909, + "learning_rate": 9.18926834276872e-06, + "loss": 0.9809, + "step": 2372 + }, + { + "epoch": 0.21, + "grad_norm": 7.773460371228996, + "learning_rate": 9.188491742927378e-06, + "loss": 0.8878, + "step": 2373 + }, + { + "epoch": 0.21, + "grad_norm": 11.674170859525715, + "learning_rate": 9.187714804160251e-06, + "loss": 0.8703, + "step": 2374 + }, + { + "epoch": 0.21, + "grad_norm": 7.14863189443222, + "learning_rate": 9.186937526530208e-06, + "loss": 0.6231, + "step": 2375 + }, + { + "epoch": 0.21, + "grad_norm": 11.86646597827937, + "learning_rate": 9.186159910100145e-06, + "loss": 0.82, + "step": 2376 + }, + { + "epoch": 0.21, + "grad_norm": 3.1877433087370353, + "learning_rate": 9.185381954932984e-06, + "loss": 0.5151, + "step": 2377 + }, + { + "epoch": 0.21, + "grad_norm": 8.191566884930701, + "learning_rate": 9.18460366109168e-06, + "loss": 0.6635, + "step": 2378 + }, + { + "epoch": 0.21, + "grad_norm": 11.234050008797041, + "learning_rate": 9.183825028639206e-06, + "loss": 0.8417, + "step": 2379 + }, + { + "epoch": 0.21, + "grad_norm": 10.370654532686086, + "learning_rate": 9.183046057638572e-06, + "loss": 1.0118, + "step": 2380 + }, + { + "epoch": 0.21, + "grad_norm": 28.940916724770428, + "learning_rate": 9.182266748152808e-06, + "loss": 0.9654, + "step": 2381 + }, + { + "epoch": 0.21, + "grad_norm": 10.134689869194627, + "learning_rate": 9.181487100244975e-06, + "loss": 0.851, + "step": 2382 + }, + { + "epoch": 0.21, + "grad_norm": 8.73583332810072, + "learning_rate": 9.180707113978164e-06, + "loss": 0.8547, + "step": 2383 + }, + { + "epoch": 0.21, + "grad_norm": 8.978384722037276, + "learning_rate": 9.179926789415485e-06, + "loss": 0.8987, + "step": 2384 + }, + { + "epoch": 0.21, + "grad_norm": 11.024193667442308, + "learning_rate": 9.179146126620085e-06, + "loss": 0.8052, + "step": 2385 + }, + { + "epoch": 0.21, + "grad_norm": 14.516639717107783, + "learning_rate": 9.178365125655131e-06, + "loss": 0.7995, + "step": 2386 + }, + { + "epoch": 0.21, + "grad_norm": 7.205283709238801, + "learning_rate": 9.177583786583821e-06, + "loss": 0.8814, + "step": 2387 + }, + { + "epoch": 0.21, + "grad_norm": 22.475112932433486, + "learning_rate": 9.17680210946938e-06, + "loss": 0.7592, + "step": 2388 + }, + { + "epoch": 0.21, + "grad_norm": 13.068549799410928, + "learning_rate": 9.17602009437506e-06, + "loss": 0.8015, + "step": 2389 + }, + { + "epoch": 0.21, + "grad_norm": 12.230391532897633, + "learning_rate": 9.175237741364143e-06, + "loss": 0.8232, + "step": 2390 + }, + { + "epoch": 0.21, + "grad_norm": 32.42041585494903, + "learning_rate": 9.17445505049993e-06, + "loss": 0.8893, + "step": 2391 + }, + { + "epoch": 0.21, + "grad_norm": 10.560969133428697, + "learning_rate": 9.173672021845759e-06, + "loss": 0.8223, + "step": 2392 + }, + { + "epoch": 0.21, + "grad_norm": 23.025772205673462, + "learning_rate": 9.172888655464991e-06, + "loss": 0.8258, + "step": 2393 + }, + { + "epoch": 0.21, + "grad_norm": 10.463116082617459, + "learning_rate": 9.172104951421014e-06, + "loss": 0.8539, + "step": 2394 + }, + { + "epoch": 0.21, + "grad_norm": 9.061761000788634, + "learning_rate": 9.171320909777244e-06, + "loss": 0.8482, + "step": 2395 + }, + { + "epoch": 0.21, + "grad_norm": 7.8033784352741895, + "learning_rate": 9.170536530597125e-06, + "loss": 0.7468, + "step": 2396 + }, + { + "epoch": 0.21, + "grad_norm": 11.09441892109543, + "learning_rate": 9.169751813944128e-06, + "loss": 0.9113, + "step": 2397 + }, + { + "epoch": 0.21, + "grad_norm": 12.683443709146452, + "learning_rate": 9.16896675988175e-06, + "loss": 0.7758, + "step": 2398 + }, + { + "epoch": 0.21, + "grad_norm": 11.742804907823002, + "learning_rate": 9.168181368473514e-06, + "loss": 0.7557, + "step": 2399 + }, + { + "epoch": 0.21, + "grad_norm": 13.623356424204387, + "learning_rate": 9.167395639782978e-06, + "loss": 0.9642, + "step": 2400 + }, + { + "epoch": 0.21, + "grad_norm": 26.3912456993807, + "learning_rate": 9.166609573873718e-06, + "loss": 0.7362, + "step": 2401 + }, + { + "epoch": 0.21, + "grad_norm": 2.2975662425384265, + "learning_rate": 9.165823170809343e-06, + "loss": 0.5207, + "step": 2402 + }, + { + "epoch": 0.21, + "grad_norm": 2.911275392683447, + "learning_rate": 9.165036430653485e-06, + "loss": 0.5656, + "step": 2403 + }, + { + "epoch": 0.21, + "grad_norm": 14.743562222982307, + "learning_rate": 9.164249353469807e-06, + "loss": 0.8487, + "step": 2404 + }, + { + "epoch": 0.21, + "grad_norm": 3.272354228110914, + "learning_rate": 9.163461939322e-06, + "loss": 0.4838, + "step": 2405 + }, + { + "epoch": 0.21, + "grad_norm": 10.961046369521961, + "learning_rate": 9.162674188273777e-06, + "loss": 0.6608, + "step": 2406 + }, + { + "epoch": 0.21, + "grad_norm": 8.72314646394867, + "learning_rate": 9.161886100388884e-06, + "loss": 0.8499, + "step": 2407 + }, + { + "epoch": 0.21, + "grad_norm": 12.47570735988328, + "learning_rate": 9.16109767573109e-06, + "loss": 0.7678, + "step": 2408 + }, + { + "epoch": 0.21, + "grad_norm": 3.571898708942293, + "learning_rate": 9.160308914364194e-06, + "loss": 0.54, + "step": 2409 + }, + { + "epoch": 0.21, + "grad_norm": 8.610894946496378, + "learning_rate": 9.159519816352021e-06, + "loss": 0.7583, + "step": 2410 + }, + { + "epoch": 0.21, + "grad_norm": 11.536499792833663, + "learning_rate": 9.158730381758423e-06, + "loss": 0.782, + "step": 2411 + }, + { + "epoch": 0.21, + "grad_norm": 14.188123763465269, + "learning_rate": 9.15794061064728e-06, + "loss": 0.7538, + "step": 2412 + }, + { + "epoch": 0.21, + "grad_norm": 8.696839190504482, + "learning_rate": 9.1571505030825e-06, + "loss": 0.7154, + "step": 2413 + }, + { + "epoch": 0.21, + "grad_norm": 15.002838418595745, + "learning_rate": 9.156360059128016e-06, + "loss": 0.7889, + "step": 2414 + }, + { + "epoch": 0.21, + "grad_norm": 10.802856538431296, + "learning_rate": 9.155569278847789e-06, + "loss": 0.7978, + "step": 2415 + }, + { + "epoch": 0.21, + "grad_norm": 7.616959361904473, + "learning_rate": 9.154778162305808e-06, + "loss": 0.8596, + "step": 2416 + }, + { + "epoch": 0.21, + "grad_norm": 5.682151565366286, + "learning_rate": 9.153986709566092e-06, + "loss": 0.7709, + "step": 2417 + }, + { + "epoch": 0.21, + "grad_norm": 12.596818676300893, + "learning_rate": 9.153194920692678e-06, + "loss": 0.6081, + "step": 2418 + }, + { + "epoch": 0.21, + "grad_norm": 10.892268627047267, + "learning_rate": 9.152402795749642e-06, + "loss": 0.8341, + "step": 2419 + }, + { + "epoch": 0.21, + "grad_norm": 8.596186958430385, + "learning_rate": 9.151610334801078e-06, + "loss": 0.8607, + "step": 2420 + }, + { + "epoch": 0.21, + "grad_norm": 10.170303986289943, + "learning_rate": 9.150817537911111e-06, + "loss": 0.8885, + "step": 2421 + }, + { + "epoch": 0.21, + "grad_norm": 10.805047947380242, + "learning_rate": 9.150024405143895e-06, + "loss": 0.8246, + "step": 2422 + }, + { + "epoch": 0.21, + "grad_norm": 12.749213749676104, + "learning_rate": 9.149230936563607e-06, + "loss": 0.8148, + "step": 2423 + }, + { + "epoch": 0.21, + "grad_norm": 8.717211781004625, + "learning_rate": 9.148437132234452e-06, + "loss": 0.7914, + "step": 2424 + }, + { + "epoch": 0.21, + "grad_norm": 27.35322262610756, + "learning_rate": 9.147642992220664e-06, + "loss": 0.8114, + "step": 2425 + }, + { + "epoch": 0.21, + "grad_norm": 9.20647776338119, + "learning_rate": 9.146848516586507e-06, + "loss": 0.7854, + "step": 2426 + }, + { + "epoch": 0.21, + "grad_norm": 12.38404912785943, + "learning_rate": 9.146053705396265e-06, + "loss": 0.7715, + "step": 2427 + }, + { + "epoch": 0.21, + "grad_norm": 9.336240012110432, + "learning_rate": 9.145258558714254e-06, + "loss": 0.8146, + "step": 2428 + }, + { + "epoch": 0.21, + "grad_norm": 6.93266495094906, + "learning_rate": 9.144463076604815e-06, + "loss": 0.7841, + "step": 2429 + }, + { + "epoch": 0.21, + "grad_norm": 9.607955817831286, + "learning_rate": 9.143667259132319e-06, + "loss": 0.7354, + "step": 2430 + }, + { + "epoch": 0.21, + "grad_norm": 13.509070857319983, + "learning_rate": 9.142871106361159e-06, + "loss": 0.9243, + "step": 2431 + }, + { + "epoch": 0.21, + "grad_norm": 6.265791730416452, + "learning_rate": 9.142074618355763e-06, + "loss": 0.7362, + "step": 2432 + }, + { + "epoch": 0.21, + "grad_norm": 9.249010615204027, + "learning_rate": 9.141277795180576e-06, + "loss": 0.8531, + "step": 2433 + }, + { + "epoch": 0.21, + "grad_norm": 6.859118244324205, + "learning_rate": 9.14048063690008e-06, + "loss": 0.7598, + "step": 2434 + }, + { + "epoch": 0.21, + "grad_norm": 7.393199658219389, + "learning_rate": 9.13968314357878e-06, + "loss": 0.7551, + "step": 2435 + }, + { + "epoch": 0.21, + "grad_norm": 11.656366445404096, + "learning_rate": 9.138885315281203e-06, + "loss": 0.8856, + "step": 2436 + }, + { + "epoch": 0.21, + "grad_norm": 2.823905417079812, + "learning_rate": 9.138087152071912e-06, + "loss": 0.5303, + "step": 2437 + }, + { + "epoch": 0.21, + "grad_norm": 9.706798692751171, + "learning_rate": 9.137288654015493e-06, + "loss": 0.8589, + "step": 2438 + }, + { + "epoch": 0.21, + "grad_norm": 9.933798712915333, + "learning_rate": 9.136489821176558e-06, + "loss": 0.8058, + "step": 2439 + }, + { + "epoch": 0.21, + "grad_norm": 7.951495835766219, + "learning_rate": 9.135690653619746e-06, + "loss": 0.9964, + "step": 2440 + }, + { + "epoch": 0.21, + "grad_norm": 7.55586499391044, + "learning_rate": 9.134891151409726e-06, + "loss": 0.9241, + "step": 2441 + }, + { + "epoch": 0.21, + "grad_norm": 7.87040421626567, + "learning_rate": 9.134091314611193e-06, + "loss": 0.8234, + "step": 2442 + }, + { + "epoch": 0.21, + "grad_norm": 8.232945305966926, + "learning_rate": 9.133291143288865e-06, + "loss": 0.7164, + "step": 2443 + }, + { + "epoch": 0.21, + "grad_norm": 6.521968925650039, + "learning_rate": 9.132490637507496e-06, + "loss": 0.6963, + "step": 2444 + }, + { + "epoch": 0.21, + "grad_norm": 9.600905099048335, + "learning_rate": 9.131689797331857e-06, + "loss": 0.7433, + "step": 2445 + }, + { + "epoch": 0.21, + "grad_norm": 16.3525114485886, + "learning_rate": 9.130888622826755e-06, + "loss": 0.6863, + "step": 2446 + }, + { + "epoch": 0.21, + "grad_norm": 8.748099266109493, + "learning_rate": 9.130087114057013e-06, + "loss": 0.8977, + "step": 2447 + }, + { + "epoch": 0.21, + "grad_norm": 12.665186191990877, + "learning_rate": 9.129285271087494e-06, + "loss": 0.9269, + "step": 2448 + }, + { + "epoch": 0.22, + "grad_norm": 3.1612063677042954, + "learning_rate": 9.12848309398308e-06, + "loss": 0.504, + "step": 2449 + }, + { + "epoch": 0.22, + "grad_norm": 10.424774232559617, + "learning_rate": 9.12768058280868e-06, + "loss": 0.6531, + "step": 2450 + }, + { + "epoch": 0.22, + "grad_norm": 13.570487782393293, + "learning_rate": 9.126877737629234e-06, + "loss": 0.908, + "step": 2451 + }, + { + "epoch": 0.22, + "grad_norm": 11.314897374010462, + "learning_rate": 9.126074558509708e-06, + "loss": 0.9047, + "step": 2452 + }, + { + "epoch": 0.22, + "grad_norm": 8.699248890744231, + "learning_rate": 9.12527104551509e-06, + "loss": 0.8017, + "step": 2453 + }, + { + "epoch": 0.22, + "grad_norm": 12.641716475639459, + "learning_rate": 9.124467198710401e-06, + "loss": 0.8479, + "step": 2454 + }, + { + "epoch": 0.22, + "grad_norm": 8.04919128767451, + "learning_rate": 9.123663018160687e-06, + "loss": 0.7831, + "step": 2455 + }, + { + "epoch": 0.22, + "grad_norm": 8.385829921681182, + "learning_rate": 9.12285850393102e-06, + "loss": 0.7825, + "step": 2456 + }, + { + "epoch": 0.22, + "grad_norm": 13.542721757335762, + "learning_rate": 9.122053656086503e-06, + "loss": 0.9307, + "step": 2457 + }, + { + "epoch": 0.22, + "grad_norm": 8.438110474123475, + "learning_rate": 9.121248474692261e-06, + "loss": 0.7107, + "step": 2458 + }, + { + "epoch": 0.22, + "grad_norm": 9.746854974349041, + "learning_rate": 9.120442959813448e-06, + "loss": 0.9873, + "step": 2459 + }, + { + "epoch": 0.22, + "grad_norm": 9.787395248123476, + "learning_rate": 9.119637111515243e-06, + "loss": 0.8628, + "step": 2460 + }, + { + "epoch": 0.22, + "grad_norm": 2.948140249906448, + "learning_rate": 9.118830929862854e-06, + "loss": 0.5063, + "step": 2461 + }, + { + "epoch": 0.22, + "grad_norm": 10.701248944615106, + "learning_rate": 9.11802441492152e-06, + "loss": 0.7427, + "step": 2462 + }, + { + "epoch": 0.22, + "grad_norm": 10.245110732753492, + "learning_rate": 9.1172175667565e-06, + "loss": 0.6878, + "step": 2463 + }, + { + "epoch": 0.22, + "grad_norm": 7.466338243074215, + "learning_rate": 9.116410385433083e-06, + "loss": 0.7133, + "step": 2464 + }, + { + "epoch": 0.22, + "grad_norm": 8.431041367768255, + "learning_rate": 9.115602871016585e-06, + "loss": 0.9648, + "step": 2465 + }, + { + "epoch": 0.22, + "grad_norm": 5.671820152515794, + "learning_rate": 9.114795023572348e-06, + "loss": 0.8556, + "step": 2466 + }, + { + "epoch": 0.22, + "grad_norm": 10.517026250990925, + "learning_rate": 9.113986843165743e-06, + "loss": 0.9535, + "step": 2467 + }, + { + "epoch": 0.22, + "grad_norm": 6.527338155128873, + "learning_rate": 9.113178329862166e-06, + "loss": 0.8101, + "step": 2468 + }, + { + "epoch": 0.22, + "grad_norm": 12.93074626300316, + "learning_rate": 9.112369483727041e-06, + "loss": 0.7853, + "step": 2469 + }, + { + "epoch": 0.22, + "grad_norm": 7.880836388606468, + "learning_rate": 9.111560304825817e-06, + "loss": 0.7561, + "step": 2470 + }, + { + "epoch": 0.22, + "grad_norm": 12.310102286219587, + "learning_rate": 9.110750793223972e-06, + "loss": 0.8983, + "step": 2471 + }, + { + "epoch": 0.22, + "grad_norm": 10.75353370238169, + "learning_rate": 9.109940948987013e-06, + "loss": 0.8926, + "step": 2472 + }, + { + "epoch": 0.22, + "grad_norm": 11.065343787590606, + "learning_rate": 9.109130772180465e-06, + "loss": 1.023, + "step": 2473 + }, + { + "epoch": 0.22, + "grad_norm": 9.857536430551415, + "learning_rate": 9.108320262869894e-06, + "loss": 0.9512, + "step": 2474 + }, + { + "epoch": 0.22, + "grad_norm": 9.706756122528711, + "learning_rate": 9.10750942112088e-06, + "loss": 0.8576, + "step": 2475 + }, + { + "epoch": 0.22, + "grad_norm": 8.360021991932996, + "learning_rate": 9.106698246999036e-06, + "loss": 0.8224, + "step": 2476 + }, + { + "epoch": 0.22, + "grad_norm": 41.07079356628693, + "learning_rate": 9.10588674057e-06, + "loss": 0.7095, + "step": 2477 + }, + { + "epoch": 0.22, + "grad_norm": 11.098744635627378, + "learning_rate": 9.105074901899438e-06, + "loss": 0.8712, + "step": 2478 + }, + { + "epoch": 0.22, + "grad_norm": 8.063001655776917, + "learning_rate": 9.104262731053045e-06, + "loss": 0.7586, + "step": 2479 + }, + { + "epoch": 0.22, + "grad_norm": 10.620496891829951, + "learning_rate": 9.103450228096539e-06, + "loss": 0.8398, + "step": 2480 + }, + { + "epoch": 0.22, + "grad_norm": 8.0599534624531, + "learning_rate": 9.102637393095666e-06, + "loss": 0.7677, + "step": 2481 + }, + { + "epoch": 0.22, + "grad_norm": 2.7672786833111713, + "learning_rate": 9.1018242261162e-06, + "loss": 0.5731, + "step": 2482 + }, + { + "epoch": 0.22, + "grad_norm": 5.462682741930739, + "learning_rate": 9.10101072722394e-06, + "loss": 0.7737, + "step": 2483 + }, + { + "epoch": 0.22, + "grad_norm": 9.285332535330898, + "learning_rate": 9.100196896484713e-06, + "loss": 0.9872, + "step": 2484 + }, + { + "epoch": 0.22, + "grad_norm": 7.709048638081124, + "learning_rate": 9.099382733964375e-06, + "loss": 0.802, + "step": 2485 + }, + { + "epoch": 0.22, + "grad_norm": 9.720697955902855, + "learning_rate": 9.098568239728805e-06, + "loss": 0.8777, + "step": 2486 + }, + { + "epoch": 0.22, + "grad_norm": 13.792870802036022, + "learning_rate": 9.097753413843909e-06, + "loss": 0.8513, + "step": 2487 + }, + { + "epoch": 0.22, + "grad_norm": 15.050978049993633, + "learning_rate": 9.096938256375624e-06, + "loss": 0.7865, + "step": 2488 + }, + { + "epoch": 0.22, + "grad_norm": 8.829357144755601, + "learning_rate": 9.09612276738991e-06, + "loss": 0.7292, + "step": 2489 + }, + { + "epoch": 0.22, + "grad_norm": 9.496820910723637, + "learning_rate": 9.095306946952756e-06, + "loss": 0.83, + "step": 2490 + }, + { + "epoch": 0.22, + "grad_norm": 10.106642354927912, + "learning_rate": 9.094490795130175e-06, + "loss": 0.7281, + "step": 2491 + }, + { + "epoch": 0.22, + "grad_norm": 7.7049386346665365, + "learning_rate": 9.093674311988209e-06, + "loss": 0.8836, + "step": 2492 + }, + { + "epoch": 0.22, + "grad_norm": 25.71960966893849, + "learning_rate": 9.09285749759293e-06, + "loss": 0.7605, + "step": 2493 + }, + { + "epoch": 0.22, + "grad_norm": 4.712433094435113, + "learning_rate": 9.092040352010428e-06, + "loss": 0.597, + "step": 2494 + }, + { + "epoch": 0.22, + "grad_norm": 8.675418454498432, + "learning_rate": 9.09122287530683e-06, + "loss": 0.9015, + "step": 2495 + }, + { + "epoch": 0.22, + "grad_norm": 8.769381453194686, + "learning_rate": 9.09040506754828e-06, + "loss": 0.9109, + "step": 2496 + }, + { + "epoch": 0.22, + "grad_norm": 11.32798977112156, + "learning_rate": 9.089586928800955e-06, + "loss": 0.9043, + "step": 2497 + }, + { + "epoch": 0.22, + "grad_norm": 14.43036099427619, + "learning_rate": 9.08876845913106e-06, + "loss": 0.7119, + "step": 2498 + }, + { + "epoch": 0.22, + "grad_norm": 29.02388767026782, + "learning_rate": 9.087949658604822e-06, + "loss": 0.6894, + "step": 2499 + }, + { + "epoch": 0.22, + "grad_norm": 11.926408118439415, + "learning_rate": 9.0871305272885e-06, + "loss": 0.7428, + "step": 2500 + }, + { + "epoch": 0.22, + "grad_norm": 8.058327412890554, + "learning_rate": 9.08631106524837e-06, + "loss": 0.7727, + "step": 2501 + }, + { + "epoch": 0.22, + "grad_norm": 11.834125768560954, + "learning_rate": 9.085491272550749e-06, + "loss": 1.0394, + "step": 2502 + }, + { + "epoch": 0.22, + "grad_norm": 6.664084757253887, + "learning_rate": 9.08467114926197e-06, + "loss": 0.7914, + "step": 2503 + }, + { + "epoch": 0.22, + "grad_norm": 9.04177883155767, + "learning_rate": 9.083850695448395e-06, + "loss": 0.811, + "step": 2504 + }, + { + "epoch": 0.22, + "grad_norm": 6.708847594562984, + "learning_rate": 9.083029911176415e-06, + "loss": 0.6356, + "step": 2505 + }, + { + "epoch": 0.22, + "grad_norm": 12.973186848897035, + "learning_rate": 9.082208796512448e-06, + "loss": 0.8472, + "step": 2506 + }, + { + "epoch": 0.22, + "grad_norm": 7.743187077468889, + "learning_rate": 9.081387351522934e-06, + "loss": 0.8985, + "step": 2507 + }, + { + "epoch": 0.22, + "grad_norm": 8.550045580320873, + "learning_rate": 9.080565576274344e-06, + "loss": 0.7843, + "step": 2508 + }, + { + "epoch": 0.22, + "grad_norm": 7.559021211467816, + "learning_rate": 9.079743470833177e-06, + "loss": 0.7449, + "step": 2509 + }, + { + "epoch": 0.22, + "grad_norm": 10.839000834126933, + "learning_rate": 9.078921035265954e-06, + "loss": 0.7999, + "step": 2510 + }, + { + "epoch": 0.22, + "grad_norm": 2.7604519967664904, + "learning_rate": 9.078098269639224e-06, + "loss": 0.5319, + "step": 2511 + }, + { + "epoch": 0.22, + "grad_norm": 49.045106208111655, + "learning_rate": 9.077275174019567e-06, + "loss": 0.8181, + "step": 2512 + }, + { + "epoch": 0.22, + "grad_norm": 15.612074729459543, + "learning_rate": 9.076451748473586e-06, + "loss": 0.7249, + "step": 2513 + }, + { + "epoch": 0.22, + "grad_norm": 30.751597435839884, + "learning_rate": 9.075627993067911e-06, + "loss": 0.8542, + "step": 2514 + }, + { + "epoch": 0.22, + "grad_norm": 7.935284389023723, + "learning_rate": 9.074803907869195e-06, + "loss": 0.7905, + "step": 2515 + }, + { + "epoch": 0.22, + "grad_norm": 8.706421929213896, + "learning_rate": 9.073979492944128e-06, + "loss": 0.8379, + "step": 2516 + }, + { + "epoch": 0.22, + "grad_norm": 3.976762918410569, + "learning_rate": 9.073154748359413e-06, + "loss": 0.5821, + "step": 2517 + }, + { + "epoch": 0.22, + "grad_norm": 9.820414271649556, + "learning_rate": 9.072329674181796e-06, + "loss": 0.8114, + "step": 2518 + }, + { + "epoch": 0.22, + "grad_norm": 9.280025224281635, + "learning_rate": 9.071504270478034e-06, + "loss": 0.8071, + "step": 2519 + }, + { + "epoch": 0.22, + "grad_norm": 7.1754487484345475, + "learning_rate": 9.070678537314919e-06, + "loss": 1.0017, + "step": 2520 + }, + { + "epoch": 0.22, + "grad_norm": 10.079133305806716, + "learning_rate": 9.069852474759266e-06, + "loss": 0.8904, + "step": 2521 + }, + { + "epoch": 0.22, + "grad_norm": 9.82858678393873, + "learning_rate": 9.069026082877924e-06, + "loss": 0.7495, + "step": 2522 + }, + { + "epoch": 0.22, + "grad_norm": 9.980703814492905, + "learning_rate": 9.068199361737758e-06, + "loss": 0.9608, + "step": 2523 + }, + { + "epoch": 0.22, + "grad_norm": 7.648349271821178, + "learning_rate": 9.067372311405667e-06, + "loss": 0.9567, + "step": 2524 + }, + { + "epoch": 0.22, + "grad_norm": 20.89683820531512, + "learning_rate": 9.066544931948574e-06, + "loss": 0.8472, + "step": 2525 + }, + { + "epoch": 0.22, + "grad_norm": 7.498956847391026, + "learning_rate": 9.065717223433429e-06, + "loss": 0.8917, + "step": 2526 + }, + { + "epoch": 0.22, + "grad_norm": 11.317245971247202, + "learning_rate": 9.06488918592721e-06, + "loss": 0.7865, + "step": 2527 + }, + { + "epoch": 0.22, + "grad_norm": 9.101194871697684, + "learning_rate": 9.064060819496918e-06, + "loss": 0.7598, + "step": 2528 + }, + { + "epoch": 0.22, + "grad_norm": 6.835672370172251, + "learning_rate": 9.063232124209586e-06, + "loss": 0.7296, + "step": 2529 + }, + { + "epoch": 0.22, + "grad_norm": 10.399803738291823, + "learning_rate": 9.06240310013227e-06, + "loss": 0.8171, + "step": 2530 + }, + { + "epoch": 0.22, + "grad_norm": 9.373313410393823, + "learning_rate": 9.061573747332053e-06, + "loss": 0.947, + "step": 2531 + }, + { + "epoch": 0.22, + "grad_norm": 12.410525030920411, + "learning_rate": 9.060744065876044e-06, + "loss": 0.8483, + "step": 2532 + }, + { + "epoch": 0.22, + "grad_norm": 8.535553679567812, + "learning_rate": 9.05991405583138e-06, + "loss": 0.8072, + "step": 2533 + }, + { + "epoch": 0.22, + "grad_norm": 7.507617787948096, + "learning_rate": 9.059083717265222e-06, + "loss": 0.8788, + "step": 2534 + }, + { + "epoch": 0.22, + "grad_norm": 5.2275798415593995, + "learning_rate": 9.058253050244764e-06, + "loss": 0.5808, + "step": 2535 + }, + { + "epoch": 0.22, + "grad_norm": 9.696787389084847, + "learning_rate": 9.057422054837219e-06, + "loss": 0.8869, + "step": 2536 + }, + { + "epoch": 0.22, + "grad_norm": 8.19876038186887, + "learning_rate": 9.056590731109829e-06, + "loss": 0.8751, + "step": 2537 + }, + { + "epoch": 0.22, + "grad_norm": 13.122703336287342, + "learning_rate": 9.055759079129867e-06, + "loss": 0.8717, + "step": 2538 + }, + { + "epoch": 0.22, + "grad_norm": 9.850882002041951, + "learning_rate": 9.054927098964625e-06, + "loss": 0.7499, + "step": 2539 + }, + { + "epoch": 0.22, + "grad_norm": 11.571899294149862, + "learning_rate": 9.05409479068143e-06, + "loss": 0.8284, + "step": 2540 + }, + { + "epoch": 0.22, + "grad_norm": 21.227259754565093, + "learning_rate": 9.053262154347624e-06, + "loss": 0.9996, + "step": 2541 + }, + { + "epoch": 0.22, + "grad_norm": 9.829078040959251, + "learning_rate": 9.052429190030589e-06, + "loss": 0.7908, + "step": 2542 + }, + { + "epoch": 0.22, + "grad_norm": 8.658165445515257, + "learning_rate": 9.051595897797725e-06, + "loss": 1.0294, + "step": 2543 + }, + { + "epoch": 0.22, + "grad_norm": 10.100003659920604, + "learning_rate": 9.050762277716461e-06, + "loss": 0.8153, + "step": 2544 + }, + { + "epoch": 0.22, + "grad_norm": 15.827147021737531, + "learning_rate": 9.04992832985425e-06, + "loss": 0.7556, + "step": 2545 + }, + { + "epoch": 0.22, + "grad_norm": 13.479516154495537, + "learning_rate": 9.049094054278576e-06, + "loss": 0.9445, + "step": 2546 + }, + { + "epoch": 0.22, + "grad_norm": 11.843247184755743, + "learning_rate": 9.048259451056946e-06, + "loss": 0.8198, + "step": 2547 + }, + { + "epoch": 0.22, + "grad_norm": 2.57265279151902, + "learning_rate": 9.047424520256896e-06, + "loss": 0.4868, + "step": 2548 + }, + { + "epoch": 0.22, + "grad_norm": 17.98847235657688, + "learning_rate": 9.046589261945987e-06, + "loss": 0.7176, + "step": 2549 + }, + { + "epoch": 0.22, + "grad_norm": 8.935915476943384, + "learning_rate": 9.045753676191805e-06, + "loss": 0.7751, + "step": 2550 + }, + { + "epoch": 0.22, + "grad_norm": 10.327344050809726, + "learning_rate": 9.044917763061965e-06, + "loss": 0.8453, + "step": 2551 + }, + { + "epoch": 0.22, + "grad_norm": 10.71236529959535, + "learning_rate": 9.044081522624106e-06, + "loss": 0.8089, + "step": 2552 + }, + { + "epoch": 0.22, + "grad_norm": 10.321996657288492, + "learning_rate": 9.0432449549459e-06, + "loss": 0.641, + "step": 2553 + }, + { + "epoch": 0.22, + "grad_norm": 53.45977061818504, + "learning_rate": 9.042408060095038e-06, + "loss": 0.8139, + "step": 2554 + }, + { + "epoch": 0.22, + "grad_norm": 8.50471639740241, + "learning_rate": 9.04157083813924e-06, + "loss": 0.8906, + "step": 2555 + }, + { + "epoch": 0.22, + "grad_norm": 18.04177840970329, + "learning_rate": 9.04073328914625e-06, + "loss": 0.802, + "step": 2556 + }, + { + "epoch": 0.22, + "grad_norm": 21.50581115214884, + "learning_rate": 9.039895413183845e-06, + "loss": 1.0448, + "step": 2557 + }, + { + "epoch": 0.22, + "grad_norm": 11.00974955004464, + "learning_rate": 9.039057210319824e-06, + "loss": 0.8727, + "step": 2558 + }, + { + "epoch": 0.22, + "grad_norm": 2.4631882916671297, + "learning_rate": 9.038218680622014e-06, + "loss": 0.4862, + "step": 2559 + }, + { + "epoch": 0.22, + "grad_norm": 8.533885195826144, + "learning_rate": 9.037379824158264e-06, + "loss": 0.8533, + "step": 2560 + }, + { + "epoch": 0.22, + "grad_norm": 11.977127917928032, + "learning_rate": 9.036540640996452e-06, + "loss": 0.8593, + "step": 2561 + }, + { + "epoch": 0.23, + "grad_norm": 8.175812731689694, + "learning_rate": 9.035701131204489e-06, + "loss": 0.7825, + "step": 2562 + }, + { + "epoch": 0.23, + "grad_norm": 8.497230320865429, + "learning_rate": 9.034861294850301e-06, + "loss": 0.8679, + "step": 2563 + }, + { + "epoch": 0.23, + "grad_norm": 11.479895517687519, + "learning_rate": 9.03402113200185e-06, + "loss": 0.9723, + "step": 2564 + }, + { + "epoch": 0.23, + "grad_norm": 2.361675738241693, + "learning_rate": 9.03318064272712e-06, + "loss": 0.4748, + "step": 2565 + }, + { + "epoch": 0.23, + "grad_norm": 7.949367713629304, + "learning_rate": 9.03233982709412e-06, + "loss": 0.9125, + "step": 2566 + }, + { + "epoch": 0.23, + "grad_norm": 10.08150602523918, + "learning_rate": 9.031498685170888e-06, + "loss": 0.8876, + "step": 2567 + }, + { + "epoch": 0.23, + "grad_norm": 13.346031601413342, + "learning_rate": 9.03065721702549e-06, + "loss": 0.9801, + "step": 2568 + }, + { + "epoch": 0.23, + "grad_norm": 9.641599155279625, + "learning_rate": 9.029815422726015e-06, + "loss": 0.8708, + "step": 2569 + }, + { + "epoch": 0.23, + "grad_norm": 2.6524660343920345, + "learning_rate": 9.028973302340578e-06, + "loss": 0.5695, + "step": 2570 + }, + { + "epoch": 0.23, + "grad_norm": 16.263032243642918, + "learning_rate": 9.028130855937323e-06, + "loss": 0.8687, + "step": 2571 + }, + { + "epoch": 0.23, + "grad_norm": 11.261906869031733, + "learning_rate": 9.02728808358442e-06, + "loss": 0.7381, + "step": 2572 + }, + { + "epoch": 0.23, + "grad_norm": 10.878989356251068, + "learning_rate": 9.026444985350064e-06, + "loss": 0.7442, + "step": 2573 + }, + { + "epoch": 0.23, + "grad_norm": 7.2425822415947785, + "learning_rate": 9.02560156130248e-06, + "loss": 0.8638, + "step": 2574 + }, + { + "epoch": 0.23, + "grad_norm": 3.5116626073060266, + "learning_rate": 9.02475781150991e-06, + "loss": 0.4719, + "step": 2575 + }, + { + "epoch": 0.23, + "grad_norm": 10.427644777325973, + "learning_rate": 9.023913736040636e-06, + "loss": 0.9242, + "step": 2576 + }, + { + "epoch": 0.23, + "grad_norm": 8.589370529663812, + "learning_rate": 9.023069334962954e-06, + "loss": 0.7841, + "step": 2577 + }, + { + "epoch": 0.23, + "grad_norm": 22.00058858443507, + "learning_rate": 9.022224608345194e-06, + "loss": 0.9901, + "step": 2578 + }, + { + "epoch": 0.23, + "grad_norm": 8.96896700573423, + "learning_rate": 9.021379556255709e-06, + "loss": 0.7875, + "step": 2579 + }, + { + "epoch": 0.23, + "grad_norm": 9.851283127549733, + "learning_rate": 9.020534178762879e-06, + "loss": 0.7943, + "step": 2580 + }, + { + "epoch": 0.23, + "grad_norm": 13.344165736535993, + "learning_rate": 9.019688475935113e-06, + "loss": 0.7818, + "step": 2581 + }, + { + "epoch": 0.23, + "grad_norm": 10.580218262194084, + "learning_rate": 9.01884244784084e-06, + "loss": 0.8368, + "step": 2582 + }, + { + "epoch": 0.23, + "grad_norm": 15.152954607623686, + "learning_rate": 9.017996094548523e-06, + "loss": 0.8665, + "step": 2583 + }, + { + "epoch": 0.23, + "grad_norm": 9.036857734016364, + "learning_rate": 9.017149416126644e-06, + "loss": 0.8636, + "step": 2584 + }, + { + "epoch": 0.23, + "grad_norm": 14.714683803135536, + "learning_rate": 9.016302412643716e-06, + "loss": 0.8482, + "step": 2585 + }, + { + "epoch": 0.23, + "grad_norm": 8.136044799833, + "learning_rate": 9.015455084168279e-06, + "loss": 0.9185, + "step": 2586 + }, + { + "epoch": 0.23, + "grad_norm": 7.394265949737332, + "learning_rate": 9.014607430768896e-06, + "loss": 0.8523, + "step": 2587 + }, + { + "epoch": 0.23, + "grad_norm": 7.686633125309811, + "learning_rate": 9.013759452514156e-06, + "loss": 0.9518, + "step": 2588 + }, + { + "epoch": 0.23, + "grad_norm": 8.094556814911147, + "learning_rate": 9.01291114947268e-06, + "loss": 0.8782, + "step": 2589 + }, + { + "epoch": 0.23, + "grad_norm": 8.340010528948659, + "learning_rate": 9.012062521713107e-06, + "loss": 0.9044, + "step": 2590 + }, + { + "epoch": 0.23, + "grad_norm": 15.7468707519551, + "learning_rate": 9.011213569304108e-06, + "loss": 0.9686, + "step": 2591 + }, + { + "epoch": 0.23, + "grad_norm": 8.159728367393265, + "learning_rate": 9.010364292314381e-06, + "loss": 0.6794, + "step": 2592 + }, + { + "epoch": 0.23, + "grad_norm": 3.4057295638889444, + "learning_rate": 9.009514690812646e-06, + "loss": 0.6204, + "step": 2593 + }, + { + "epoch": 0.23, + "grad_norm": 10.995492219045452, + "learning_rate": 9.008664764867652e-06, + "loss": 0.7203, + "step": 2594 + }, + { + "epoch": 0.23, + "grad_norm": 10.77995577063491, + "learning_rate": 9.007814514548172e-06, + "loss": 0.8371, + "step": 2595 + }, + { + "epoch": 0.23, + "grad_norm": 13.371637921465135, + "learning_rate": 9.006963939923011e-06, + "loss": 0.9211, + "step": 2596 + }, + { + "epoch": 0.23, + "grad_norm": 14.017314227421243, + "learning_rate": 9.00611304106099e-06, + "loss": 0.9818, + "step": 2597 + }, + { + "epoch": 0.23, + "grad_norm": 8.404885843431742, + "learning_rate": 9.005261818030966e-06, + "loss": 0.8759, + "step": 2598 + }, + { + "epoch": 0.23, + "grad_norm": 11.450552299991548, + "learning_rate": 9.00441027090182e-06, + "loss": 0.9073, + "step": 2599 + }, + { + "epoch": 0.23, + "grad_norm": 7.565092383951006, + "learning_rate": 9.003558399742454e-06, + "loss": 0.8119, + "step": 2600 + }, + { + "epoch": 0.23, + "grad_norm": 6.083400833524064, + "learning_rate": 9.002706204621802e-06, + "loss": 0.8399, + "step": 2601 + }, + { + "epoch": 0.23, + "grad_norm": 2.635415736673015, + "learning_rate": 9.001853685608824e-06, + "loss": 0.5196, + "step": 2602 + }, + { + "epoch": 0.23, + "grad_norm": 13.163399343044263, + "learning_rate": 9.001000842772501e-06, + "loss": 0.7095, + "step": 2603 + }, + { + "epoch": 0.23, + "grad_norm": 14.831711294238744, + "learning_rate": 9.000147676181845e-06, + "loss": 0.8782, + "step": 2604 + }, + { + "epoch": 0.23, + "grad_norm": 9.697528737176013, + "learning_rate": 8.999294185905894e-06, + "loss": 0.7287, + "step": 2605 + }, + { + "epoch": 0.23, + "grad_norm": 2.5002432890612054, + "learning_rate": 8.998440372013709e-06, + "loss": 0.4804, + "step": 2606 + }, + { + "epoch": 0.23, + "grad_norm": 9.935992108614176, + "learning_rate": 8.99758623457438e-06, + "loss": 0.8705, + "step": 2607 + }, + { + "epoch": 0.23, + "grad_norm": 2.1636023294228286, + "learning_rate": 8.996731773657022e-06, + "loss": 0.4973, + "step": 2608 + }, + { + "epoch": 0.23, + "grad_norm": 9.278712091683214, + "learning_rate": 8.995876989330778e-06, + "loss": 0.7553, + "step": 2609 + }, + { + "epoch": 0.23, + "grad_norm": 7.7289484680700085, + "learning_rate": 8.995021881664816e-06, + "loss": 0.8854, + "step": 2610 + }, + { + "epoch": 0.23, + "grad_norm": 9.026496632403578, + "learning_rate": 8.994166450728327e-06, + "loss": 0.9641, + "step": 2611 + }, + { + "epoch": 0.23, + "grad_norm": 23.017807327730264, + "learning_rate": 8.993310696590534e-06, + "loss": 0.8811, + "step": 2612 + }, + { + "epoch": 0.23, + "grad_norm": 9.64204447226167, + "learning_rate": 8.99245461932068e-06, + "loss": 0.8351, + "step": 2613 + }, + { + "epoch": 0.23, + "grad_norm": 3.2045251222441027, + "learning_rate": 8.99159821898804e-06, + "loss": 0.5823, + "step": 2614 + }, + { + "epoch": 0.23, + "grad_norm": 11.354099513582234, + "learning_rate": 8.990741495661913e-06, + "loss": 0.7731, + "step": 2615 + }, + { + "epoch": 0.23, + "grad_norm": 32.33276344588963, + "learning_rate": 8.989884449411622e-06, + "loss": 0.75, + "step": 2616 + }, + { + "epoch": 0.23, + "grad_norm": 2.360573944893467, + "learning_rate": 8.989027080306518e-06, + "loss": 0.4767, + "step": 2617 + }, + { + "epoch": 0.23, + "grad_norm": 27.38268802116189, + "learning_rate": 8.988169388415977e-06, + "loss": 0.7864, + "step": 2618 + }, + { + "epoch": 0.23, + "grad_norm": 11.240382718952233, + "learning_rate": 8.987311373809405e-06, + "loss": 0.8007, + "step": 2619 + }, + { + "epoch": 0.23, + "grad_norm": 11.338031677168848, + "learning_rate": 8.986453036556228e-06, + "loss": 0.92, + "step": 2620 + }, + { + "epoch": 0.23, + "grad_norm": 10.790627584734612, + "learning_rate": 8.985594376725904e-06, + "loss": 0.7879, + "step": 2621 + }, + { + "epoch": 0.23, + "grad_norm": 5.683329986437577, + "learning_rate": 8.984735394387911e-06, + "loss": 0.7585, + "step": 2622 + }, + { + "epoch": 0.23, + "grad_norm": 8.114383031694336, + "learning_rate": 8.983876089611759e-06, + "loss": 0.8262, + "step": 2623 + }, + { + "epoch": 0.23, + "grad_norm": 12.034907837207568, + "learning_rate": 8.98301646246698e-06, + "loss": 0.6498, + "step": 2624 + }, + { + "epoch": 0.23, + "grad_norm": 15.137588431552933, + "learning_rate": 8.982156513023134e-06, + "loss": 0.8162, + "step": 2625 + }, + { + "epoch": 0.23, + "grad_norm": 10.424891065032524, + "learning_rate": 8.981296241349808e-06, + "loss": 0.85, + "step": 2626 + }, + { + "epoch": 0.23, + "grad_norm": 15.727684764106195, + "learning_rate": 8.980435647516612e-06, + "loss": 0.8816, + "step": 2627 + }, + { + "epoch": 0.23, + "grad_norm": 7.904122154724245, + "learning_rate": 8.979574731593183e-06, + "loss": 0.8969, + "step": 2628 + }, + { + "epoch": 0.23, + "grad_norm": 22.411422957249737, + "learning_rate": 8.978713493649189e-06, + "loss": 0.8364, + "step": 2629 + }, + { + "epoch": 0.23, + "grad_norm": 9.355151942742737, + "learning_rate": 8.977851933754317e-06, + "loss": 0.8463, + "step": 2630 + }, + { + "epoch": 0.23, + "grad_norm": 43.96206686017489, + "learning_rate": 8.976990051978283e-06, + "loss": 0.7711, + "step": 2631 + }, + { + "epoch": 0.23, + "grad_norm": 3.217613790185671, + "learning_rate": 8.976127848390828e-06, + "loss": 0.4874, + "step": 2632 + }, + { + "epoch": 0.23, + "grad_norm": 12.292473562886972, + "learning_rate": 8.975265323061723e-06, + "loss": 0.9639, + "step": 2633 + }, + { + "epoch": 0.23, + "grad_norm": 2.467073582554485, + "learning_rate": 8.97440247606076e-06, + "loss": 0.5237, + "step": 2634 + }, + { + "epoch": 0.23, + "grad_norm": 16.516359587397435, + "learning_rate": 8.97353930745776e-06, + "loss": 0.7574, + "step": 2635 + }, + { + "epoch": 0.23, + "grad_norm": 12.676732825699423, + "learning_rate": 8.97267581732257e-06, + "loss": 0.7567, + "step": 2636 + }, + { + "epoch": 0.23, + "grad_norm": 13.225043077611062, + "learning_rate": 8.971812005725059e-06, + "loss": 0.7454, + "step": 2637 + }, + { + "epoch": 0.23, + "grad_norm": 15.258436295412833, + "learning_rate": 8.970947872735128e-06, + "loss": 0.9491, + "step": 2638 + }, + { + "epoch": 0.23, + "grad_norm": 14.088035633809222, + "learning_rate": 8.970083418422701e-06, + "loss": 0.8565, + "step": 2639 + }, + { + "epoch": 0.23, + "grad_norm": 10.886159480818176, + "learning_rate": 8.969218642857727e-06, + "loss": 0.9058, + "step": 2640 + }, + { + "epoch": 0.23, + "grad_norm": 11.548636232478971, + "learning_rate": 8.968353546110181e-06, + "loss": 0.7116, + "step": 2641 + }, + { + "epoch": 0.23, + "grad_norm": 8.690418394846773, + "learning_rate": 8.96748812825007e-06, + "loss": 0.8341, + "step": 2642 + }, + { + "epoch": 0.23, + "grad_norm": 8.273844195349044, + "learning_rate": 8.966622389347419e-06, + "loss": 0.8054, + "step": 2643 + }, + { + "epoch": 0.23, + "grad_norm": 8.53804662596126, + "learning_rate": 8.965756329472282e-06, + "loss": 0.7543, + "step": 2644 + }, + { + "epoch": 0.23, + "grad_norm": 7.128076955600173, + "learning_rate": 8.964889948694739e-06, + "loss": 0.621, + "step": 2645 + }, + { + "epoch": 0.23, + "grad_norm": 11.465882598485543, + "learning_rate": 8.964023247084898e-06, + "loss": 0.8775, + "step": 2646 + }, + { + "epoch": 0.23, + "grad_norm": 7.730966448537998, + "learning_rate": 8.963156224712888e-06, + "loss": 0.7421, + "step": 2647 + }, + { + "epoch": 0.23, + "grad_norm": 242.203127627351, + "learning_rate": 8.962288881648869e-06, + "loss": 1.013, + "step": 2648 + }, + { + "epoch": 0.23, + "grad_norm": 13.163299672669535, + "learning_rate": 8.961421217963027e-06, + "loss": 0.8465, + "step": 2649 + }, + { + "epoch": 0.23, + "grad_norm": 9.692196180351885, + "learning_rate": 8.960553233725567e-06, + "loss": 0.9074, + "step": 2650 + }, + { + "epoch": 0.23, + "grad_norm": 10.098302969368788, + "learning_rate": 8.959684929006727e-06, + "loss": 0.6775, + "step": 2651 + }, + { + "epoch": 0.23, + "grad_norm": 2.888651035477564, + "learning_rate": 8.95881630387677e-06, + "loss": 0.6065, + "step": 2652 + }, + { + "epoch": 0.23, + "grad_norm": 11.127070686557985, + "learning_rate": 8.957947358405982e-06, + "loss": 0.9073, + "step": 2653 + }, + { + "epoch": 0.23, + "grad_norm": 14.500775616293636, + "learning_rate": 8.95707809266468e-06, + "loss": 0.9357, + "step": 2654 + }, + { + "epoch": 0.23, + "grad_norm": 18.895354024414395, + "learning_rate": 8.9562085067232e-06, + "loss": 0.7657, + "step": 2655 + }, + { + "epoch": 0.23, + "grad_norm": 8.801030910609436, + "learning_rate": 8.955338600651906e-06, + "loss": 0.9016, + "step": 2656 + }, + { + "epoch": 0.23, + "grad_norm": 8.330409024816165, + "learning_rate": 8.954468374521194e-06, + "loss": 0.7773, + "step": 2657 + }, + { + "epoch": 0.23, + "grad_norm": 40.10974853283794, + "learning_rate": 8.953597828401479e-06, + "loss": 0.7122, + "step": 2658 + }, + { + "epoch": 0.23, + "grad_norm": 12.938343076615126, + "learning_rate": 8.952726962363203e-06, + "loss": 0.8453, + "step": 2659 + }, + { + "epoch": 0.23, + "grad_norm": 7.830307925466359, + "learning_rate": 8.951855776476836e-06, + "loss": 0.8674, + "step": 2660 + }, + { + "epoch": 0.23, + "grad_norm": 9.522380421434555, + "learning_rate": 8.950984270812875e-06, + "loss": 0.7266, + "step": 2661 + }, + { + "epoch": 0.23, + "grad_norm": 7.219627672371025, + "learning_rate": 8.950112445441839e-06, + "loss": 0.8392, + "step": 2662 + }, + { + "epoch": 0.23, + "grad_norm": 13.74280873308201, + "learning_rate": 8.949240300434272e-06, + "loss": 0.938, + "step": 2663 + }, + { + "epoch": 0.23, + "grad_norm": 7.038785917079529, + "learning_rate": 8.948367835860752e-06, + "loss": 0.7838, + "step": 2664 + }, + { + "epoch": 0.23, + "grad_norm": 6.546885990222684, + "learning_rate": 8.947495051791872e-06, + "loss": 0.8871, + "step": 2665 + }, + { + "epoch": 0.23, + "grad_norm": 6.570875807450431, + "learning_rate": 8.946621948298261e-06, + "loss": 0.8965, + "step": 2666 + }, + { + "epoch": 0.23, + "grad_norm": 9.317995848632915, + "learning_rate": 8.945748525450566e-06, + "loss": 0.7833, + "step": 2667 + }, + { + "epoch": 0.23, + "grad_norm": 11.660496780010149, + "learning_rate": 8.944874783319465e-06, + "loss": 0.7723, + "step": 2668 + }, + { + "epoch": 0.23, + "grad_norm": 11.914897554148682, + "learning_rate": 8.944000721975656e-06, + "loss": 0.8347, + "step": 2669 + }, + { + "epoch": 0.23, + "grad_norm": 8.814674974637281, + "learning_rate": 8.94312634148987e-06, + "loss": 0.7302, + "step": 2670 + }, + { + "epoch": 0.23, + "grad_norm": 17.011075054299127, + "learning_rate": 8.94225164193286e-06, + "loss": 0.8974, + "step": 2671 + }, + { + "epoch": 0.23, + "grad_norm": 7.513615806997398, + "learning_rate": 8.941376623375403e-06, + "loss": 0.928, + "step": 2672 + }, + { + "epoch": 0.23, + "grad_norm": 10.437279209789535, + "learning_rate": 8.940501285888309e-06, + "loss": 0.869, + "step": 2673 + }, + { + "epoch": 0.23, + "grad_norm": 5.81192522449214, + "learning_rate": 8.939625629542401e-06, + "loss": 0.739, + "step": 2674 + }, + { + "epoch": 0.23, + "grad_norm": 11.50464952936879, + "learning_rate": 8.938749654408545e-06, + "loss": 0.8408, + "step": 2675 + }, + { + "epoch": 0.24, + "grad_norm": 8.573098339756779, + "learning_rate": 8.937873360557617e-06, + "loss": 0.9357, + "step": 2676 + }, + { + "epoch": 0.24, + "grad_norm": 10.996251423844143, + "learning_rate": 8.936996748060527e-06, + "loss": 0.7964, + "step": 2677 + }, + { + "epoch": 0.24, + "grad_norm": 9.416216987620327, + "learning_rate": 8.936119816988209e-06, + "loss": 0.7478, + "step": 2678 + }, + { + "epoch": 0.24, + "grad_norm": 8.727642351166718, + "learning_rate": 8.935242567411622e-06, + "loss": 0.881, + "step": 2679 + }, + { + "epoch": 0.24, + "grad_norm": 12.450134043643198, + "learning_rate": 8.934364999401752e-06, + "loss": 0.8287, + "step": 2680 + }, + { + "epoch": 0.24, + "grad_norm": 8.512151958796569, + "learning_rate": 8.933487113029613e-06, + "loss": 0.8702, + "step": 2681 + }, + { + "epoch": 0.24, + "grad_norm": 9.20990698327382, + "learning_rate": 8.932608908366239e-06, + "loss": 0.9249, + "step": 2682 + }, + { + "epoch": 0.24, + "grad_norm": 12.206366043134553, + "learning_rate": 8.931730385482692e-06, + "loss": 0.9043, + "step": 2683 + }, + { + "epoch": 0.24, + "grad_norm": 9.41008057643673, + "learning_rate": 8.930851544450063e-06, + "loss": 0.9716, + "step": 2684 + }, + { + "epoch": 0.24, + "grad_norm": 9.452807500846067, + "learning_rate": 8.929972385339466e-06, + "loss": 0.8101, + "step": 2685 + }, + { + "epoch": 0.24, + "grad_norm": 7.470808317545056, + "learning_rate": 8.929092908222042e-06, + "loss": 0.833, + "step": 2686 + }, + { + "epoch": 0.24, + "grad_norm": 2.3706429791275165, + "learning_rate": 8.928213113168954e-06, + "loss": 0.4475, + "step": 2687 + }, + { + "epoch": 0.24, + "grad_norm": 7.838029802220112, + "learning_rate": 8.927333000251396e-06, + "loss": 0.7497, + "step": 2688 + }, + { + "epoch": 0.24, + "grad_norm": 8.623827532675444, + "learning_rate": 8.926452569540585e-06, + "loss": 0.7899, + "step": 2689 + }, + { + "epoch": 0.24, + "grad_norm": 8.62003066290925, + "learning_rate": 8.925571821107761e-06, + "loss": 0.9972, + "step": 2690 + }, + { + "epoch": 0.24, + "grad_norm": 19.847204343360662, + "learning_rate": 8.924690755024197e-06, + "loss": 0.864, + "step": 2691 + }, + { + "epoch": 0.24, + "grad_norm": 6.893028749142685, + "learning_rate": 8.923809371361184e-06, + "loss": 0.8239, + "step": 2692 + }, + { + "epoch": 0.24, + "grad_norm": 11.262934641537084, + "learning_rate": 8.922927670190047e-06, + "loss": 0.8831, + "step": 2693 + }, + { + "epoch": 0.24, + "grad_norm": 9.403777913361179, + "learning_rate": 8.922045651582126e-06, + "loss": 0.7279, + "step": 2694 + }, + { + "epoch": 0.24, + "grad_norm": 10.307161786278826, + "learning_rate": 8.921163315608793e-06, + "loss": 0.8284, + "step": 2695 + }, + { + "epoch": 0.24, + "grad_norm": 12.949439487574237, + "learning_rate": 8.92028066234145e-06, + "loss": 0.8601, + "step": 2696 + }, + { + "epoch": 0.24, + "grad_norm": 9.223233162483217, + "learning_rate": 8.919397691851515e-06, + "loss": 0.8482, + "step": 2697 + }, + { + "epoch": 0.24, + "grad_norm": 9.12497462684437, + "learning_rate": 8.918514404210439e-06, + "loss": 0.8867, + "step": 2698 + }, + { + "epoch": 0.24, + "grad_norm": 11.276704424041382, + "learning_rate": 8.917630799489696e-06, + "loss": 0.755, + "step": 2699 + }, + { + "epoch": 0.24, + "grad_norm": 6.737818537647922, + "learning_rate": 8.916746877760785e-06, + "loss": 0.9616, + "step": 2700 + }, + { + "epoch": 0.24, + "grad_norm": 9.321664265280825, + "learning_rate": 8.915862639095232e-06, + "loss": 0.8224, + "step": 2701 + }, + { + "epoch": 0.24, + "grad_norm": 14.011338659018298, + "learning_rate": 8.914978083564588e-06, + "loss": 0.7572, + "step": 2702 + }, + { + "epoch": 0.24, + "grad_norm": 8.952576377241687, + "learning_rate": 8.91409321124043e-06, + "loss": 0.8019, + "step": 2703 + }, + { + "epoch": 0.24, + "grad_norm": 15.16437517256294, + "learning_rate": 8.91320802219436e-06, + "loss": 0.7869, + "step": 2704 + }, + { + "epoch": 0.24, + "grad_norm": 7.80481390425256, + "learning_rate": 8.912322516498004e-06, + "loss": 0.9538, + "step": 2705 + }, + { + "epoch": 0.24, + "grad_norm": 12.763768025608229, + "learning_rate": 8.91143669422302e-06, + "loss": 0.9308, + "step": 2706 + }, + { + "epoch": 0.24, + "grad_norm": 8.22062502712083, + "learning_rate": 8.910550555441085e-06, + "loss": 0.8075, + "step": 2707 + }, + { + "epoch": 0.24, + "grad_norm": 27.553787100896187, + "learning_rate": 8.909664100223903e-06, + "loss": 0.8607, + "step": 2708 + }, + { + "epoch": 0.24, + "grad_norm": 15.265118298131133, + "learning_rate": 8.908777328643206e-06, + "loss": 0.7136, + "step": 2709 + }, + { + "epoch": 0.24, + "grad_norm": 9.468534166143646, + "learning_rate": 8.90789024077075e-06, + "loss": 0.8643, + "step": 2710 + }, + { + "epoch": 0.24, + "grad_norm": 9.808692045773101, + "learning_rate": 8.907002836678315e-06, + "loss": 0.7675, + "step": 2711 + }, + { + "epoch": 0.24, + "grad_norm": 20.89170237047966, + "learning_rate": 8.90611511643771e-06, + "loss": 0.8294, + "step": 2712 + }, + { + "epoch": 0.24, + "grad_norm": 7.719788255322534, + "learning_rate": 8.905227080120766e-06, + "loss": 0.8325, + "step": 2713 + }, + { + "epoch": 0.24, + "grad_norm": 6.849823040129696, + "learning_rate": 8.904338727799344e-06, + "loss": 0.6901, + "step": 2714 + }, + { + "epoch": 0.24, + "grad_norm": 11.431983828742245, + "learning_rate": 8.903450059545327e-06, + "loss": 0.7771, + "step": 2715 + }, + { + "epoch": 0.24, + "grad_norm": 8.019106658333575, + "learning_rate": 8.902561075430625e-06, + "loss": 0.7658, + "step": 2716 + }, + { + "epoch": 0.24, + "grad_norm": 12.332764295354535, + "learning_rate": 8.90167177552717e-06, + "loss": 0.8606, + "step": 2717 + }, + { + "epoch": 0.24, + "grad_norm": 8.704477350163472, + "learning_rate": 8.900782159906927e-06, + "loss": 0.858, + "step": 2718 + }, + { + "epoch": 0.24, + "grad_norm": 11.124786270904718, + "learning_rate": 8.89989222864188e-06, + "loss": 0.761, + "step": 2719 + }, + { + "epoch": 0.24, + "grad_norm": 10.710773840344016, + "learning_rate": 8.89900198180404e-06, + "loss": 0.8654, + "step": 2720 + }, + { + "epoch": 0.24, + "grad_norm": 7.033758878223877, + "learning_rate": 8.898111419465444e-06, + "loss": 0.7248, + "step": 2721 + }, + { + "epoch": 0.24, + "grad_norm": 4.521222998672539, + "learning_rate": 8.89722054169816e-06, + "loss": 0.5531, + "step": 2722 + }, + { + "epoch": 0.24, + "grad_norm": 6.561645678258267, + "learning_rate": 8.896329348574269e-06, + "loss": 0.7482, + "step": 2723 + }, + { + "epoch": 0.24, + "grad_norm": 2.3154147968701015, + "learning_rate": 8.895437840165891e-06, + "loss": 0.5145, + "step": 2724 + }, + { + "epoch": 0.24, + "grad_norm": 2.8005070184367877, + "learning_rate": 8.89454601654516e-06, + "loss": 0.522, + "step": 2725 + }, + { + "epoch": 0.24, + "grad_norm": 12.587122910060845, + "learning_rate": 8.893653877784245e-06, + "loss": 0.7354, + "step": 2726 + }, + { + "epoch": 0.24, + "grad_norm": 12.185385744699637, + "learning_rate": 8.892761423955336e-06, + "loss": 0.8466, + "step": 2727 + }, + { + "epoch": 0.24, + "grad_norm": 17.230245230788313, + "learning_rate": 8.891868655130646e-06, + "loss": 0.8913, + "step": 2728 + }, + { + "epoch": 0.24, + "grad_norm": 11.481043355620566, + "learning_rate": 8.890975571382419e-06, + "loss": 0.8169, + "step": 2729 + }, + { + "epoch": 0.24, + "grad_norm": 7.161285288080518, + "learning_rate": 8.890082172782921e-06, + "loss": 0.8612, + "step": 2730 + }, + { + "epoch": 0.24, + "grad_norm": 10.13437047646372, + "learning_rate": 8.889188459404445e-06, + "loss": 0.9108, + "step": 2731 + }, + { + "epoch": 0.24, + "grad_norm": 13.243882555270442, + "learning_rate": 8.888294431319307e-06, + "loss": 0.938, + "step": 2732 + }, + { + "epoch": 0.24, + "grad_norm": 6.3006422685139905, + "learning_rate": 8.887400088599852e-06, + "loss": 0.8213, + "step": 2733 + }, + { + "epoch": 0.24, + "grad_norm": 7.820668911091511, + "learning_rate": 8.886505431318449e-06, + "loss": 0.9148, + "step": 2734 + }, + { + "epoch": 0.24, + "grad_norm": 9.311390025109425, + "learning_rate": 8.88561045954749e-06, + "loss": 0.9808, + "step": 2735 + }, + { + "epoch": 0.24, + "grad_norm": 2.739904156418659, + "learning_rate": 8.884715173359397e-06, + "loss": 0.477, + "step": 2736 + }, + { + "epoch": 0.24, + "grad_norm": 3.1213126575304058, + "learning_rate": 8.883819572826613e-06, + "loss": 0.5635, + "step": 2737 + }, + { + "epoch": 0.24, + "grad_norm": 14.935420169545374, + "learning_rate": 8.88292365802161e-06, + "loss": 0.946, + "step": 2738 + }, + { + "epoch": 0.24, + "grad_norm": 16.463498828773197, + "learning_rate": 8.882027429016883e-06, + "loss": 0.7296, + "step": 2739 + }, + { + "epoch": 0.24, + "grad_norm": 14.9999016673107, + "learning_rate": 8.881130885884955e-06, + "loss": 0.7614, + "step": 2740 + }, + { + "epoch": 0.24, + "grad_norm": 2.435941053877029, + "learning_rate": 8.88023402869837e-06, + "loss": 0.421, + "step": 2741 + }, + { + "epoch": 0.24, + "grad_norm": 7.557310474933913, + "learning_rate": 8.879336857529706e-06, + "loss": 1.0387, + "step": 2742 + }, + { + "epoch": 0.24, + "grad_norm": 8.452372618399833, + "learning_rate": 8.878439372451552e-06, + "loss": 0.72, + "step": 2743 + }, + { + "epoch": 0.24, + "grad_norm": 6.996366334121361, + "learning_rate": 8.877541573536537e-06, + "loss": 0.7242, + "step": 2744 + }, + { + "epoch": 0.24, + "grad_norm": 9.08220205922114, + "learning_rate": 8.876643460857308e-06, + "loss": 0.7446, + "step": 2745 + }, + { + "epoch": 0.24, + "grad_norm": 9.232144933819198, + "learning_rate": 8.875745034486538e-06, + "loss": 0.8685, + "step": 2746 + }, + { + "epoch": 0.24, + "grad_norm": 9.84630573179566, + "learning_rate": 8.874846294496928e-06, + "loss": 0.804, + "step": 2747 + }, + { + "epoch": 0.24, + "grad_norm": 8.858867392262637, + "learning_rate": 8.8739472409612e-06, + "loss": 0.7481, + "step": 2748 + }, + { + "epoch": 0.24, + "grad_norm": 23.570708165009318, + "learning_rate": 8.873047873952106e-06, + "loss": 0.9043, + "step": 2749 + }, + { + "epoch": 0.24, + "grad_norm": 7.241429587452319, + "learning_rate": 8.87214819354242e-06, + "loss": 0.7218, + "step": 2750 + }, + { + "epoch": 0.24, + "grad_norm": 7.642424170081916, + "learning_rate": 8.871248199804944e-06, + "loss": 0.9633, + "step": 2751 + }, + { + "epoch": 0.24, + "grad_norm": 7.0065713487540275, + "learning_rate": 8.870347892812504e-06, + "loss": 0.8282, + "step": 2752 + }, + { + "epoch": 0.24, + "grad_norm": 39.41085065939162, + "learning_rate": 8.869447272637948e-06, + "loss": 0.8478, + "step": 2753 + }, + { + "epoch": 0.24, + "grad_norm": 8.710607555759152, + "learning_rate": 8.868546339354156e-06, + "loss": 0.6852, + "step": 2754 + }, + { + "epoch": 0.24, + "grad_norm": 2.6489709568677355, + "learning_rate": 8.867645093034029e-06, + "loss": 0.5612, + "step": 2755 + }, + { + "epoch": 0.24, + "grad_norm": 6.817729069419878, + "learning_rate": 8.866743533750495e-06, + "loss": 0.8496, + "step": 2756 + }, + { + "epoch": 0.24, + "grad_norm": 8.405783484707916, + "learning_rate": 8.865841661576506e-06, + "loss": 0.7418, + "step": 2757 + }, + { + "epoch": 0.24, + "grad_norm": 15.07073828704563, + "learning_rate": 8.864939476585042e-06, + "loss": 0.9617, + "step": 2758 + }, + { + "epoch": 0.24, + "grad_norm": 11.517626241796373, + "learning_rate": 8.864036978849103e-06, + "loss": 0.7745, + "step": 2759 + }, + { + "epoch": 0.24, + "grad_norm": 13.871986945601092, + "learning_rate": 8.863134168441719e-06, + "loss": 0.7412, + "step": 2760 + }, + { + "epoch": 0.24, + "grad_norm": 11.890566077150133, + "learning_rate": 8.862231045435944e-06, + "loss": 0.8033, + "step": 2761 + }, + { + "epoch": 0.24, + "grad_norm": 6.849338257197789, + "learning_rate": 8.861327609904859e-06, + "loss": 0.8318, + "step": 2762 + }, + { + "epoch": 0.24, + "grad_norm": 6.196503345695955, + "learning_rate": 8.860423861921566e-06, + "loss": 0.8701, + "step": 2763 + }, + { + "epoch": 0.24, + "grad_norm": 5.319727358779177, + "learning_rate": 8.859519801559193e-06, + "loss": 0.6719, + "step": 2764 + }, + { + "epoch": 0.24, + "grad_norm": 8.64990614258632, + "learning_rate": 8.8586154288909e-06, + "loss": 0.7633, + "step": 2765 + }, + { + "epoch": 0.24, + "grad_norm": 9.364560840929796, + "learning_rate": 8.857710743989865e-06, + "loss": 0.8645, + "step": 2766 + }, + { + "epoch": 0.24, + "grad_norm": 3.2063474615880234, + "learning_rate": 8.856805746929294e-06, + "loss": 0.5325, + "step": 2767 + }, + { + "epoch": 0.24, + "grad_norm": 8.873858227120856, + "learning_rate": 8.855900437782417e-06, + "loss": 0.783, + "step": 2768 + }, + { + "epoch": 0.24, + "grad_norm": 7.738771689218286, + "learning_rate": 8.854994816622489e-06, + "loss": 0.8289, + "step": 2769 + }, + { + "epoch": 0.24, + "grad_norm": 6.8793013192160215, + "learning_rate": 8.854088883522794e-06, + "loss": 0.7072, + "step": 2770 + }, + { + "epoch": 0.24, + "grad_norm": 6.9453828617748075, + "learning_rate": 8.853182638556637e-06, + "loss": 0.7973, + "step": 2771 + }, + { + "epoch": 0.24, + "grad_norm": 9.455396063213703, + "learning_rate": 8.852276081797353e-06, + "loss": 1.002, + "step": 2772 + }, + { + "epoch": 0.24, + "grad_norm": 11.059098102671017, + "learning_rate": 8.851369213318293e-06, + "loss": 0.7683, + "step": 2773 + }, + { + "epoch": 0.24, + "grad_norm": 17.696326459843682, + "learning_rate": 8.850462033192843e-06, + "loss": 0.7907, + "step": 2774 + }, + { + "epoch": 0.24, + "grad_norm": 8.775022020458815, + "learning_rate": 8.849554541494412e-06, + "loss": 0.7881, + "step": 2775 + }, + { + "epoch": 0.24, + "grad_norm": 8.000381910939556, + "learning_rate": 8.848646738296432e-06, + "loss": 0.9716, + "step": 2776 + }, + { + "epoch": 0.24, + "grad_norm": 9.322190332164867, + "learning_rate": 8.847738623672357e-06, + "loss": 0.8666, + "step": 2777 + }, + { + "epoch": 0.24, + "grad_norm": 9.86274471615869, + "learning_rate": 8.846830197695672e-06, + "loss": 0.8597, + "step": 2778 + }, + { + "epoch": 0.24, + "grad_norm": 6.083468691615392, + "learning_rate": 8.84592146043989e-06, + "loss": 0.7556, + "step": 2779 + }, + { + "epoch": 0.24, + "grad_norm": 9.94705380848878, + "learning_rate": 8.845012411978538e-06, + "loss": 0.8821, + "step": 2780 + }, + { + "epoch": 0.24, + "grad_norm": 7.6865339884036255, + "learning_rate": 8.844103052385178e-06, + "loss": 0.781, + "step": 2781 + }, + { + "epoch": 0.24, + "grad_norm": 9.177480214472387, + "learning_rate": 8.843193381733395e-06, + "loss": 0.7695, + "step": 2782 + }, + { + "epoch": 0.24, + "grad_norm": 9.522749878297937, + "learning_rate": 8.842283400096795e-06, + "loss": 0.6392, + "step": 2783 + }, + { + "epoch": 0.24, + "grad_norm": 13.949367837659077, + "learning_rate": 8.841373107549014e-06, + "loss": 0.7567, + "step": 2784 + }, + { + "epoch": 0.24, + "grad_norm": 8.075158545098057, + "learning_rate": 8.840462504163711e-06, + "loss": 0.8763, + "step": 2785 + }, + { + "epoch": 0.24, + "grad_norm": 3.3836039266799167, + "learning_rate": 8.83955159001457e-06, + "loss": 0.5112, + "step": 2786 + }, + { + "epoch": 0.24, + "grad_norm": 6.3913436772628565, + "learning_rate": 8.8386403651753e-06, + "loss": 0.7182, + "step": 2787 + }, + { + "epoch": 0.24, + "grad_norm": 5.99153991671191, + "learning_rate": 8.837728829719638e-06, + "loss": 0.8094, + "step": 2788 + }, + { + "epoch": 0.24, + "grad_norm": 21.699966348236924, + "learning_rate": 8.836816983721344e-06, + "loss": 0.8494, + "step": 2789 + }, + { + "epoch": 0.25, + "grad_norm": 9.181348436007964, + "learning_rate": 8.8359048272542e-06, + "loss": 0.7505, + "step": 2790 + }, + { + "epoch": 0.25, + "grad_norm": 9.23228476830211, + "learning_rate": 8.834992360392018e-06, + "loss": 0.8831, + "step": 2791 + }, + { + "epoch": 0.25, + "grad_norm": 11.30103420275404, + "learning_rate": 8.834079583208635e-06, + "loss": 0.8259, + "step": 2792 + }, + { + "epoch": 0.25, + "grad_norm": 9.09473997591444, + "learning_rate": 8.833166495777909e-06, + "loss": 0.9313, + "step": 2793 + }, + { + "epoch": 0.25, + "grad_norm": 9.16519927139401, + "learning_rate": 8.832253098173726e-06, + "loss": 0.8223, + "step": 2794 + }, + { + "epoch": 0.25, + "grad_norm": 13.632652312094217, + "learning_rate": 8.831339390469998e-06, + "loss": 0.8644, + "step": 2795 + }, + { + "epoch": 0.25, + "grad_norm": 12.195417604396257, + "learning_rate": 8.830425372740658e-06, + "loss": 0.8481, + "step": 2796 + }, + { + "epoch": 0.25, + "grad_norm": 2.543314046086127, + "learning_rate": 8.829511045059672e-06, + "loss": 0.5506, + "step": 2797 + }, + { + "epoch": 0.25, + "grad_norm": 6.770569162176066, + "learning_rate": 8.828596407501018e-06, + "loss": 0.9147, + "step": 2798 + }, + { + "epoch": 0.25, + "grad_norm": 9.199138634924491, + "learning_rate": 8.827681460138712e-06, + "loss": 0.686, + "step": 2799 + }, + { + "epoch": 0.25, + "grad_norm": 6.712855997118154, + "learning_rate": 8.826766203046791e-06, + "loss": 0.783, + "step": 2800 + }, + { + "epoch": 0.25, + "grad_norm": 6.159328189897741, + "learning_rate": 8.825850636299313e-06, + "loss": 0.7707, + "step": 2801 + }, + { + "epoch": 0.25, + "grad_norm": 9.254121928875907, + "learning_rate": 8.824934759970366e-06, + "loss": 0.8668, + "step": 2802 + }, + { + "epoch": 0.25, + "grad_norm": 11.88372782151253, + "learning_rate": 8.824018574134061e-06, + "loss": 0.8566, + "step": 2803 + }, + { + "epoch": 0.25, + "grad_norm": 7.51589834225485, + "learning_rate": 8.823102078864533e-06, + "loss": 0.9073, + "step": 2804 + }, + { + "epoch": 0.25, + "grad_norm": 8.075409737936086, + "learning_rate": 8.822185274235947e-06, + "loss": 0.7773, + "step": 2805 + }, + { + "epoch": 0.25, + "grad_norm": 9.509139973447889, + "learning_rate": 8.821268160322482e-06, + "loss": 0.8034, + "step": 2806 + }, + { + "epoch": 0.25, + "grad_norm": 15.141944975134372, + "learning_rate": 8.820350737198357e-06, + "loss": 0.8473, + "step": 2807 + }, + { + "epoch": 0.25, + "grad_norm": 5.950928797708698, + "learning_rate": 8.819433004937805e-06, + "loss": 0.8259, + "step": 2808 + }, + { + "epoch": 0.25, + "grad_norm": 5.916411585716418, + "learning_rate": 8.818514963615086e-06, + "loss": 0.8681, + "step": 2809 + }, + { + "epoch": 0.25, + "grad_norm": 7.418346281050323, + "learning_rate": 8.81759661330449e-06, + "loss": 0.6964, + "step": 2810 + }, + { + "epoch": 0.25, + "grad_norm": 8.599976034376152, + "learning_rate": 8.816677954080324e-06, + "loss": 0.7986, + "step": 2811 + }, + { + "epoch": 0.25, + "grad_norm": 9.083625313401269, + "learning_rate": 8.815758986016927e-06, + "loss": 0.8051, + "step": 2812 + }, + { + "epoch": 0.25, + "grad_norm": 7.256377288120837, + "learning_rate": 8.814839709188661e-06, + "loss": 0.9548, + "step": 2813 + }, + { + "epoch": 0.25, + "grad_norm": 3.096443532795902, + "learning_rate": 8.81392012366991e-06, + "loss": 0.51, + "step": 2814 + }, + { + "epoch": 0.25, + "grad_norm": 10.87314190162265, + "learning_rate": 8.813000229535089e-06, + "loss": 0.779, + "step": 2815 + }, + { + "epoch": 0.25, + "grad_norm": 5.171556349356453, + "learning_rate": 8.812080026858632e-06, + "loss": 0.8116, + "step": 2816 + }, + { + "epoch": 0.25, + "grad_norm": 13.315745189702143, + "learning_rate": 8.811159515714998e-06, + "loss": 0.9851, + "step": 2817 + }, + { + "epoch": 0.25, + "grad_norm": 8.718841698711552, + "learning_rate": 8.810238696178676e-06, + "loss": 0.7809, + "step": 2818 + }, + { + "epoch": 0.25, + "grad_norm": 26.65122558303073, + "learning_rate": 8.809317568324178e-06, + "loss": 0.8475, + "step": 2819 + }, + { + "epoch": 0.25, + "grad_norm": 10.845503948041177, + "learning_rate": 8.808396132226039e-06, + "loss": 0.7655, + "step": 2820 + }, + { + "epoch": 0.25, + "grad_norm": 11.676754266049356, + "learning_rate": 8.807474387958819e-06, + "loss": 0.8434, + "step": 2821 + }, + { + "epoch": 0.25, + "grad_norm": 3.5496962215365366, + "learning_rate": 8.806552335597105e-06, + "loss": 0.6174, + "step": 2822 + }, + { + "epoch": 0.25, + "grad_norm": 10.907010774289892, + "learning_rate": 8.805629975215509e-06, + "loss": 0.8848, + "step": 2823 + }, + { + "epoch": 0.25, + "grad_norm": 8.32275150710283, + "learning_rate": 8.804707306888663e-06, + "loss": 0.7884, + "step": 2824 + }, + { + "epoch": 0.25, + "grad_norm": 9.017762081418512, + "learning_rate": 8.803784330691232e-06, + "loss": 0.7397, + "step": 2825 + }, + { + "epoch": 0.25, + "grad_norm": 8.438074202085682, + "learning_rate": 8.802861046697902e-06, + "loss": 0.9056, + "step": 2826 + }, + { + "epoch": 0.25, + "grad_norm": 10.850014292410766, + "learning_rate": 8.80193745498338e-06, + "loss": 0.8213, + "step": 2827 + }, + { + "epoch": 0.25, + "grad_norm": 5.84330164675956, + "learning_rate": 8.801013555622403e-06, + "loss": 0.8422, + "step": 2828 + }, + { + "epoch": 0.25, + "grad_norm": 7.677698815891883, + "learning_rate": 8.800089348689733e-06, + "loss": 0.9225, + "step": 2829 + }, + { + "epoch": 0.25, + "grad_norm": 2.6362097951626438, + "learning_rate": 8.799164834260153e-06, + "loss": 0.4899, + "step": 2830 + }, + { + "epoch": 0.25, + "grad_norm": 19.50886217599729, + "learning_rate": 8.798240012408475e-06, + "loss": 0.8075, + "step": 2831 + }, + { + "epoch": 0.25, + "grad_norm": 7.9673595763489535, + "learning_rate": 8.797314883209533e-06, + "loss": 0.8497, + "step": 2832 + }, + { + "epoch": 0.25, + "grad_norm": 7.698994263941197, + "learning_rate": 8.796389446738188e-06, + "loss": 0.805, + "step": 2833 + }, + { + "epoch": 0.25, + "grad_norm": 6.420960514172805, + "learning_rate": 8.795463703069323e-06, + "loss": 0.8728, + "step": 2834 + }, + { + "epoch": 0.25, + "grad_norm": 7.144917004433905, + "learning_rate": 8.794537652277846e-06, + "loss": 0.7967, + "step": 2835 + }, + { + "epoch": 0.25, + "grad_norm": 5.562291528177638, + "learning_rate": 8.793611294438698e-06, + "loss": 0.8656, + "step": 2836 + }, + { + "epoch": 0.25, + "grad_norm": 7.036410539291599, + "learning_rate": 8.792684629626833e-06, + "loss": 0.777, + "step": 2837 + }, + { + "epoch": 0.25, + "grad_norm": 13.195129293259399, + "learning_rate": 8.791757657917236e-06, + "loss": 0.9215, + "step": 2838 + }, + { + "epoch": 0.25, + "grad_norm": 9.26671435626022, + "learning_rate": 8.790830379384918e-06, + "loss": 0.779, + "step": 2839 + }, + { + "epoch": 0.25, + "grad_norm": 6.691230258099832, + "learning_rate": 8.78990279410491e-06, + "loss": 0.9636, + "step": 2840 + }, + { + "epoch": 0.25, + "grad_norm": 6.965056051957785, + "learning_rate": 8.788974902152274e-06, + "loss": 0.8583, + "step": 2841 + }, + { + "epoch": 0.25, + "grad_norm": 11.765926727891122, + "learning_rate": 8.788046703602089e-06, + "loss": 0.9804, + "step": 2842 + }, + { + "epoch": 0.25, + "grad_norm": 2.8314267027736277, + "learning_rate": 8.787118198529468e-06, + "loss": 0.5892, + "step": 2843 + }, + { + "epoch": 0.25, + "grad_norm": 6.901427626601832, + "learning_rate": 8.78618938700954e-06, + "loss": 0.9525, + "step": 2844 + }, + { + "epoch": 0.25, + "grad_norm": 9.57072812160973, + "learning_rate": 8.785260269117466e-06, + "loss": 0.7497, + "step": 2845 + }, + { + "epoch": 0.25, + "grad_norm": 27.016474086373094, + "learning_rate": 8.784330844928428e-06, + "loss": 0.8603, + "step": 2846 + }, + { + "epoch": 0.25, + "grad_norm": 8.582294042464051, + "learning_rate": 8.783401114517632e-06, + "loss": 0.919, + "step": 2847 + }, + { + "epoch": 0.25, + "grad_norm": 10.499442994749588, + "learning_rate": 8.782471077960311e-06, + "loss": 0.8035, + "step": 2848 + }, + { + "epoch": 0.25, + "grad_norm": 8.730176856263276, + "learning_rate": 8.781540735331722e-06, + "loss": 0.8998, + "step": 2849 + }, + { + "epoch": 0.25, + "grad_norm": 9.738132763216116, + "learning_rate": 8.780610086707149e-06, + "loss": 0.6888, + "step": 2850 + }, + { + "epoch": 0.25, + "grad_norm": 10.210597106459064, + "learning_rate": 8.779679132161893e-06, + "loss": 0.8399, + "step": 2851 + }, + { + "epoch": 0.25, + "grad_norm": 3.081080461004908, + "learning_rate": 8.778747871771293e-06, + "loss": 0.5033, + "step": 2852 + }, + { + "epoch": 0.25, + "grad_norm": 8.267750077067975, + "learning_rate": 8.777816305610698e-06, + "loss": 0.92, + "step": 2853 + }, + { + "epoch": 0.25, + "grad_norm": 9.780337032124065, + "learning_rate": 8.776884433755493e-06, + "loss": 0.8492, + "step": 2854 + }, + { + "epoch": 0.25, + "grad_norm": 8.152717308747377, + "learning_rate": 8.775952256281081e-06, + "loss": 0.7809, + "step": 2855 + }, + { + "epoch": 0.25, + "grad_norm": 2.326767075482379, + "learning_rate": 8.775019773262893e-06, + "loss": 0.5004, + "step": 2856 + }, + { + "epoch": 0.25, + "grad_norm": 14.295695004465612, + "learning_rate": 8.774086984776386e-06, + "loss": 0.7693, + "step": 2857 + }, + { + "epoch": 0.25, + "grad_norm": 10.722246363989704, + "learning_rate": 8.773153890897037e-06, + "loss": 0.9377, + "step": 2858 + }, + { + "epoch": 0.25, + "grad_norm": 11.090100407903211, + "learning_rate": 8.772220491700351e-06, + "loss": 0.9294, + "step": 2859 + }, + { + "epoch": 0.25, + "grad_norm": 17.055951478429435, + "learning_rate": 8.77128678726186e-06, + "loss": 0.8763, + "step": 2860 + }, + { + "epoch": 0.25, + "grad_norm": 8.071537791139939, + "learning_rate": 8.770352777657112e-06, + "loss": 0.8804, + "step": 2861 + }, + { + "epoch": 0.25, + "grad_norm": 9.018109415311898, + "learning_rate": 8.769418462961692e-06, + "loss": 0.8779, + "step": 2862 + }, + { + "epoch": 0.25, + "grad_norm": 6.884031912671427, + "learning_rate": 8.768483843251199e-06, + "loss": 0.867, + "step": 2863 + }, + { + "epoch": 0.25, + "grad_norm": 2.2818167530152262, + "learning_rate": 8.767548918601261e-06, + "loss": 0.4377, + "step": 2864 + }, + { + "epoch": 0.25, + "grad_norm": 8.938684920921718, + "learning_rate": 8.766613689087532e-06, + "loss": 0.9878, + "step": 2865 + }, + { + "epoch": 0.25, + "grad_norm": 7.481276704176392, + "learning_rate": 8.765678154785689e-06, + "loss": 0.907, + "step": 2866 + }, + { + "epoch": 0.25, + "grad_norm": 8.387463166564029, + "learning_rate": 8.764742315771434e-06, + "loss": 0.7325, + "step": 2867 + }, + { + "epoch": 0.25, + "grad_norm": 6.960189436409377, + "learning_rate": 8.763806172120492e-06, + "loss": 0.9563, + "step": 2868 + }, + { + "epoch": 0.25, + "grad_norm": 12.440181360339887, + "learning_rate": 8.762869723908616e-06, + "loss": 0.7291, + "step": 2869 + }, + { + "epoch": 0.25, + "grad_norm": 9.625440517735687, + "learning_rate": 8.761932971211581e-06, + "loss": 0.7938, + "step": 2870 + }, + { + "epoch": 0.25, + "grad_norm": 8.930901009229617, + "learning_rate": 8.760995914105188e-06, + "loss": 0.8707, + "step": 2871 + }, + { + "epoch": 0.25, + "grad_norm": 10.331419654020008, + "learning_rate": 8.760058552665262e-06, + "loss": 0.7546, + "step": 2872 + }, + { + "epoch": 0.25, + "grad_norm": 6.202678920724311, + "learning_rate": 8.75912088696765e-06, + "loss": 0.6683, + "step": 2873 + }, + { + "epoch": 0.25, + "grad_norm": 8.438435668034645, + "learning_rate": 8.758182917088233e-06, + "loss": 0.903, + "step": 2874 + }, + { + "epoch": 0.25, + "grad_norm": 8.686650791024844, + "learning_rate": 8.757244643102903e-06, + "loss": 0.809, + "step": 2875 + }, + { + "epoch": 0.25, + "grad_norm": 8.610099251935473, + "learning_rate": 8.756306065087588e-06, + "loss": 0.8347, + "step": 2876 + }, + { + "epoch": 0.25, + "grad_norm": 10.00366048155551, + "learning_rate": 8.755367183118232e-06, + "loss": 0.755, + "step": 2877 + }, + { + "epoch": 0.25, + "grad_norm": 8.688479386797441, + "learning_rate": 8.754427997270811e-06, + "loss": 0.669, + "step": 2878 + }, + { + "epoch": 0.25, + "grad_norm": 9.46012320678665, + "learning_rate": 8.753488507621323e-06, + "loss": 0.8177, + "step": 2879 + }, + { + "epoch": 0.25, + "grad_norm": 6.70913914242898, + "learning_rate": 8.752548714245787e-06, + "loss": 0.9159, + "step": 2880 + }, + { + "epoch": 0.25, + "grad_norm": 6.394069948595565, + "learning_rate": 8.751608617220254e-06, + "loss": 0.7075, + "step": 2881 + }, + { + "epoch": 0.25, + "grad_norm": 7.953553015952459, + "learning_rate": 8.750668216620789e-06, + "loss": 0.7651, + "step": 2882 + }, + { + "epoch": 0.25, + "grad_norm": 2.6655261576612226, + "learning_rate": 8.749727512523491e-06, + "loss": 0.5075, + "step": 2883 + }, + { + "epoch": 0.25, + "grad_norm": 9.31964368386908, + "learning_rate": 8.748786505004482e-06, + "loss": 0.8757, + "step": 2884 + }, + { + "epoch": 0.25, + "grad_norm": 8.761460050796583, + "learning_rate": 8.747845194139903e-06, + "loss": 0.8994, + "step": 2885 + }, + { + "epoch": 0.25, + "grad_norm": 9.182955695562496, + "learning_rate": 8.746903580005927e-06, + "loss": 0.7609, + "step": 2886 + }, + { + "epoch": 0.25, + "grad_norm": 10.525693566782776, + "learning_rate": 8.745961662678744e-06, + "loss": 0.7967, + "step": 2887 + }, + { + "epoch": 0.25, + "grad_norm": 14.810914441134662, + "learning_rate": 8.745019442234574e-06, + "loss": 0.75, + "step": 2888 + }, + { + "epoch": 0.25, + "grad_norm": 8.619401479532996, + "learning_rate": 8.744076918749662e-06, + "loss": 0.8179, + "step": 2889 + }, + { + "epoch": 0.25, + "grad_norm": 6.632662706053884, + "learning_rate": 8.743134092300275e-06, + "loss": 0.67, + "step": 2890 + }, + { + "epoch": 0.25, + "grad_norm": 2.8101819632516856, + "learning_rate": 8.7421909629627e-06, + "loss": 0.5364, + "step": 2891 + }, + { + "epoch": 0.25, + "grad_norm": 12.379389732869932, + "learning_rate": 8.74124753081326e-06, + "loss": 0.8215, + "step": 2892 + }, + { + "epoch": 0.25, + "grad_norm": 11.435560093634798, + "learning_rate": 8.740303795928292e-06, + "loss": 0.8528, + "step": 2893 + }, + { + "epoch": 0.25, + "grad_norm": 6.150867742150004, + "learning_rate": 8.739359758384162e-06, + "loss": 0.8189, + "step": 2894 + }, + { + "epoch": 0.25, + "grad_norm": 7.607606903580161, + "learning_rate": 8.73841541825726e-06, + "loss": 0.7881, + "step": 2895 + }, + { + "epoch": 0.25, + "grad_norm": 22.835595093704363, + "learning_rate": 8.737470775624003e-06, + "loss": 0.8275, + "step": 2896 + }, + { + "epoch": 0.25, + "grad_norm": 10.19625718079821, + "learning_rate": 8.736525830560826e-06, + "loss": 0.7652, + "step": 2897 + }, + { + "epoch": 0.25, + "grad_norm": 8.713472049353355, + "learning_rate": 8.735580583144195e-06, + "loss": 0.8643, + "step": 2898 + }, + { + "epoch": 0.25, + "grad_norm": 6.7901950785624905, + "learning_rate": 8.734635033450597e-06, + "loss": 0.6921, + "step": 2899 + }, + { + "epoch": 0.25, + "grad_norm": 8.651379783646089, + "learning_rate": 8.733689181556544e-06, + "loss": 0.9499, + "step": 2900 + }, + { + "epoch": 0.25, + "grad_norm": 7.996406883324831, + "learning_rate": 8.732743027538572e-06, + "loss": 0.6961, + "step": 2901 + }, + { + "epoch": 0.25, + "grad_norm": 10.441951683036491, + "learning_rate": 8.731796571473245e-06, + "loss": 0.834, + "step": 2902 + }, + { + "epoch": 0.25, + "grad_norm": 6.476545319965869, + "learning_rate": 8.730849813437147e-06, + "loss": 0.7903, + "step": 2903 + }, + { + "epoch": 0.26, + "grad_norm": 9.505341834618788, + "learning_rate": 8.729902753506888e-06, + "loss": 0.6182, + "step": 2904 + }, + { + "epoch": 0.26, + "grad_norm": 8.131171970761018, + "learning_rate": 8.728955391759102e-06, + "loss": 0.9491, + "step": 2905 + }, + { + "epoch": 0.26, + "grad_norm": 8.193269273575336, + "learning_rate": 8.728007728270449e-06, + "loss": 0.9943, + "step": 2906 + }, + { + "epoch": 0.26, + "grad_norm": 9.370950003838809, + "learning_rate": 8.72705976311761e-06, + "loss": 0.8945, + "step": 2907 + }, + { + "epoch": 0.26, + "grad_norm": 7.338537905470208, + "learning_rate": 8.726111496377297e-06, + "loss": 0.7194, + "step": 2908 + }, + { + "epoch": 0.26, + "grad_norm": 9.736807659521261, + "learning_rate": 8.725162928126239e-06, + "loss": 1.0532, + "step": 2909 + }, + { + "epoch": 0.26, + "grad_norm": 7.162817769425723, + "learning_rate": 8.724214058441191e-06, + "loss": 0.8421, + "step": 2910 + }, + { + "epoch": 0.26, + "grad_norm": 10.191170519439725, + "learning_rate": 8.72326488739894e-06, + "loss": 0.8342, + "step": 2911 + }, + { + "epoch": 0.26, + "grad_norm": 16.549781458770777, + "learning_rate": 8.722315415076286e-06, + "loss": 0.8538, + "step": 2912 + }, + { + "epoch": 0.26, + "grad_norm": 2.6794761976105095, + "learning_rate": 8.721365641550061e-06, + "loss": 0.5121, + "step": 2913 + }, + { + "epoch": 0.26, + "grad_norm": 6.774704875372642, + "learning_rate": 8.720415566897118e-06, + "loss": 0.768, + "step": 2914 + }, + { + "epoch": 0.26, + "grad_norm": 6.421484210891573, + "learning_rate": 8.719465191194335e-06, + "loss": 0.7758, + "step": 2915 + }, + { + "epoch": 0.26, + "grad_norm": 7.0825842111221196, + "learning_rate": 8.718514514518617e-06, + "loss": 0.8663, + "step": 2916 + }, + { + "epoch": 0.26, + "grad_norm": 5.882085132770092, + "learning_rate": 8.717563536946889e-06, + "loss": 0.7725, + "step": 2917 + }, + { + "epoch": 0.26, + "grad_norm": 7.23128783499513, + "learning_rate": 8.716612258556104e-06, + "loss": 1.024, + "step": 2918 + }, + { + "epoch": 0.26, + "grad_norm": 15.203536140484973, + "learning_rate": 8.715660679423237e-06, + "loss": 0.8636, + "step": 2919 + }, + { + "epoch": 0.26, + "grad_norm": 11.183065986593185, + "learning_rate": 8.714708799625289e-06, + "loss": 0.776, + "step": 2920 + }, + { + "epoch": 0.26, + "grad_norm": 6.410081477583485, + "learning_rate": 8.713756619239285e-06, + "loss": 0.8414, + "step": 2921 + }, + { + "epoch": 0.26, + "grad_norm": 28.25447448757409, + "learning_rate": 8.712804138342271e-06, + "loss": 0.811, + "step": 2922 + }, + { + "epoch": 0.26, + "grad_norm": 3.077303513207197, + "learning_rate": 8.711851357011322e-06, + "loss": 0.4554, + "step": 2923 + }, + { + "epoch": 0.26, + "grad_norm": 11.381091475848772, + "learning_rate": 8.710898275323537e-06, + "loss": 0.7896, + "step": 2924 + }, + { + "epoch": 0.26, + "grad_norm": 5.760139112277035, + "learning_rate": 8.709944893356035e-06, + "loss": 0.7887, + "step": 2925 + }, + { + "epoch": 0.26, + "grad_norm": 9.913586350621223, + "learning_rate": 8.708991211185964e-06, + "loss": 0.7596, + "step": 2926 + }, + { + "epoch": 0.26, + "grad_norm": 12.521722061917465, + "learning_rate": 8.708037228890494e-06, + "loss": 0.8391, + "step": 2927 + }, + { + "epoch": 0.26, + "grad_norm": 6.693498061483151, + "learning_rate": 8.70708294654682e-06, + "loss": 0.817, + "step": 2928 + }, + { + "epoch": 0.26, + "grad_norm": 13.05931853826081, + "learning_rate": 8.706128364232161e-06, + "loss": 0.8091, + "step": 2929 + }, + { + "epoch": 0.26, + "grad_norm": 7.541990909166846, + "learning_rate": 8.705173482023757e-06, + "loss": 0.7547, + "step": 2930 + }, + { + "epoch": 0.26, + "grad_norm": 7.406240917840451, + "learning_rate": 8.70421829999888e-06, + "loss": 0.7613, + "step": 2931 + }, + { + "epoch": 0.26, + "grad_norm": 7.545958708535751, + "learning_rate": 8.703262818234818e-06, + "loss": 0.8345, + "step": 2932 + }, + { + "epoch": 0.26, + "grad_norm": 9.901156420862415, + "learning_rate": 8.70230703680889e-06, + "loss": 0.9164, + "step": 2933 + }, + { + "epoch": 0.26, + "grad_norm": 12.155428195491684, + "learning_rate": 8.701350955798435e-06, + "loss": 0.8978, + "step": 2934 + }, + { + "epoch": 0.26, + "grad_norm": 9.980501725507809, + "learning_rate": 8.700394575280818e-06, + "loss": 0.7634, + "step": 2935 + }, + { + "epoch": 0.26, + "grad_norm": 16.354660172414974, + "learning_rate": 8.699437895333425e-06, + "loss": 0.8211, + "step": 2936 + }, + { + "epoch": 0.26, + "grad_norm": 6.90816516612611, + "learning_rate": 8.698480916033674e-06, + "loss": 0.8021, + "step": 2937 + }, + { + "epoch": 0.26, + "grad_norm": 6.3931608334559575, + "learning_rate": 8.697523637458997e-06, + "loss": 0.8475, + "step": 2938 + }, + { + "epoch": 0.26, + "grad_norm": 2.5186488201149637, + "learning_rate": 8.69656605968686e-06, + "loss": 0.5278, + "step": 2939 + }, + { + "epoch": 0.26, + "grad_norm": 8.261295040321855, + "learning_rate": 8.695608182794745e-06, + "loss": 0.8421, + "step": 2940 + }, + { + "epoch": 0.26, + "grad_norm": 7.994395805369903, + "learning_rate": 8.694650006860165e-06, + "loss": 0.7829, + "step": 2941 + }, + { + "epoch": 0.26, + "grad_norm": 2.891961002813286, + "learning_rate": 8.693691531960652e-06, + "loss": 0.5424, + "step": 2942 + }, + { + "epoch": 0.26, + "grad_norm": 8.39587902049651, + "learning_rate": 8.692732758173761e-06, + "loss": 0.8228, + "step": 2943 + }, + { + "epoch": 0.26, + "grad_norm": 8.830074024988372, + "learning_rate": 8.691773685577082e-06, + "loss": 0.6943, + "step": 2944 + }, + { + "epoch": 0.26, + "grad_norm": 10.056968748712245, + "learning_rate": 8.690814314248215e-06, + "loss": 0.8886, + "step": 2945 + }, + { + "epoch": 0.26, + "grad_norm": 8.2465852004846, + "learning_rate": 8.689854644264792e-06, + "loss": 0.7196, + "step": 2946 + }, + { + "epoch": 0.26, + "grad_norm": 5.703057016933375, + "learning_rate": 8.688894675704472e-06, + "loss": 0.7556, + "step": 2947 + }, + { + "epoch": 0.26, + "grad_norm": 7.078747130727275, + "learning_rate": 8.68793440864493e-06, + "loss": 0.8744, + "step": 2948 + }, + { + "epoch": 0.26, + "grad_norm": 6.0018923223355785, + "learning_rate": 8.686973843163868e-06, + "loss": 0.804, + "step": 2949 + }, + { + "epoch": 0.26, + "grad_norm": 7.074386834635933, + "learning_rate": 8.686012979339017e-06, + "loss": 0.8493, + "step": 2950 + }, + { + "epoch": 0.26, + "grad_norm": 7.689644223226183, + "learning_rate": 8.685051817248127e-06, + "loss": 0.7567, + "step": 2951 + }, + { + "epoch": 0.26, + "grad_norm": 3.6018527936503975, + "learning_rate": 8.684090356968974e-06, + "loss": 0.5362, + "step": 2952 + }, + { + "epoch": 0.26, + "grad_norm": 10.846287427196174, + "learning_rate": 8.683128598579358e-06, + "loss": 0.8957, + "step": 2953 + }, + { + "epoch": 0.26, + "grad_norm": 8.097264006018523, + "learning_rate": 8.6821665421571e-06, + "loss": 0.7907, + "step": 2954 + }, + { + "epoch": 0.26, + "grad_norm": 8.091015861484678, + "learning_rate": 8.681204187780054e-06, + "loss": 0.9904, + "step": 2955 + }, + { + "epoch": 0.26, + "grad_norm": 8.759369216269292, + "learning_rate": 8.680241535526085e-06, + "loss": 0.7903, + "step": 2956 + }, + { + "epoch": 0.26, + "grad_norm": 8.504376681135412, + "learning_rate": 8.679278585473095e-06, + "loss": 0.7797, + "step": 2957 + }, + { + "epoch": 0.26, + "grad_norm": 7.073878745011099, + "learning_rate": 8.678315337699e-06, + "loss": 0.8184, + "step": 2958 + }, + { + "epoch": 0.26, + "grad_norm": 5.657713460985453, + "learning_rate": 8.67735179228175e-06, + "loss": 0.9456, + "step": 2959 + }, + { + "epoch": 0.26, + "grad_norm": 7.848646886966014, + "learning_rate": 8.676387949299307e-06, + "loss": 0.8846, + "step": 2960 + }, + { + "epoch": 0.26, + "grad_norm": 7.724390824736029, + "learning_rate": 8.675423808829667e-06, + "loss": 0.7928, + "step": 2961 + }, + { + "epoch": 0.26, + "grad_norm": 5.2991304409877715, + "learning_rate": 8.674459370950848e-06, + "loss": 0.759, + "step": 2962 + }, + { + "epoch": 0.26, + "grad_norm": 9.6131580355416, + "learning_rate": 8.673494635740886e-06, + "loss": 0.8202, + "step": 2963 + }, + { + "epoch": 0.26, + "grad_norm": 7.9406936951629925, + "learning_rate": 8.672529603277849e-06, + "loss": 0.7274, + "step": 2964 + }, + { + "epoch": 0.26, + "grad_norm": 7.706925631777547, + "learning_rate": 8.671564273639828e-06, + "loss": 0.8393, + "step": 2965 + }, + { + "epoch": 0.26, + "grad_norm": 8.302891494774716, + "learning_rate": 8.670598646904931e-06, + "loss": 0.8454, + "step": 2966 + }, + { + "epoch": 0.26, + "grad_norm": 6.144504990542853, + "learning_rate": 8.669632723151297e-06, + "loss": 0.8328, + "step": 2967 + }, + { + "epoch": 0.26, + "grad_norm": 13.68576344804128, + "learning_rate": 8.668666502457089e-06, + "loss": 0.8979, + "step": 2968 + }, + { + "epoch": 0.26, + "grad_norm": 7.808043459600445, + "learning_rate": 8.66769998490049e-06, + "loss": 0.8008, + "step": 2969 + }, + { + "epoch": 0.26, + "grad_norm": 6.999130873041443, + "learning_rate": 8.666733170559706e-06, + "loss": 0.7054, + "step": 2970 + }, + { + "epoch": 0.26, + "grad_norm": 7.976467693462358, + "learning_rate": 8.665766059512977e-06, + "loss": 0.8249, + "step": 2971 + }, + { + "epoch": 0.26, + "grad_norm": 7.531842763307472, + "learning_rate": 8.664798651838555e-06, + "loss": 0.7417, + "step": 2972 + }, + { + "epoch": 0.26, + "grad_norm": 4.006125455784613, + "learning_rate": 8.66383094761472e-06, + "loss": 0.5824, + "step": 2973 + }, + { + "epoch": 0.26, + "grad_norm": 11.377248587574789, + "learning_rate": 8.662862946919781e-06, + "loss": 1.0121, + "step": 2974 + }, + { + "epoch": 0.26, + "grad_norm": 9.059082859532792, + "learning_rate": 8.661894649832065e-06, + "loss": 0.6613, + "step": 2975 + }, + { + "epoch": 0.26, + "grad_norm": 12.186230289799184, + "learning_rate": 8.660926056429926e-06, + "loss": 0.8063, + "step": 2976 + }, + { + "epoch": 0.26, + "grad_norm": 12.115896053542029, + "learning_rate": 8.659957166791739e-06, + "loss": 0.7132, + "step": 2977 + }, + { + "epoch": 0.26, + "grad_norm": 6.566938026129322, + "learning_rate": 8.658987980995908e-06, + "loss": 0.8297, + "step": 2978 + }, + { + "epoch": 0.26, + "grad_norm": 5.988014517176099, + "learning_rate": 8.658018499120852e-06, + "loss": 0.8951, + "step": 2979 + }, + { + "epoch": 0.26, + "grad_norm": 6.458162590610563, + "learning_rate": 8.657048721245027e-06, + "loss": 0.9368, + "step": 2980 + }, + { + "epoch": 0.26, + "grad_norm": 4.819776052011941, + "learning_rate": 8.656078647446902e-06, + "loss": 0.7444, + "step": 2981 + }, + { + "epoch": 0.26, + "grad_norm": 7.748313595050877, + "learning_rate": 8.655108277804975e-06, + "loss": 0.8241, + "step": 2982 + }, + { + "epoch": 0.26, + "grad_norm": 11.90498498017788, + "learning_rate": 8.654137612397765e-06, + "loss": 0.9127, + "step": 2983 + }, + { + "epoch": 0.26, + "grad_norm": 7.341984096896056, + "learning_rate": 8.653166651303819e-06, + "loss": 0.7263, + "step": 2984 + }, + { + "epoch": 0.26, + "grad_norm": 6.377326436612921, + "learning_rate": 8.652195394601704e-06, + "loss": 0.7764, + "step": 2985 + }, + { + "epoch": 0.26, + "grad_norm": 17.881227685657887, + "learning_rate": 8.651223842370013e-06, + "loss": 0.7731, + "step": 2986 + }, + { + "epoch": 0.26, + "grad_norm": 7.057738319525283, + "learning_rate": 8.650251994687361e-06, + "loss": 0.7664, + "step": 2987 + }, + { + "epoch": 0.26, + "grad_norm": 6.263370858673192, + "learning_rate": 8.64927985163239e-06, + "loss": 0.8251, + "step": 2988 + }, + { + "epoch": 0.26, + "grad_norm": 6.260230760513899, + "learning_rate": 8.648307413283762e-06, + "loss": 0.7194, + "step": 2989 + }, + { + "epoch": 0.26, + "grad_norm": 9.4297890827432, + "learning_rate": 8.64733467972017e-06, + "loss": 0.8745, + "step": 2990 + }, + { + "epoch": 0.26, + "grad_norm": 9.474421834692517, + "learning_rate": 8.64636165102032e-06, + "loss": 0.8607, + "step": 2991 + }, + { + "epoch": 0.26, + "grad_norm": 15.216781868177288, + "learning_rate": 8.645388327262954e-06, + "loss": 0.8288, + "step": 2992 + }, + { + "epoch": 0.26, + "grad_norm": 5.066666451857499, + "learning_rate": 8.644414708526824e-06, + "loss": 0.8042, + "step": 2993 + }, + { + "epoch": 0.26, + "grad_norm": 9.040729679640648, + "learning_rate": 8.643440794890723e-06, + "loss": 0.8282, + "step": 2994 + }, + { + "epoch": 0.26, + "grad_norm": 3.128363968672994, + "learning_rate": 8.642466586433448e-06, + "loss": 0.4683, + "step": 2995 + }, + { + "epoch": 0.26, + "grad_norm": 7.352386835947175, + "learning_rate": 8.64149208323384e-06, + "loss": 0.7402, + "step": 2996 + }, + { + "epoch": 0.26, + "grad_norm": 2.882909864588861, + "learning_rate": 8.64051728537075e-06, + "loss": 0.5002, + "step": 2997 + }, + { + "epoch": 0.26, + "grad_norm": 9.336947602695554, + "learning_rate": 8.639542192923054e-06, + "loss": 1.102, + "step": 2998 + }, + { + "epoch": 0.26, + "grad_norm": 5.065902650119693, + "learning_rate": 8.63856680596966e-06, + "loss": 0.7339, + "step": 2999 + }, + { + "epoch": 0.26, + "grad_norm": 7.031746843015993, + "learning_rate": 8.63759112458949e-06, + "loss": 0.7326, + "step": 3000 + }, + { + "epoch": 0.26, + "grad_norm": 8.664370954538581, + "learning_rate": 8.636615148861498e-06, + "loss": 0.8407, + "step": 3001 + }, + { + "epoch": 0.26, + "grad_norm": 8.891033684080627, + "learning_rate": 8.635638878864659e-06, + "loss": 0.8263, + "step": 3002 + }, + { + "epoch": 0.26, + "grad_norm": 7.957209496846805, + "learning_rate": 8.634662314677968e-06, + "loss": 0.8365, + "step": 3003 + }, + { + "epoch": 0.26, + "grad_norm": 7.314615324789968, + "learning_rate": 8.63368545638045e-06, + "loss": 0.6703, + "step": 3004 + }, + { + "epoch": 0.26, + "grad_norm": 8.162590329284125, + "learning_rate": 8.632708304051146e-06, + "loss": 0.849, + "step": 3005 + }, + { + "epoch": 0.26, + "grad_norm": 7.707351094189534, + "learning_rate": 8.631730857769132e-06, + "loss": 0.7379, + "step": 3006 + }, + { + "epoch": 0.26, + "grad_norm": 7.4217496017037, + "learning_rate": 8.630753117613498e-06, + "loss": 0.8888, + "step": 3007 + }, + { + "epoch": 0.26, + "grad_norm": 7.220203986785347, + "learning_rate": 8.629775083663358e-06, + "loss": 0.913, + "step": 3008 + }, + { + "epoch": 0.26, + "grad_norm": 6.976370151700154, + "learning_rate": 8.62879675599786e-06, + "loss": 0.9195, + "step": 3009 + }, + { + "epoch": 0.26, + "grad_norm": 9.229134015335282, + "learning_rate": 8.627818134696162e-06, + "loss": 0.8073, + "step": 3010 + }, + { + "epoch": 0.26, + "grad_norm": 6.465307922111533, + "learning_rate": 8.626839219837456e-06, + "loss": 0.659, + "step": 3011 + }, + { + "epoch": 0.26, + "grad_norm": 7.6349432482527195, + "learning_rate": 8.625860011500953e-06, + "loss": 0.7923, + "step": 3012 + }, + { + "epoch": 0.26, + "grad_norm": 7.739146286263802, + "learning_rate": 8.62488050976589e-06, + "loss": 0.9085, + "step": 3013 + }, + { + "epoch": 0.26, + "grad_norm": 10.014252180597143, + "learning_rate": 8.623900714711524e-06, + "loss": 0.736, + "step": 3014 + }, + { + "epoch": 0.26, + "grad_norm": 9.853991379552404, + "learning_rate": 8.622920626417141e-06, + "loss": 0.7602, + "step": 3015 + }, + { + "epoch": 0.26, + "grad_norm": 7.186176345039314, + "learning_rate": 8.621940244962048e-06, + "loss": 0.7738, + "step": 3016 + }, + { + "epoch": 0.26, + "grad_norm": 8.569283851509915, + "learning_rate": 8.620959570425575e-06, + "loss": 0.7975, + "step": 3017 + }, + { + "epoch": 0.27, + "grad_norm": 8.602772496609063, + "learning_rate": 8.619978602887078e-06, + "loss": 1.0282, + "step": 3018 + }, + { + "epoch": 0.27, + "grad_norm": 3.86186653320029, + "learning_rate": 8.618997342425932e-06, + "loss": 0.5984, + "step": 3019 + }, + { + "epoch": 0.27, + "grad_norm": 7.393305464237552, + "learning_rate": 8.61801578912154e-06, + "loss": 0.8901, + "step": 3020 + }, + { + "epoch": 0.27, + "grad_norm": 17.224110940326515, + "learning_rate": 8.617033943053332e-06, + "loss": 0.817, + "step": 3021 + }, + { + "epoch": 0.27, + "grad_norm": 6.9078913897262995, + "learning_rate": 8.616051804300751e-06, + "loss": 0.8189, + "step": 3022 + }, + { + "epoch": 0.27, + "grad_norm": 12.433497671033127, + "learning_rate": 8.615069372943274e-06, + "loss": 0.8644, + "step": 3023 + }, + { + "epoch": 0.27, + "grad_norm": 5.699879557059219, + "learning_rate": 8.614086649060398e-06, + "loss": 0.6393, + "step": 3024 + }, + { + "epoch": 0.27, + "grad_norm": 7.130546007225425, + "learning_rate": 8.61310363273164e-06, + "loss": 0.8605, + "step": 3025 + }, + { + "epoch": 0.27, + "grad_norm": 14.910742342687815, + "learning_rate": 8.612120324036548e-06, + "loss": 0.6249, + "step": 3026 + }, + { + "epoch": 0.27, + "grad_norm": 7.796389003572444, + "learning_rate": 8.611136723054686e-06, + "loss": 0.8906, + "step": 3027 + }, + { + "epoch": 0.27, + "grad_norm": 8.176641496921217, + "learning_rate": 8.610152829865647e-06, + "loss": 0.9394, + "step": 3028 + }, + { + "epoch": 0.27, + "grad_norm": 13.506926348055476, + "learning_rate": 8.609168644549048e-06, + "loss": 0.8251, + "step": 3029 + }, + { + "epoch": 0.27, + "grad_norm": 19.201731739491954, + "learning_rate": 8.608184167184525e-06, + "loss": 0.8142, + "step": 3030 + }, + { + "epoch": 0.27, + "grad_norm": 2.1706307607722626, + "learning_rate": 8.607199397851741e-06, + "loss": 0.4931, + "step": 3031 + }, + { + "epoch": 0.27, + "grad_norm": 7.461684395802114, + "learning_rate": 8.606214336630382e-06, + "loss": 0.7527, + "step": 3032 + }, + { + "epoch": 0.27, + "grad_norm": 5.731169746967573, + "learning_rate": 8.605228983600158e-06, + "loss": 0.8893, + "step": 3033 + }, + { + "epoch": 0.27, + "grad_norm": 11.196326063558377, + "learning_rate": 8.604243338840801e-06, + "loss": 1.0228, + "step": 3034 + }, + { + "epoch": 0.27, + "grad_norm": 5.483284318523815, + "learning_rate": 8.60325740243207e-06, + "loss": 0.6898, + "step": 3035 + }, + { + "epoch": 0.27, + "grad_norm": 11.799156942976365, + "learning_rate": 8.602271174453742e-06, + "loss": 0.934, + "step": 3036 + }, + { + "epoch": 0.27, + "grad_norm": 11.866143932583714, + "learning_rate": 8.601284654985623e-06, + "loss": 0.8592, + "step": 3037 + }, + { + "epoch": 0.27, + "grad_norm": 9.594325963478887, + "learning_rate": 8.600297844107541e-06, + "loss": 0.7949, + "step": 3038 + }, + { + "epoch": 0.27, + "grad_norm": 7.26845765903899, + "learning_rate": 8.599310741899345e-06, + "loss": 0.757, + "step": 3039 + }, + { + "epoch": 0.27, + "grad_norm": 9.64732868362241, + "learning_rate": 8.598323348440911e-06, + "loss": 0.7964, + "step": 3040 + }, + { + "epoch": 0.27, + "grad_norm": 8.364295488098723, + "learning_rate": 8.597335663812137e-06, + "loss": 0.7698, + "step": 3041 + }, + { + "epoch": 0.27, + "grad_norm": 6.310517060878988, + "learning_rate": 8.596347688092945e-06, + "loss": 0.6859, + "step": 3042 + }, + { + "epoch": 0.27, + "grad_norm": 8.457732549138145, + "learning_rate": 8.59535942136328e-06, + "loss": 0.8305, + "step": 3043 + }, + { + "epoch": 0.27, + "grad_norm": 7.274067697275562, + "learning_rate": 8.594370863703113e-06, + "loss": 0.8257, + "step": 3044 + }, + { + "epoch": 0.27, + "grad_norm": 6.664365411554445, + "learning_rate": 8.593382015192434e-06, + "loss": 0.7715, + "step": 3045 + }, + { + "epoch": 0.27, + "grad_norm": 5.868353736667625, + "learning_rate": 8.592392875911259e-06, + "loss": 0.8655, + "step": 3046 + }, + { + "epoch": 0.27, + "grad_norm": 12.673263383212188, + "learning_rate": 8.591403445939628e-06, + "loss": 0.8382, + "step": 3047 + }, + { + "epoch": 0.27, + "grad_norm": 12.79881217293614, + "learning_rate": 8.590413725357605e-06, + "loss": 1.0085, + "step": 3048 + }, + { + "epoch": 0.27, + "grad_norm": 14.7939657299943, + "learning_rate": 8.589423714245275e-06, + "loss": 0.9705, + "step": 3049 + }, + { + "epoch": 0.27, + "grad_norm": 5.922387293806707, + "learning_rate": 8.588433412682749e-06, + "loss": 0.8089, + "step": 3050 + }, + { + "epoch": 0.27, + "grad_norm": 9.352138860648935, + "learning_rate": 8.58744282075016e-06, + "loss": 0.6304, + "step": 3051 + }, + { + "epoch": 0.27, + "grad_norm": 7.575620091153556, + "learning_rate": 8.586451938527665e-06, + "loss": 0.8687, + "step": 3052 + }, + { + "epoch": 0.27, + "grad_norm": 8.491510035281609, + "learning_rate": 8.585460766095446e-06, + "loss": 0.7904, + "step": 3053 + }, + { + "epoch": 0.27, + "grad_norm": 9.696578710266785, + "learning_rate": 8.584469303533705e-06, + "loss": 0.8311, + "step": 3054 + }, + { + "epoch": 0.27, + "grad_norm": 10.922768931321473, + "learning_rate": 8.58347755092267e-06, + "loss": 0.9098, + "step": 3055 + }, + { + "epoch": 0.27, + "grad_norm": 19.889914038087785, + "learning_rate": 8.582485508342593e-06, + "loss": 0.7852, + "step": 3056 + }, + { + "epoch": 0.27, + "grad_norm": 9.791110079728254, + "learning_rate": 8.581493175873749e-06, + "loss": 0.6628, + "step": 3057 + }, + { + "epoch": 0.27, + "grad_norm": 9.242997858903351, + "learning_rate": 8.580500553596433e-06, + "loss": 0.8403, + "step": 3058 + }, + { + "epoch": 0.27, + "grad_norm": 7.82302870372864, + "learning_rate": 8.57950764159097e-06, + "loss": 0.7595, + "step": 3059 + }, + { + "epoch": 0.27, + "grad_norm": 9.630342018350625, + "learning_rate": 8.5785144399377e-06, + "loss": 0.7341, + "step": 3060 + }, + { + "epoch": 0.27, + "grad_norm": 11.909487898380455, + "learning_rate": 8.577520948716996e-06, + "loss": 0.8957, + "step": 3061 + }, + { + "epoch": 0.27, + "grad_norm": 6.701336281642899, + "learning_rate": 8.576527168009248e-06, + "loss": 0.7743, + "step": 3062 + }, + { + "epoch": 0.27, + "grad_norm": 8.088894428219135, + "learning_rate": 8.57553309789487e-06, + "loss": 0.8659, + "step": 3063 + }, + { + "epoch": 0.27, + "grad_norm": 14.170615495039378, + "learning_rate": 8.5745387384543e-06, + "loss": 0.8992, + "step": 3064 + }, + { + "epoch": 0.27, + "grad_norm": 8.107778869570362, + "learning_rate": 8.573544089768005e-06, + "loss": 0.8276, + "step": 3065 + }, + { + "epoch": 0.27, + "grad_norm": 5.312532533400759, + "learning_rate": 8.572549151916462e-06, + "loss": 0.808, + "step": 3066 + }, + { + "epoch": 0.27, + "grad_norm": 14.786658567112982, + "learning_rate": 8.571553924980188e-06, + "loss": 0.7595, + "step": 3067 + }, + { + "epoch": 0.27, + "grad_norm": 9.084194332697642, + "learning_rate": 8.57055840903971e-06, + "loss": 0.7887, + "step": 3068 + }, + { + "epoch": 0.27, + "grad_norm": 9.722161692244137, + "learning_rate": 8.569562604175585e-06, + "loss": 0.7162, + "step": 3069 + }, + { + "epoch": 0.27, + "grad_norm": 2.0478891945738225, + "learning_rate": 8.568566510468392e-06, + "loss": 0.4919, + "step": 3070 + }, + { + "epoch": 0.27, + "grad_norm": 7.37767748293205, + "learning_rate": 8.56757012799873e-06, + "loss": 0.8353, + "step": 3071 + }, + { + "epoch": 0.27, + "grad_norm": 11.144005248359102, + "learning_rate": 8.566573456847232e-06, + "loss": 0.9247, + "step": 3072 + }, + { + "epoch": 0.27, + "grad_norm": 7.419071052740864, + "learning_rate": 8.56557649709454e-06, + "loss": 0.844, + "step": 3073 + }, + { + "epoch": 0.27, + "grad_norm": 5.986354569019454, + "learning_rate": 8.564579248821329e-06, + "loss": 0.887, + "step": 3074 + }, + { + "epoch": 0.27, + "grad_norm": 8.917885588959294, + "learning_rate": 8.563581712108295e-06, + "loss": 0.8546, + "step": 3075 + }, + { + "epoch": 0.27, + "grad_norm": 9.474464717853863, + "learning_rate": 8.562583887036157e-06, + "loss": 0.9347, + "step": 3076 + }, + { + "epoch": 0.27, + "grad_norm": 10.17139551094473, + "learning_rate": 8.561585773685657e-06, + "loss": 0.8357, + "step": 3077 + }, + { + "epoch": 0.27, + "grad_norm": 12.405518080950506, + "learning_rate": 8.560587372137561e-06, + "loss": 0.8737, + "step": 3078 + }, + { + "epoch": 0.27, + "grad_norm": 14.587498516933627, + "learning_rate": 8.559588682472656e-06, + "loss": 0.8793, + "step": 3079 + }, + { + "epoch": 0.27, + "grad_norm": 14.0457999093386, + "learning_rate": 8.558589704771756e-06, + "loss": 0.833, + "step": 3080 + }, + { + "epoch": 0.27, + "grad_norm": 8.404748380871242, + "learning_rate": 8.557590439115697e-06, + "loss": 0.744, + "step": 3081 + }, + { + "epoch": 0.27, + "grad_norm": 21.61088230507084, + "learning_rate": 8.556590885585338e-06, + "loss": 0.7641, + "step": 3082 + }, + { + "epoch": 0.27, + "grad_norm": 6.124231587216938, + "learning_rate": 8.55559104426156e-06, + "loss": 0.824, + "step": 3083 + }, + { + "epoch": 0.27, + "grad_norm": 6.267610068619695, + "learning_rate": 8.55459091522527e-06, + "loss": 0.7114, + "step": 3084 + }, + { + "epoch": 0.27, + "grad_norm": 9.582369784315803, + "learning_rate": 8.553590498557394e-06, + "loss": 0.7662, + "step": 3085 + }, + { + "epoch": 0.27, + "grad_norm": 9.361072738335178, + "learning_rate": 8.552589794338889e-06, + "loss": 0.982, + "step": 3086 + }, + { + "epoch": 0.27, + "grad_norm": 4.8729033222375095, + "learning_rate": 8.551588802650723e-06, + "loss": 0.6788, + "step": 3087 + }, + { + "epoch": 0.27, + "grad_norm": 11.375598758196062, + "learning_rate": 8.550587523573903e-06, + "loss": 0.7436, + "step": 3088 + }, + { + "epoch": 0.27, + "grad_norm": 6.9925509591947685, + "learning_rate": 8.549585957189444e-06, + "loss": 0.8598, + "step": 3089 + }, + { + "epoch": 0.27, + "grad_norm": 2.6887936873141767, + "learning_rate": 8.548584103578395e-06, + "loss": 0.5183, + "step": 3090 + }, + { + "epoch": 0.27, + "grad_norm": 8.397732076590335, + "learning_rate": 8.547581962821823e-06, + "loss": 0.8844, + "step": 3091 + }, + { + "epoch": 0.27, + "grad_norm": 2.0714161563004816, + "learning_rate": 8.546579535000819e-06, + "loss": 0.4816, + "step": 3092 + }, + { + "epoch": 0.27, + "grad_norm": 11.405446589464221, + "learning_rate": 8.5455768201965e-06, + "loss": 0.7984, + "step": 3093 + }, + { + "epoch": 0.27, + "grad_norm": 8.06421402802851, + "learning_rate": 8.544573818490001e-06, + "loss": 0.6994, + "step": 3094 + }, + { + "epoch": 0.27, + "grad_norm": 10.505507852602213, + "learning_rate": 8.543570529962484e-06, + "loss": 0.939, + "step": 3095 + }, + { + "epoch": 0.27, + "grad_norm": 12.963611831651532, + "learning_rate": 8.542566954695137e-06, + "loss": 0.9368, + "step": 3096 + }, + { + "epoch": 0.27, + "grad_norm": 7.177071157672863, + "learning_rate": 8.541563092769161e-06, + "loss": 0.8546, + "step": 3097 + }, + { + "epoch": 0.27, + "grad_norm": 8.54662978077648, + "learning_rate": 8.540558944265793e-06, + "loss": 0.8078, + "step": 3098 + }, + { + "epoch": 0.27, + "grad_norm": 6.72605233074442, + "learning_rate": 8.539554509266286e-06, + "loss": 0.8596, + "step": 3099 + }, + { + "epoch": 0.27, + "grad_norm": 5.2226245329599195, + "learning_rate": 8.538549787851913e-06, + "loss": 0.7672, + "step": 3100 + }, + { + "epoch": 0.27, + "grad_norm": 11.25598206067046, + "learning_rate": 8.53754478010398e-06, + "loss": 0.8226, + "step": 3101 + }, + { + "epoch": 0.27, + "grad_norm": 5.735030161788527, + "learning_rate": 8.536539486103805e-06, + "loss": 0.9262, + "step": 3102 + }, + { + "epoch": 0.27, + "grad_norm": 58.54175421373675, + "learning_rate": 8.535533905932739e-06, + "loss": 0.8282, + "step": 3103 + }, + { + "epoch": 0.27, + "grad_norm": 7.7506103062135745, + "learning_rate": 8.53452803967215e-06, + "loss": 0.8769, + "step": 3104 + }, + { + "epoch": 0.27, + "grad_norm": 12.342932560210127, + "learning_rate": 8.533521887403432e-06, + "loss": 0.927, + "step": 3105 + }, + { + "epoch": 0.27, + "grad_norm": 13.60467356290392, + "learning_rate": 8.532515449207999e-06, + "loss": 0.6565, + "step": 3106 + }, + { + "epoch": 0.27, + "grad_norm": 10.514679760485999, + "learning_rate": 8.531508725167293e-06, + "loss": 0.8797, + "step": 3107 + }, + { + "epoch": 0.27, + "grad_norm": 7.997552616678686, + "learning_rate": 8.530501715362775e-06, + "loss": 0.9176, + "step": 3108 + }, + { + "epoch": 0.27, + "grad_norm": 7.772768946166449, + "learning_rate": 8.529494419875932e-06, + "loss": 0.9164, + "step": 3109 + }, + { + "epoch": 0.27, + "grad_norm": 9.206126076825841, + "learning_rate": 8.52848683878827e-06, + "loss": 0.7392, + "step": 3110 + }, + { + "epoch": 0.27, + "grad_norm": 8.473376005996116, + "learning_rate": 8.527478972181324e-06, + "loss": 0.8929, + "step": 3111 + }, + { + "epoch": 0.27, + "grad_norm": 9.249763386694372, + "learning_rate": 8.526470820136646e-06, + "loss": 0.8362, + "step": 3112 + }, + { + "epoch": 0.27, + "grad_norm": 2.250253414105023, + "learning_rate": 8.525462382735816e-06, + "loss": 0.4666, + "step": 3113 + }, + { + "epoch": 0.27, + "grad_norm": 7.628852621928183, + "learning_rate": 8.524453660060434e-06, + "loss": 0.7495, + "step": 3114 + }, + { + "epoch": 0.27, + "grad_norm": 6.771373978364439, + "learning_rate": 8.523444652192123e-06, + "loss": 0.7543, + "step": 3115 + }, + { + "epoch": 0.27, + "grad_norm": 3.161257703140788, + "learning_rate": 8.522435359212533e-06, + "loss": 0.5406, + "step": 3116 + }, + { + "epoch": 0.27, + "grad_norm": 5.905236724840074, + "learning_rate": 8.521425781203333e-06, + "loss": 0.8289, + "step": 3117 + }, + { + "epoch": 0.27, + "grad_norm": 9.280316871683143, + "learning_rate": 8.520415918246216e-06, + "loss": 0.7942, + "step": 3118 + }, + { + "epoch": 0.27, + "grad_norm": 8.85887880803998, + "learning_rate": 8.5194057704229e-06, + "loss": 0.7564, + "step": 3119 + }, + { + "epoch": 0.27, + "grad_norm": 8.88831625428789, + "learning_rate": 8.518395337815123e-06, + "loss": 0.7611, + "step": 3120 + }, + { + "epoch": 0.27, + "grad_norm": 9.045219563503363, + "learning_rate": 8.517384620504647e-06, + "loss": 0.925, + "step": 3121 + }, + { + "epoch": 0.27, + "grad_norm": 12.747900420187522, + "learning_rate": 8.516373618573258e-06, + "loss": 0.9476, + "step": 3122 + }, + { + "epoch": 0.27, + "grad_norm": 7.277933544017275, + "learning_rate": 8.515362332102766e-06, + "loss": 0.8133, + "step": 3123 + }, + { + "epoch": 0.27, + "grad_norm": 6.5238781880003405, + "learning_rate": 8.514350761175e-06, + "loss": 0.8096, + "step": 3124 + }, + { + "epoch": 0.27, + "grad_norm": 6.831983408966379, + "learning_rate": 8.513338905871819e-06, + "loss": 0.7534, + "step": 3125 + }, + { + "epoch": 0.27, + "grad_norm": 10.815872258870897, + "learning_rate": 8.512326766275096e-06, + "loss": 0.9156, + "step": 3126 + }, + { + "epoch": 0.27, + "grad_norm": 5.288943951549614, + "learning_rate": 8.511314342466735e-06, + "loss": 0.7966, + "step": 3127 + }, + { + "epoch": 0.27, + "grad_norm": 2.44801961219163, + "learning_rate": 8.510301634528656e-06, + "loss": 0.5133, + "step": 3128 + }, + { + "epoch": 0.27, + "grad_norm": 6.3954607790537805, + "learning_rate": 8.50928864254281e-06, + "loss": 0.8272, + "step": 3129 + }, + { + "epoch": 0.27, + "grad_norm": 8.482559618535705, + "learning_rate": 8.508275366591161e-06, + "loss": 0.8727, + "step": 3130 + }, + { + "epoch": 0.27, + "grad_norm": 10.698382200048481, + "learning_rate": 8.50726180675571e-06, + "loss": 0.8596, + "step": 3131 + }, + { + "epoch": 0.28, + "grad_norm": 18.87561949917552, + "learning_rate": 8.506247963118464e-06, + "loss": 1.0312, + "step": 3132 + }, + { + "epoch": 0.28, + "grad_norm": 7.671160179286012, + "learning_rate": 8.505233835761466e-06, + "loss": 0.7999, + "step": 3133 + }, + { + "epoch": 0.28, + "grad_norm": 10.191093055757914, + "learning_rate": 8.504219424766777e-06, + "loss": 0.6496, + "step": 3134 + }, + { + "epoch": 0.28, + "grad_norm": 8.731973002142391, + "learning_rate": 8.503204730216481e-06, + "loss": 0.7103, + "step": 3135 + }, + { + "epoch": 0.28, + "grad_norm": 6.714905266809431, + "learning_rate": 8.502189752192685e-06, + "loss": 0.8094, + "step": 3136 + }, + { + "epoch": 0.28, + "grad_norm": 5.5557878187401615, + "learning_rate": 8.50117449077752e-06, + "loss": 0.8565, + "step": 3137 + }, + { + "epoch": 0.28, + "grad_norm": 9.485482760993612, + "learning_rate": 8.500158946053139e-06, + "loss": 0.9105, + "step": 3138 + }, + { + "epoch": 0.28, + "grad_norm": 14.658033564595131, + "learning_rate": 8.49914311810172e-06, + "loss": 0.7425, + "step": 3139 + }, + { + "epoch": 0.28, + "grad_norm": 15.298911190603057, + "learning_rate": 8.498127007005459e-06, + "loss": 0.9161, + "step": 3140 + }, + { + "epoch": 0.28, + "grad_norm": 9.93874852355141, + "learning_rate": 8.49711061284658e-06, + "loss": 0.7007, + "step": 3141 + }, + { + "epoch": 0.28, + "grad_norm": 8.088601973618472, + "learning_rate": 8.496093935707326e-06, + "loss": 1.02, + "step": 3142 + }, + { + "epoch": 0.28, + "grad_norm": 6.625966484789252, + "learning_rate": 8.495076975669968e-06, + "loss": 0.6636, + "step": 3143 + }, + { + "epoch": 0.28, + "grad_norm": 8.595146449363487, + "learning_rate": 8.494059732816794e-06, + "loss": 0.9201, + "step": 3144 + }, + { + "epoch": 0.28, + "grad_norm": 7.553305787250429, + "learning_rate": 8.493042207230117e-06, + "loss": 0.8635, + "step": 3145 + }, + { + "epoch": 0.28, + "grad_norm": 11.253080475992501, + "learning_rate": 8.492024398992278e-06, + "loss": 0.8804, + "step": 3146 + }, + { + "epoch": 0.28, + "grad_norm": 18.745658427471373, + "learning_rate": 8.491006308185632e-06, + "loss": 0.9761, + "step": 3147 + }, + { + "epoch": 0.28, + "grad_norm": 10.376999886738474, + "learning_rate": 8.48998793489256e-06, + "loss": 0.7112, + "step": 3148 + }, + { + "epoch": 0.28, + "grad_norm": 8.800288775160066, + "learning_rate": 8.488969279195474e-06, + "loss": 0.8241, + "step": 3149 + }, + { + "epoch": 0.28, + "grad_norm": 7.073851159375679, + "learning_rate": 8.487950341176794e-06, + "loss": 0.6641, + "step": 3150 + }, + { + "epoch": 0.28, + "grad_norm": 7.881612765269817, + "learning_rate": 8.486931120918976e-06, + "loss": 0.8134, + "step": 3151 + }, + { + "epoch": 0.28, + "grad_norm": 12.694214327606193, + "learning_rate": 8.485911618504493e-06, + "loss": 0.8728, + "step": 3152 + }, + { + "epoch": 0.28, + "grad_norm": 11.166927512509606, + "learning_rate": 8.484891834015837e-06, + "loss": 0.8935, + "step": 3153 + }, + { + "epoch": 0.28, + "grad_norm": 4.801487413070018, + "learning_rate": 8.483871767535535e-06, + "loss": 0.6893, + "step": 3154 + }, + { + "epoch": 0.28, + "grad_norm": 10.869220383260144, + "learning_rate": 8.482851419146121e-06, + "loss": 0.8508, + "step": 3155 + }, + { + "epoch": 0.28, + "grad_norm": 6.57994841023509, + "learning_rate": 8.481830788930166e-06, + "loss": 0.8456, + "step": 3156 + }, + { + "epoch": 0.28, + "grad_norm": 18.648951718611436, + "learning_rate": 8.480809876970253e-06, + "loss": 0.8919, + "step": 3157 + }, + { + "epoch": 0.28, + "grad_norm": 8.732825568086332, + "learning_rate": 8.479788683348996e-06, + "loss": 0.8331, + "step": 3158 + }, + { + "epoch": 0.28, + "grad_norm": 5.6799088386582195, + "learning_rate": 8.478767208149027e-06, + "loss": 0.8853, + "step": 3159 + }, + { + "epoch": 0.28, + "grad_norm": 7.888372053363535, + "learning_rate": 8.477745451453003e-06, + "loss": 0.6349, + "step": 3160 + }, + { + "epoch": 0.28, + "grad_norm": 8.173109117016471, + "learning_rate": 8.476723413343599e-06, + "loss": 0.7257, + "step": 3161 + }, + { + "epoch": 0.28, + "grad_norm": 8.080614674383542, + "learning_rate": 8.475701093903523e-06, + "loss": 0.8721, + "step": 3162 + }, + { + "epoch": 0.28, + "grad_norm": 11.915920302461835, + "learning_rate": 8.474678493215495e-06, + "loss": 0.7436, + "step": 3163 + }, + { + "epoch": 0.28, + "grad_norm": 11.049736277957729, + "learning_rate": 8.473655611362262e-06, + "loss": 0.8534, + "step": 3164 + }, + { + "epoch": 0.28, + "grad_norm": 6.261002260178128, + "learning_rate": 8.472632448426596e-06, + "loss": 0.7226, + "step": 3165 + }, + { + "epoch": 0.28, + "grad_norm": 5.087437569238574, + "learning_rate": 8.47160900449129e-06, + "loss": 0.8388, + "step": 3166 + }, + { + "epoch": 0.28, + "grad_norm": 7.97046535041165, + "learning_rate": 8.470585279639156e-06, + "loss": 0.8925, + "step": 3167 + }, + { + "epoch": 0.28, + "grad_norm": 8.794779583262983, + "learning_rate": 8.469561273953035e-06, + "loss": 0.874, + "step": 3168 + }, + { + "epoch": 0.28, + "grad_norm": 6.669576986294365, + "learning_rate": 8.468536987515788e-06, + "loss": 0.7539, + "step": 3169 + }, + { + "epoch": 0.28, + "grad_norm": 9.631625919129084, + "learning_rate": 8.467512420410295e-06, + "loss": 0.9871, + "step": 3170 + }, + { + "epoch": 0.28, + "grad_norm": 14.605374713617152, + "learning_rate": 8.466487572719468e-06, + "loss": 0.8273, + "step": 3171 + }, + { + "epoch": 0.28, + "grad_norm": 6.357686887373408, + "learning_rate": 8.465462444526231e-06, + "loss": 0.6422, + "step": 3172 + }, + { + "epoch": 0.28, + "grad_norm": 2.772492911101937, + "learning_rate": 8.464437035913537e-06, + "loss": 0.4907, + "step": 3173 + }, + { + "epoch": 0.28, + "grad_norm": 6.370438379450058, + "learning_rate": 8.463411346964363e-06, + "loss": 0.9217, + "step": 3174 + }, + { + "epoch": 0.28, + "grad_norm": 12.77648188649008, + "learning_rate": 8.462385377761701e-06, + "loss": 0.8027, + "step": 3175 + }, + { + "epoch": 0.28, + "grad_norm": 6.813465767349711, + "learning_rate": 8.461359128388575e-06, + "loss": 0.7609, + "step": 3176 + }, + { + "epoch": 0.28, + "grad_norm": 2.312810120301569, + "learning_rate": 8.460332598928027e-06, + "loss": 0.5127, + "step": 3177 + }, + { + "epoch": 0.28, + "grad_norm": 10.526048799546295, + "learning_rate": 8.45930578946312e-06, + "loss": 0.6467, + "step": 3178 + }, + { + "epoch": 0.28, + "grad_norm": 6.604021117739147, + "learning_rate": 8.458278700076942e-06, + "loss": 0.9044, + "step": 3179 + }, + { + "epoch": 0.28, + "grad_norm": 6.19471593229433, + "learning_rate": 8.457251330852608e-06, + "loss": 0.7666, + "step": 3180 + }, + { + "epoch": 0.28, + "grad_norm": 4.63777645905137, + "learning_rate": 8.456223681873242e-06, + "loss": 0.9117, + "step": 3181 + }, + { + "epoch": 0.28, + "grad_norm": 2.516748699190015, + "learning_rate": 8.455195753222008e-06, + "loss": 0.535, + "step": 3182 + }, + { + "epoch": 0.28, + "grad_norm": 2.5341402494539698, + "learning_rate": 8.45416754498208e-06, + "loss": 0.5322, + "step": 3183 + }, + { + "epoch": 0.28, + "grad_norm": 6.674986279008562, + "learning_rate": 8.45313905723666e-06, + "loss": 0.7261, + "step": 3184 + }, + { + "epoch": 0.28, + "grad_norm": 7.535989473127013, + "learning_rate": 8.452110290068973e-06, + "loss": 0.9172, + "step": 3185 + }, + { + "epoch": 0.28, + "grad_norm": 3.83788207632623, + "learning_rate": 8.451081243562261e-06, + "loss": 0.5363, + "step": 3186 + }, + { + "epoch": 0.28, + "grad_norm": 8.288288078569098, + "learning_rate": 8.450051917799797e-06, + "loss": 0.646, + "step": 3187 + }, + { + "epoch": 0.28, + "grad_norm": 5.806962151880584, + "learning_rate": 8.44902231286487e-06, + "loss": 0.8111, + "step": 3188 + }, + { + "epoch": 0.28, + "grad_norm": 9.212561691171079, + "learning_rate": 8.447992428840795e-06, + "loss": 0.8417, + "step": 3189 + }, + { + "epoch": 0.28, + "grad_norm": 6.708902467171544, + "learning_rate": 8.446962265810907e-06, + "loss": 0.9408, + "step": 3190 + }, + { + "epoch": 0.28, + "grad_norm": 9.293214546781861, + "learning_rate": 8.445931823858568e-06, + "loss": 0.6503, + "step": 3191 + }, + { + "epoch": 0.28, + "grad_norm": 8.574771077272118, + "learning_rate": 8.444901103067158e-06, + "loss": 0.9667, + "step": 3192 + }, + { + "epoch": 0.28, + "grad_norm": 3.881314264594567, + "learning_rate": 8.44387010352008e-06, + "loss": 0.5382, + "step": 3193 + }, + { + "epoch": 0.28, + "grad_norm": 7.0730778611234255, + "learning_rate": 8.44283882530076e-06, + "loss": 0.9596, + "step": 3194 + }, + { + "epoch": 0.28, + "grad_norm": 9.86980792105976, + "learning_rate": 8.441807268492653e-06, + "loss": 0.7707, + "step": 3195 + }, + { + "epoch": 0.28, + "grad_norm": 8.168437290172319, + "learning_rate": 8.440775433179223e-06, + "loss": 0.927, + "step": 3196 + }, + { + "epoch": 0.28, + "grad_norm": 8.486649612202303, + "learning_rate": 8.439743319443972e-06, + "loss": 0.8405, + "step": 3197 + }, + { + "epoch": 0.28, + "grad_norm": 9.395384796389806, + "learning_rate": 8.43871092737041e-06, + "loss": 0.728, + "step": 3198 + }, + { + "epoch": 0.28, + "grad_norm": 13.69509955953549, + "learning_rate": 8.43767825704208e-06, + "loss": 0.8882, + "step": 3199 + }, + { + "epoch": 0.28, + "grad_norm": 8.76615487070126, + "learning_rate": 8.436645308542547e-06, + "loss": 0.829, + "step": 3200 + }, + { + "epoch": 0.28, + "grad_norm": 6.49416772950944, + "learning_rate": 8.43561208195539e-06, + "loss": 0.8603, + "step": 3201 + }, + { + "epoch": 0.28, + "grad_norm": 7.25092587342527, + "learning_rate": 8.434578577364218e-06, + "loss": 0.8878, + "step": 3202 + }, + { + "epoch": 0.28, + "grad_norm": 8.48815935137707, + "learning_rate": 8.433544794852662e-06, + "loss": 0.964, + "step": 3203 + }, + { + "epoch": 0.28, + "grad_norm": 7.622640992330007, + "learning_rate": 8.432510734504371e-06, + "loss": 0.8214, + "step": 3204 + }, + { + "epoch": 0.28, + "grad_norm": 6.2914054688141885, + "learning_rate": 8.431476396403022e-06, + "loss": 0.7712, + "step": 3205 + }, + { + "epoch": 0.28, + "grad_norm": 9.184812071275433, + "learning_rate": 8.430441780632312e-06, + "loss": 0.6917, + "step": 3206 + }, + { + "epoch": 0.28, + "grad_norm": 9.169913454778811, + "learning_rate": 8.429406887275958e-06, + "loss": 0.8181, + "step": 3207 + }, + { + "epoch": 0.28, + "grad_norm": 6.640148389146151, + "learning_rate": 8.428371716417703e-06, + "loss": 0.8961, + "step": 3208 + }, + { + "epoch": 0.28, + "grad_norm": 11.122421892241727, + "learning_rate": 8.42733626814131e-06, + "loss": 0.7178, + "step": 3209 + }, + { + "epoch": 0.28, + "grad_norm": 7.835317079892041, + "learning_rate": 8.42630054253057e-06, + "loss": 0.7826, + "step": 3210 + }, + { + "epoch": 0.28, + "grad_norm": 7.290157503069916, + "learning_rate": 8.425264539669289e-06, + "loss": 0.8275, + "step": 3211 + }, + { + "epoch": 0.28, + "grad_norm": 8.774766399578127, + "learning_rate": 8.424228259641299e-06, + "loss": 0.8479, + "step": 3212 + }, + { + "epoch": 0.28, + "grad_norm": 10.575565150022703, + "learning_rate": 8.423191702530453e-06, + "loss": 0.9274, + "step": 3213 + }, + { + "epoch": 0.28, + "grad_norm": 8.639758881118102, + "learning_rate": 8.42215486842063e-06, + "loss": 0.8245, + "step": 3214 + }, + { + "epoch": 0.28, + "grad_norm": 8.486890092408824, + "learning_rate": 8.421117757395728e-06, + "loss": 0.7704, + "step": 3215 + }, + { + "epoch": 0.28, + "grad_norm": 5.99782247375563, + "learning_rate": 8.420080369539665e-06, + "loss": 0.8015, + "step": 3216 + }, + { + "epoch": 0.28, + "grad_norm": 8.552919769357052, + "learning_rate": 8.419042704936389e-06, + "loss": 0.6928, + "step": 3217 + }, + { + "epoch": 0.28, + "grad_norm": 5.779483081997814, + "learning_rate": 8.418004763669864e-06, + "loss": 0.6635, + "step": 3218 + }, + { + "epoch": 0.28, + "grad_norm": 2.614083791250978, + "learning_rate": 8.416966545824082e-06, + "loss": 0.5705, + "step": 3219 + }, + { + "epoch": 0.28, + "grad_norm": 23.172877333190698, + "learning_rate": 8.415928051483049e-06, + "loss": 0.895, + "step": 3220 + }, + { + "epoch": 0.28, + "grad_norm": 8.939314928067647, + "learning_rate": 8.4148892807308e-06, + "loss": 0.8779, + "step": 3221 + }, + { + "epoch": 0.28, + "grad_norm": 5.831296857889294, + "learning_rate": 8.41385023365139e-06, + "loss": 0.8594, + "step": 3222 + }, + { + "epoch": 0.28, + "grad_norm": 10.137410864828436, + "learning_rate": 8.4128109103289e-06, + "loss": 0.7498, + "step": 3223 + }, + { + "epoch": 0.28, + "grad_norm": 3.9469711973900337, + "learning_rate": 8.411771310847426e-06, + "loss": 0.5243, + "step": 3224 + }, + { + "epoch": 0.28, + "grad_norm": 6.985383195471843, + "learning_rate": 8.410731435291093e-06, + "loss": 0.864, + "step": 3225 + }, + { + "epoch": 0.28, + "grad_norm": 7.224065847844632, + "learning_rate": 8.409691283744046e-06, + "loss": 0.8428, + "step": 3226 + }, + { + "epoch": 0.28, + "grad_norm": 9.470716577154464, + "learning_rate": 8.408650856290453e-06, + "loss": 0.8046, + "step": 3227 + }, + { + "epoch": 0.28, + "grad_norm": 8.388641532731882, + "learning_rate": 8.407610153014501e-06, + "loss": 0.9257, + "step": 3228 + }, + { + "epoch": 0.28, + "grad_norm": 6.571095196261312, + "learning_rate": 8.406569174000404e-06, + "loss": 0.7916, + "step": 3229 + }, + { + "epoch": 0.28, + "grad_norm": 11.475541989925311, + "learning_rate": 8.405527919332396e-06, + "loss": 0.627, + "step": 3230 + }, + { + "epoch": 0.28, + "grad_norm": 9.215175855147082, + "learning_rate": 8.404486389094734e-06, + "loss": 0.9029, + "step": 3231 + }, + { + "epoch": 0.28, + "grad_norm": 35.81888127332189, + "learning_rate": 8.403444583371698e-06, + "loss": 0.8138, + "step": 3232 + }, + { + "epoch": 0.28, + "grad_norm": 10.393795666150487, + "learning_rate": 8.402402502247585e-06, + "loss": 0.98, + "step": 3233 + }, + { + "epoch": 0.28, + "grad_norm": 6.904056055400079, + "learning_rate": 8.401360145806723e-06, + "loss": 0.8723, + "step": 3234 + }, + { + "epoch": 0.28, + "grad_norm": 7.874985016113198, + "learning_rate": 8.400317514133454e-06, + "loss": 0.9758, + "step": 3235 + }, + { + "epoch": 0.28, + "grad_norm": 7.422870112273012, + "learning_rate": 8.39927460731215e-06, + "loss": 0.8422, + "step": 3236 + }, + { + "epoch": 0.28, + "grad_norm": 9.422308301483163, + "learning_rate": 8.398231425427197e-06, + "loss": 0.8418, + "step": 3237 + }, + { + "epoch": 0.28, + "grad_norm": 18.518007385346646, + "learning_rate": 8.397187968563011e-06, + "loss": 0.8145, + "step": 3238 + }, + { + "epoch": 0.28, + "grad_norm": 21.916473638726465, + "learning_rate": 8.396144236804026e-06, + "loss": 0.7676, + "step": 3239 + }, + { + "epoch": 0.28, + "grad_norm": 2.3180061724862004, + "learning_rate": 8.395100230234699e-06, + "loss": 0.5149, + "step": 3240 + }, + { + "epoch": 0.28, + "grad_norm": 7.080087966008208, + "learning_rate": 8.394055948939508e-06, + "loss": 0.8225, + "step": 3241 + }, + { + "epoch": 0.28, + "grad_norm": 8.833264570474764, + "learning_rate": 8.393011393002955e-06, + "loss": 0.8049, + "step": 3242 + }, + { + "epoch": 0.28, + "grad_norm": 2.104360432624469, + "learning_rate": 8.391966562509563e-06, + "loss": 0.5321, + "step": 3243 + }, + { + "epoch": 0.28, + "grad_norm": 19.464916491346827, + "learning_rate": 8.390921457543881e-06, + "loss": 0.8742, + "step": 3244 + }, + { + "epoch": 0.28, + "grad_norm": 10.556718346879325, + "learning_rate": 8.389876078190475e-06, + "loss": 0.8187, + "step": 3245 + }, + { + "epoch": 0.29, + "grad_norm": 8.56656536572149, + "learning_rate": 8.388830424533935e-06, + "loss": 0.8163, + "step": 3246 + }, + { + "epoch": 0.29, + "grad_norm": 7.256630066550686, + "learning_rate": 8.387784496658872e-06, + "loss": 0.7443, + "step": 3247 + }, + { + "epoch": 0.29, + "grad_norm": 8.407955415427805, + "learning_rate": 8.386738294649926e-06, + "loss": 0.8873, + "step": 3248 + }, + { + "epoch": 0.29, + "grad_norm": 7.468012705467907, + "learning_rate": 8.38569181859175e-06, + "loss": 0.8625, + "step": 3249 + }, + { + "epoch": 0.29, + "grad_norm": 6.603601731750605, + "learning_rate": 8.384645068569024e-06, + "loss": 0.7543, + "step": 3250 + }, + { + "epoch": 0.29, + "grad_norm": 10.51764490323785, + "learning_rate": 8.383598044666448e-06, + "loss": 0.9293, + "step": 3251 + }, + { + "epoch": 0.29, + "grad_norm": 20.35179735664073, + "learning_rate": 8.382550746968747e-06, + "loss": 0.885, + "step": 3252 + }, + { + "epoch": 0.29, + "grad_norm": 15.357863927453472, + "learning_rate": 8.381503175560666e-06, + "loss": 0.7691, + "step": 3253 + }, + { + "epoch": 0.29, + "grad_norm": 9.55208689557372, + "learning_rate": 8.380455330526973e-06, + "loss": 0.7507, + "step": 3254 + }, + { + "epoch": 0.29, + "grad_norm": 3.743386665415296, + "learning_rate": 8.379407211952459e-06, + "loss": 0.4561, + "step": 3255 + }, + { + "epoch": 0.29, + "grad_norm": 3.605436407710461, + "learning_rate": 8.378358819921934e-06, + "loss": 0.5875, + "step": 3256 + }, + { + "epoch": 0.29, + "grad_norm": 9.52225052180041, + "learning_rate": 8.377310154520232e-06, + "loss": 0.6994, + "step": 3257 + }, + { + "epoch": 0.29, + "grad_norm": 7.660284435788593, + "learning_rate": 8.376261215832211e-06, + "loss": 0.8751, + "step": 3258 + }, + { + "epoch": 0.29, + "grad_norm": 12.61899645568873, + "learning_rate": 8.375212003942747e-06, + "loss": 0.7391, + "step": 3259 + }, + { + "epoch": 0.29, + "grad_norm": 6.156233988431764, + "learning_rate": 8.374162518936744e-06, + "loss": 0.6744, + "step": 3260 + }, + { + "epoch": 0.29, + "grad_norm": 11.125432318182412, + "learning_rate": 8.373112760899121e-06, + "loss": 0.7234, + "step": 3261 + }, + { + "epoch": 0.29, + "grad_norm": 8.628868848705439, + "learning_rate": 8.372062729914827e-06, + "loss": 0.8335, + "step": 3262 + }, + { + "epoch": 0.29, + "grad_norm": 7.113764785723414, + "learning_rate": 8.371012426068822e-06, + "loss": 0.7422, + "step": 3263 + }, + { + "epoch": 0.29, + "grad_norm": 6.653735983095822, + "learning_rate": 8.369961849446103e-06, + "loss": 0.8515, + "step": 3264 + }, + { + "epoch": 0.29, + "grad_norm": 8.861866729560012, + "learning_rate": 8.368911000131674e-06, + "loss": 0.7671, + "step": 3265 + }, + { + "epoch": 0.29, + "grad_norm": 6.982151725019161, + "learning_rate": 8.367859878210571e-06, + "loss": 0.8289, + "step": 3266 + }, + { + "epoch": 0.29, + "grad_norm": 2.6860495390456305, + "learning_rate": 8.36680848376785e-06, + "loss": 0.6042, + "step": 3267 + }, + { + "epoch": 0.29, + "grad_norm": 10.769213403672204, + "learning_rate": 8.365756816888586e-06, + "loss": 0.9345, + "step": 3268 + }, + { + "epoch": 0.29, + "grad_norm": 8.492725294607125, + "learning_rate": 8.36470487765788e-06, + "loss": 0.7629, + "step": 3269 + }, + { + "epoch": 0.29, + "grad_norm": 7.941294372831879, + "learning_rate": 8.36365266616085e-06, + "loss": 0.8253, + "step": 3270 + }, + { + "epoch": 0.29, + "grad_norm": 9.77531244015847, + "learning_rate": 8.362600182482644e-06, + "loss": 0.7575, + "step": 3271 + }, + { + "epoch": 0.29, + "grad_norm": 8.681734661413858, + "learning_rate": 8.361547426708424e-06, + "loss": 0.8117, + "step": 3272 + }, + { + "epoch": 0.29, + "grad_norm": 10.898183794404963, + "learning_rate": 8.360494398923378e-06, + "loss": 0.8895, + "step": 3273 + }, + { + "epoch": 0.29, + "grad_norm": 12.184369876398414, + "learning_rate": 8.359441099212712e-06, + "loss": 0.8264, + "step": 3274 + }, + { + "epoch": 0.29, + "grad_norm": 7.6020986404433435, + "learning_rate": 8.358387527661664e-06, + "loss": 0.8571, + "step": 3275 + }, + { + "epoch": 0.29, + "grad_norm": 13.151710656761809, + "learning_rate": 8.357333684355485e-06, + "loss": 0.9524, + "step": 3276 + }, + { + "epoch": 0.29, + "grad_norm": 44.69361398467823, + "learning_rate": 8.356279569379444e-06, + "loss": 0.8855, + "step": 3277 + }, + { + "epoch": 0.29, + "grad_norm": 25.917975073329146, + "learning_rate": 8.355225182818847e-06, + "loss": 0.7541, + "step": 3278 + }, + { + "epoch": 0.29, + "grad_norm": 11.258097722356196, + "learning_rate": 8.354170524759008e-06, + "loss": 0.7212, + "step": 3279 + }, + { + "epoch": 0.29, + "grad_norm": 2.4178917361728955, + "learning_rate": 8.353115595285268e-06, + "loss": 0.5264, + "step": 3280 + }, + { + "epoch": 0.29, + "grad_norm": 12.652199114078567, + "learning_rate": 8.352060394482994e-06, + "loss": 0.8579, + "step": 3281 + }, + { + "epoch": 0.29, + "grad_norm": 6.016589068912108, + "learning_rate": 8.35100492243757e-06, + "loss": 0.8623, + "step": 3282 + }, + { + "epoch": 0.29, + "grad_norm": 8.228074981551671, + "learning_rate": 8.349949179234398e-06, + "loss": 0.7276, + "step": 3283 + }, + { + "epoch": 0.29, + "grad_norm": 7.876401483471793, + "learning_rate": 8.348893164958912e-06, + "loss": 0.6997, + "step": 3284 + }, + { + "epoch": 0.29, + "grad_norm": 8.642345656269839, + "learning_rate": 8.347836879696562e-06, + "loss": 0.8155, + "step": 3285 + }, + { + "epoch": 0.29, + "grad_norm": 3.222681747045196, + "learning_rate": 8.346780323532821e-06, + "loss": 0.6053, + "step": 3286 + }, + { + "epoch": 0.29, + "grad_norm": 6.385639399075591, + "learning_rate": 8.345723496553184e-06, + "loss": 0.6488, + "step": 3287 + }, + { + "epoch": 0.29, + "grad_norm": 7.9146141755700965, + "learning_rate": 8.344666398843165e-06, + "loss": 0.8428, + "step": 3288 + }, + { + "epoch": 0.29, + "grad_norm": 8.060189640584623, + "learning_rate": 8.343609030488306e-06, + "loss": 0.8223, + "step": 3289 + }, + { + "epoch": 0.29, + "grad_norm": 8.283733648104143, + "learning_rate": 8.342551391574165e-06, + "loss": 0.7511, + "step": 3290 + }, + { + "epoch": 0.29, + "grad_norm": 6.622418516243713, + "learning_rate": 8.341493482186326e-06, + "loss": 0.9262, + "step": 3291 + }, + { + "epoch": 0.29, + "grad_norm": 9.074488558422157, + "learning_rate": 8.340435302410393e-06, + "loss": 0.7949, + "step": 3292 + }, + { + "epoch": 0.29, + "grad_norm": 11.222174909991127, + "learning_rate": 8.339376852331992e-06, + "loss": 0.8665, + "step": 3293 + }, + { + "epoch": 0.29, + "grad_norm": 10.424150141818723, + "learning_rate": 8.33831813203677e-06, + "loss": 0.9038, + "step": 3294 + }, + { + "epoch": 0.29, + "grad_norm": 9.282309372666171, + "learning_rate": 8.337259141610399e-06, + "loss": 0.7738, + "step": 3295 + }, + { + "epoch": 0.29, + "grad_norm": 16.7575735150019, + "learning_rate": 8.336199881138568e-06, + "loss": 0.868, + "step": 3296 + }, + { + "epoch": 0.29, + "grad_norm": 5.84427206739459, + "learning_rate": 8.335140350706992e-06, + "loss": 0.8336, + "step": 3297 + }, + { + "epoch": 0.29, + "grad_norm": 5.626776869842072, + "learning_rate": 8.334080550401408e-06, + "loss": 0.6391, + "step": 3298 + }, + { + "epoch": 0.29, + "grad_norm": 13.595995640368967, + "learning_rate": 8.333020480307572e-06, + "loss": 0.8188, + "step": 3299 + }, + { + "epoch": 0.29, + "grad_norm": 6.906018146347141, + "learning_rate": 8.331960140511261e-06, + "loss": 0.8443, + "step": 3300 + }, + { + "epoch": 0.29, + "grad_norm": 9.642664698414338, + "learning_rate": 8.33089953109828e-06, + "loss": 0.9869, + "step": 3301 + }, + { + "epoch": 0.29, + "grad_norm": 8.601754221825892, + "learning_rate": 8.329838652154449e-06, + "loss": 0.6541, + "step": 3302 + }, + { + "epoch": 0.29, + "grad_norm": 27.015557584458204, + "learning_rate": 8.328777503765613e-06, + "loss": 0.7428, + "step": 3303 + }, + { + "epoch": 0.29, + "grad_norm": 14.995961250690268, + "learning_rate": 8.327716086017638e-06, + "loss": 0.8229, + "step": 3304 + }, + { + "epoch": 0.29, + "grad_norm": 8.203123713591127, + "learning_rate": 8.326654398996412e-06, + "loss": 0.9, + "step": 3305 + }, + { + "epoch": 0.29, + "grad_norm": 7.155437489844302, + "learning_rate": 8.325592442787847e-06, + "loss": 0.7923, + "step": 3306 + }, + { + "epoch": 0.29, + "grad_norm": 8.519506495625564, + "learning_rate": 8.324530217477873e-06, + "loss": 0.7912, + "step": 3307 + }, + { + "epoch": 0.29, + "grad_norm": 14.43068877748966, + "learning_rate": 8.323467723152445e-06, + "loss": 0.7767, + "step": 3308 + }, + { + "epoch": 0.29, + "grad_norm": 2.3623628639600676, + "learning_rate": 8.322404959897536e-06, + "loss": 0.5041, + "step": 3309 + }, + { + "epoch": 0.29, + "grad_norm": 6.272721515631472, + "learning_rate": 8.321341927799143e-06, + "loss": 0.7297, + "step": 3310 + }, + { + "epoch": 0.29, + "grad_norm": 8.587079217936921, + "learning_rate": 8.320278626943288e-06, + "loss": 0.817, + "step": 3311 + }, + { + "epoch": 0.29, + "grad_norm": 3.3777810867084925, + "learning_rate": 8.319215057416007e-06, + "loss": 0.6076, + "step": 3312 + }, + { + "epoch": 0.29, + "grad_norm": 12.003011891331704, + "learning_rate": 8.318151219303368e-06, + "loss": 0.657, + "step": 3313 + }, + { + "epoch": 0.29, + "grad_norm": 8.651972913781593, + "learning_rate": 8.31708711269145e-06, + "loss": 0.8655, + "step": 3314 + }, + { + "epoch": 0.29, + "grad_norm": 5.748120202104353, + "learning_rate": 8.316022737666362e-06, + "loss": 0.5684, + "step": 3315 + }, + { + "epoch": 0.29, + "grad_norm": 7.2699299131678, + "learning_rate": 8.314958094314228e-06, + "loss": 0.7826, + "step": 3316 + }, + { + "epoch": 0.29, + "grad_norm": 13.98050370163564, + "learning_rate": 8.3138931827212e-06, + "loss": 0.7469, + "step": 3317 + }, + { + "epoch": 0.29, + "grad_norm": 2.553599864404236, + "learning_rate": 8.312828002973448e-06, + "loss": 0.6057, + "step": 3318 + }, + { + "epoch": 0.29, + "grad_norm": 8.791084426713152, + "learning_rate": 8.311762555157164e-06, + "loss": 0.6536, + "step": 3319 + }, + { + "epoch": 0.29, + "grad_norm": 5.942464316887609, + "learning_rate": 8.310696839358563e-06, + "loss": 0.8183, + "step": 3320 + }, + { + "epoch": 0.29, + "grad_norm": 8.074649818402786, + "learning_rate": 8.309630855663882e-06, + "loss": 0.8168, + "step": 3321 + }, + { + "epoch": 0.29, + "grad_norm": 11.962479563961882, + "learning_rate": 8.308564604159377e-06, + "loss": 0.8506, + "step": 3322 + }, + { + "epoch": 0.29, + "grad_norm": 8.139217772772902, + "learning_rate": 8.307498084931327e-06, + "loss": 0.7559, + "step": 3323 + }, + { + "epoch": 0.29, + "grad_norm": 7.836552652455766, + "learning_rate": 8.306431298066036e-06, + "loss": 0.7236, + "step": 3324 + }, + { + "epoch": 0.29, + "grad_norm": 2.784975167734778, + "learning_rate": 8.305364243649822e-06, + "loss": 0.5327, + "step": 3325 + }, + { + "epoch": 0.29, + "grad_norm": 10.87517874974802, + "learning_rate": 8.304296921769031e-06, + "loss": 0.8177, + "step": 3326 + }, + { + "epoch": 0.29, + "grad_norm": 12.38124181471453, + "learning_rate": 8.30322933251003e-06, + "loss": 0.794, + "step": 3327 + }, + { + "epoch": 0.29, + "grad_norm": 8.401955814415517, + "learning_rate": 8.302161475959207e-06, + "loss": 0.7914, + "step": 3328 + }, + { + "epoch": 0.29, + "grad_norm": 8.989354082950392, + "learning_rate": 8.30109335220297e-06, + "loss": 0.879, + "step": 3329 + }, + { + "epoch": 0.29, + "grad_norm": 15.952768997343641, + "learning_rate": 8.30002496132775e-06, + "loss": 0.8661, + "step": 3330 + }, + { + "epoch": 0.29, + "grad_norm": 12.847509166169301, + "learning_rate": 8.29895630342e-06, + "loss": 0.7652, + "step": 3331 + }, + { + "epoch": 0.29, + "grad_norm": 15.352566366069828, + "learning_rate": 8.297887378566194e-06, + "loss": 0.8074, + "step": 3332 + }, + { + "epoch": 0.29, + "grad_norm": 9.76806057766376, + "learning_rate": 8.296818186852825e-06, + "loss": 0.9143, + "step": 3333 + }, + { + "epoch": 0.29, + "grad_norm": 3.867477620408964, + "learning_rate": 8.295748728366414e-06, + "loss": 0.6571, + "step": 3334 + }, + { + "epoch": 0.29, + "grad_norm": 10.248958641682469, + "learning_rate": 8.294679003193497e-06, + "loss": 0.815, + "step": 3335 + }, + { + "epoch": 0.29, + "grad_norm": 9.617295087675538, + "learning_rate": 8.293609011420636e-06, + "loss": 0.8542, + "step": 3336 + }, + { + "epoch": 0.29, + "grad_norm": 11.044726635980169, + "learning_rate": 8.292538753134412e-06, + "loss": 0.7894, + "step": 3337 + }, + { + "epoch": 0.29, + "grad_norm": 7.38351080367891, + "learning_rate": 8.291468228421429e-06, + "loss": 0.7628, + "step": 3338 + }, + { + "epoch": 0.29, + "grad_norm": 2.7298460507897913, + "learning_rate": 8.290397437368311e-06, + "loss": 0.6134, + "step": 3339 + }, + { + "epoch": 0.29, + "grad_norm": 9.629016503873912, + "learning_rate": 8.289326380061708e-06, + "loss": 1.0189, + "step": 3340 + }, + { + "epoch": 0.29, + "grad_norm": 8.262413782070452, + "learning_rate": 8.288255056588284e-06, + "loss": 0.8154, + "step": 3341 + }, + { + "epoch": 0.29, + "grad_norm": 11.253396290107663, + "learning_rate": 8.287183467034732e-06, + "loss": 0.748, + "step": 3342 + }, + { + "epoch": 0.29, + "grad_norm": 13.644935716420017, + "learning_rate": 8.286111611487762e-06, + "loss": 0.9261, + "step": 3343 + }, + { + "epoch": 0.29, + "grad_norm": 3.02178467830843, + "learning_rate": 8.285039490034104e-06, + "loss": 0.5383, + "step": 3344 + }, + { + "epoch": 0.29, + "grad_norm": 6.475217294441489, + "learning_rate": 8.283967102760518e-06, + "loss": 0.9496, + "step": 3345 + }, + { + "epoch": 0.29, + "grad_norm": 6.28883196976804, + "learning_rate": 8.282894449753775e-06, + "loss": 0.8155, + "step": 3346 + }, + { + "epoch": 0.29, + "grad_norm": 8.865799641509566, + "learning_rate": 8.281821531100673e-06, + "loss": 0.9703, + "step": 3347 + }, + { + "epoch": 0.29, + "grad_norm": 10.261804481907188, + "learning_rate": 8.280748346888032e-06, + "loss": 0.8683, + "step": 3348 + }, + { + "epoch": 0.29, + "grad_norm": 10.200286069751442, + "learning_rate": 8.279674897202694e-06, + "loss": 0.8185, + "step": 3349 + }, + { + "epoch": 0.29, + "grad_norm": 12.539773611917376, + "learning_rate": 8.278601182131518e-06, + "loss": 0.7614, + "step": 3350 + }, + { + "epoch": 0.29, + "grad_norm": 14.62948785751598, + "learning_rate": 8.277527201761387e-06, + "loss": 0.9362, + "step": 3351 + }, + { + "epoch": 0.29, + "grad_norm": 8.146678181269086, + "learning_rate": 8.276452956179209e-06, + "loss": 0.7825, + "step": 3352 + }, + { + "epoch": 0.29, + "grad_norm": 5.613927892001161, + "learning_rate": 8.275378445471905e-06, + "loss": 0.836, + "step": 3353 + }, + { + "epoch": 0.29, + "grad_norm": 8.794700836236922, + "learning_rate": 8.274303669726427e-06, + "loss": 0.7962, + "step": 3354 + }, + { + "epoch": 0.29, + "grad_norm": 8.955781323391554, + "learning_rate": 8.273228629029742e-06, + "loss": 0.8288, + "step": 3355 + }, + { + "epoch": 0.29, + "grad_norm": 10.950410818803803, + "learning_rate": 8.272153323468842e-06, + "loss": 1.0562, + "step": 3356 + }, + { + "epoch": 0.29, + "grad_norm": 10.56183585365, + "learning_rate": 8.271077753130736e-06, + "loss": 0.8479, + "step": 3357 + }, + { + "epoch": 0.29, + "grad_norm": 3.8861914688077785, + "learning_rate": 8.270001918102462e-06, + "loss": 0.6667, + "step": 3358 + }, + { + "epoch": 0.29, + "grad_norm": 11.809642272569004, + "learning_rate": 8.268925818471069e-06, + "loss": 0.8278, + "step": 3359 + }, + { + "epoch": 0.3, + "grad_norm": 17.74528897126826, + "learning_rate": 8.26784945432364e-06, + "loss": 0.9477, + "step": 3360 + }, + { + "epoch": 0.3, + "grad_norm": 6.899837769674577, + "learning_rate": 8.266772825747267e-06, + "loss": 0.7144, + "step": 3361 + }, + { + "epoch": 0.3, + "grad_norm": 2.7263554268723422, + "learning_rate": 8.265695932829071e-06, + "loss": 0.533, + "step": 3362 + }, + { + "epoch": 0.3, + "grad_norm": 7.875805809903537, + "learning_rate": 8.264618775656193e-06, + "loss": 0.7865, + "step": 3363 + }, + { + "epoch": 0.3, + "grad_norm": 5.4364383051103555, + "learning_rate": 8.263541354315793e-06, + "loss": 0.9115, + "step": 3364 + }, + { + "epoch": 0.3, + "grad_norm": 8.456250252661095, + "learning_rate": 8.262463668895055e-06, + "loss": 0.8672, + "step": 3365 + }, + { + "epoch": 0.3, + "grad_norm": 7.489790066805471, + "learning_rate": 8.261385719481186e-06, + "loss": 0.9189, + "step": 3366 + }, + { + "epoch": 0.3, + "grad_norm": 7.551321808320784, + "learning_rate": 8.260307506161407e-06, + "loss": 0.9146, + "step": 3367 + }, + { + "epoch": 0.3, + "grad_norm": 13.010327864675736, + "learning_rate": 8.259229029022969e-06, + "loss": 0.7472, + "step": 3368 + }, + { + "epoch": 0.3, + "grad_norm": 7.684025810939782, + "learning_rate": 8.25815028815314e-06, + "loss": 0.9214, + "step": 3369 + }, + { + "epoch": 0.3, + "grad_norm": 9.646407704977427, + "learning_rate": 8.25707128363921e-06, + "loss": 0.9043, + "step": 3370 + }, + { + "epoch": 0.3, + "grad_norm": 3.3136439218204052, + "learning_rate": 8.255992015568486e-06, + "loss": 0.5104, + "step": 3371 + }, + { + "epoch": 0.3, + "grad_norm": 6.6814286265151, + "learning_rate": 8.254912484028308e-06, + "loss": 0.7301, + "step": 3372 + }, + { + "epoch": 0.3, + "grad_norm": 8.488676404796161, + "learning_rate": 8.253832689106024e-06, + "loss": 0.8062, + "step": 3373 + }, + { + "epoch": 0.3, + "grad_norm": 6.028850141409825, + "learning_rate": 8.252752630889011e-06, + "loss": 0.801, + "step": 3374 + }, + { + "epoch": 0.3, + "grad_norm": 10.240339559862626, + "learning_rate": 8.251672309464667e-06, + "loss": 1.0153, + "step": 3375 + }, + { + "epoch": 0.3, + "grad_norm": 3.030210312003958, + "learning_rate": 8.25059172492041e-06, + "loss": 0.4813, + "step": 3376 + }, + { + "epoch": 0.3, + "grad_norm": 9.191428472045503, + "learning_rate": 8.249510877343674e-06, + "loss": 0.8807, + "step": 3377 + }, + { + "epoch": 0.3, + "grad_norm": 9.455887967585298, + "learning_rate": 8.248429766821925e-06, + "loss": 0.7992, + "step": 3378 + }, + { + "epoch": 0.3, + "grad_norm": 8.97110233541002, + "learning_rate": 8.247348393442643e-06, + "loss": 0.8253, + "step": 3379 + }, + { + "epoch": 0.3, + "grad_norm": 7.896093374366543, + "learning_rate": 8.246266757293331e-06, + "loss": 0.6984, + "step": 3380 + }, + { + "epoch": 0.3, + "grad_norm": 9.455715747700037, + "learning_rate": 8.245184858461512e-06, + "loss": 0.8431, + "step": 3381 + }, + { + "epoch": 0.3, + "grad_norm": 8.161295717548905, + "learning_rate": 8.244102697034732e-06, + "loss": 0.9779, + "step": 3382 + }, + { + "epoch": 0.3, + "grad_norm": 8.691965815581375, + "learning_rate": 8.24302027310056e-06, + "loss": 0.8752, + "step": 3383 + }, + { + "epoch": 0.3, + "grad_norm": 1.8521572376125006, + "learning_rate": 8.24193758674658e-06, + "loss": 0.437, + "step": 3384 + }, + { + "epoch": 0.3, + "grad_norm": 11.190277078839811, + "learning_rate": 8.240854638060404e-06, + "loss": 0.9103, + "step": 3385 + }, + { + "epoch": 0.3, + "grad_norm": 3.5275127638642116, + "learning_rate": 8.239771427129661e-06, + "loss": 0.5291, + "step": 3386 + }, + { + "epoch": 0.3, + "grad_norm": 7.734678173344992, + "learning_rate": 8.238687954042003e-06, + "loss": 0.766, + "step": 3387 + }, + { + "epoch": 0.3, + "grad_norm": 9.514043740644363, + "learning_rate": 8.237604218885104e-06, + "loss": 0.854, + "step": 3388 + }, + { + "epoch": 0.3, + "grad_norm": 18.198412177784995, + "learning_rate": 8.236520221746657e-06, + "loss": 0.7809, + "step": 3389 + }, + { + "epoch": 0.3, + "grad_norm": 6.515274702943235, + "learning_rate": 8.235435962714378e-06, + "loss": 0.8773, + "step": 3390 + }, + { + "epoch": 0.3, + "grad_norm": 9.68825258644562, + "learning_rate": 8.234351441876002e-06, + "loss": 0.9235, + "step": 3391 + }, + { + "epoch": 0.3, + "grad_norm": 9.446000607786141, + "learning_rate": 8.233266659319287e-06, + "loss": 0.9457, + "step": 3392 + }, + { + "epoch": 0.3, + "grad_norm": 14.426436794585618, + "learning_rate": 8.232181615132011e-06, + "loss": 0.7646, + "step": 3393 + }, + { + "epoch": 0.3, + "grad_norm": 2.1637182124367795, + "learning_rate": 8.231096309401978e-06, + "loss": 0.4623, + "step": 3394 + }, + { + "epoch": 0.3, + "grad_norm": 8.101166210260363, + "learning_rate": 8.230010742217006e-06, + "loss": 0.6803, + "step": 3395 + }, + { + "epoch": 0.3, + "grad_norm": 13.636239688268152, + "learning_rate": 8.228924913664935e-06, + "loss": 0.9049, + "step": 3396 + }, + { + "epoch": 0.3, + "grad_norm": 12.160109889950647, + "learning_rate": 8.227838823833633e-06, + "loss": 0.7417, + "step": 3397 + }, + { + "epoch": 0.3, + "grad_norm": 6.886136338682345, + "learning_rate": 8.226752472810982e-06, + "loss": 0.7379, + "step": 3398 + }, + { + "epoch": 0.3, + "grad_norm": 8.934500732441194, + "learning_rate": 8.225665860684888e-06, + "loss": 0.6448, + "step": 3399 + }, + { + "epoch": 0.3, + "grad_norm": 27.76897069700421, + "learning_rate": 8.22457898754328e-06, + "loss": 0.8855, + "step": 3400 + }, + { + "epoch": 0.3, + "grad_norm": 9.26558684256552, + "learning_rate": 8.223491853474101e-06, + "loss": 0.8142, + "step": 3401 + }, + { + "epoch": 0.3, + "grad_norm": 10.829678447240259, + "learning_rate": 8.222404458565327e-06, + "loss": 0.9835, + "step": 3402 + }, + { + "epoch": 0.3, + "grad_norm": 9.559250231855671, + "learning_rate": 8.221316802904942e-06, + "loss": 0.8752, + "step": 3403 + }, + { + "epoch": 0.3, + "grad_norm": 6.170965035130193, + "learning_rate": 8.22022888658096e-06, + "loss": 0.8899, + "step": 3404 + }, + { + "epoch": 0.3, + "grad_norm": 10.370520348804616, + "learning_rate": 8.219140709681412e-06, + "loss": 0.7956, + "step": 3405 + }, + { + "epoch": 0.3, + "grad_norm": 18.228599654319616, + "learning_rate": 8.218052272294354e-06, + "loss": 0.8627, + "step": 3406 + }, + { + "epoch": 0.3, + "grad_norm": 7.78035532547529, + "learning_rate": 8.216963574507859e-06, + "loss": 0.9701, + "step": 3407 + }, + { + "epoch": 0.3, + "grad_norm": 11.62327800373347, + "learning_rate": 8.215874616410022e-06, + "loss": 0.8323, + "step": 3408 + }, + { + "epoch": 0.3, + "grad_norm": 20.03405027363456, + "learning_rate": 8.214785398088963e-06, + "loss": 0.7733, + "step": 3409 + }, + { + "epoch": 0.3, + "grad_norm": 10.48830509211779, + "learning_rate": 8.213695919632814e-06, + "loss": 0.6545, + "step": 3410 + }, + { + "epoch": 0.3, + "grad_norm": 9.598655097791207, + "learning_rate": 8.212606181129737e-06, + "loss": 0.8282, + "step": 3411 + }, + { + "epoch": 0.3, + "grad_norm": 7.59261404549607, + "learning_rate": 8.211516182667913e-06, + "loss": 0.7658, + "step": 3412 + }, + { + "epoch": 0.3, + "grad_norm": 8.042475313456762, + "learning_rate": 8.210425924335541e-06, + "loss": 0.668, + "step": 3413 + }, + { + "epoch": 0.3, + "grad_norm": 3.3267411551912662, + "learning_rate": 8.209335406220844e-06, + "loss": 0.5639, + "step": 3414 + }, + { + "epoch": 0.3, + "grad_norm": 14.263968549506327, + "learning_rate": 8.208244628412064e-06, + "loss": 0.8121, + "step": 3415 + }, + { + "epoch": 0.3, + "grad_norm": 12.261252842684787, + "learning_rate": 8.207153590997466e-06, + "loss": 0.965, + "step": 3416 + }, + { + "epoch": 0.3, + "grad_norm": 8.685246311264084, + "learning_rate": 8.206062294065333e-06, + "loss": 0.8708, + "step": 3417 + }, + { + "epoch": 0.3, + "grad_norm": 11.113862411742755, + "learning_rate": 8.204970737703972e-06, + "loss": 0.8433, + "step": 3418 + }, + { + "epoch": 0.3, + "grad_norm": 7.944270246188547, + "learning_rate": 8.203878922001712e-06, + "loss": 0.8825, + "step": 3419 + }, + { + "epoch": 0.3, + "grad_norm": 6.623314387111109, + "learning_rate": 8.202786847046897e-06, + "loss": 0.8169, + "step": 3420 + }, + { + "epoch": 0.3, + "grad_norm": 8.175994602086192, + "learning_rate": 8.2016945129279e-06, + "loss": 0.8327, + "step": 3421 + }, + { + "epoch": 0.3, + "grad_norm": 10.582837026941249, + "learning_rate": 8.200601919733106e-06, + "loss": 0.7736, + "step": 3422 + }, + { + "epoch": 0.3, + "grad_norm": 7.0054966019024825, + "learning_rate": 8.19950906755093e-06, + "loss": 0.7929, + "step": 3423 + }, + { + "epoch": 0.3, + "grad_norm": 8.72713772006109, + "learning_rate": 8.198415956469803e-06, + "loss": 0.7418, + "step": 3424 + }, + { + "epoch": 0.3, + "grad_norm": 6.878068158124625, + "learning_rate": 8.197322586578177e-06, + "loss": 0.72, + "step": 3425 + }, + { + "epoch": 0.3, + "grad_norm": 7.388826206720679, + "learning_rate": 8.196228957964525e-06, + "loss": 0.6985, + "step": 3426 + }, + { + "epoch": 0.3, + "grad_norm": 12.057394350706168, + "learning_rate": 8.195135070717344e-06, + "loss": 0.8526, + "step": 3427 + }, + { + "epoch": 0.3, + "grad_norm": 3.270011663845428, + "learning_rate": 8.194040924925146e-06, + "loss": 0.5574, + "step": 3428 + }, + { + "epoch": 0.3, + "grad_norm": 9.46848539320959, + "learning_rate": 8.19294652067647e-06, + "loss": 0.7276, + "step": 3429 + }, + { + "epoch": 0.3, + "grad_norm": 10.695178360834968, + "learning_rate": 8.191851858059872e-06, + "loss": 0.7197, + "step": 3430 + }, + { + "epoch": 0.3, + "grad_norm": 2.370632679584042, + "learning_rate": 8.190756937163933e-06, + "loss": 0.4494, + "step": 3431 + }, + { + "epoch": 0.3, + "grad_norm": 6.641211374641154, + "learning_rate": 8.189661758077248e-06, + "loss": 0.9042, + "step": 3432 + }, + { + "epoch": 0.3, + "grad_norm": 9.71247250821788, + "learning_rate": 8.18856632088844e-06, + "loss": 0.7972, + "step": 3433 + }, + { + "epoch": 0.3, + "grad_norm": 22.733052041058976, + "learning_rate": 8.187470625686149e-06, + "loss": 0.8757, + "step": 3434 + }, + { + "epoch": 0.3, + "grad_norm": 9.230805351816088, + "learning_rate": 8.186374672559038e-06, + "loss": 0.8293, + "step": 3435 + }, + { + "epoch": 0.3, + "grad_norm": 12.457610001817892, + "learning_rate": 8.18527846159579e-06, + "loss": 0.6858, + "step": 3436 + }, + { + "epoch": 0.3, + "grad_norm": 15.05030552561652, + "learning_rate": 8.184181992885104e-06, + "loss": 0.8793, + "step": 3437 + }, + { + "epoch": 0.3, + "grad_norm": 9.785007333759557, + "learning_rate": 8.183085266515709e-06, + "loss": 0.8857, + "step": 3438 + }, + { + "epoch": 0.3, + "grad_norm": 8.488264572013414, + "learning_rate": 8.18198828257635e-06, + "loss": 0.9027, + "step": 3439 + }, + { + "epoch": 0.3, + "grad_norm": 10.729059580123907, + "learning_rate": 8.180891041155791e-06, + "loss": 0.9711, + "step": 3440 + }, + { + "epoch": 0.3, + "grad_norm": 9.7347051336122, + "learning_rate": 8.179793542342821e-06, + "loss": 1.0683, + "step": 3441 + }, + { + "epoch": 0.3, + "grad_norm": 10.827177898739953, + "learning_rate": 8.178695786226247e-06, + "loss": 0.9033, + "step": 3442 + }, + { + "epoch": 0.3, + "grad_norm": 10.58298057150334, + "learning_rate": 8.177597772894896e-06, + "loss": 0.8053, + "step": 3443 + }, + { + "epoch": 0.3, + "grad_norm": 7.181324255000663, + "learning_rate": 8.176499502437621e-06, + "loss": 0.8099, + "step": 3444 + }, + { + "epoch": 0.3, + "grad_norm": 13.738322469522506, + "learning_rate": 8.175400974943289e-06, + "loss": 0.8482, + "step": 3445 + }, + { + "epoch": 0.3, + "grad_norm": 13.64142701820933, + "learning_rate": 8.17430219050079e-06, + "loss": 0.7369, + "step": 3446 + }, + { + "epoch": 0.3, + "grad_norm": 13.321646529110211, + "learning_rate": 8.173203149199041e-06, + "loss": 0.8978, + "step": 3447 + }, + { + "epoch": 0.3, + "grad_norm": 7.041950624674003, + "learning_rate": 8.17210385112697e-06, + "loss": 0.9095, + "step": 3448 + }, + { + "epoch": 0.3, + "grad_norm": 11.253208885096688, + "learning_rate": 8.171004296373533e-06, + "loss": 0.8213, + "step": 3449 + }, + { + "epoch": 0.3, + "grad_norm": 2.3147282033870646, + "learning_rate": 8.169904485027702e-06, + "loss": 0.5283, + "step": 3450 + }, + { + "epoch": 0.3, + "grad_norm": 17.32823791516391, + "learning_rate": 8.168804417178473e-06, + "loss": 0.6421, + "step": 3451 + }, + { + "epoch": 0.3, + "grad_norm": 3.1807836335555026, + "learning_rate": 8.167704092914861e-06, + "loss": 0.5539, + "step": 3452 + }, + { + "epoch": 0.3, + "grad_norm": 8.71852621626734, + "learning_rate": 8.166603512325905e-06, + "loss": 0.8821, + "step": 3453 + }, + { + "epoch": 0.3, + "grad_norm": 8.274799860369715, + "learning_rate": 8.165502675500658e-06, + "loss": 0.7995, + "step": 3454 + }, + { + "epoch": 0.3, + "grad_norm": 8.597498050279892, + "learning_rate": 8.164401582528202e-06, + "loss": 0.8041, + "step": 3455 + }, + { + "epoch": 0.3, + "grad_norm": 9.889328518521536, + "learning_rate": 8.163300233497632e-06, + "loss": 0.8262, + "step": 3456 + }, + { + "epoch": 0.3, + "grad_norm": 3.038433960833323, + "learning_rate": 8.16219862849807e-06, + "loss": 0.5116, + "step": 3457 + }, + { + "epoch": 0.3, + "grad_norm": 5.454682474007254, + "learning_rate": 8.161096767618655e-06, + "loss": 0.5165, + "step": 3458 + }, + { + "epoch": 0.3, + "grad_norm": 7.393680347085186, + "learning_rate": 8.159994650948549e-06, + "loss": 0.6419, + "step": 3459 + }, + { + "epoch": 0.3, + "grad_norm": 22.674726613796444, + "learning_rate": 8.15889227857693e-06, + "loss": 0.8261, + "step": 3460 + }, + { + "epoch": 0.3, + "grad_norm": 13.939262723047241, + "learning_rate": 8.157789650593003e-06, + "loss": 0.8149, + "step": 3461 + }, + { + "epoch": 0.3, + "grad_norm": 11.711049098196598, + "learning_rate": 8.156686767085991e-06, + "loss": 0.7735, + "step": 3462 + }, + { + "epoch": 0.3, + "grad_norm": 15.334956122482037, + "learning_rate": 8.155583628145135e-06, + "loss": 0.7578, + "step": 3463 + }, + { + "epoch": 0.3, + "grad_norm": 10.603910998900233, + "learning_rate": 8.154480233859703e-06, + "loss": 0.7702, + "step": 3464 + }, + { + "epoch": 0.3, + "grad_norm": 12.941216664947916, + "learning_rate": 8.153376584318977e-06, + "loss": 0.8323, + "step": 3465 + }, + { + "epoch": 0.3, + "grad_norm": 14.97615134776413, + "learning_rate": 8.15227267961226e-06, + "loss": 0.6864, + "step": 3466 + }, + { + "epoch": 0.3, + "grad_norm": 9.789470160390707, + "learning_rate": 8.151168519828885e-06, + "loss": 0.8632, + "step": 3467 + }, + { + "epoch": 0.3, + "grad_norm": 7.40725152743787, + "learning_rate": 8.150064105058196e-06, + "loss": 0.7527, + "step": 3468 + }, + { + "epoch": 0.3, + "grad_norm": 6.813478616363257, + "learning_rate": 8.148959435389557e-06, + "loss": 0.7208, + "step": 3469 + }, + { + "epoch": 0.3, + "grad_norm": 12.973275293639091, + "learning_rate": 8.147854510912359e-06, + "loss": 0.8369, + "step": 3470 + }, + { + "epoch": 0.3, + "grad_norm": 7.028067651885325, + "learning_rate": 8.14674933171601e-06, + "loss": 0.8671, + "step": 3471 + }, + { + "epoch": 0.3, + "grad_norm": 3.901553265082286, + "learning_rate": 8.145643897889938e-06, + "loss": 0.4909, + "step": 3472 + }, + { + "epoch": 0.31, + "grad_norm": 8.829882263139123, + "learning_rate": 8.144538209523598e-06, + "loss": 0.7632, + "step": 3473 + }, + { + "epoch": 0.31, + "grad_norm": 13.481436269702602, + "learning_rate": 8.143432266706455e-06, + "loss": 0.7791, + "step": 3474 + }, + { + "epoch": 0.31, + "grad_norm": 13.539087758237628, + "learning_rate": 8.142326069528001e-06, + "loss": 0.7943, + "step": 3475 + }, + { + "epoch": 0.31, + "grad_norm": 10.59430670800061, + "learning_rate": 8.141219618077749e-06, + "loss": 0.7462, + "step": 3476 + }, + { + "epoch": 0.31, + "grad_norm": 9.771463357898998, + "learning_rate": 8.14011291244523e-06, + "loss": 0.7764, + "step": 3477 + }, + { + "epoch": 0.31, + "grad_norm": 3.2760219444978, + "learning_rate": 8.139005952719999e-06, + "loss": 0.5527, + "step": 3478 + }, + { + "epoch": 0.31, + "grad_norm": 7.67187763524382, + "learning_rate": 8.137898738991626e-06, + "loss": 0.7522, + "step": 3479 + }, + { + "epoch": 0.31, + "grad_norm": 9.945021937865297, + "learning_rate": 8.136791271349709e-06, + "loss": 0.933, + "step": 3480 + }, + { + "epoch": 0.31, + "grad_norm": 10.282680828542109, + "learning_rate": 8.135683549883858e-06, + "loss": 0.7942, + "step": 3481 + }, + { + "epoch": 0.31, + "grad_norm": 17.219209819793328, + "learning_rate": 8.134575574683711e-06, + "loss": 0.6865, + "step": 3482 + }, + { + "epoch": 0.31, + "grad_norm": 5.600147091735667, + "learning_rate": 8.133467345838921e-06, + "loss": 0.6407, + "step": 3483 + }, + { + "epoch": 0.31, + "grad_norm": 21.560076636751994, + "learning_rate": 8.132358863439168e-06, + "loss": 0.6079, + "step": 3484 + }, + { + "epoch": 0.31, + "grad_norm": 14.538256862055105, + "learning_rate": 8.131250127574146e-06, + "loss": 0.9363, + "step": 3485 + }, + { + "epoch": 0.31, + "grad_norm": 7.809550480382988, + "learning_rate": 8.130141138333571e-06, + "loss": 0.7256, + "step": 3486 + }, + { + "epoch": 0.31, + "grad_norm": 11.882759713082132, + "learning_rate": 8.129031895807182e-06, + "loss": 0.6649, + "step": 3487 + }, + { + "epoch": 0.31, + "grad_norm": 9.36947393365372, + "learning_rate": 8.127922400084736e-06, + "loss": 0.7841, + "step": 3488 + }, + { + "epoch": 0.31, + "grad_norm": 10.497393499915548, + "learning_rate": 8.126812651256014e-06, + "loss": 0.8194, + "step": 3489 + }, + { + "epoch": 0.31, + "grad_norm": 3.34342014925125, + "learning_rate": 8.125702649410814e-06, + "loss": 0.5542, + "step": 3490 + }, + { + "epoch": 0.31, + "grad_norm": 2.580190890873106, + "learning_rate": 8.124592394638952e-06, + "loss": 0.5241, + "step": 3491 + }, + { + "epoch": 0.31, + "grad_norm": 2.561164816343557, + "learning_rate": 8.123481887030274e-06, + "loss": 0.4896, + "step": 3492 + }, + { + "epoch": 0.31, + "grad_norm": 24.102033636421094, + "learning_rate": 8.122371126674636e-06, + "loss": 0.7879, + "step": 3493 + }, + { + "epoch": 0.31, + "grad_norm": 3.528362548068185, + "learning_rate": 8.12126011366192e-06, + "loss": 0.5904, + "step": 3494 + }, + { + "epoch": 0.31, + "grad_norm": 14.997683538228536, + "learning_rate": 8.120148848082028e-06, + "loss": 0.8456, + "step": 3495 + }, + { + "epoch": 0.31, + "grad_norm": 7.458291992103334, + "learning_rate": 8.119037330024881e-06, + "loss": 0.7554, + "step": 3496 + }, + { + "epoch": 0.31, + "grad_norm": 11.506400675331156, + "learning_rate": 8.117925559580422e-06, + "loss": 0.7289, + "step": 3497 + }, + { + "epoch": 0.31, + "grad_norm": 9.725081724103909, + "learning_rate": 8.116813536838613e-06, + "loss": 0.7592, + "step": 3498 + }, + { + "epoch": 0.31, + "grad_norm": 9.410013716246487, + "learning_rate": 8.115701261889437e-06, + "loss": 0.9204, + "step": 3499 + }, + { + "epoch": 0.31, + "grad_norm": 10.66838640515633, + "learning_rate": 8.114588734822899e-06, + "loss": 0.9464, + "step": 3500 + }, + { + "epoch": 0.31, + "grad_norm": 2.8965088527575946, + "learning_rate": 8.11347595572902e-06, + "loss": 0.5682, + "step": 3501 + }, + { + "epoch": 0.31, + "grad_norm": 11.458616533424713, + "learning_rate": 8.112362924697845e-06, + "loss": 0.7929, + "step": 3502 + }, + { + "epoch": 0.31, + "grad_norm": 15.975626596200378, + "learning_rate": 8.11124964181944e-06, + "loss": 0.7502, + "step": 3503 + }, + { + "epoch": 0.31, + "grad_norm": 16.882303047990973, + "learning_rate": 8.110136107183889e-06, + "loss": 0.8499, + "step": 3504 + }, + { + "epoch": 0.31, + "grad_norm": 11.759474726321791, + "learning_rate": 8.109022320881298e-06, + "loss": 0.8438, + "step": 3505 + }, + { + "epoch": 0.31, + "grad_norm": 4.467652237486764, + "learning_rate": 8.107908283001793e-06, + "loss": 0.627, + "step": 3506 + }, + { + "epoch": 0.31, + "grad_norm": 20.500424859606998, + "learning_rate": 8.106793993635518e-06, + "loss": 1.0084, + "step": 3507 + }, + { + "epoch": 0.31, + "grad_norm": 10.016805310481374, + "learning_rate": 8.105679452872642e-06, + "loss": 0.726, + "step": 3508 + }, + { + "epoch": 0.31, + "grad_norm": 6.393121293505268, + "learning_rate": 8.10456466080335e-06, + "loss": 0.7855, + "step": 3509 + }, + { + "epoch": 0.31, + "grad_norm": 10.548998167127534, + "learning_rate": 8.10344961751785e-06, + "loss": 0.8951, + "step": 3510 + }, + { + "epoch": 0.31, + "grad_norm": 12.057184200487965, + "learning_rate": 8.10233432310637e-06, + "loss": 0.7812, + "step": 3511 + }, + { + "epoch": 0.31, + "grad_norm": 8.044343735078007, + "learning_rate": 8.101218777659157e-06, + "loss": 0.979, + "step": 3512 + }, + { + "epoch": 0.31, + "grad_norm": 11.303232843178103, + "learning_rate": 8.100102981266478e-06, + "loss": 0.9039, + "step": 3513 + }, + { + "epoch": 0.31, + "grad_norm": 8.19071343661498, + "learning_rate": 8.098986934018625e-06, + "loss": 0.6685, + "step": 3514 + }, + { + "epoch": 0.31, + "grad_norm": 8.696190563098229, + "learning_rate": 8.097870636005902e-06, + "loss": 0.9369, + "step": 3515 + }, + { + "epoch": 0.31, + "grad_norm": 8.294468542538224, + "learning_rate": 8.096754087318639e-06, + "loss": 0.856, + "step": 3516 + }, + { + "epoch": 0.31, + "grad_norm": 8.216808618591186, + "learning_rate": 8.09563728804719e-06, + "loss": 0.9348, + "step": 3517 + }, + { + "epoch": 0.31, + "grad_norm": 11.882820006333537, + "learning_rate": 8.094520238281918e-06, + "loss": 0.7999, + "step": 3518 + }, + { + "epoch": 0.31, + "grad_norm": 7.162326312121886, + "learning_rate": 8.093402938113218e-06, + "loss": 0.7989, + "step": 3519 + }, + { + "epoch": 0.31, + "grad_norm": 10.331502015019202, + "learning_rate": 8.092285387631495e-06, + "loss": 0.8252, + "step": 3520 + }, + { + "epoch": 0.31, + "grad_norm": 9.397920330506684, + "learning_rate": 8.091167586927184e-06, + "loss": 0.9677, + "step": 3521 + }, + { + "epoch": 0.31, + "grad_norm": 14.575157177542893, + "learning_rate": 8.090049536090733e-06, + "loss": 0.7141, + "step": 3522 + }, + { + "epoch": 0.31, + "grad_norm": 8.414572384098275, + "learning_rate": 8.088931235212614e-06, + "loss": 0.8997, + "step": 3523 + }, + { + "epoch": 0.31, + "grad_norm": 5.8571607651869275, + "learning_rate": 8.087812684383318e-06, + "loss": 0.8271, + "step": 3524 + }, + { + "epoch": 0.31, + "grad_norm": 11.339208570321974, + "learning_rate": 8.086693883693356e-06, + "loss": 0.815, + "step": 3525 + }, + { + "epoch": 0.31, + "grad_norm": 19.35983700229807, + "learning_rate": 8.085574833233259e-06, + "loss": 0.7748, + "step": 3526 + }, + { + "epoch": 0.31, + "grad_norm": 2.978948785842116, + "learning_rate": 8.084455533093579e-06, + "loss": 0.5446, + "step": 3527 + }, + { + "epoch": 0.31, + "grad_norm": 3.3663271865654343, + "learning_rate": 8.083335983364889e-06, + "loss": 0.511, + "step": 3528 + }, + { + "epoch": 0.31, + "grad_norm": 13.825202618018839, + "learning_rate": 8.082216184137779e-06, + "loss": 0.8314, + "step": 3529 + }, + { + "epoch": 0.31, + "grad_norm": 10.583690479508995, + "learning_rate": 8.081096135502863e-06, + "loss": 0.8357, + "step": 3530 + }, + { + "epoch": 0.31, + "grad_norm": 8.684085918650974, + "learning_rate": 8.079975837550773e-06, + "loss": 0.9701, + "step": 3531 + }, + { + "epoch": 0.31, + "grad_norm": 18.624740458859133, + "learning_rate": 8.078855290372161e-06, + "loss": 0.7401, + "step": 3532 + }, + { + "epoch": 0.31, + "grad_norm": 7.527296117455529, + "learning_rate": 8.077734494057698e-06, + "loss": 0.8898, + "step": 3533 + }, + { + "epoch": 0.31, + "grad_norm": 10.097192221676748, + "learning_rate": 8.076613448698083e-06, + "loss": 0.8391, + "step": 3534 + }, + { + "epoch": 0.31, + "grad_norm": 8.305158897182364, + "learning_rate": 8.075492154384024e-06, + "loss": 0.6854, + "step": 3535 + }, + { + "epoch": 0.31, + "grad_norm": 18.169875630267875, + "learning_rate": 8.074370611206255e-06, + "loss": 0.8609, + "step": 3536 + }, + { + "epoch": 0.31, + "grad_norm": 6.946512932763637, + "learning_rate": 8.07324881925553e-06, + "loss": 0.8734, + "step": 3537 + }, + { + "epoch": 0.31, + "grad_norm": 9.006783145242107, + "learning_rate": 8.072126778622624e-06, + "loss": 0.8227, + "step": 3538 + }, + { + "epoch": 0.31, + "grad_norm": 24.376132525057027, + "learning_rate": 8.071004489398329e-06, + "loss": 0.8532, + "step": 3539 + }, + { + "epoch": 0.31, + "grad_norm": 9.524924685419599, + "learning_rate": 8.06988195167346e-06, + "loss": 0.8548, + "step": 3540 + }, + { + "epoch": 0.31, + "grad_norm": 15.91270883496897, + "learning_rate": 8.068759165538848e-06, + "loss": 0.7629, + "step": 3541 + }, + { + "epoch": 0.31, + "grad_norm": 3.0282845672085137, + "learning_rate": 8.06763613108535e-06, + "loss": 0.5628, + "step": 3542 + }, + { + "epoch": 0.31, + "grad_norm": 17.73428584473079, + "learning_rate": 8.066512848403837e-06, + "loss": 0.8148, + "step": 3543 + }, + { + "epoch": 0.31, + "grad_norm": 13.477130042170504, + "learning_rate": 8.065389317585207e-06, + "loss": 0.8772, + "step": 3544 + }, + { + "epoch": 0.31, + "grad_norm": 13.217855720603028, + "learning_rate": 8.064265538720374e-06, + "loss": 0.766, + "step": 3545 + }, + { + "epoch": 0.31, + "grad_norm": 9.668199618393578, + "learning_rate": 8.063141511900268e-06, + "loss": 0.8221, + "step": 3546 + }, + { + "epoch": 0.31, + "grad_norm": 10.262233094621193, + "learning_rate": 8.062017237215848e-06, + "loss": 0.771, + "step": 3547 + }, + { + "epoch": 0.31, + "grad_norm": 7.72647538902935, + "learning_rate": 8.060892714758086e-06, + "loss": 0.7822, + "step": 3548 + }, + { + "epoch": 0.31, + "grad_norm": 9.901847353755384, + "learning_rate": 8.059767944617977e-06, + "loss": 0.947, + "step": 3549 + }, + { + "epoch": 0.31, + "grad_norm": 9.946923890064284, + "learning_rate": 8.058642926886535e-06, + "loss": 0.9353, + "step": 3550 + }, + { + "epoch": 0.31, + "grad_norm": 10.421067305220545, + "learning_rate": 8.057517661654796e-06, + "loss": 0.8149, + "step": 3551 + }, + { + "epoch": 0.31, + "grad_norm": 8.669888968686728, + "learning_rate": 8.056392149013813e-06, + "loss": 0.7295, + "step": 3552 + }, + { + "epoch": 0.31, + "grad_norm": 8.432195883854362, + "learning_rate": 8.055266389054661e-06, + "loss": 0.8299, + "step": 3553 + }, + { + "epoch": 0.31, + "grad_norm": 11.987487779659672, + "learning_rate": 8.054140381868435e-06, + "loss": 0.9024, + "step": 3554 + }, + { + "epoch": 0.31, + "grad_norm": 9.960589744797533, + "learning_rate": 8.053014127546252e-06, + "loss": 0.7532, + "step": 3555 + }, + { + "epoch": 0.31, + "grad_norm": 6.900820273894923, + "learning_rate": 8.051887626179243e-06, + "loss": 0.7352, + "step": 3556 + }, + { + "epoch": 0.31, + "grad_norm": 9.162269193934417, + "learning_rate": 8.050760877858561e-06, + "loss": 0.7213, + "step": 3557 + }, + { + "epoch": 0.31, + "grad_norm": 14.405882659978696, + "learning_rate": 8.049633882675388e-06, + "loss": 0.8473, + "step": 3558 + }, + { + "epoch": 0.31, + "grad_norm": 15.582610455762644, + "learning_rate": 8.048506640720911e-06, + "loss": 0.7965, + "step": 3559 + }, + { + "epoch": 0.31, + "grad_norm": 12.22628867441387, + "learning_rate": 8.047379152086347e-06, + "loss": 0.8286, + "step": 3560 + }, + { + "epoch": 0.31, + "grad_norm": 30.369168317642092, + "learning_rate": 8.046251416862934e-06, + "loss": 0.7764, + "step": 3561 + }, + { + "epoch": 0.31, + "grad_norm": 2.1605907417941363, + "learning_rate": 8.045123435141921e-06, + "loss": 0.5141, + "step": 3562 + }, + { + "epoch": 0.31, + "grad_norm": 25.918148805535736, + "learning_rate": 8.043995207014587e-06, + "loss": 0.8647, + "step": 3563 + }, + { + "epoch": 0.31, + "grad_norm": 16.40795765319198, + "learning_rate": 8.042866732572224e-06, + "loss": 0.7848, + "step": 3564 + }, + { + "epoch": 0.31, + "grad_norm": 12.109344995716112, + "learning_rate": 8.041738011906144e-06, + "loss": 0.9166, + "step": 3565 + }, + { + "epoch": 0.31, + "grad_norm": 6.978673409980145, + "learning_rate": 8.040609045107686e-06, + "loss": 0.7567, + "step": 3566 + }, + { + "epoch": 0.31, + "grad_norm": 10.566607701260953, + "learning_rate": 8.039479832268202e-06, + "loss": 0.8923, + "step": 3567 + }, + { + "epoch": 0.31, + "grad_norm": 9.947496166080176, + "learning_rate": 8.038350373479065e-06, + "loss": 1.0222, + "step": 3568 + }, + { + "epoch": 0.31, + "grad_norm": 14.011598476510702, + "learning_rate": 8.037220668831669e-06, + "loss": 0.7432, + "step": 3569 + }, + { + "epoch": 0.31, + "grad_norm": 11.293160011465472, + "learning_rate": 8.03609071841743e-06, + "loss": 0.9367, + "step": 3570 + }, + { + "epoch": 0.31, + "grad_norm": 9.634661993560542, + "learning_rate": 8.034960522327779e-06, + "loss": 0.8308, + "step": 3571 + }, + { + "epoch": 0.31, + "grad_norm": 11.56826081392934, + "learning_rate": 8.033830080654172e-06, + "loss": 0.8737, + "step": 3572 + }, + { + "epoch": 0.31, + "grad_norm": 13.122973931992153, + "learning_rate": 8.03269939348808e-06, + "loss": 0.8197, + "step": 3573 + }, + { + "epoch": 0.31, + "grad_norm": 10.419109388971412, + "learning_rate": 8.031568460921002e-06, + "loss": 0.6817, + "step": 3574 + }, + { + "epoch": 0.31, + "grad_norm": 12.126026770486915, + "learning_rate": 8.030437283044443e-06, + "loss": 0.7691, + "step": 3575 + }, + { + "epoch": 0.31, + "grad_norm": 7.891476103367027, + "learning_rate": 8.02930585994994e-06, + "loss": 0.9192, + "step": 3576 + }, + { + "epoch": 0.31, + "grad_norm": 9.120830001081563, + "learning_rate": 8.028174191729048e-06, + "loss": 0.8083, + "step": 3577 + }, + { + "epoch": 0.31, + "grad_norm": 2.392546279354245, + "learning_rate": 8.027042278473336e-06, + "loss": 0.5615, + "step": 3578 + }, + { + "epoch": 0.31, + "grad_norm": 13.947770911445627, + "learning_rate": 8.025910120274399e-06, + "loss": 0.8791, + "step": 3579 + }, + { + "epoch": 0.31, + "grad_norm": 9.433333563665109, + "learning_rate": 8.024777717223846e-06, + "loss": 0.7476, + "step": 3580 + }, + { + "epoch": 0.31, + "grad_norm": 11.545110717990864, + "learning_rate": 8.023645069413314e-06, + "loss": 0.7931, + "step": 3581 + }, + { + "epoch": 0.31, + "grad_norm": 14.039757242329337, + "learning_rate": 8.022512176934452e-06, + "loss": 0.8433, + "step": 3582 + }, + { + "epoch": 0.31, + "grad_norm": 8.603150883050601, + "learning_rate": 8.021379039878931e-06, + "loss": 0.8792, + "step": 3583 + }, + { + "epoch": 0.31, + "grad_norm": 8.425157785224394, + "learning_rate": 8.020245658338447e-06, + "loss": 0.8151, + "step": 3584 + }, + { + "epoch": 0.31, + "grad_norm": 11.17888107834352, + "learning_rate": 8.019112032404706e-06, + "loss": 0.7459, + "step": 3585 + }, + { + "epoch": 0.31, + "grad_norm": 7.202293670941295, + "learning_rate": 8.01797816216944e-06, + "loss": 0.857, + "step": 3586 + }, + { + "epoch": 0.32, + "grad_norm": 8.319043374130368, + "learning_rate": 8.016844047724404e-06, + "loss": 0.8308, + "step": 3587 + }, + { + "epoch": 0.32, + "grad_norm": 12.585580255974333, + "learning_rate": 8.015709689161364e-06, + "loss": 0.7643, + "step": 3588 + }, + { + "epoch": 0.32, + "grad_norm": 26.07475707157845, + "learning_rate": 8.014575086572113e-06, + "loss": 0.7935, + "step": 3589 + }, + { + "epoch": 0.32, + "grad_norm": 16.38137754380505, + "learning_rate": 8.013440240048461e-06, + "loss": 0.6556, + "step": 3590 + }, + { + "epoch": 0.32, + "grad_norm": 8.859210638582837, + "learning_rate": 8.012305149682238e-06, + "loss": 0.6863, + "step": 3591 + }, + { + "epoch": 0.32, + "grad_norm": 8.551193942105483, + "learning_rate": 8.011169815565291e-06, + "loss": 0.8001, + "step": 3592 + }, + { + "epoch": 0.32, + "grad_norm": 9.150306647700686, + "learning_rate": 8.010034237789493e-06, + "loss": 0.7769, + "step": 3593 + }, + { + "epoch": 0.32, + "grad_norm": 6.278025260574702, + "learning_rate": 8.00889841644673e-06, + "loss": 0.6772, + "step": 3594 + }, + { + "epoch": 0.32, + "grad_norm": 5.041111604593404, + "learning_rate": 8.007762351628914e-06, + "loss": 0.699, + "step": 3595 + }, + { + "epoch": 0.32, + "grad_norm": 10.263036287807974, + "learning_rate": 8.00662604342797e-06, + "loss": 0.9115, + "step": 3596 + }, + { + "epoch": 0.32, + "grad_norm": 7.868717815940381, + "learning_rate": 8.005489491935848e-06, + "loss": 0.8063, + "step": 3597 + }, + { + "epoch": 0.32, + "grad_norm": 5.11802292598749, + "learning_rate": 8.004352697244516e-06, + "loss": 0.7995, + "step": 3598 + }, + { + "epoch": 0.32, + "grad_norm": 21.891509028770667, + "learning_rate": 8.003215659445962e-06, + "loss": 0.8141, + "step": 3599 + }, + { + "epoch": 0.32, + "grad_norm": 17.071786371185546, + "learning_rate": 8.002078378632192e-06, + "loss": 0.7219, + "step": 3600 + }, + { + "epoch": 0.32, + "grad_norm": 3.5103950908390353, + "learning_rate": 8.000940854895233e-06, + "loss": 0.5734, + "step": 3601 + }, + { + "epoch": 0.32, + "grad_norm": 10.217283377993617, + "learning_rate": 7.999803088327131e-06, + "loss": 0.8318, + "step": 3602 + }, + { + "epoch": 0.32, + "grad_norm": 6.785921818798741, + "learning_rate": 7.998665079019954e-06, + "loss": 0.8107, + "step": 3603 + }, + { + "epoch": 0.32, + "grad_norm": 14.276992184097628, + "learning_rate": 7.997526827065787e-06, + "loss": 0.6735, + "step": 3604 + }, + { + "epoch": 0.32, + "grad_norm": 20.165613138986593, + "learning_rate": 7.996388332556735e-06, + "loss": 0.8938, + "step": 3605 + }, + { + "epoch": 0.32, + "grad_norm": 11.351051921239375, + "learning_rate": 7.995249595584923e-06, + "loss": 0.7489, + "step": 3606 + }, + { + "epoch": 0.32, + "grad_norm": 5.268420380481079, + "learning_rate": 7.994110616242496e-06, + "loss": 0.7569, + "step": 3607 + }, + { + "epoch": 0.32, + "grad_norm": 12.349123047780282, + "learning_rate": 7.99297139462162e-06, + "loss": 0.7305, + "step": 3608 + }, + { + "epoch": 0.32, + "grad_norm": 3.145456899614564, + "learning_rate": 7.991831930814475e-06, + "loss": 0.626, + "step": 3609 + }, + { + "epoch": 0.32, + "grad_norm": 10.5957149712525, + "learning_rate": 7.99069222491327e-06, + "loss": 0.6623, + "step": 3610 + }, + { + "epoch": 0.32, + "grad_norm": 10.453589120509564, + "learning_rate": 7.989552277010222e-06, + "loss": 0.7742, + "step": 3611 + }, + { + "epoch": 0.32, + "grad_norm": 8.166701467448426, + "learning_rate": 7.988412087197577e-06, + "loss": 0.7729, + "step": 3612 + }, + { + "epoch": 0.32, + "grad_norm": 47.30516680788544, + "learning_rate": 7.987271655567598e-06, + "loss": 0.7196, + "step": 3613 + }, + { + "epoch": 0.32, + "grad_norm": 2.523667373386807, + "learning_rate": 7.986130982212565e-06, + "loss": 0.5837, + "step": 3614 + }, + { + "epoch": 0.32, + "grad_norm": 8.568890669779309, + "learning_rate": 7.98499006722478e-06, + "loss": 0.7561, + "step": 3615 + }, + { + "epoch": 0.32, + "grad_norm": 2.909717341590154, + "learning_rate": 7.983848910696565e-06, + "loss": 0.5607, + "step": 3616 + }, + { + "epoch": 0.32, + "grad_norm": 14.771787516942991, + "learning_rate": 7.982707512720257e-06, + "loss": 0.8817, + "step": 3617 + }, + { + "epoch": 0.32, + "grad_norm": 28.099288818642187, + "learning_rate": 7.981565873388222e-06, + "loss": 0.8971, + "step": 3618 + }, + { + "epoch": 0.32, + "grad_norm": 8.77401078159559, + "learning_rate": 7.980423992792833e-06, + "loss": 0.8453, + "step": 3619 + }, + { + "epoch": 0.32, + "grad_norm": 6.778810539626058, + "learning_rate": 7.979281871026493e-06, + "loss": 0.8358, + "step": 3620 + }, + { + "epoch": 0.32, + "grad_norm": 11.640069568389139, + "learning_rate": 7.978139508181622e-06, + "loss": 0.7739, + "step": 3621 + }, + { + "epoch": 0.32, + "grad_norm": 9.641261929280379, + "learning_rate": 7.976996904350653e-06, + "loss": 0.8202, + "step": 3622 + }, + { + "epoch": 0.32, + "grad_norm": 10.238981592300773, + "learning_rate": 7.97585405962605e-06, + "loss": 0.6589, + "step": 3623 + }, + { + "epoch": 0.32, + "grad_norm": 6.833980168352535, + "learning_rate": 7.974710974100284e-06, + "loss": 0.7317, + "step": 3624 + }, + { + "epoch": 0.32, + "grad_norm": 11.000804407464019, + "learning_rate": 7.973567647865854e-06, + "loss": 0.906, + "step": 3625 + }, + { + "epoch": 0.32, + "grad_norm": 10.080154861296695, + "learning_rate": 7.972424081015275e-06, + "loss": 0.7542, + "step": 3626 + }, + { + "epoch": 0.32, + "grad_norm": 13.292378757151148, + "learning_rate": 7.971280273641086e-06, + "loss": 0.7674, + "step": 3627 + }, + { + "epoch": 0.32, + "grad_norm": 7.544093097542714, + "learning_rate": 7.97013622583584e-06, + "loss": 0.8018, + "step": 3628 + }, + { + "epoch": 0.32, + "grad_norm": 15.153347465534877, + "learning_rate": 7.96899193769211e-06, + "loss": 0.7939, + "step": 3629 + }, + { + "epoch": 0.32, + "grad_norm": 11.010010171478427, + "learning_rate": 7.967847409302492e-06, + "loss": 0.8516, + "step": 3630 + }, + { + "epoch": 0.32, + "grad_norm": 11.352558921983668, + "learning_rate": 7.966702640759598e-06, + "loss": 0.8273, + "step": 3631 + }, + { + "epoch": 0.32, + "grad_norm": 17.62021688414949, + "learning_rate": 7.96555763215606e-06, + "loss": 0.8941, + "step": 3632 + }, + { + "epoch": 0.32, + "grad_norm": 11.090081742476748, + "learning_rate": 7.964412383584532e-06, + "loss": 1.0046, + "step": 3633 + }, + { + "epoch": 0.32, + "grad_norm": 10.631155026271257, + "learning_rate": 7.963266895137686e-06, + "loss": 0.6766, + "step": 3634 + }, + { + "epoch": 0.32, + "grad_norm": 12.015959514638999, + "learning_rate": 7.96212116690821e-06, + "loss": 0.9345, + "step": 3635 + }, + { + "epoch": 0.32, + "grad_norm": 2.605941301642703, + "learning_rate": 7.96097519898882e-06, + "loss": 0.6144, + "step": 3636 + }, + { + "epoch": 0.32, + "grad_norm": 3.1318117699116645, + "learning_rate": 7.959828991472239e-06, + "loss": 0.5234, + "step": 3637 + }, + { + "epoch": 0.32, + "grad_norm": 3.3628067020491836, + "learning_rate": 7.95868254445122e-06, + "loss": 0.546, + "step": 3638 + }, + { + "epoch": 0.32, + "grad_norm": 7.791102460414167, + "learning_rate": 7.95753585801853e-06, + "loss": 0.7386, + "step": 3639 + }, + { + "epoch": 0.32, + "grad_norm": 8.271061327236358, + "learning_rate": 7.95638893226696e-06, + "loss": 0.8078, + "step": 3640 + }, + { + "epoch": 0.32, + "grad_norm": 16.713335324143596, + "learning_rate": 7.955241767289315e-06, + "loss": 0.9332, + "step": 3641 + }, + { + "epoch": 0.32, + "grad_norm": 8.796830862885953, + "learning_rate": 7.954094363178421e-06, + "loss": 0.8151, + "step": 3642 + }, + { + "epoch": 0.32, + "grad_norm": 8.64400998379649, + "learning_rate": 7.952946720027125e-06, + "loss": 0.6543, + "step": 3643 + }, + { + "epoch": 0.32, + "grad_norm": 3.9017741604192944, + "learning_rate": 7.951798837928294e-06, + "loss": 0.4703, + "step": 3644 + }, + { + "epoch": 0.32, + "grad_norm": 7.931559078764929, + "learning_rate": 7.95065071697481e-06, + "loss": 0.8415, + "step": 3645 + }, + { + "epoch": 0.32, + "grad_norm": 10.580820457795578, + "learning_rate": 7.949502357259577e-06, + "loss": 0.7356, + "step": 3646 + }, + { + "epoch": 0.32, + "grad_norm": 13.185814309088379, + "learning_rate": 7.948353758875521e-06, + "loss": 0.8076, + "step": 3647 + }, + { + "epoch": 0.32, + "grad_norm": 12.840755952242906, + "learning_rate": 7.947204921915583e-06, + "loss": 0.8573, + "step": 3648 + }, + { + "epoch": 0.32, + "grad_norm": 15.879009944725853, + "learning_rate": 7.946055846472727e-06, + "loss": 0.745, + "step": 3649 + }, + { + "epoch": 0.32, + "grad_norm": 7.2736997462558, + "learning_rate": 7.944906532639929e-06, + "loss": 0.7933, + "step": 3650 + }, + { + "epoch": 0.32, + "grad_norm": 11.20994932980082, + "learning_rate": 7.943756980510196e-06, + "loss": 0.8477, + "step": 3651 + }, + { + "epoch": 0.32, + "grad_norm": 12.635249726289747, + "learning_rate": 7.942607190176542e-06, + "loss": 0.8251, + "step": 3652 + }, + { + "epoch": 0.32, + "grad_norm": 8.436572087837598, + "learning_rate": 7.941457161732011e-06, + "loss": 0.7697, + "step": 3653 + }, + { + "epoch": 0.32, + "grad_norm": 8.454172371301661, + "learning_rate": 7.940306895269658e-06, + "loss": 0.8637, + "step": 3654 + }, + { + "epoch": 0.32, + "grad_norm": 10.972780666521267, + "learning_rate": 7.939156390882562e-06, + "loss": 0.8754, + "step": 3655 + }, + { + "epoch": 0.32, + "grad_norm": 9.955290806453036, + "learning_rate": 7.93800564866382e-06, + "loss": 0.7803, + "step": 3656 + }, + { + "epoch": 0.32, + "grad_norm": 3.381689752223644, + "learning_rate": 7.936854668706548e-06, + "loss": 0.5658, + "step": 3657 + }, + { + "epoch": 0.32, + "grad_norm": 10.742742917339458, + "learning_rate": 7.935703451103884e-06, + "loss": 0.7853, + "step": 3658 + }, + { + "epoch": 0.32, + "grad_norm": 23.824568599409687, + "learning_rate": 7.934551995948977e-06, + "loss": 0.8655, + "step": 3659 + }, + { + "epoch": 0.32, + "grad_norm": 9.051359718704967, + "learning_rate": 7.933400303335003e-06, + "loss": 0.9221, + "step": 3660 + }, + { + "epoch": 0.32, + "grad_norm": 11.515563427895822, + "learning_rate": 7.93224837335516e-06, + "loss": 0.6853, + "step": 3661 + }, + { + "epoch": 0.32, + "grad_norm": 9.376671527842282, + "learning_rate": 7.931096206102654e-06, + "loss": 0.6971, + "step": 3662 + }, + { + "epoch": 0.32, + "grad_norm": 8.204281996648307, + "learning_rate": 7.929943801670717e-06, + "loss": 0.9044, + "step": 3663 + }, + { + "epoch": 0.32, + "grad_norm": 9.736965366746523, + "learning_rate": 7.928791160152603e-06, + "loss": 0.7505, + "step": 3664 + }, + { + "epoch": 0.32, + "grad_norm": 7.947237461048696, + "learning_rate": 7.927638281641581e-06, + "loss": 0.9233, + "step": 3665 + }, + { + "epoch": 0.32, + "grad_norm": 7.412776909650221, + "learning_rate": 7.926485166230938e-06, + "loss": 0.6811, + "step": 3666 + }, + { + "epoch": 0.32, + "grad_norm": 13.005446547729033, + "learning_rate": 7.925331814013982e-06, + "loss": 0.6763, + "step": 3667 + }, + { + "epoch": 0.32, + "grad_norm": 8.044503616329973, + "learning_rate": 7.924178225084043e-06, + "loss": 0.7983, + "step": 3668 + }, + { + "epoch": 0.32, + "grad_norm": 13.120717516838795, + "learning_rate": 7.923024399534466e-06, + "loss": 0.8855, + "step": 3669 + }, + { + "epoch": 0.32, + "grad_norm": 7.834383201434681, + "learning_rate": 7.921870337458616e-06, + "loss": 0.7823, + "step": 3670 + }, + { + "epoch": 0.32, + "grad_norm": 7.719957358998602, + "learning_rate": 7.92071603894988e-06, + "loss": 0.6895, + "step": 3671 + }, + { + "epoch": 0.32, + "grad_norm": 8.092649947385173, + "learning_rate": 7.91956150410166e-06, + "loss": 0.7514, + "step": 3672 + }, + { + "epoch": 0.32, + "grad_norm": 8.233104656982384, + "learning_rate": 7.91840673300738e-06, + "loss": 0.7099, + "step": 3673 + }, + { + "epoch": 0.32, + "grad_norm": 11.373999607022043, + "learning_rate": 7.917251725760483e-06, + "loss": 0.8828, + "step": 3674 + }, + { + "epoch": 0.32, + "grad_norm": 7.360503533365887, + "learning_rate": 7.916096482454425e-06, + "loss": 0.8088, + "step": 3675 + }, + { + "epoch": 0.32, + "grad_norm": 7.352347357157703, + "learning_rate": 7.914941003182694e-06, + "loss": 0.6716, + "step": 3676 + }, + { + "epoch": 0.32, + "grad_norm": 14.219396645955431, + "learning_rate": 7.913785288038783e-06, + "loss": 0.7575, + "step": 3677 + }, + { + "epoch": 0.32, + "grad_norm": 10.287132375498759, + "learning_rate": 7.912629337116215e-06, + "loss": 0.8964, + "step": 3678 + }, + { + "epoch": 0.32, + "grad_norm": 7.2717431151090555, + "learning_rate": 7.911473150508527e-06, + "loss": 0.7276, + "step": 3679 + }, + { + "epoch": 0.32, + "grad_norm": 11.883959695025338, + "learning_rate": 7.91031672830927e-06, + "loss": 0.8375, + "step": 3680 + }, + { + "epoch": 0.32, + "grad_norm": 12.668661658092395, + "learning_rate": 7.90916007061203e-06, + "loss": 0.8983, + "step": 3681 + }, + { + "epoch": 0.32, + "grad_norm": 9.017882354886124, + "learning_rate": 7.908003177510392e-06, + "loss": 0.7623, + "step": 3682 + }, + { + "epoch": 0.32, + "grad_norm": 21.716332466567902, + "learning_rate": 7.906846049097976e-06, + "loss": 0.7086, + "step": 3683 + }, + { + "epoch": 0.32, + "grad_norm": 16.670357948905927, + "learning_rate": 7.905688685468414e-06, + "loss": 0.7383, + "step": 3684 + }, + { + "epoch": 0.32, + "grad_norm": 2.3911220359060517, + "learning_rate": 7.904531086715356e-06, + "loss": 0.5188, + "step": 3685 + }, + { + "epoch": 0.32, + "grad_norm": 13.828369235919283, + "learning_rate": 7.903373252932474e-06, + "loss": 0.7593, + "step": 3686 + }, + { + "epoch": 0.32, + "grad_norm": 18.608449007322406, + "learning_rate": 7.902215184213457e-06, + "loss": 0.7687, + "step": 3687 + }, + { + "epoch": 0.32, + "grad_norm": 10.390548018755824, + "learning_rate": 7.901056880652016e-06, + "loss": 0.8305, + "step": 3688 + }, + { + "epoch": 0.32, + "grad_norm": 2.805933493833933, + "learning_rate": 7.899898342341876e-06, + "loss": 0.4591, + "step": 3689 + }, + { + "epoch": 0.32, + "grad_norm": 11.817753088767384, + "learning_rate": 7.898739569376788e-06, + "loss": 0.8439, + "step": 3690 + }, + { + "epoch": 0.32, + "grad_norm": 14.106683258199606, + "learning_rate": 7.897580561850516e-06, + "loss": 0.8652, + "step": 3691 + }, + { + "epoch": 0.32, + "grad_norm": 16.67881099925124, + "learning_rate": 7.896421319856843e-06, + "loss": 0.8268, + "step": 3692 + }, + { + "epoch": 0.32, + "grad_norm": 8.05596611669659, + "learning_rate": 7.895261843489575e-06, + "loss": 0.748, + "step": 3693 + }, + { + "epoch": 0.32, + "grad_norm": 3.9900321853438467, + "learning_rate": 7.894102132842535e-06, + "loss": 0.6424, + "step": 3694 + }, + { + "epoch": 0.32, + "grad_norm": 11.471269648807565, + "learning_rate": 7.892942188009566e-06, + "loss": 0.7714, + "step": 3695 + }, + { + "epoch": 0.32, + "grad_norm": 11.689589212519795, + "learning_rate": 7.891782009084525e-06, + "loss": 0.725, + "step": 3696 + }, + { + "epoch": 0.32, + "grad_norm": 13.351756710608415, + "learning_rate": 7.890621596161295e-06, + "loss": 0.7231, + "step": 3697 + }, + { + "epoch": 0.32, + "grad_norm": 3.0900292607017916, + "learning_rate": 7.889460949333775e-06, + "loss": 0.5196, + "step": 3698 + }, + { + "epoch": 0.32, + "grad_norm": 11.394139859924646, + "learning_rate": 7.88830006869588e-06, + "loss": 0.7608, + "step": 3699 + }, + { + "epoch": 0.32, + "grad_norm": 7.464656720603205, + "learning_rate": 7.887138954341547e-06, + "loss": 0.8835, + "step": 3700 + }, + { + "epoch": 0.33, + "grad_norm": 12.705465806555207, + "learning_rate": 7.885977606364735e-06, + "loss": 0.7558, + "step": 3701 + }, + { + "epoch": 0.33, + "grad_norm": 15.145763119672814, + "learning_rate": 7.884816024859414e-06, + "loss": 0.7339, + "step": 3702 + }, + { + "epoch": 0.33, + "grad_norm": 9.145782726985022, + "learning_rate": 7.883654209919579e-06, + "loss": 0.8734, + "step": 3703 + }, + { + "epoch": 0.33, + "grad_norm": 8.19955243705943, + "learning_rate": 7.88249216163924e-06, + "loss": 0.8932, + "step": 3704 + }, + { + "epoch": 0.33, + "grad_norm": 10.408792060865665, + "learning_rate": 7.881329880112435e-06, + "loss": 0.7155, + "step": 3705 + }, + { + "epoch": 0.33, + "grad_norm": 3.2354592834941465, + "learning_rate": 7.880167365433204e-06, + "loss": 0.5508, + "step": 3706 + }, + { + "epoch": 0.33, + "grad_norm": 10.203611657946787, + "learning_rate": 7.879004617695624e-06, + "loss": 0.7057, + "step": 3707 + }, + { + "epoch": 0.33, + "grad_norm": 9.01957563464443, + "learning_rate": 7.877841636993777e-06, + "loss": 0.6855, + "step": 3708 + }, + { + "epoch": 0.33, + "grad_norm": 9.35810225861942, + "learning_rate": 7.876678423421774e-06, + "loss": 0.7917, + "step": 3709 + }, + { + "epoch": 0.33, + "grad_norm": 7.98448821271601, + "learning_rate": 7.875514977073737e-06, + "loss": 0.7899, + "step": 3710 + }, + { + "epoch": 0.33, + "grad_norm": 14.48200955994999, + "learning_rate": 7.87435129804381e-06, + "loss": 0.9283, + "step": 3711 + }, + { + "epoch": 0.33, + "grad_norm": 5.963915662962667, + "learning_rate": 7.87318738642616e-06, + "loss": 0.716, + "step": 3712 + }, + { + "epoch": 0.33, + "grad_norm": 7.787450251782846, + "learning_rate": 7.872023242314964e-06, + "loss": 0.7916, + "step": 3713 + }, + { + "epoch": 0.33, + "grad_norm": 10.062253606013275, + "learning_rate": 7.870858865804425e-06, + "loss": 0.8456, + "step": 3714 + }, + { + "epoch": 0.33, + "grad_norm": 11.47092777594, + "learning_rate": 7.869694256988762e-06, + "loss": 0.8296, + "step": 3715 + }, + { + "epoch": 0.33, + "grad_norm": 7.988427947484089, + "learning_rate": 7.868529415962212e-06, + "loss": 0.8459, + "step": 3716 + }, + { + "epoch": 0.33, + "grad_norm": 6.349093316720154, + "learning_rate": 7.867364342819035e-06, + "loss": 0.8483, + "step": 3717 + }, + { + "epoch": 0.33, + "grad_norm": 3.4758181703480417, + "learning_rate": 7.866199037653505e-06, + "loss": 0.5196, + "step": 3718 + }, + { + "epoch": 0.33, + "grad_norm": 9.15829895825719, + "learning_rate": 7.865033500559916e-06, + "loss": 0.8697, + "step": 3719 + }, + { + "epoch": 0.33, + "grad_norm": 13.988207452645087, + "learning_rate": 7.863867731632581e-06, + "loss": 0.7745, + "step": 3720 + }, + { + "epoch": 0.33, + "grad_norm": 10.849913400534774, + "learning_rate": 7.862701730965833e-06, + "loss": 0.768, + "step": 3721 + }, + { + "epoch": 0.33, + "grad_norm": 9.943101377508913, + "learning_rate": 7.861535498654025e-06, + "loss": 0.8131, + "step": 3722 + }, + { + "epoch": 0.33, + "grad_norm": 19.44283986082784, + "learning_rate": 7.860369034791523e-06, + "loss": 0.8543, + "step": 3723 + }, + { + "epoch": 0.33, + "grad_norm": 10.946122466349962, + "learning_rate": 7.859202339472717e-06, + "loss": 0.8151, + "step": 3724 + }, + { + "epoch": 0.33, + "grad_norm": 26.721257617153306, + "learning_rate": 7.858035412792014e-06, + "loss": 0.7713, + "step": 3725 + }, + { + "epoch": 0.33, + "grad_norm": 9.2470620018948, + "learning_rate": 7.856868254843837e-06, + "loss": 0.9961, + "step": 3726 + }, + { + "epoch": 0.33, + "grad_norm": 11.029534647547706, + "learning_rate": 7.855700865722636e-06, + "loss": 0.6221, + "step": 3727 + }, + { + "epoch": 0.33, + "grad_norm": 15.86413269930123, + "learning_rate": 7.85453324552287e-06, + "loss": 0.9129, + "step": 3728 + }, + { + "epoch": 0.33, + "grad_norm": 12.149566945143729, + "learning_rate": 7.85336539433902e-06, + "loss": 1.2215, + "step": 3729 + }, + { + "epoch": 0.33, + "grad_norm": 10.645150495007519, + "learning_rate": 7.852197312265592e-06, + "loss": 0.8976, + "step": 3730 + }, + { + "epoch": 0.33, + "grad_norm": 14.853843253887309, + "learning_rate": 7.851028999397101e-06, + "loss": 0.9104, + "step": 3731 + }, + { + "epoch": 0.33, + "grad_norm": 10.099298597373357, + "learning_rate": 7.849860455828085e-06, + "loss": 0.9464, + "step": 3732 + }, + { + "epoch": 0.33, + "grad_norm": 10.272811272294383, + "learning_rate": 7.848691681653102e-06, + "loss": 0.6773, + "step": 3733 + }, + { + "epoch": 0.33, + "grad_norm": 9.161583598473557, + "learning_rate": 7.847522676966725e-06, + "loss": 0.9513, + "step": 3734 + }, + { + "epoch": 0.33, + "grad_norm": 20.373801988673293, + "learning_rate": 7.846353441863553e-06, + "loss": 0.6859, + "step": 3735 + }, + { + "epoch": 0.33, + "grad_norm": 8.991552867386764, + "learning_rate": 7.845183976438193e-06, + "loss": 0.8857, + "step": 3736 + }, + { + "epoch": 0.33, + "grad_norm": 7.923217120288348, + "learning_rate": 7.844014280785277e-06, + "loss": 0.863, + "step": 3737 + }, + { + "epoch": 0.33, + "grad_norm": 9.51182298942481, + "learning_rate": 7.84284435499946e-06, + "loss": 0.8189, + "step": 3738 + }, + { + "epoch": 0.33, + "grad_norm": 8.743222346750926, + "learning_rate": 7.841674199175404e-06, + "loss": 0.7488, + "step": 3739 + }, + { + "epoch": 0.33, + "grad_norm": 10.761953010531833, + "learning_rate": 7.840503813407798e-06, + "loss": 0.7671, + "step": 3740 + }, + { + "epoch": 0.33, + "grad_norm": 10.835178863711818, + "learning_rate": 7.83933319779135e-06, + "loss": 0.7713, + "step": 3741 + }, + { + "epoch": 0.33, + "grad_norm": 8.45589224220471, + "learning_rate": 7.838162352420782e-06, + "loss": 0.9009, + "step": 3742 + }, + { + "epoch": 0.33, + "grad_norm": 11.385869566090577, + "learning_rate": 7.836991277390837e-06, + "loss": 0.8983, + "step": 3743 + }, + { + "epoch": 0.33, + "grad_norm": 9.004489278202582, + "learning_rate": 7.835819972796277e-06, + "loss": 0.6505, + "step": 3744 + }, + { + "epoch": 0.33, + "grad_norm": 8.09609278369687, + "learning_rate": 7.834648438731881e-06, + "loss": 0.7615, + "step": 3745 + }, + { + "epoch": 0.33, + "grad_norm": 9.11545467725209, + "learning_rate": 7.833476675292451e-06, + "loss": 0.7901, + "step": 3746 + }, + { + "epoch": 0.33, + "grad_norm": 7.884555329249683, + "learning_rate": 7.8323046825728e-06, + "loss": 0.9012, + "step": 3747 + }, + { + "epoch": 0.33, + "grad_norm": 9.017069242684371, + "learning_rate": 7.831132460667767e-06, + "loss": 0.7893, + "step": 3748 + }, + { + "epoch": 0.33, + "grad_norm": 16.337687676454355, + "learning_rate": 7.829960009672203e-06, + "loss": 0.9471, + "step": 3749 + }, + { + "epoch": 0.33, + "grad_norm": 67.13410025416484, + "learning_rate": 7.828787329680983e-06, + "loss": 0.7878, + "step": 3750 + }, + { + "epoch": 0.33, + "grad_norm": 6.527762505409227, + "learning_rate": 7.827614420788998e-06, + "loss": 0.8699, + "step": 3751 + }, + { + "epoch": 0.33, + "grad_norm": 10.93736996569136, + "learning_rate": 7.826441283091158e-06, + "loss": 0.8319, + "step": 3752 + }, + { + "epoch": 0.33, + "grad_norm": 3.748774771478255, + "learning_rate": 7.82526791668239e-06, + "loss": 0.6258, + "step": 3753 + }, + { + "epoch": 0.33, + "grad_norm": 8.486179512756873, + "learning_rate": 7.824094321657642e-06, + "loss": 0.7853, + "step": 3754 + }, + { + "epoch": 0.33, + "grad_norm": 10.011338853408747, + "learning_rate": 7.82292049811188e-06, + "loss": 0.7216, + "step": 3755 + }, + { + "epoch": 0.33, + "grad_norm": 16.41973217705509, + "learning_rate": 7.821746446140084e-06, + "loss": 0.6934, + "step": 3756 + }, + { + "epoch": 0.33, + "grad_norm": 7.910767522549293, + "learning_rate": 7.820572165837263e-06, + "loss": 0.7673, + "step": 3757 + }, + { + "epoch": 0.33, + "grad_norm": 10.776686172912234, + "learning_rate": 7.819397657298431e-06, + "loss": 0.7915, + "step": 3758 + }, + { + "epoch": 0.33, + "grad_norm": 34.900283043377755, + "learning_rate": 7.818222920618632e-06, + "loss": 0.7673, + "step": 3759 + }, + { + "epoch": 0.33, + "grad_norm": 6.485427084743866, + "learning_rate": 7.817047955892922e-06, + "loss": 0.6871, + "step": 3760 + }, + { + "epoch": 0.33, + "grad_norm": 6.054567270542756, + "learning_rate": 7.815872763216378e-06, + "loss": 0.6544, + "step": 3761 + }, + { + "epoch": 0.33, + "grad_norm": 11.805755912731684, + "learning_rate": 7.814697342684094e-06, + "loss": 0.8554, + "step": 3762 + }, + { + "epoch": 0.33, + "grad_norm": 10.713829537473107, + "learning_rate": 7.813521694391183e-06, + "loss": 0.8388, + "step": 3763 + }, + { + "epoch": 0.33, + "grad_norm": 16.481594472231887, + "learning_rate": 7.812345818432774e-06, + "loss": 0.8232, + "step": 3764 + }, + { + "epoch": 0.33, + "grad_norm": 12.516208678316223, + "learning_rate": 7.811169714904023e-06, + "loss": 0.5729, + "step": 3765 + }, + { + "epoch": 0.33, + "grad_norm": 9.135750956756404, + "learning_rate": 7.809993383900091e-06, + "loss": 0.8158, + "step": 3766 + }, + { + "epoch": 0.33, + "grad_norm": 18.43219181833982, + "learning_rate": 7.808816825516172e-06, + "loss": 0.8198, + "step": 3767 + }, + { + "epoch": 0.33, + "grad_norm": 10.152118637621038, + "learning_rate": 7.807640039847468e-06, + "loss": 0.7631, + "step": 3768 + }, + { + "epoch": 0.33, + "grad_norm": 6.106340472700748, + "learning_rate": 7.8064630269892e-06, + "loss": 0.741, + "step": 3769 + }, + { + "epoch": 0.33, + "grad_norm": 12.429067551120934, + "learning_rate": 7.805285787036611e-06, + "loss": 0.9186, + "step": 3770 + }, + { + "epoch": 0.33, + "grad_norm": 8.211282270167223, + "learning_rate": 7.804108320084966e-06, + "loss": 0.6891, + "step": 3771 + }, + { + "epoch": 0.33, + "grad_norm": 18.428704371661116, + "learning_rate": 7.802930626229539e-06, + "loss": 0.8158, + "step": 3772 + }, + { + "epoch": 0.33, + "grad_norm": 10.112669993198264, + "learning_rate": 7.801752705565627e-06, + "loss": 0.7873, + "step": 3773 + }, + { + "epoch": 0.33, + "grad_norm": 10.503098006330399, + "learning_rate": 7.800574558188548e-06, + "loss": 0.7427, + "step": 3774 + }, + { + "epoch": 0.33, + "grad_norm": 8.482164311954444, + "learning_rate": 7.799396184193633e-06, + "loss": 0.7019, + "step": 3775 + }, + { + "epoch": 0.33, + "grad_norm": 26.743522088765634, + "learning_rate": 7.798217583676236e-06, + "loss": 0.7654, + "step": 3776 + }, + { + "epoch": 0.33, + "grad_norm": 10.99363292523377, + "learning_rate": 7.797038756731726e-06, + "loss": 0.7248, + "step": 3777 + }, + { + "epoch": 0.33, + "grad_norm": 14.84100137633765, + "learning_rate": 7.795859703455493e-06, + "loss": 0.7987, + "step": 3778 + }, + { + "epoch": 0.33, + "grad_norm": 10.099716771102608, + "learning_rate": 7.794680423942945e-06, + "loss": 0.6884, + "step": 3779 + }, + { + "epoch": 0.33, + "grad_norm": 9.795816592782495, + "learning_rate": 7.793500918289503e-06, + "loss": 0.6423, + "step": 3780 + }, + { + "epoch": 0.33, + "grad_norm": 13.614101791464003, + "learning_rate": 7.792321186590615e-06, + "loss": 0.8817, + "step": 3781 + }, + { + "epoch": 0.33, + "grad_norm": 25.08533535939842, + "learning_rate": 7.791141228941741e-06, + "loss": 0.742, + "step": 3782 + }, + { + "epoch": 0.33, + "grad_norm": 2.746928182977886, + "learning_rate": 7.78996104543836e-06, + "loss": 0.5601, + "step": 3783 + }, + { + "epoch": 0.33, + "grad_norm": 11.78170281607098, + "learning_rate": 7.788780636175975e-06, + "loss": 0.8362, + "step": 3784 + }, + { + "epoch": 0.33, + "grad_norm": 2.823954138559099, + "learning_rate": 7.787600001250098e-06, + "loss": 0.5435, + "step": 3785 + }, + { + "epoch": 0.33, + "grad_norm": 8.297090918318585, + "learning_rate": 7.786419140756267e-06, + "loss": 0.8396, + "step": 3786 + }, + { + "epoch": 0.33, + "grad_norm": 3.698382666986178, + "learning_rate": 7.785238054790033e-06, + "loss": 0.5038, + "step": 3787 + }, + { + "epoch": 0.33, + "grad_norm": 14.039744720398811, + "learning_rate": 7.78405674344697e-06, + "loss": 0.7386, + "step": 3788 + }, + { + "epoch": 0.33, + "grad_norm": 16.89001620351972, + "learning_rate": 7.782875206822663e-06, + "loss": 0.8225, + "step": 3789 + }, + { + "epoch": 0.33, + "grad_norm": 16.017565587632443, + "learning_rate": 7.781693445012728e-06, + "loss": 0.8399, + "step": 3790 + }, + { + "epoch": 0.33, + "grad_norm": 14.744218520941358, + "learning_rate": 7.780511458112783e-06, + "loss": 0.9107, + "step": 3791 + }, + { + "epoch": 0.33, + "grad_norm": 7.691326915909074, + "learning_rate": 7.77932924621848e-06, + "loss": 0.8364, + "step": 3792 + }, + { + "epoch": 0.33, + "grad_norm": 8.682348844244169, + "learning_rate": 7.778146809425472e-06, + "loss": 0.724, + "step": 3793 + }, + { + "epoch": 0.33, + "grad_norm": 11.086646739818065, + "learning_rate": 7.776964147829448e-06, + "loss": 0.7869, + "step": 3794 + }, + { + "epoch": 0.33, + "grad_norm": 2.4139320610972588, + "learning_rate": 7.775781261526106e-06, + "loss": 0.5674, + "step": 3795 + }, + { + "epoch": 0.33, + "grad_norm": 11.238712435121766, + "learning_rate": 7.77459815061116e-06, + "loss": 0.8945, + "step": 3796 + }, + { + "epoch": 0.33, + "grad_norm": 14.452073241999248, + "learning_rate": 7.773414815180347e-06, + "loss": 0.7982, + "step": 3797 + }, + { + "epoch": 0.33, + "grad_norm": 2.5093432079826394, + "learning_rate": 7.772231255329422e-06, + "loss": 0.5269, + "step": 3798 + }, + { + "epoch": 0.33, + "grad_norm": 6.9899849287343985, + "learning_rate": 7.771047471154153e-06, + "loss": 0.7766, + "step": 3799 + }, + { + "epoch": 0.33, + "grad_norm": 7.815799523011844, + "learning_rate": 7.769863462750332e-06, + "loss": 0.8394, + "step": 3800 + }, + { + "epoch": 0.33, + "grad_norm": 13.746166083107829, + "learning_rate": 7.76867923021377e-06, + "loss": 0.7606, + "step": 3801 + }, + { + "epoch": 0.33, + "grad_norm": 8.13960692806002, + "learning_rate": 7.767494773640287e-06, + "loss": 0.79, + "step": 3802 + }, + { + "epoch": 0.33, + "grad_norm": 2.9520168583935913, + "learning_rate": 7.76631009312573e-06, + "loss": 0.5179, + "step": 3803 + }, + { + "epoch": 0.33, + "grad_norm": 10.047648618720771, + "learning_rate": 7.765125188765964e-06, + "loss": 0.8045, + "step": 3804 + }, + { + "epoch": 0.33, + "grad_norm": 7.8638696203971366, + "learning_rate": 7.763940060656868e-06, + "loss": 0.7585, + "step": 3805 + }, + { + "epoch": 0.33, + "grad_norm": 12.831357055154863, + "learning_rate": 7.762754708894337e-06, + "loss": 0.9479, + "step": 3806 + }, + { + "epoch": 0.33, + "grad_norm": 3.1750029226704064, + "learning_rate": 7.761569133574291e-06, + "loss": 0.5539, + "step": 3807 + }, + { + "epoch": 0.33, + "grad_norm": 25.655173587331923, + "learning_rate": 7.760383334792667e-06, + "loss": 0.9262, + "step": 3808 + }, + { + "epoch": 0.33, + "grad_norm": 8.798008565980366, + "learning_rate": 7.759197312645413e-06, + "loss": 0.7581, + "step": 3809 + }, + { + "epoch": 0.33, + "grad_norm": 10.861566930163203, + "learning_rate": 7.758011067228503e-06, + "loss": 0.9001, + "step": 3810 + }, + { + "epoch": 0.33, + "grad_norm": 10.564995776164958, + "learning_rate": 7.756824598637924e-06, + "loss": 0.7613, + "step": 3811 + }, + { + "epoch": 0.33, + "grad_norm": 11.563695661782255, + "learning_rate": 7.755637906969687e-06, + "loss": 0.6956, + "step": 3812 + }, + { + "epoch": 0.33, + "grad_norm": 8.020404782004606, + "learning_rate": 7.754450992319811e-06, + "loss": 0.7448, + "step": 3813 + }, + { + "epoch": 0.33, + "grad_norm": 13.788172342517047, + "learning_rate": 7.753263854784344e-06, + "loss": 0.8298, + "step": 3814 + }, + { + "epoch": 0.34, + "grad_norm": 18.153330907187367, + "learning_rate": 7.752076494459346e-06, + "loss": 0.843, + "step": 3815 + }, + { + "epoch": 0.34, + "grad_norm": 12.59056660690292, + "learning_rate": 7.750888911440895e-06, + "loss": 0.8675, + "step": 3816 + }, + { + "epoch": 0.34, + "grad_norm": 2.3049057598056697, + "learning_rate": 7.749701105825089e-06, + "loss": 0.4881, + "step": 3817 + }, + { + "epoch": 0.34, + "grad_norm": 3.537876478659659, + "learning_rate": 7.748513077708044e-06, + "loss": 0.5648, + "step": 3818 + }, + { + "epoch": 0.34, + "grad_norm": 20.039514479414038, + "learning_rate": 7.747324827185893e-06, + "loss": 0.8676, + "step": 3819 + }, + { + "epoch": 0.34, + "grad_norm": 10.574341300143985, + "learning_rate": 7.746136354354787e-06, + "loss": 1.0333, + "step": 3820 + }, + { + "epoch": 0.34, + "grad_norm": 3.161700532552664, + "learning_rate": 7.744947659310894e-06, + "loss": 0.6478, + "step": 3821 + }, + { + "epoch": 0.34, + "grad_norm": 5.627428672538101, + "learning_rate": 7.743758742150404e-06, + "loss": 0.8714, + "step": 3822 + }, + { + "epoch": 0.34, + "grad_norm": 12.356255261801692, + "learning_rate": 7.742569602969519e-06, + "loss": 0.6765, + "step": 3823 + }, + { + "epoch": 0.34, + "grad_norm": 11.454562949189635, + "learning_rate": 7.741380241864465e-06, + "loss": 0.8873, + "step": 3824 + }, + { + "epoch": 0.34, + "grad_norm": 10.218626110642608, + "learning_rate": 7.740190658931482e-06, + "loss": 0.9504, + "step": 3825 + }, + { + "epoch": 0.34, + "grad_norm": 9.695229521398941, + "learning_rate": 7.739000854266829e-06, + "loss": 0.8074, + "step": 3826 + }, + { + "epoch": 0.34, + "grad_norm": 10.50252363731724, + "learning_rate": 7.73781082796678e-06, + "loss": 0.8566, + "step": 3827 + }, + { + "epoch": 0.34, + "grad_norm": 16.603434103660934, + "learning_rate": 7.736620580127637e-06, + "loss": 0.7876, + "step": 3828 + }, + { + "epoch": 0.34, + "grad_norm": 27.471375333211558, + "learning_rate": 7.735430110845707e-06, + "loss": 0.6754, + "step": 3829 + }, + { + "epoch": 0.34, + "grad_norm": 8.029988269299906, + "learning_rate": 7.734239420217322e-06, + "loss": 0.7755, + "step": 3830 + }, + { + "epoch": 0.34, + "grad_norm": 9.145758105711828, + "learning_rate": 7.733048508338832e-06, + "loss": 0.7416, + "step": 3831 + }, + { + "epoch": 0.34, + "grad_norm": 8.697781810057705, + "learning_rate": 7.731857375306604e-06, + "loss": 0.8009, + "step": 3832 + }, + { + "epoch": 0.34, + "grad_norm": 9.432786741920589, + "learning_rate": 7.730666021217019e-06, + "loss": 0.7308, + "step": 3833 + }, + { + "epoch": 0.34, + "grad_norm": 15.282041859450352, + "learning_rate": 7.729474446166484e-06, + "loss": 0.6799, + "step": 3834 + }, + { + "epoch": 0.34, + "grad_norm": 11.863629315689261, + "learning_rate": 7.728282650251416e-06, + "loss": 0.7565, + "step": 3835 + }, + { + "epoch": 0.34, + "grad_norm": 4.867297937701518, + "learning_rate": 7.727090633568254e-06, + "loss": 0.8402, + "step": 3836 + }, + { + "epoch": 0.34, + "grad_norm": 8.09511307502535, + "learning_rate": 7.725898396213454e-06, + "loss": 0.7306, + "step": 3837 + }, + { + "epoch": 0.34, + "grad_norm": 6.696274259727556, + "learning_rate": 7.724705938283493e-06, + "loss": 0.7876, + "step": 3838 + }, + { + "epoch": 0.34, + "grad_norm": 8.575611060111264, + "learning_rate": 7.723513259874855e-06, + "loss": 0.7673, + "step": 3839 + }, + { + "epoch": 0.34, + "grad_norm": 11.784421611482676, + "learning_rate": 7.722320361084057e-06, + "loss": 0.8315, + "step": 3840 + }, + { + "epoch": 0.34, + "grad_norm": 10.723740351277286, + "learning_rate": 7.721127242007625e-06, + "loss": 0.7816, + "step": 3841 + }, + { + "epoch": 0.34, + "grad_norm": 12.132607477396268, + "learning_rate": 7.7199339027421e-06, + "loss": 0.8349, + "step": 3842 + }, + { + "epoch": 0.34, + "grad_norm": 2.983211262206762, + "learning_rate": 7.71874034338405e-06, + "loss": 0.6006, + "step": 3843 + }, + { + "epoch": 0.34, + "grad_norm": 9.166356240351272, + "learning_rate": 7.717546564030052e-06, + "loss": 0.7427, + "step": 3844 + }, + { + "epoch": 0.34, + "grad_norm": 8.118115145415894, + "learning_rate": 7.716352564776709e-06, + "loss": 0.8525, + "step": 3845 + }, + { + "epoch": 0.34, + "grad_norm": 19.78597678822605, + "learning_rate": 7.715158345720633e-06, + "loss": 0.6656, + "step": 3846 + }, + { + "epoch": 0.34, + "grad_norm": 8.212465006145727, + "learning_rate": 7.71396390695846e-06, + "loss": 0.7794, + "step": 3847 + }, + { + "epoch": 0.34, + "grad_norm": 10.559606116162717, + "learning_rate": 7.712769248586843e-06, + "loss": 0.8918, + "step": 3848 + }, + { + "epoch": 0.34, + "grad_norm": 11.280434461567978, + "learning_rate": 7.711574370702451e-06, + "loss": 0.7251, + "step": 3849 + }, + { + "epoch": 0.34, + "grad_norm": 29.06217911929704, + "learning_rate": 7.710379273401972e-06, + "loss": 0.8775, + "step": 3850 + }, + { + "epoch": 0.34, + "grad_norm": 9.325241629422738, + "learning_rate": 7.70918395678211e-06, + "loss": 0.8143, + "step": 3851 + }, + { + "epoch": 0.34, + "grad_norm": 8.566659548563624, + "learning_rate": 7.707988420939588e-06, + "loss": 0.8399, + "step": 3852 + }, + { + "epoch": 0.34, + "grad_norm": 3.942675770760893, + "learning_rate": 7.706792665971147e-06, + "loss": 0.5791, + "step": 3853 + }, + { + "epoch": 0.34, + "grad_norm": 10.76968215194795, + "learning_rate": 7.705596691973546e-06, + "loss": 0.8535, + "step": 3854 + }, + { + "epoch": 0.34, + "grad_norm": 8.289900425345468, + "learning_rate": 7.704400499043563e-06, + "loss": 0.8176, + "step": 3855 + }, + { + "epoch": 0.34, + "grad_norm": 7.7344904812329345, + "learning_rate": 7.703204087277989e-06, + "loss": 0.6999, + "step": 3856 + }, + { + "epoch": 0.34, + "grad_norm": 10.64070970783752, + "learning_rate": 7.702007456773638e-06, + "loss": 0.7932, + "step": 3857 + }, + { + "epoch": 0.34, + "grad_norm": 3.233375806671925, + "learning_rate": 7.700810607627336e-06, + "loss": 0.5787, + "step": 3858 + }, + { + "epoch": 0.34, + "grad_norm": 6.504679773750085, + "learning_rate": 7.699613539935933e-06, + "loss": 0.8872, + "step": 3859 + }, + { + "epoch": 0.34, + "grad_norm": 7.246845662098848, + "learning_rate": 7.698416253796293e-06, + "loss": 0.7152, + "step": 3860 + }, + { + "epoch": 0.34, + "grad_norm": 6.554285289935077, + "learning_rate": 7.6972187493053e-06, + "loss": 0.7371, + "step": 3861 + }, + { + "epoch": 0.34, + "grad_norm": 9.18090684282212, + "learning_rate": 7.69602102655985e-06, + "loss": 0.8173, + "step": 3862 + }, + { + "epoch": 0.34, + "grad_norm": 11.495193995375832, + "learning_rate": 7.694823085656863e-06, + "loss": 0.8259, + "step": 3863 + }, + { + "epoch": 0.34, + "grad_norm": 6.902569548491149, + "learning_rate": 7.693624926693276e-06, + "loss": 0.7015, + "step": 3864 + }, + { + "epoch": 0.34, + "grad_norm": 3.9919942429561437, + "learning_rate": 7.69242654976604e-06, + "loss": 0.5675, + "step": 3865 + }, + { + "epoch": 0.34, + "grad_norm": 9.062157695049944, + "learning_rate": 7.691227954972127e-06, + "loss": 0.8334, + "step": 3866 + }, + { + "epoch": 0.34, + "grad_norm": 9.390255021038824, + "learning_rate": 7.690029142408524e-06, + "loss": 0.8057, + "step": 3867 + }, + { + "epoch": 0.34, + "grad_norm": 8.498684840972745, + "learning_rate": 7.688830112172238e-06, + "loss": 0.9363, + "step": 3868 + }, + { + "epoch": 0.34, + "grad_norm": 7.536086560264097, + "learning_rate": 7.687630864360291e-06, + "loss": 0.7205, + "step": 3869 + }, + { + "epoch": 0.34, + "grad_norm": 6.302388137275441, + "learning_rate": 7.686431399069725e-06, + "loss": 0.6593, + "step": 3870 + }, + { + "epoch": 0.34, + "grad_norm": 5.220461893471065, + "learning_rate": 7.6852317163976e-06, + "loss": 0.6102, + "step": 3871 + }, + { + "epoch": 0.34, + "grad_norm": 8.35938827490016, + "learning_rate": 7.684031816440991e-06, + "loss": 0.7935, + "step": 3872 + }, + { + "epoch": 0.34, + "grad_norm": 9.462431721148286, + "learning_rate": 7.682831699296991e-06, + "loss": 0.9209, + "step": 3873 + }, + { + "epoch": 0.34, + "grad_norm": 8.330336890984755, + "learning_rate": 7.681631365062712e-06, + "loss": 0.7932, + "step": 3874 + }, + { + "epoch": 0.34, + "grad_norm": 11.997614607468913, + "learning_rate": 7.680430813835285e-06, + "loss": 0.9933, + "step": 3875 + }, + { + "epoch": 0.34, + "grad_norm": 9.974738952908218, + "learning_rate": 7.679230045711856e-06, + "loss": 0.6815, + "step": 3876 + }, + { + "epoch": 0.34, + "grad_norm": 6.410426912841582, + "learning_rate": 7.678029060789587e-06, + "loss": 0.6802, + "step": 3877 + }, + { + "epoch": 0.34, + "grad_norm": 10.058711707036295, + "learning_rate": 7.67682785916566e-06, + "loss": 0.9044, + "step": 3878 + }, + { + "epoch": 0.34, + "grad_norm": 22.435738465900695, + "learning_rate": 7.675626440937278e-06, + "loss": 0.7698, + "step": 3879 + }, + { + "epoch": 0.34, + "grad_norm": 14.070342701842295, + "learning_rate": 7.674424806201654e-06, + "loss": 0.7707, + "step": 3880 + }, + { + "epoch": 0.34, + "grad_norm": 10.378966528097585, + "learning_rate": 7.673222955056023e-06, + "loss": 0.7646, + "step": 3881 + }, + { + "epoch": 0.34, + "grad_norm": 16.000596114970243, + "learning_rate": 7.672020887597636e-06, + "loss": 0.8816, + "step": 3882 + }, + { + "epoch": 0.34, + "grad_norm": 15.618663221209063, + "learning_rate": 7.670818603923765e-06, + "loss": 0.6544, + "step": 3883 + }, + { + "epoch": 0.34, + "grad_norm": 7.5398605786158335, + "learning_rate": 7.669616104131697e-06, + "loss": 0.8467, + "step": 3884 + }, + { + "epoch": 0.34, + "grad_norm": 8.502386406088078, + "learning_rate": 7.668413388318731e-06, + "loss": 0.9146, + "step": 3885 + }, + { + "epoch": 0.34, + "grad_norm": 2.9644977695031978, + "learning_rate": 7.667210456582194e-06, + "loss": 0.5496, + "step": 3886 + }, + { + "epoch": 0.34, + "grad_norm": 11.441908874349561, + "learning_rate": 7.666007309019423e-06, + "loss": 0.8374, + "step": 3887 + }, + { + "epoch": 0.34, + "grad_norm": 6.380931067859205, + "learning_rate": 7.664803945727774e-06, + "loss": 0.8116, + "step": 3888 + }, + { + "epoch": 0.34, + "grad_norm": 2.5290634106875984, + "learning_rate": 7.663600366804625e-06, + "loss": 0.5409, + "step": 3889 + }, + { + "epoch": 0.34, + "grad_norm": 14.740775866971806, + "learning_rate": 7.662396572347364e-06, + "loss": 0.6885, + "step": 3890 + }, + { + "epoch": 0.34, + "grad_norm": 8.164537809007696, + "learning_rate": 7.661192562453401e-06, + "loss": 0.8274, + "step": 3891 + }, + { + "epoch": 0.34, + "grad_norm": 8.077508841053284, + "learning_rate": 7.659988337220164e-06, + "loss": 0.7866, + "step": 3892 + }, + { + "epoch": 0.34, + "grad_norm": 13.614561184366318, + "learning_rate": 7.658783896745093e-06, + "loss": 0.9668, + "step": 3893 + }, + { + "epoch": 0.34, + "grad_norm": 7.550405738268819, + "learning_rate": 7.657579241125652e-06, + "loss": 0.9205, + "step": 3894 + }, + { + "epoch": 0.34, + "grad_norm": 14.815000451517248, + "learning_rate": 7.656374370459321e-06, + "loss": 0.9081, + "step": 3895 + }, + { + "epoch": 0.34, + "grad_norm": 9.829724916809372, + "learning_rate": 7.655169284843593e-06, + "loss": 0.8745, + "step": 3896 + }, + { + "epoch": 0.34, + "grad_norm": 7.115058081050513, + "learning_rate": 7.653963984375982e-06, + "loss": 0.7316, + "step": 3897 + }, + { + "epoch": 0.34, + "grad_norm": 25.174252078614252, + "learning_rate": 7.652758469154023e-06, + "loss": 0.7991, + "step": 3898 + }, + { + "epoch": 0.34, + "grad_norm": 11.303301261272596, + "learning_rate": 7.651552739275259e-06, + "loss": 0.843, + "step": 3899 + }, + { + "epoch": 0.34, + "grad_norm": 2.4172332436585964, + "learning_rate": 7.65034679483726e-06, + "loss": 0.5398, + "step": 3900 + }, + { + "epoch": 0.34, + "grad_norm": 15.493902038172585, + "learning_rate": 7.649140635937606e-06, + "loss": 0.7192, + "step": 3901 + }, + { + "epoch": 0.34, + "grad_norm": 11.399585975699996, + "learning_rate": 7.647934262673899e-06, + "loss": 0.7712, + "step": 3902 + }, + { + "epoch": 0.34, + "grad_norm": 9.013223014385188, + "learning_rate": 7.646727675143755e-06, + "loss": 0.8182, + "step": 3903 + }, + { + "epoch": 0.34, + "grad_norm": 10.725723084062475, + "learning_rate": 7.645520873444811e-06, + "loss": 0.8284, + "step": 3904 + }, + { + "epoch": 0.34, + "grad_norm": 11.380979434425601, + "learning_rate": 7.644313857674718e-06, + "loss": 0.781, + "step": 3905 + }, + { + "epoch": 0.34, + "grad_norm": 11.318524277066542, + "learning_rate": 7.643106627931148e-06, + "loss": 0.8242, + "step": 3906 + }, + { + "epoch": 0.34, + "grad_norm": 11.538050781699132, + "learning_rate": 7.641899184311784e-06, + "loss": 0.7078, + "step": 3907 + }, + { + "epoch": 0.34, + "grad_norm": 9.479490847641019, + "learning_rate": 7.640691526914333e-06, + "loss": 0.7113, + "step": 3908 + }, + { + "epoch": 0.34, + "grad_norm": 8.090193386619234, + "learning_rate": 7.639483655836515e-06, + "loss": 0.7907, + "step": 3909 + }, + { + "epoch": 0.34, + "grad_norm": 18.075652205292393, + "learning_rate": 7.638275571176073e-06, + "loss": 0.9253, + "step": 3910 + }, + { + "epoch": 0.34, + "grad_norm": 7.89484872762941, + "learning_rate": 7.637067273030759e-06, + "loss": 0.8514, + "step": 3911 + }, + { + "epoch": 0.34, + "grad_norm": 9.8215323567191, + "learning_rate": 7.635858761498347e-06, + "loss": 0.9327, + "step": 3912 + }, + { + "epoch": 0.34, + "grad_norm": 8.525724736522102, + "learning_rate": 7.63465003667663e-06, + "loss": 0.8617, + "step": 3913 + }, + { + "epoch": 0.34, + "grad_norm": 7.693724278852611, + "learning_rate": 7.633441098663412e-06, + "loss": 0.8732, + "step": 3914 + }, + { + "epoch": 0.34, + "grad_norm": 2.390813069119867, + "learning_rate": 7.632231947556523e-06, + "loss": 0.4659, + "step": 3915 + }, + { + "epoch": 0.34, + "grad_norm": 7.293105883519997, + "learning_rate": 7.631022583453804e-06, + "loss": 0.8816, + "step": 3916 + }, + { + "epoch": 0.34, + "grad_norm": 12.15904602753064, + "learning_rate": 7.629813006453114e-06, + "loss": 0.8451, + "step": 3917 + }, + { + "epoch": 0.34, + "grad_norm": 10.056658953283618, + "learning_rate": 7.628603216652327e-06, + "loss": 0.757, + "step": 3918 + }, + { + "epoch": 0.34, + "grad_norm": 6.439568027684935, + "learning_rate": 7.627393214149341e-06, + "loss": 0.7696, + "step": 3919 + }, + { + "epoch": 0.34, + "grad_norm": 7.809902009243983, + "learning_rate": 7.6261829990420685e-06, + "loss": 0.8248, + "step": 3920 + }, + { + "epoch": 0.34, + "grad_norm": 3.2082978286484494, + "learning_rate": 7.624972571428435e-06, + "loss": 0.5808, + "step": 3921 + }, + { + "epoch": 0.34, + "grad_norm": 9.061409143822972, + "learning_rate": 7.6237619314063884e-06, + "loss": 0.8243, + "step": 3922 + }, + { + "epoch": 0.34, + "grad_norm": 6.476086179183928, + "learning_rate": 7.62255107907389e-06, + "loss": 0.8606, + "step": 3923 + }, + { + "epoch": 0.34, + "grad_norm": 7.142111620442133, + "learning_rate": 7.62134001452892e-06, + "loss": 0.8167, + "step": 3924 + }, + { + "epoch": 0.34, + "grad_norm": 6.355789811602934, + "learning_rate": 7.620128737869476e-06, + "loss": 0.6924, + "step": 3925 + }, + { + "epoch": 0.34, + "grad_norm": 6.290074897292337, + "learning_rate": 7.618917249193576e-06, + "loss": 0.8365, + "step": 3926 + }, + { + "epoch": 0.34, + "grad_norm": 7.873228863088276, + "learning_rate": 7.6177055485992455e-06, + "loss": 0.7782, + "step": 3927 + }, + { + "epoch": 0.34, + "grad_norm": 7.765936559877259, + "learning_rate": 7.616493636184538e-06, + "loss": 0.7093, + "step": 3928 + }, + { + "epoch": 0.35, + "grad_norm": 14.620132624539925, + "learning_rate": 7.6152815120475165e-06, + "loss": 0.909, + "step": 3929 + }, + { + "epoch": 0.35, + "grad_norm": 7.374449788528988, + "learning_rate": 7.614069176286267e-06, + "loss": 0.6129, + "step": 3930 + }, + { + "epoch": 0.35, + "grad_norm": 7.897394654046696, + "learning_rate": 7.612856628998886e-06, + "loss": 0.8413, + "step": 3931 + }, + { + "epoch": 0.35, + "grad_norm": 7.472427170674936, + "learning_rate": 7.611643870283494e-06, + "loss": 0.9498, + "step": 3932 + }, + { + "epoch": 0.35, + "grad_norm": 7.275594827951904, + "learning_rate": 7.6104309002382245e-06, + "loss": 0.7025, + "step": 3933 + }, + { + "epoch": 0.35, + "grad_norm": 6.57483781131763, + "learning_rate": 7.609217718961228e-06, + "loss": 0.7365, + "step": 3934 + }, + { + "epoch": 0.35, + "grad_norm": 2.7210343378372657, + "learning_rate": 7.608004326550675e-06, + "loss": 0.4974, + "step": 3935 + }, + { + "epoch": 0.35, + "grad_norm": 6.6393777929551, + "learning_rate": 7.606790723104749e-06, + "loss": 0.7779, + "step": 3936 + }, + { + "epoch": 0.35, + "grad_norm": 10.944532047217356, + "learning_rate": 7.6055769087216545e-06, + "loss": 0.7376, + "step": 3937 + }, + { + "epoch": 0.35, + "grad_norm": 8.027259036115481, + "learning_rate": 7.604362883499609e-06, + "loss": 0.7509, + "step": 3938 + }, + { + "epoch": 0.35, + "grad_norm": 9.866289575618971, + "learning_rate": 7.603148647536853e-06, + "loss": 0.8226, + "step": 3939 + }, + { + "epoch": 0.35, + "grad_norm": 2.922423303722492, + "learning_rate": 7.601934200931638e-06, + "loss": 0.5672, + "step": 3940 + }, + { + "epoch": 0.35, + "grad_norm": 9.25920403896589, + "learning_rate": 7.600719543782237e-06, + "loss": 0.7943, + "step": 3941 + }, + { + "epoch": 0.35, + "grad_norm": 8.521001266879116, + "learning_rate": 7.599504676186934e-06, + "loss": 0.7563, + "step": 3942 + }, + { + "epoch": 0.35, + "grad_norm": 5.135538639306115, + "learning_rate": 7.598289598244038e-06, + "loss": 0.5598, + "step": 3943 + }, + { + "epoch": 0.35, + "grad_norm": 5.868256490136458, + "learning_rate": 7.5970743100518704e-06, + "loss": 0.6581, + "step": 3944 + }, + { + "epoch": 0.35, + "grad_norm": 6.7793979190038725, + "learning_rate": 7.5958588117087685e-06, + "loss": 0.8431, + "step": 3945 + }, + { + "epoch": 0.35, + "grad_norm": 8.483415812779715, + "learning_rate": 7.594643103313091e-06, + "loss": 0.7136, + "step": 3946 + }, + { + "epoch": 0.35, + "grad_norm": 10.736301232606797, + "learning_rate": 7.593427184963208e-06, + "loss": 0.7967, + "step": 3947 + }, + { + "epoch": 0.35, + "grad_norm": 17.100777289562767, + "learning_rate": 7.59221105675751e-06, + "loss": 0.7539, + "step": 3948 + }, + { + "epoch": 0.35, + "grad_norm": 3.198915202080439, + "learning_rate": 7.590994718794408e-06, + "loss": 0.5906, + "step": 3949 + }, + { + "epoch": 0.35, + "grad_norm": 9.576697033070243, + "learning_rate": 7.5897781711723215e-06, + "loss": 0.84, + "step": 3950 + }, + { + "epoch": 0.35, + "grad_norm": 9.200021072757801, + "learning_rate": 7.588561413989695e-06, + "loss": 0.8046, + "step": 3951 + }, + { + "epoch": 0.35, + "grad_norm": 8.30364069780401, + "learning_rate": 7.587344447344983e-06, + "loss": 0.77, + "step": 3952 + }, + { + "epoch": 0.35, + "grad_norm": 7.920114622881582, + "learning_rate": 7.586127271336663e-06, + "loss": 0.7696, + "step": 3953 + }, + { + "epoch": 0.35, + "grad_norm": 8.088015219427048, + "learning_rate": 7.584909886063226e-06, + "loss": 0.8445, + "step": 3954 + }, + { + "epoch": 0.35, + "grad_norm": 19.426855905812115, + "learning_rate": 7.583692291623179e-06, + "loss": 0.7226, + "step": 3955 + }, + { + "epoch": 0.35, + "grad_norm": 18.0078152651729, + "learning_rate": 7.582474488115052e-06, + "loss": 0.8692, + "step": 3956 + }, + { + "epoch": 0.35, + "grad_norm": 9.159129965088576, + "learning_rate": 7.581256475637384e-06, + "loss": 0.7917, + "step": 3957 + }, + { + "epoch": 0.35, + "grad_norm": 5.5671278048198145, + "learning_rate": 7.580038254288736e-06, + "loss": 0.8188, + "step": 3958 + }, + { + "epoch": 0.35, + "grad_norm": 11.506797730713727, + "learning_rate": 7.578819824167684e-06, + "loss": 0.7765, + "step": 3959 + }, + { + "epoch": 0.35, + "grad_norm": 9.00974100268924, + "learning_rate": 7.577601185372821e-06, + "loss": 0.7147, + "step": 3960 + }, + { + "epoch": 0.35, + "grad_norm": 13.451667808645315, + "learning_rate": 7.576382338002759e-06, + "loss": 0.7269, + "step": 3961 + }, + { + "epoch": 0.35, + "grad_norm": 24.51790196245228, + "learning_rate": 7.575163282156121e-06, + "loss": 0.7153, + "step": 3962 + }, + { + "epoch": 0.35, + "grad_norm": 8.053703744108978, + "learning_rate": 7.573944017931557e-06, + "loss": 0.8004, + "step": 3963 + }, + { + "epoch": 0.35, + "grad_norm": 18.087752760057075, + "learning_rate": 7.572724545427723e-06, + "loss": 0.8396, + "step": 3964 + }, + { + "epoch": 0.35, + "grad_norm": 6.785181281233911, + "learning_rate": 7.5715048647432995e-06, + "loss": 0.7804, + "step": 3965 + }, + { + "epoch": 0.35, + "grad_norm": 8.255669118755605, + "learning_rate": 7.570284975976978e-06, + "loss": 0.7098, + "step": 3966 + }, + { + "epoch": 0.35, + "grad_norm": 9.09713465254974, + "learning_rate": 7.569064879227474e-06, + "loss": 0.7579, + "step": 3967 + }, + { + "epoch": 0.35, + "grad_norm": 7.586368910254791, + "learning_rate": 7.567844574593512e-06, + "loss": 0.8578, + "step": 3968 + }, + { + "epoch": 0.35, + "grad_norm": 9.150593466745862, + "learning_rate": 7.566624062173837e-06, + "loss": 0.7133, + "step": 3969 + }, + { + "epoch": 0.35, + "grad_norm": 7.17766901643168, + "learning_rate": 7.565403342067215e-06, + "loss": 0.755, + "step": 3970 + }, + { + "epoch": 0.35, + "grad_norm": 17.43409586699016, + "learning_rate": 7.56418241437242e-06, + "loss": 0.7309, + "step": 3971 + }, + { + "epoch": 0.35, + "grad_norm": 7.445492182783555, + "learning_rate": 7.56296127918825e-06, + "loss": 0.6922, + "step": 3972 + }, + { + "epoch": 0.35, + "grad_norm": 7.6253340180304985, + "learning_rate": 7.561739936613517e-06, + "loss": 0.9378, + "step": 3973 + }, + { + "epoch": 0.35, + "grad_norm": 7.67125910320869, + "learning_rate": 7.560518386747049e-06, + "loss": 1.0094, + "step": 3974 + }, + { + "epoch": 0.35, + "grad_norm": 10.50783581088593, + "learning_rate": 7.559296629687693e-06, + "loss": 0.7892, + "step": 3975 + }, + { + "epoch": 0.35, + "grad_norm": 9.184526781117313, + "learning_rate": 7.55807466553431e-06, + "loss": 0.7005, + "step": 3976 + }, + { + "epoch": 0.35, + "grad_norm": 5.262887518094319, + "learning_rate": 7.556852494385781e-06, + "loss": 0.8928, + "step": 3977 + }, + { + "epoch": 0.35, + "grad_norm": 8.705701956246324, + "learning_rate": 7.555630116341e-06, + "loss": 0.8195, + "step": 3978 + }, + { + "epoch": 0.35, + "grad_norm": 8.885931858842127, + "learning_rate": 7.554407531498882e-06, + "loss": 0.81, + "step": 3979 + }, + { + "epoch": 0.35, + "grad_norm": 6.149942826108538, + "learning_rate": 7.553184739958355e-06, + "loss": 0.6061, + "step": 3980 + }, + { + "epoch": 0.35, + "grad_norm": 2.4656338210105817, + "learning_rate": 7.551961741818367e-06, + "loss": 0.5068, + "step": 3981 + }, + { + "epoch": 0.35, + "grad_norm": 12.629861298900122, + "learning_rate": 7.550738537177878e-06, + "loss": 0.8831, + "step": 3982 + }, + { + "epoch": 0.35, + "grad_norm": 9.418057632945574, + "learning_rate": 7.549515126135871e-06, + "loss": 0.7443, + "step": 3983 + }, + { + "epoch": 0.35, + "grad_norm": 10.141071296286084, + "learning_rate": 7.548291508791341e-06, + "loss": 0.8924, + "step": 3984 + }, + { + "epoch": 0.35, + "grad_norm": 12.098536599301168, + "learning_rate": 7.5470676852433e-06, + "loss": 0.808, + "step": 3985 + }, + { + "epoch": 0.35, + "grad_norm": 7.33953112287265, + "learning_rate": 7.545843655590779e-06, + "loss": 0.7404, + "step": 3986 + }, + { + "epoch": 0.35, + "grad_norm": 7.537012557873799, + "learning_rate": 7.544619419932825e-06, + "loss": 0.8253, + "step": 3987 + }, + { + "epoch": 0.35, + "grad_norm": 9.733212441354736, + "learning_rate": 7.543394978368499e-06, + "loss": 0.9939, + "step": 3988 + }, + { + "epoch": 0.35, + "grad_norm": 8.80401423780282, + "learning_rate": 7.542170330996883e-06, + "loss": 0.7259, + "step": 3989 + }, + { + "epoch": 0.35, + "grad_norm": 9.508445422330379, + "learning_rate": 7.540945477917072e-06, + "loss": 0.8428, + "step": 3990 + }, + { + "epoch": 0.35, + "grad_norm": 3.016166768317236, + "learning_rate": 7.53972041922818e-06, + "loss": 0.6208, + "step": 3991 + }, + { + "epoch": 0.35, + "grad_norm": 8.719856336395123, + "learning_rate": 7.538495155029337e-06, + "loss": 0.8084, + "step": 3992 + }, + { + "epoch": 0.35, + "grad_norm": 9.22084799981738, + "learning_rate": 7.537269685419688e-06, + "loss": 0.7519, + "step": 3993 + }, + { + "epoch": 0.35, + "grad_norm": 8.339655601341269, + "learning_rate": 7.536044010498396e-06, + "loss": 0.7766, + "step": 3994 + }, + { + "epoch": 0.35, + "grad_norm": 2.4074896843138838, + "learning_rate": 7.534818130364641e-06, + "loss": 0.5333, + "step": 3995 + }, + { + "epoch": 0.35, + "grad_norm": 9.43519723886775, + "learning_rate": 7.533592045117621e-06, + "loss": 0.6927, + "step": 3996 + }, + { + "epoch": 0.35, + "grad_norm": 12.712431393307183, + "learning_rate": 7.532365754856545e-06, + "loss": 0.8389, + "step": 3997 + }, + { + "epoch": 0.35, + "grad_norm": 9.21890387171665, + "learning_rate": 7.531139259680645e-06, + "loss": 0.9199, + "step": 3998 + }, + { + "epoch": 0.35, + "grad_norm": 10.46800546078943, + "learning_rate": 7.529912559689166e-06, + "loss": 0.8162, + "step": 3999 + }, + { + "epoch": 0.35, + "grad_norm": 11.62688291514835, + "learning_rate": 7.5286856549813735e-06, + "loss": 0.8878, + "step": 4000 + }, + { + "epoch": 0.35, + "grad_norm": 2.7547441413516514, + "learning_rate": 7.527458545656542e-06, + "loss": 0.5468, + "step": 4001 + }, + { + "epoch": 0.35, + "grad_norm": 7.493537000850039, + "learning_rate": 7.526231231813969e-06, + "loss": 0.6139, + "step": 4002 + }, + { + "epoch": 0.35, + "grad_norm": 8.199519924073998, + "learning_rate": 7.525003713552968e-06, + "loss": 0.8632, + "step": 4003 + }, + { + "epoch": 0.35, + "grad_norm": 30.946987316284126, + "learning_rate": 7.523775990972866e-06, + "loss": 0.694, + "step": 4004 + }, + { + "epoch": 0.35, + "grad_norm": 7.764554217019674, + "learning_rate": 7.5225480641730084e-06, + "loss": 0.8135, + "step": 4005 + }, + { + "epoch": 0.35, + "grad_norm": 6.731911357496425, + "learning_rate": 7.5213199332527585e-06, + "loss": 0.783, + "step": 4006 + }, + { + "epoch": 0.35, + "grad_norm": 6.845464030406883, + "learning_rate": 7.520091598311494e-06, + "loss": 0.7567, + "step": 4007 + }, + { + "epoch": 0.35, + "grad_norm": 7.515047875587519, + "learning_rate": 7.518863059448608e-06, + "loss": 0.8081, + "step": 4008 + }, + { + "epoch": 0.35, + "grad_norm": 13.721295549068362, + "learning_rate": 7.517634316763513e-06, + "loss": 0.9599, + "step": 4009 + }, + { + "epoch": 0.35, + "grad_norm": 9.681597995583274, + "learning_rate": 7.51640537035564e-06, + "loss": 0.7223, + "step": 4010 + }, + { + "epoch": 0.35, + "grad_norm": 9.137534245941534, + "learning_rate": 7.515176220324426e-06, + "loss": 0.7291, + "step": 4011 + }, + { + "epoch": 0.35, + "grad_norm": 9.471175253817167, + "learning_rate": 7.5139468667693386e-06, + "loss": 0.6987, + "step": 4012 + }, + { + "epoch": 0.35, + "grad_norm": 5.978376956150296, + "learning_rate": 7.512717309789852e-06, + "loss": 0.6536, + "step": 4013 + }, + { + "epoch": 0.35, + "grad_norm": 6.6404737021015645, + "learning_rate": 7.511487549485459e-06, + "loss": 0.8632, + "step": 4014 + }, + { + "epoch": 0.35, + "grad_norm": 7.24319030868556, + "learning_rate": 7.510257585955674e-06, + "loss": 0.7839, + "step": 4015 + }, + { + "epoch": 0.35, + "grad_norm": 6.848299358651625, + "learning_rate": 7.509027419300017e-06, + "loss": 0.8327, + "step": 4016 + }, + { + "epoch": 0.35, + "grad_norm": 7.999849769952475, + "learning_rate": 7.507797049618038e-06, + "loss": 0.8679, + "step": 4017 + }, + { + "epoch": 0.35, + "grad_norm": 7.749638020986158, + "learning_rate": 7.506566477009292e-06, + "loss": 0.8218, + "step": 4018 + }, + { + "epoch": 0.35, + "grad_norm": 11.653603628394048, + "learning_rate": 7.5053357015733555e-06, + "loss": 0.6756, + "step": 4019 + }, + { + "epoch": 0.35, + "grad_norm": 8.685679423243533, + "learning_rate": 7.504104723409823e-06, + "loss": 0.8362, + "step": 4020 + }, + { + "epoch": 0.35, + "grad_norm": 7.356294134880953, + "learning_rate": 7.502873542618301e-06, + "loss": 0.7149, + "step": 4021 + }, + { + "epoch": 0.35, + "grad_norm": 7.9215404957025575, + "learning_rate": 7.501642159298414e-06, + "loss": 0.8776, + "step": 4022 + }, + { + "epoch": 0.35, + "grad_norm": 2.7287869013298094, + "learning_rate": 7.500410573549807e-06, + "loss": 0.4988, + "step": 4023 + }, + { + "epoch": 0.35, + "grad_norm": 22.71081578013992, + "learning_rate": 7.499178785472134e-06, + "loss": 0.9529, + "step": 4024 + }, + { + "epoch": 0.35, + "grad_norm": 9.031723617314322, + "learning_rate": 7.497946795165071e-06, + "loss": 0.6931, + "step": 4025 + }, + { + "epoch": 0.35, + "grad_norm": 7.227128031928849, + "learning_rate": 7.4967146027283085e-06, + "loss": 0.6431, + "step": 4026 + }, + { + "epoch": 0.35, + "grad_norm": 9.104098467884372, + "learning_rate": 7.495482208261554e-06, + "loss": 0.7286, + "step": 4027 + }, + { + "epoch": 0.35, + "grad_norm": 15.042257707440395, + "learning_rate": 7.49424961186453e-06, + "loss": 0.8065, + "step": 4028 + }, + { + "epoch": 0.35, + "grad_norm": 5.77619574925267, + "learning_rate": 7.493016813636976e-06, + "loss": 0.7948, + "step": 4029 + }, + { + "epoch": 0.35, + "grad_norm": 7.902749568111473, + "learning_rate": 7.491783813678649e-06, + "loss": 0.7963, + "step": 4030 + }, + { + "epoch": 0.35, + "grad_norm": 2.6063273843194357, + "learning_rate": 7.49055061208932e-06, + "loss": 0.5239, + "step": 4031 + }, + { + "epoch": 0.35, + "grad_norm": 14.04241321470559, + "learning_rate": 7.489317208968778e-06, + "loss": 0.7044, + "step": 4032 + }, + { + "epoch": 0.35, + "grad_norm": 7.354894688870216, + "learning_rate": 7.488083604416831e-06, + "loss": 0.7086, + "step": 4033 + }, + { + "epoch": 0.35, + "grad_norm": 3.2868687081047003, + "learning_rate": 7.486849798533293e-06, + "loss": 0.5978, + "step": 4034 + }, + { + "epoch": 0.35, + "grad_norm": 2.8713867642007793, + "learning_rate": 7.485615791418009e-06, + "loss": 0.5229, + "step": 4035 + }, + { + "epoch": 0.35, + "grad_norm": 10.068293594391715, + "learning_rate": 7.484381583170829e-06, + "loss": 0.798, + "step": 4036 + }, + { + "epoch": 0.35, + "grad_norm": 18.04106141502968, + "learning_rate": 7.483147173891624e-06, + "loss": 0.7474, + "step": 4037 + }, + { + "epoch": 0.35, + "grad_norm": 6.584475793746839, + "learning_rate": 7.48191256368028e-06, + "loss": 0.7491, + "step": 4038 + }, + { + "epoch": 0.35, + "grad_norm": 6.701218903112596, + "learning_rate": 7.480677752636699e-06, + "loss": 0.898, + "step": 4039 + }, + { + "epoch": 0.35, + "grad_norm": 7.924150388771717, + "learning_rate": 7.479442740860802e-06, + "loss": 0.7846, + "step": 4040 + }, + { + "epoch": 0.35, + "grad_norm": 11.269670040721607, + "learning_rate": 7.478207528452522e-06, + "loss": 0.7687, + "step": 4041 + }, + { + "epoch": 0.35, + "grad_norm": 6.396875817852838, + "learning_rate": 7.4769721155118115e-06, + "loss": 0.7344, + "step": 4042 + }, + { + "epoch": 0.36, + "grad_norm": 11.488729191623825, + "learning_rate": 7.475736502138637e-06, + "loss": 0.724, + "step": 4043 + }, + { + "epoch": 0.36, + "grad_norm": 7.3391255697707, + "learning_rate": 7.474500688432984e-06, + "loss": 0.9171, + "step": 4044 + }, + { + "epoch": 0.36, + "grad_norm": 5.751779438291744, + "learning_rate": 7.47326467449485e-06, + "loss": 0.7639, + "step": 4045 + }, + { + "epoch": 0.36, + "grad_norm": 15.114078461656993, + "learning_rate": 7.472028460424254e-06, + "loss": 0.7421, + "step": 4046 + }, + { + "epoch": 0.36, + "grad_norm": 10.127394485403597, + "learning_rate": 7.470792046321227e-06, + "loss": 0.8192, + "step": 4047 + }, + { + "epoch": 0.36, + "grad_norm": 3.3717953798551985, + "learning_rate": 7.469555432285816e-06, + "loss": 0.5611, + "step": 4048 + }, + { + "epoch": 0.36, + "grad_norm": 6.179296019171882, + "learning_rate": 7.468318618418089e-06, + "loss": 0.7794, + "step": 4049 + }, + { + "epoch": 0.36, + "grad_norm": 7.845197710380337, + "learning_rate": 7.467081604818125e-06, + "loss": 0.7349, + "step": 4050 + }, + { + "epoch": 0.36, + "grad_norm": 8.534634105736265, + "learning_rate": 7.465844391586019e-06, + "loss": 0.7405, + "step": 4051 + }, + { + "epoch": 0.36, + "grad_norm": 11.563932566398458, + "learning_rate": 7.464606978821889e-06, + "loss": 0.7183, + "step": 4052 + }, + { + "epoch": 0.36, + "grad_norm": 6.870425744000303, + "learning_rate": 7.463369366625862e-06, + "loss": 0.8107, + "step": 4053 + }, + { + "epoch": 0.36, + "grad_norm": 23.587545288543428, + "learning_rate": 7.4621315550980825e-06, + "loss": 0.707, + "step": 4054 + }, + { + "epoch": 0.36, + "grad_norm": 8.651666989431016, + "learning_rate": 7.4608935443387135e-06, + "loss": 0.7619, + "step": 4055 + }, + { + "epoch": 0.36, + "grad_norm": 8.616497037257057, + "learning_rate": 7.459655334447932e-06, + "loss": 0.8019, + "step": 4056 + }, + { + "epoch": 0.36, + "grad_norm": 5.770146474115134, + "learning_rate": 7.458416925525932e-06, + "loss": 0.5314, + "step": 4057 + }, + { + "epoch": 0.36, + "grad_norm": 19.54450488907055, + "learning_rate": 7.457178317672925e-06, + "loss": 0.6253, + "step": 4058 + }, + { + "epoch": 0.36, + "grad_norm": 8.597375895437846, + "learning_rate": 7.455939510989134e-06, + "loss": 0.7976, + "step": 4059 + }, + { + "epoch": 0.36, + "grad_norm": 2.508483565399146, + "learning_rate": 7.454700505574805e-06, + "loss": 0.5313, + "step": 4060 + }, + { + "epoch": 0.36, + "grad_norm": 7.15778467109325, + "learning_rate": 7.4534613015301935e-06, + "loss": 0.6406, + "step": 4061 + }, + { + "epoch": 0.36, + "grad_norm": 7.631630533986943, + "learning_rate": 7.452221898955574e-06, + "loss": 0.6763, + "step": 4062 + }, + { + "epoch": 0.36, + "grad_norm": 2.9298063423307545, + "learning_rate": 7.450982297951238e-06, + "loss": 0.5415, + "step": 4063 + }, + { + "epoch": 0.36, + "grad_norm": 2.578954271420968, + "learning_rate": 7.449742498617492e-06, + "loss": 0.5112, + "step": 4064 + }, + { + "epoch": 0.36, + "grad_norm": 7.561782692341439, + "learning_rate": 7.448502501054658e-06, + "loss": 0.8792, + "step": 4065 + }, + { + "epoch": 0.36, + "grad_norm": 5.8445975252942395, + "learning_rate": 7.447262305363074e-06, + "loss": 0.7102, + "step": 4066 + }, + { + "epoch": 0.36, + "grad_norm": 10.050878435290855, + "learning_rate": 7.4460219116430944e-06, + "loss": 0.8629, + "step": 4067 + }, + { + "epoch": 0.36, + "grad_norm": 15.341721908166402, + "learning_rate": 7.44478131999509e-06, + "loss": 0.6671, + "step": 4068 + }, + { + "epoch": 0.36, + "grad_norm": 9.313811573139207, + "learning_rate": 7.44354053051945e-06, + "loss": 0.754, + "step": 4069 + }, + { + "epoch": 0.36, + "grad_norm": 9.668856988691159, + "learning_rate": 7.442299543316573e-06, + "loss": 0.8128, + "step": 4070 + }, + { + "epoch": 0.36, + "grad_norm": 7.377603305983573, + "learning_rate": 7.44105835848688e-06, + "loss": 0.7913, + "step": 4071 + }, + { + "epoch": 0.36, + "grad_norm": 10.673809313384464, + "learning_rate": 7.439816976130806e-06, + "loss": 0.7422, + "step": 4072 + }, + { + "epoch": 0.36, + "grad_norm": 16.672730850751567, + "learning_rate": 7.438575396348801e-06, + "loss": 0.7052, + "step": 4073 + }, + { + "epoch": 0.36, + "grad_norm": 7.428355933988194, + "learning_rate": 7.437333619241329e-06, + "loss": 0.8976, + "step": 4074 + }, + { + "epoch": 0.36, + "grad_norm": 2.4676166968320468, + "learning_rate": 7.436091644908876e-06, + "loss": 0.5203, + "step": 4075 + }, + { + "epoch": 0.36, + "grad_norm": 6.928965262202256, + "learning_rate": 7.434849473451939e-06, + "loss": 0.9193, + "step": 4076 + }, + { + "epoch": 0.36, + "grad_norm": 9.863338656397552, + "learning_rate": 7.433607104971034e-06, + "loss": 0.8157, + "step": 4077 + }, + { + "epoch": 0.36, + "grad_norm": 27.525941740642974, + "learning_rate": 7.43236453956669e-06, + "loss": 0.6886, + "step": 4078 + }, + { + "epoch": 0.36, + "grad_norm": 2.1634169802208447, + "learning_rate": 7.431121777339453e-06, + "loss": 0.4093, + "step": 4079 + }, + { + "epoch": 0.36, + "grad_norm": 10.553030088336016, + "learning_rate": 7.4298788183898865e-06, + "loss": 0.8853, + "step": 4080 + }, + { + "epoch": 0.36, + "grad_norm": 8.548372438284973, + "learning_rate": 7.428635662818566e-06, + "loss": 0.778, + "step": 4081 + }, + { + "epoch": 0.36, + "grad_norm": 6.67848555441902, + "learning_rate": 7.427392310726088e-06, + "loss": 0.8167, + "step": 4082 + }, + { + "epoch": 0.36, + "grad_norm": 13.559004947186903, + "learning_rate": 7.426148762213063e-06, + "loss": 0.8331, + "step": 4083 + }, + { + "epoch": 0.36, + "grad_norm": 49.153344847012825, + "learning_rate": 7.424905017380116e-06, + "loss": 0.8651, + "step": 4084 + }, + { + "epoch": 0.36, + "grad_norm": 2.893777607028321, + "learning_rate": 7.423661076327889e-06, + "loss": 0.6354, + "step": 4085 + }, + { + "epoch": 0.36, + "grad_norm": 3.7871466909137053, + "learning_rate": 7.422416939157037e-06, + "loss": 0.5285, + "step": 4086 + }, + { + "epoch": 0.36, + "grad_norm": 7.353176338001196, + "learning_rate": 7.421172605968237e-06, + "loss": 0.6713, + "step": 4087 + }, + { + "epoch": 0.36, + "grad_norm": 2.589044467538788, + "learning_rate": 7.4199280768621776e-06, + "loss": 0.543, + "step": 4088 + }, + { + "epoch": 0.36, + "grad_norm": 7.829496932719495, + "learning_rate": 7.418683351939564e-06, + "loss": 0.807, + "step": 4089 + }, + { + "epoch": 0.36, + "grad_norm": 17.153794010813023, + "learning_rate": 7.4174384313011165e-06, + "loss": 0.8389, + "step": 4090 + }, + { + "epoch": 0.36, + "grad_norm": 7.266536214341661, + "learning_rate": 7.416193315047571e-06, + "loss": 0.7445, + "step": 4091 + }, + { + "epoch": 0.36, + "grad_norm": 9.066753226617063, + "learning_rate": 7.414948003279683e-06, + "loss": 1.0333, + "step": 4092 + }, + { + "epoch": 0.36, + "grad_norm": 11.327762207261799, + "learning_rate": 7.413702496098218e-06, + "loss": 0.6447, + "step": 4093 + }, + { + "epoch": 0.36, + "grad_norm": 11.129486722501802, + "learning_rate": 7.412456793603963e-06, + "loss": 0.6425, + "step": 4094 + }, + { + "epoch": 0.36, + "grad_norm": 11.153933529534529, + "learning_rate": 7.411210895897718e-06, + "loss": 0.9176, + "step": 4095 + }, + { + "epoch": 0.36, + "grad_norm": 7.893096012174923, + "learning_rate": 7.409964803080296e-06, + "loss": 0.6846, + "step": 4096 + }, + { + "epoch": 0.36, + "grad_norm": 11.11233475021718, + "learning_rate": 7.408718515252533e-06, + "loss": 0.8075, + "step": 4097 + }, + { + "epoch": 0.36, + "grad_norm": 7.427862861369478, + "learning_rate": 7.407472032515275e-06, + "loss": 0.9054, + "step": 4098 + }, + { + "epoch": 0.36, + "grad_norm": 11.443951103072715, + "learning_rate": 7.4062253549693816e-06, + "loss": 0.7546, + "step": 4099 + }, + { + "epoch": 0.36, + "grad_norm": 8.75275880508346, + "learning_rate": 7.404978482715738e-06, + "loss": 0.9333, + "step": 4100 + }, + { + "epoch": 0.36, + "grad_norm": 9.935630341919476, + "learning_rate": 7.403731415855236e-06, + "loss": 0.8707, + "step": 4101 + }, + { + "epoch": 0.36, + "grad_norm": 17.36612047322883, + "learning_rate": 7.4024841544887835e-06, + "loss": 0.7673, + "step": 4102 + }, + { + "epoch": 0.36, + "grad_norm": 2.031860071977032, + "learning_rate": 7.401236698717313e-06, + "loss": 0.5418, + "step": 4103 + }, + { + "epoch": 0.36, + "grad_norm": 87.67304612375297, + "learning_rate": 7.39998904864176e-06, + "loss": 0.8931, + "step": 4104 + }, + { + "epoch": 0.36, + "grad_norm": 6.895276538326906, + "learning_rate": 7.3987412043630866e-06, + "loss": 0.84, + "step": 4105 + }, + { + "epoch": 0.36, + "grad_norm": 9.432330894524767, + "learning_rate": 7.3974931659822656e-06, + "loss": 0.7267, + "step": 4106 + }, + { + "epoch": 0.36, + "grad_norm": 7.084900336790308, + "learning_rate": 7.396244933600285e-06, + "loss": 0.6508, + "step": 4107 + }, + { + "epoch": 0.36, + "grad_norm": 9.405562346642736, + "learning_rate": 7.39499650731815e-06, + "loss": 0.7566, + "step": 4108 + }, + { + "epoch": 0.36, + "grad_norm": 7.068199819759221, + "learning_rate": 7.393747887236883e-06, + "loss": 0.8985, + "step": 4109 + }, + { + "epoch": 0.36, + "grad_norm": 54.68747647764097, + "learning_rate": 7.3924990734575175e-06, + "loss": 0.8328, + "step": 4110 + }, + { + "epoch": 0.36, + "grad_norm": 7.2436758957774074, + "learning_rate": 7.391250066081107e-06, + "loss": 0.7443, + "step": 4111 + }, + { + "epoch": 0.36, + "grad_norm": 20.96952275661023, + "learning_rate": 7.3900008652087195e-06, + "loss": 0.9189, + "step": 4112 + }, + { + "epoch": 0.36, + "grad_norm": 7.851577920677699, + "learning_rate": 7.388751470941436e-06, + "loss": 0.8995, + "step": 4113 + }, + { + "epoch": 0.36, + "grad_norm": 9.970721200546697, + "learning_rate": 7.387501883380358e-06, + "loss": 0.6672, + "step": 4114 + }, + { + "epoch": 0.36, + "grad_norm": 11.416397702076267, + "learning_rate": 7.3862521026265986e-06, + "loss": 0.7526, + "step": 4115 + }, + { + "epoch": 0.36, + "grad_norm": 9.784749204045589, + "learning_rate": 7.385002128781289e-06, + "loss": 0.7152, + "step": 4116 + }, + { + "epoch": 0.36, + "grad_norm": 7.582559799581756, + "learning_rate": 7.3837519619455734e-06, + "loss": 0.7412, + "step": 4117 + }, + { + "epoch": 0.36, + "grad_norm": 12.52271319349592, + "learning_rate": 7.382501602220615e-06, + "loss": 0.7749, + "step": 4118 + }, + { + "epoch": 0.36, + "grad_norm": 7.677847318963557, + "learning_rate": 7.38125104970759e-06, + "loss": 0.7893, + "step": 4119 + }, + { + "epoch": 0.36, + "grad_norm": 17.486076860357404, + "learning_rate": 7.38000030450769e-06, + "loss": 0.8429, + "step": 4120 + }, + { + "epoch": 0.36, + "grad_norm": 4.073211770296243, + "learning_rate": 7.378749366722127e-06, + "loss": 0.6562, + "step": 4121 + }, + { + "epoch": 0.36, + "grad_norm": 8.44631022089056, + "learning_rate": 7.377498236452118e-06, + "loss": 0.5978, + "step": 4122 + }, + { + "epoch": 0.36, + "grad_norm": 13.223783841560108, + "learning_rate": 7.3762469137989105e-06, + "loss": 0.8766, + "step": 4123 + }, + { + "epoch": 0.36, + "grad_norm": 9.473120781816613, + "learning_rate": 7.3749953988637535e-06, + "loss": 0.7763, + "step": 4124 + }, + { + "epoch": 0.36, + "grad_norm": 8.53447023911812, + "learning_rate": 7.373743691747919e-06, + "loss": 0.6214, + "step": 4125 + }, + { + "epoch": 0.36, + "grad_norm": 10.019030493572803, + "learning_rate": 7.372491792552694e-06, + "loss": 0.8422, + "step": 4126 + }, + { + "epoch": 0.36, + "grad_norm": 11.130886507835012, + "learning_rate": 7.37123970137938e-06, + "loss": 0.732, + "step": 4127 + }, + { + "epoch": 0.36, + "grad_norm": 7.922232141578236, + "learning_rate": 7.369987418329293e-06, + "loss": 0.817, + "step": 4128 + }, + { + "epoch": 0.36, + "grad_norm": 8.029333271058128, + "learning_rate": 7.368734943503767e-06, + "loss": 0.7103, + "step": 4129 + }, + { + "epoch": 0.36, + "grad_norm": 8.484931211268597, + "learning_rate": 7.36748227700415e-06, + "loss": 0.7354, + "step": 4130 + }, + { + "epoch": 0.36, + "grad_norm": 9.871822899089068, + "learning_rate": 7.366229418931803e-06, + "loss": 0.8289, + "step": 4131 + }, + { + "epoch": 0.36, + "grad_norm": 9.122087437846849, + "learning_rate": 7.3649763693881104e-06, + "loss": 0.815, + "step": 4132 + }, + { + "epoch": 0.36, + "grad_norm": 2.2603842748913507, + "learning_rate": 7.3637231284744625e-06, + "loss": 0.4774, + "step": 4133 + }, + { + "epoch": 0.36, + "grad_norm": 10.20299322174705, + "learning_rate": 7.36246969629227e-06, + "loss": 0.7092, + "step": 4134 + }, + { + "epoch": 0.36, + "grad_norm": 8.22799998733332, + "learning_rate": 7.36121607294296e-06, + "loss": 0.6913, + "step": 4135 + }, + { + "epoch": 0.36, + "grad_norm": 8.342086051040466, + "learning_rate": 7.3599622585279725e-06, + "loss": 0.872, + "step": 4136 + }, + { + "epoch": 0.36, + "grad_norm": 8.618367286802064, + "learning_rate": 7.3587082531487675e-06, + "loss": 0.8598, + "step": 4137 + }, + { + "epoch": 0.36, + "grad_norm": 7.502915226492183, + "learning_rate": 7.357454056906812e-06, + "loss": 0.7154, + "step": 4138 + }, + { + "epoch": 0.36, + "grad_norm": 6.65256519631903, + "learning_rate": 7.356199669903596e-06, + "loss": 0.7912, + "step": 4139 + }, + { + "epoch": 0.36, + "grad_norm": 8.44898015528005, + "learning_rate": 7.354945092240621e-06, + "loss": 0.8386, + "step": 4140 + }, + { + "epoch": 0.36, + "grad_norm": 12.504742154786724, + "learning_rate": 7.353690324019407e-06, + "loss": 0.7125, + "step": 4141 + }, + { + "epoch": 0.36, + "grad_norm": 8.751544721807612, + "learning_rate": 7.3524353653414885e-06, + "loss": 0.7681, + "step": 4142 + }, + { + "epoch": 0.36, + "grad_norm": 7.304864844770889, + "learning_rate": 7.351180216308411e-06, + "loss": 0.6231, + "step": 4143 + }, + { + "epoch": 0.36, + "grad_norm": 8.710895703271474, + "learning_rate": 7.349924877021743e-06, + "loss": 0.7899, + "step": 4144 + }, + { + "epoch": 0.36, + "grad_norm": 20.70248310637586, + "learning_rate": 7.348669347583063e-06, + "loss": 0.9061, + "step": 4145 + }, + { + "epoch": 0.36, + "grad_norm": 40.627309544831256, + "learning_rate": 7.347413628093965e-06, + "loss": 0.8461, + "step": 4146 + }, + { + "epoch": 0.36, + "grad_norm": 8.301915912599382, + "learning_rate": 7.346157718656062e-06, + "loss": 0.8105, + "step": 4147 + }, + { + "epoch": 0.36, + "grad_norm": 3.205621598146446, + "learning_rate": 7.344901619370977e-06, + "loss": 0.5806, + "step": 4148 + }, + { + "epoch": 0.36, + "grad_norm": 12.29018072427109, + "learning_rate": 7.343645330340357e-06, + "loss": 0.7603, + "step": 4149 + }, + { + "epoch": 0.36, + "grad_norm": 2.382887613148542, + "learning_rate": 7.342388851665852e-06, + "loss": 0.5312, + "step": 4150 + }, + { + "epoch": 0.36, + "grad_norm": 7.221035960328189, + "learning_rate": 7.341132183449139e-06, + "loss": 0.8925, + "step": 4151 + }, + { + "epoch": 0.36, + "grad_norm": 9.452880362843867, + "learning_rate": 7.339875325791904e-06, + "loss": 0.7917, + "step": 4152 + }, + { + "epoch": 0.36, + "grad_norm": 9.08431098716898, + "learning_rate": 7.33861827879585e-06, + "loss": 0.8556, + "step": 4153 + }, + { + "epoch": 0.36, + "grad_norm": 13.834537214293755, + "learning_rate": 7.337361042562695e-06, + "loss": 0.9201, + "step": 4154 + }, + { + "epoch": 0.36, + "grad_norm": 8.065308890780871, + "learning_rate": 7.336103617194173e-06, + "loss": 0.6809, + "step": 4155 + }, + { + "epoch": 0.36, + "grad_norm": 9.429413254576554, + "learning_rate": 7.334846002792031e-06, + "loss": 0.6969, + "step": 4156 + }, + { + "epoch": 0.37, + "grad_norm": 6.488658726719006, + "learning_rate": 7.3335881994580335e-06, + "loss": 0.7582, + "step": 4157 + }, + { + "epoch": 0.37, + "grad_norm": 33.11312299371921, + "learning_rate": 7.332330207293963e-06, + "loss": 0.879, + "step": 4158 + }, + { + "epoch": 0.37, + "grad_norm": 6.363728252072287, + "learning_rate": 7.331072026401611e-06, + "loss": 0.8937, + "step": 4159 + }, + { + "epoch": 0.37, + "grad_norm": 8.62769445187746, + "learning_rate": 7.329813656882787e-06, + "loss": 0.6086, + "step": 4160 + }, + { + "epoch": 0.37, + "grad_norm": 6.8910098853511235, + "learning_rate": 7.328555098839318e-06, + "loss": 0.8327, + "step": 4161 + }, + { + "epoch": 0.37, + "grad_norm": 13.751338275018393, + "learning_rate": 7.327296352373045e-06, + "loss": 0.6718, + "step": 4162 + }, + { + "epoch": 0.37, + "grad_norm": 8.724938525800939, + "learning_rate": 7.326037417585819e-06, + "loss": 0.7532, + "step": 4163 + }, + { + "epoch": 0.37, + "grad_norm": 8.840310873069292, + "learning_rate": 7.324778294579518e-06, + "loss": 0.7511, + "step": 4164 + }, + { + "epoch": 0.37, + "grad_norm": 10.980668407426037, + "learning_rate": 7.323518983456022e-06, + "loss": 0.9073, + "step": 4165 + }, + { + "epoch": 0.37, + "grad_norm": 15.164767102544058, + "learning_rate": 7.322259484317236e-06, + "loss": 0.6627, + "step": 4166 + }, + { + "epoch": 0.37, + "grad_norm": 7.55190112386766, + "learning_rate": 7.320999797265076e-06, + "loss": 0.8009, + "step": 4167 + }, + { + "epoch": 0.37, + "grad_norm": 8.756780228990307, + "learning_rate": 7.319739922401472e-06, + "loss": 0.7042, + "step": 4168 + }, + { + "epoch": 0.37, + "grad_norm": 7.825405809076905, + "learning_rate": 7.318479859828372e-06, + "loss": 0.823, + "step": 4169 + }, + { + "epoch": 0.37, + "grad_norm": 3.9963160565435656, + "learning_rate": 7.31721960964774e-06, + "loss": 0.6642, + "step": 4170 + }, + { + "epoch": 0.37, + "grad_norm": 11.376907343896784, + "learning_rate": 7.315959171961549e-06, + "loss": 0.61, + "step": 4171 + }, + { + "epoch": 0.37, + "grad_norm": 7.388378142609373, + "learning_rate": 7.314698546871797e-06, + "loss": 1.0106, + "step": 4172 + }, + { + "epoch": 0.37, + "grad_norm": 10.34510816778849, + "learning_rate": 7.313437734480489e-06, + "loss": 0.8064, + "step": 4173 + }, + { + "epoch": 0.37, + "grad_norm": 8.831132151342215, + "learning_rate": 7.312176734889644e-06, + "loss": 0.7425, + "step": 4174 + }, + { + "epoch": 0.37, + "grad_norm": 8.699913623082914, + "learning_rate": 7.310915548201307e-06, + "loss": 0.8732, + "step": 4175 + }, + { + "epoch": 0.37, + "grad_norm": 13.428699544443955, + "learning_rate": 7.309654174517528e-06, + "loss": 0.9993, + "step": 4176 + }, + { + "epoch": 0.37, + "grad_norm": 6.573115379808732, + "learning_rate": 7.308392613940374e-06, + "loss": 0.6779, + "step": 4177 + }, + { + "epoch": 0.37, + "grad_norm": 7.006470329897844, + "learning_rate": 7.30713086657193e-06, + "loss": 0.7205, + "step": 4178 + }, + { + "epoch": 0.37, + "grad_norm": 12.060392507863298, + "learning_rate": 7.305868932514293e-06, + "loss": 0.8121, + "step": 4179 + }, + { + "epoch": 0.37, + "grad_norm": 10.941016987662524, + "learning_rate": 7.304606811869578e-06, + "loss": 0.9693, + "step": 4180 + }, + { + "epoch": 0.37, + "grad_norm": 10.456836797126932, + "learning_rate": 7.303344504739914e-06, + "loss": 0.8136, + "step": 4181 + }, + { + "epoch": 0.37, + "grad_norm": 18.088226410424067, + "learning_rate": 7.302082011227444e-06, + "loss": 0.8563, + "step": 4182 + }, + { + "epoch": 0.37, + "grad_norm": 14.607825985057737, + "learning_rate": 7.300819331434326e-06, + "loss": 0.8883, + "step": 4183 + }, + { + "epoch": 0.37, + "grad_norm": 10.04412532902516, + "learning_rate": 7.299556465462736e-06, + "loss": 0.9071, + "step": 4184 + }, + { + "epoch": 0.37, + "grad_norm": 8.312678365987068, + "learning_rate": 7.298293413414862e-06, + "loss": 0.6372, + "step": 4185 + }, + { + "epoch": 0.37, + "grad_norm": 7.631779251353098, + "learning_rate": 7.297030175392906e-06, + "loss": 0.7142, + "step": 4186 + }, + { + "epoch": 0.37, + "grad_norm": 7.913344521123398, + "learning_rate": 7.295766751499091e-06, + "loss": 0.7768, + "step": 4187 + }, + { + "epoch": 0.37, + "grad_norm": 6.307224854420024, + "learning_rate": 7.2945031418356485e-06, + "loss": 0.8758, + "step": 4188 + }, + { + "epoch": 0.37, + "grad_norm": 11.692224082830883, + "learning_rate": 7.293239346504829e-06, + "loss": 0.9737, + "step": 4189 + }, + { + "epoch": 0.37, + "grad_norm": 8.45680086214289, + "learning_rate": 7.2919753656088965e-06, + "loss": 0.8101, + "step": 4190 + }, + { + "epoch": 0.37, + "grad_norm": 8.223157530642588, + "learning_rate": 7.290711199250129e-06, + "loss": 0.7981, + "step": 4191 + }, + { + "epoch": 0.37, + "grad_norm": 8.549127395308917, + "learning_rate": 7.289446847530822e-06, + "loss": 0.7823, + "step": 4192 + }, + { + "epoch": 0.37, + "grad_norm": 8.646363732344387, + "learning_rate": 7.288182310553285e-06, + "loss": 0.7227, + "step": 4193 + }, + { + "epoch": 0.37, + "grad_norm": 7.458359399018529, + "learning_rate": 7.286917588419841e-06, + "loss": 0.7151, + "step": 4194 + }, + { + "epoch": 0.37, + "grad_norm": 6.510187276633661, + "learning_rate": 7.28565268123283e-06, + "loss": 0.7057, + "step": 4195 + }, + { + "epoch": 0.37, + "grad_norm": 6.724487940625326, + "learning_rate": 7.284387589094606e-06, + "loss": 0.7843, + "step": 4196 + }, + { + "epoch": 0.37, + "grad_norm": 6.394544987066132, + "learning_rate": 7.283122312107538e-06, + "loss": 0.6571, + "step": 4197 + }, + { + "epoch": 0.37, + "grad_norm": 10.582386860009034, + "learning_rate": 7.2818568503740104e-06, + "loss": 0.8147, + "step": 4198 + }, + { + "epoch": 0.37, + "grad_norm": 2.4571470040856087, + "learning_rate": 7.280591203996423e-06, + "loss": 0.4522, + "step": 4199 + }, + { + "epoch": 0.37, + "grad_norm": 7.6613889361556256, + "learning_rate": 7.279325373077187e-06, + "loss": 0.8665, + "step": 4200 + }, + { + "epoch": 0.37, + "grad_norm": 6.9570942336847486, + "learning_rate": 7.278059357718735e-06, + "loss": 0.7198, + "step": 4201 + }, + { + "epoch": 0.37, + "grad_norm": 9.931090858959498, + "learning_rate": 7.276793158023509e-06, + "loss": 0.9464, + "step": 4202 + }, + { + "epoch": 0.37, + "grad_norm": 7.449047332312249, + "learning_rate": 7.2755267740939664e-06, + "loss": 0.7307, + "step": 4203 + }, + { + "epoch": 0.37, + "grad_norm": 9.267733049985331, + "learning_rate": 7.274260206032584e-06, + "loss": 0.7945, + "step": 4204 + }, + { + "epoch": 0.37, + "grad_norm": 7.89432939542872, + "learning_rate": 7.272993453941848e-06, + "loss": 0.7021, + "step": 4205 + }, + { + "epoch": 0.37, + "grad_norm": 9.595523947731627, + "learning_rate": 7.271726517924263e-06, + "loss": 0.7738, + "step": 4206 + }, + { + "epoch": 0.37, + "grad_norm": 12.569077765791448, + "learning_rate": 7.270459398082346e-06, + "loss": 0.9741, + "step": 4207 + }, + { + "epoch": 0.37, + "grad_norm": 11.625556988712756, + "learning_rate": 7.2691920945186325e-06, + "loss": 0.7582, + "step": 4208 + }, + { + "epoch": 0.37, + "grad_norm": 6.947438754788942, + "learning_rate": 7.267924607335669e-06, + "loss": 0.592, + "step": 4209 + }, + { + "epoch": 0.37, + "grad_norm": 8.639049227732379, + "learning_rate": 7.266656936636018e-06, + "loss": 0.7401, + "step": 4210 + }, + { + "epoch": 0.37, + "grad_norm": 8.431435352748904, + "learning_rate": 7.265389082522258e-06, + "loss": 0.8358, + "step": 4211 + }, + { + "epoch": 0.37, + "grad_norm": 9.298484332799513, + "learning_rate": 7.264121045096983e-06, + "loss": 0.895, + "step": 4212 + }, + { + "epoch": 0.37, + "grad_norm": 14.446512779485612, + "learning_rate": 7.2628528244628e-06, + "loss": 1.0047, + "step": 4213 + }, + { + "epoch": 0.37, + "grad_norm": 9.260781011094881, + "learning_rate": 7.261584420722328e-06, + "loss": 0.7891, + "step": 4214 + }, + { + "epoch": 0.37, + "grad_norm": 9.27145017461563, + "learning_rate": 7.260315833978209e-06, + "loss": 0.7303, + "step": 4215 + }, + { + "epoch": 0.37, + "grad_norm": 6.565013549414908, + "learning_rate": 7.259047064333091e-06, + "loss": 0.6994, + "step": 4216 + }, + { + "epoch": 0.37, + "grad_norm": 14.875850310225035, + "learning_rate": 7.257778111889641e-06, + "loss": 1.0254, + "step": 4217 + }, + { + "epoch": 0.37, + "grad_norm": 10.196233778817218, + "learning_rate": 7.256508976750545e-06, + "loss": 0.7294, + "step": 4218 + }, + { + "epoch": 0.37, + "grad_norm": 9.328062145616308, + "learning_rate": 7.2552396590184945e-06, + "loss": 0.6942, + "step": 4219 + }, + { + "epoch": 0.37, + "grad_norm": 9.286783718130515, + "learning_rate": 7.2539701587962015e-06, + "loss": 0.8308, + "step": 4220 + }, + { + "epoch": 0.37, + "grad_norm": 19.57670872311623, + "learning_rate": 7.252700476186394e-06, + "loss": 0.8305, + "step": 4221 + }, + { + "epoch": 0.37, + "grad_norm": 8.311301289234255, + "learning_rate": 7.2514306112918095e-06, + "loss": 0.8139, + "step": 4222 + }, + { + "epoch": 0.37, + "grad_norm": 11.57107374072217, + "learning_rate": 7.250160564215206e-06, + "loss": 0.7695, + "step": 4223 + }, + { + "epoch": 0.37, + "grad_norm": 6.127749198417821, + "learning_rate": 7.248890335059353e-06, + "loss": 0.6744, + "step": 4224 + }, + { + "epoch": 0.37, + "grad_norm": 7.28880497291457, + "learning_rate": 7.2476199239270354e-06, + "loss": 0.7892, + "step": 4225 + }, + { + "epoch": 0.37, + "grad_norm": 11.037115014355656, + "learning_rate": 7.24634933092105e-06, + "loss": 0.8821, + "step": 4226 + }, + { + "epoch": 0.37, + "grad_norm": 12.303457521312758, + "learning_rate": 7.245078556144215e-06, + "loss": 0.6241, + "step": 4227 + }, + { + "epoch": 0.37, + "grad_norm": 15.757017463012184, + "learning_rate": 7.243807599699357e-06, + "loss": 0.8606, + "step": 4228 + }, + { + "epoch": 0.37, + "grad_norm": 7.533015671175947, + "learning_rate": 7.242536461689321e-06, + "loss": 0.8871, + "step": 4229 + }, + { + "epoch": 0.37, + "grad_norm": 10.543485955534107, + "learning_rate": 7.2412651422169645e-06, + "loss": 0.7751, + "step": 4230 + }, + { + "epoch": 0.37, + "grad_norm": 18.167470628489323, + "learning_rate": 7.239993641385161e-06, + "loss": 0.9564, + "step": 4231 + }, + { + "epoch": 0.37, + "grad_norm": 13.833304116705374, + "learning_rate": 7.238721959296798e-06, + "loss": 0.7486, + "step": 4232 + }, + { + "epoch": 0.37, + "grad_norm": 7.383792569277386, + "learning_rate": 7.237450096054779e-06, + "loss": 0.6009, + "step": 4233 + }, + { + "epoch": 0.37, + "grad_norm": 7.425470076388714, + "learning_rate": 7.2361780517620175e-06, + "loss": 0.8457, + "step": 4234 + }, + { + "epoch": 0.37, + "grad_norm": 18.201607834545968, + "learning_rate": 7.23490582652145e-06, + "loss": 0.7401, + "step": 4235 + }, + { + "epoch": 0.37, + "grad_norm": 6.346023196874105, + "learning_rate": 7.2336334204360206e-06, + "loss": 0.9015, + "step": 4236 + }, + { + "epoch": 0.37, + "grad_norm": 10.979452149775913, + "learning_rate": 7.232360833608688e-06, + "loss": 0.8661, + "step": 4237 + }, + { + "epoch": 0.37, + "grad_norm": 3.521617974681032, + "learning_rate": 7.231088066142432e-06, + "loss": 0.6246, + "step": 4238 + }, + { + "epoch": 0.37, + "grad_norm": 6.178920762931778, + "learning_rate": 7.229815118140241e-06, + "loss": 0.7442, + "step": 4239 + }, + { + "epoch": 0.37, + "grad_norm": 5.611907176168912, + "learning_rate": 7.228541989705119e-06, + "loss": 0.7776, + "step": 4240 + }, + { + "epoch": 0.37, + "grad_norm": 6.423316038129589, + "learning_rate": 7.227268680940087e-06, + "loss": 0.8661, + "step": 4241 + }, + { + "epoch": 0.37, + "grad_norm": 15.61864669172623, + "learning_rate": 7.225995191948177e-06, + "loss": 0.8603, + "step": 4242 + }, + { + "epoch": 0.37, + "grad_norm": 6.770072196911455, + "learning_rate": 7.22472152283244e-06, + "loss": 0.7451, + "step": 4243 + }, + { + "epoch": 0.37, + "grad_norm": 5.320888997457571, + "learning_rate": 7.223447673695937e-06, + "loss": 0.7724, + "step": 4244 + }, + { + "epoch": 0.37, + "grad_norm": 8.952574203971675, + "learning_rate": 7.2221736446417475e-06, + "loss": 0.8075, + "step": 4245 + }, + { + "epoch": 0.37, + "grad_norm": 5.824146842577657, + "learning_rate": 7.220899435772962e-06, + "loss": 0.7833, + "step": 4246 + }, + { + "epoch": 0.37, + "grad_norm": 11.82520642552967, + "learning_rate": 7.21962504719269e-06, + "loss": 0.8361, + "step": 4247 + }, + { + "epoch": 0.37, + "grad_norm": 3.270402185876539, + "learning_rate": 7.21835047900405e-06, + "loss": 0.5246, + "step": 4248 + }, + { + "epoch": 0.37, + "grad_norm": 8.723244044336017, + "learning_rate": 7.21707573131018e-06, + "loss": 0.8291, + "step": 4249 + }, + { + "epoch": 0.37, + "grad_norm": 9.583580221582261, + "learning_rate": 7.21580080421423e-06, + "loss": 0.7793, + "step": 4250 + }, + { + "epoch": 0.37, + "grad_norm": 16.5064723941237, + "learning_rate": 7.214525697819363e-06, + "loss": 0.8249, + "step": 4251 + }, + { + "epoch": 0.37, + "grad_norm": 8.978723441433141, + "learning_rate": 7.2132504122287626e-06, + "loss": 0.8281, + "step": 4252 + }, + { + "epoch": 0.37, + "grad_norm": 7.989046509844019, + "learning_rate": 7.21197494754562e-06, + "loss": 0.7782, + "step": 4253 + }, + { + "epoch": 0.37, + "grad_norm": 15.605701047645876, + "learning_rate": 7.2106993038731435e-06, + "loss": 0.7391, + "step": 4254 + }, + { + "epoch": 0.37, + "grad_norm": 8.362965441826274, + "learning_rate": 7.209423481314556e-06, + "loss": 0.6467, + "step": 4255 + }, + { + "epoch": 0.37, + "grad_norm": 8.802222162916816, + "learning_rate": 7.2081474799730965e-06, + "loss": 0.7616, + "step": 4256 + }, + { + "epoch": 0.37, + "grad_norm": 6.69340421168566, + "learning_rate": 7.206871299952015e-06, + "loss": 0.7617, + "step": 4257 + }, + { + "epoch": 0.37, + "grad_norm": 12.435353694423933, + "learning_rate": 7.20559494135458e-06, + "loss": 0.8635, + "step": 4258 + }, + { + "epoch": 0.37, + "grad_norm": 4.415775704169373, + "learning_rate": 7.204318404284071e-06, + "loss": 0.5977, + "step": 4259 + }, + { + "epoch": 0.37, + "grad_norm": 13.803833425315469, + "learning_rate": 7.203041688843783e-06, + "loss": 0.7909, + "step": 4260 + }, + { + "epoch": 0.37, + "grad_norm": 12.6885722578884, + "learning_rate": 7.201764795137026e-06, + "loss": 0.8025, + "step": 4261 + }, + { + "epoch": 0.37, + "grad_norm": 5.703955366272616, + "learning_rate": 7.200487723267124e-06, + "loss": 0.7957, + "step": 4262 + }, + { + "epoch": 0.37, + "grad_norm": 7.928656073802879, + "learning_rate": 7.199210473337415e-06, + "loss": 0.8272, + "step": 4263 + }, + { + "epoch": 0.37, + "grad_norm": 8.797748074557791, + "learning_rate": 7.197933045451256e-06, + "loss": 0.8252, + "step": 4264 + }, + { + "epoch": 0.37, + "grad_norm": 10.377045443078124, + "learning_rate": 7.196655439712008e-06, + "loss": 0.8963, + "step": 4265 + }, + { + "epoch": 0.37, + "grad_norm": 6.923536744224651, + "learning_rate": 7.195377656223057e-06, + "loss": 0.694, + "step": 4266 + }, + { + "epoch": 0.37, + "grad_norm": 10.08792200223668, + "learning_rate": 7.194099695087797e-06, + "loss": 0.8917, + "step": 4267 + }, + { + "epoch": 0.37, + "grad_norm": 7.609113001064463, + "learning_rate": 7.192821556409639e-06, + "loss": 0.6821, + "step": 4268 + }, + { + "epoch": 0.37, + "grad_norm": 6.655181366206341, + "learning_rate": 7.19154324029201e-06, + "loss": 0.9587, + "step": 4269 + }, + { + "epoch": 0.38, + "grad_norm": 2.903848891906123, + "learning_rate": 7.190264746838347e-06, + "loss": 0.5349, + "step": 4270 + }, + { + "epoch": 0.38, + "grad_norm": 7.675641221235051, + "learning_rate": 7.1889860761521026e-06, + "loss": 0.8157, + "step": 4271 + }, + { + "epoch": 0.38, + "grad_norm": 9.677455776383114, + "learning_rate": 7.1877072283367475e-06, + "loss": 0.8156, + "step": 4272 + }, + { + "epoch": 0.38, + "grad_norm": 13.167689496302554, + "learning_rate": 7.1864282034957636e-06, + "loss": 0.836, + "step": 4273 + }, + { + "epoch": 0.38, + "grad_norm": 9.239511191235128, + "learning_rate": 7.185149001732643e-06, + "loss": 0.9079, + "step": 4274 + }, + { + "epoch": 0.38, + "grad_norm": 8.774186309715589, + "learning_rate": 7.183869623150903e-06, + "loss": 0.8513, + "step": 4275 + }, + { + "epoch": 0.38, + "grad_norm": 10.441885811466376, + "learning_rate": 7.182590067854065e-06, + "loss": 0.7427, + "step": 4276 + }, + { + "epoch": 0.38, + "grad_norm": 8.817600658893847, + "learning_rate": 7.181310335945668e-06, + "loss": 0.9288, + "step": 4277 + }, + { + "epoch": 0.38, + "grad_norm": 5.1511512928450784, + "learning_rate": 7.180030427529269e-06, + "loss": 0.7774, + "step": 4278 + }, + { + "epoch": 0.38, + "grad_norm": 4.541837395215971, + "learning_rate": 7.178750342708432e-06, + "loss": 0.7224, + "step": 4279 + }, + { + "epoch": 0.38, + "grad_norm": 7.588494272100689, + "learning_rate": 7.177470081586743e-06, + "loss": 0.7124, + "step": 4280 + }, + { + "epoch": 0.38, + "grad_norm": 5.743353113304169, + "learning_rate": 7.176189644267797e-06, + "loss": 0.8251, + "step": 4281 + }, + { + "epoch": 0.38, + "grad_norm": 5.045522286076304, + "learning_rate": 7.174909030855204e-06, + "loss": 0.7389, + "step": 4282 + }, + { + "epoch": 0.38, + "grad_norm": 6.868099589565909, + "learning_rate": 7.1736282414525905e-06, + "loss": 0.883, + "step": 4283 + }, + { + "epoch": 0.38, + "grad_norm": 4.259453578416971, + "learning_rate": 7.172347276163596e-06, + "loss": 0.6925, + "step": 4284 + }, + { + "epoch": 0.38, + "grad_norm": 10.607324666664462, + "learning_rate": 7.171066135091873e-06, + "loss": 0.8142, + "step": 4285 + }, + { + "epoch": 0.38, + "grad_norm": 8.520802586972643, + "learning_rate": 7.169784818341088e-06, + "loss": 0.8403, + "step": 4286 + }, + { + "epoch": 0.38, + "grad_norm": 4.277539099338132, + "learning_rate": 7.168503326014926e-06, + "loss": 0.7415, + "step": 4287 + }, + { + "epoch": 0.38, + "grad_norm": 7.9418291304442254, + "learning_rate": 7.167221658217083e-06, + "loss": 0.8083, + "step": 4288 + }, + { + "epoch": 0.38, + "grad_norm": 13.219198288957587, + "learning_rate": 7.165939815051267e-06, + "loss": 0.7482, + "step": 4289 + }, + { + "epoch": 0.38, + "grad_norm": 8.296223284274431, + "learning_rate": 7.164657796621205e-06, + "loss": 0.8216, + "step": 4290 + }, + { + "epoch": 0.38, + "grad_norm": 6.829731858527355, + "learning_rate": 7.163375603030634e-06, + "loss": 0.7264, + "step": 4291 + }, + { + "epoch": 0.38, + "grad_norm": 8.858622711645923, + "learning_rate": 7.16209323438331e-06, + "loss": 0.858, + "step": 4292 + }, + { + "epoch": 0.38, + "grad_norm": 7.504040564873065, + "learning_rate": 7.160810690782995e-06, + "loss": 0.7237, + "step": 4293 + }, + { + "epoch": 0.38, + "grad_norm": 11.32413586800157, + "learning_rate": 7.159527972333474e-06, + "loss": 0.7659, + "step": 4294 + }, + { + "epoch": 0.38, + "grad_norm": 7.37164723237538, + "learning_rate": 7.1582450791385425e-06, + "loss": 0.8182, + "step": 4295 + }, + { + "epoch": 0.38, + "grad_norm": 13.787804178612893, + "learning_rate": 7.1569620113020085e-06, + "loss": 0.6666, + "step": 4296 + }, + { + "epoch": 0.38, + "grad_norm": 6.568554005195951, + "learning_rate": 7.155678768927696e-06, + "loss": 0.8333, + "step": 4297 + }, + { + "epoch": 0.38, + "grad_norm": 7.130950763870329, + "learning_rate": 7.154395352119446e-06, + "loss": 0.8982, + "step": 4298 + }, + { + "epoch": 0.38, + "grad_norm": 34.74985251630934, + "learning_rate": 7.153111760981105e-06, + "loss": 0.7805, + "step": 4299 + }, + { + "epoch": 0.38, + "grad_norm": 15.826996034615902, + "learning_rate": 7.1518279956165425e-06, + "loss": 0.6901, + "step": 4300 + }, + { + "epoch": 0.38, + "grad_norm": 6.478297011003456, + "learning_rate": 7.150544056129638e-06, + "loss": 0.8692, + "step": 4301 + }, + { + "epoch": 0.38, + "grad_norm": 6.55589373054387, + "learning_rate": 7.149259942624287e-06, + "loss": 0.7028, + "step": 4302 + }, + { + "epoch": 0.38, + "grad_norm": 9.071291710911618, + "learning_rate": 7.1479756552043944e-06, + "loss": 0.9202, + "step": 4303 + }, + { + "epoch": 0.38, + "grad_norm": 4.093210936406697, + "learning_rate": 7.146691193973886e-06, + "loss": 0.6663, + "step": 4304 + }, + { + "epoch": 0.38, + "grad_norm": 7.9063938874861455, + "learning_rate": 7.145406559036697e-06, + "loss": 0.8385, + "step": 4305 + }, + { + "epoch": 0.38, + "grad_norm": 8.302713659454474, + "learning_rate": 7.1441217504967775e-06, + "loss": 0.6926, + "step": 4306 + }, + { + "epoch": 0.38, + "grad_norm": 10.002117271823025, + "learning_rate": 7.142836768458092e-06, + "loss": 0.8524, + "step": 4307 + }, + { + "epoch": 0.38, + "grad_norm": 8.91022212170721, + "learning_rate": 7.141551613024621e-06, + "loss": 0.7538, + "step": 4308 + }, + { + "epoch": 0.38, + "grad_norm": 6.671817989005663, + "learning_rate": 7.140266284300355e-06, + "loss": 0.8598, + "step": 4309 + }, + { + "epoch": 0.38, + "grad_norm": 6.673661761790604, + "learning_rate": 7.138980782389302e-06, + "loss": 0.7879, + "step": 4310 + }, + { + "epoch": 0.38, + "grad_norm": 8.39448372991862, + "learning_rate": 7.137695107395482e-06, + "loss": 0.7247, + "step": 4311 + }, + { + "epoch": 0.38, + "grad_norm": 9.001041860622998, + "learning_rate": 7.136409259422929e-06, + "loss": 0.6963, + "step": 4312 + }, + { + "epoch": 0.38, + "grad_norm": 2.6227900368939308, + "learning_rate": 7.135123238575693e-06, + "loss": 0.5348, + "step": 4313 + }, + { + "epoch": 0.38, + "grad_norm": 9.829420526279499, + "learning_rate": 7.133837044957835e-06, + "loss": 0.9111, + "step": 4314 + }, + { + "epoch": 0.38, + "grad_norm": 18.175992459995204, + "learning_rate": 7.1325506786734345e-06, + "loss": 0.8115, + "step": 4315 + }, + { + "epoch": 0.38, + "grad_norm": 6.619901051702438, + "learning_rate": 7.13126413982658e-06, + "loss": 0.8995, + "step": 4316 + }, + { + "epoch": 0.38, + "grad_norm": 11.811195201334634, + "learning_rate": 7.129977428521375e-06, + "loss": 0.7875, + "step": 4317 + }, + { + "epoch": 0.38, + "grad_norm": 12.632535625678173, + "learning_rate": 7.128690544861941e-06, + "loss": 0.8901, + "step": 4318 + }, + { + "epoch": 0.38, + "grad_norm": 7.627742887449954, + "learning_rate": 7.12740348895241e-06, + "loss": 0.6181, + "step": 4319 + }, + { + "epoch": 0.38, + "grad_norm": 8.705996302219962, + "learning_rate": 7.126116260896924e-06, + "loss": 0.8033, + "step": 4320 + }, + { + "epoch": 0.38, + "grad_norm": 14.523819369369638, + "learning_rate": 7.124828860799651e-06, + "loss": 0.8593, + "step": 4321 + }, + { + "epoch": 0.38, + "grad_norm": 8.88646654074108, + "learning_rate": 7.123541288764759e-06, + "loss": 0.7925, + "step": 4322 + }, + { + "epoch": 0.38, + "grad_norm": 6.815668147156321, + "learning_rate": 7.122253544896438e-06, + "loss": 0.7619, + "step": 4323 + }, + { + "epoch": 0.38, + "grad_norm": 8.023480966803259, + "learning_rate": 7.120965629298891e-06, + "loss": 0.7196, + "step": 4324 + }, + { + "epoch": 0.38, + "grad_norm": 9.93546236988735, + "learning_rate": 7.119677542076333e-06, + "loss": 0.6993, + "step": 4325 + }, + { + "epoch": 0.38, + "grad_norm": 10.484679156463917, + "learning_rate": 7.118389283332994e-06, + "loss": 0.8746, + "step": 4326 + }, + { + "epoch": 0.38, + "grad_norm": 9.124502977035183, + "learning_rate": 7.117100853173118e-06, + "loss": 0.8846, + "step": 4327 + }, + { + "epoch": 0.38, + "grad_norm": 2.3609400486061176, + "learning_rate": 7.115812251700962e-06, + "loss": 0.5216, + "step": 4328 + }, + { + "epoch": 0.38, + "grad_norm": 5.561943766407394, + "learning_rate": 7.1145234790207984e-06, + "loss": 0.912, + "step": 4329 + }, + { + "epoch": 0.38, + "grad_norm": 10.334150043774729, + "learning_rate": 7.113234535236913e-06, + "loss": 0.8872, + "step": 4330 + }, + { + "epoch": 0.38, + "grad_norm": 9.871644198859334, + "learning_rate": 7.1119454204536036e-06, + "loss": 0.768, + "step": 4331 + }, + { + "epoch": 0.38, + "grad_norm": 3.5182809517705107, + "learning_rate": 7.110656134775183e-06, + "loss": 0.5928, + "step": 4332 + }, + { + "epoch": 0.38, + "grad_norm": 5.968757939793793, + "learning_rate": 7.109366678305981e-06, + "loss": 0.7906, + "step": 4333 + }, + { + "epoch": 0.38, + "grad_norm": 8.57479833639797, + "learning_rate": 7.1080770511503336e-06, + "loss": 0.9103, + "step": 4334 + }, + { + "epoch": 0.38, + "grad_norm": 12.730956802945931, + "learning_rate": 7.1067872534126004e-06, + "loss": 0.8225, + "step": 4335 + }, + { + "epoch": 0.38, + "grad_norm": 25.958867248675126, + "learning_rate": 7.105497285197146e-06, + "loss": 0.7504, + "step": 4336 + }, + { + "epoch": 0.38, + "grad_norm": 9.13426372789524, + "learning_rate": 7.104207146608353e-06, + "loss": 0.7446, + "step": 4337 + }, + { + "epoch": 0.38, + "grad_norm": 15.519971010862323, + "learning_rate": 7.102916837750619e-06, + "loss": 0.8788, + "step": 4338 + }, + { + "epoch": 0.38, + "grad_norm": 7.858717108246173, + "learning_rate": 7.101626358728352e-06, + "loss": 0.732, + "step": 4339 + }, + { + "epoch": 0.38, + "grad_norm": 9.21601015262091, + "learning_rate": 7.100335709645975e-06, + "loss": 0.7515, + "step": 4340 + }, + { + "epoch": 0.38, + "grad_norm": 6.177822995967037, + "learning_rate": 7.099044890607927e-06, + "loss": 0.98, + "step": 4341 + }, + { + "epoch": 0.38, + "grad_norm": 8.831123178743413, + "learning_rate": 7.097753901718658e-06, + "loss": 0.841, + "step": 4342 + }, + { + "epoch": 0.38, + "grad_norm": 10.256604591045974, + "learning_rate": 7.096462743082633e-06, + "loss": 0.8244, + "step": 4343 + }, + { + "epoch": 0.38, + "grad_norm": 3.1170104340703775, + "learning_rate": 7.095171414804329e-06, + "loss": 0.5008, + "step": 4344 + }, + { + "epoch": 0.38, + "grad_norm": 2.6717314419391256, + "learning_rate": 7.09387991698824e-06, + "loss": 0.5636, + "step": 4345 + }, + { + "epoch": 0.38, + "grad_norm": 7.3160781096305625, + "learning_rate": 7.092588249738871e-06, + "loss": 0.7125, + "step": 4346 + }, + { + "epoch": 0.38, + "grad_norm": 8.431408591470458, + "learning_rate": 7.091296413160742e-06, + "loss": 0.829, + "step": 4347 + }, + { + "epoch": 0.38, + "grad_norm": 8.150524634887098, + "learning_rate": 7.090004407358385e-06, + "loss": 0.7463, + "step": 4348 + }, + { + "epoch": 0.38, + "grad_norm": 7.815909466919527, + "learning_rate": 7.088712232436349e-06, + "loss": 0.8586, + "step": 4349 + }, + { + "epoch": 0.38, + "grad_norm": 8.128770075929394, + "learning_rate": 7.087419888499194e-06, + "loss": 0.7642, + "step": 4350 + }, + { + "epoch": 0.38, + "grad_norm": 24.6046217611164, + "learning_rate": 7.086127375651492e-06, + "loss": 0.8498, + "step": 4351 + }, + { + "epoch": 0.38, + "grad_norm": 8.04246126067049, + "learning_rate": 7.084834693997834e-06, + "loss": 0.8552, + "step": 4352 + }, + { + "epoch": 0.38, + "grad_norm": 9.216672743870735, + "learning_rate": 7.08354184364282e-06, + "loss": 0.8423, + "step": 4353 + }, + { + "epoch": 0.38, + "grad_norm": 2.5971532490666287, + "learning_rate": 7.082248824691066e-06, + "loss": 0.5283, + "step": 4354 + }, + { + "epoch": 0.38, + "grad_norm": 6.819288645338891, + "learning_rate": 7.080955637247202e-06, + "loss": 0.7171, + "step": 4355 + }, + { + "epoch": 0.38, + "grad_norm": 5.812581813296302, + "learning_rate": 7.079662281415869e-06, + "loss": 0.7057, + "step": 4356 + }, + { + "epoch": 0.38, + "grad_norm": 5.080895785414747, + "learning_rate": 7.0783687573017215e-06, + "loss": 0.7458, + "step": 4357 + }, + { + "epoch": 0.38, + "grad_norm": 6.696018097870272, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.7645, + "step": 4358 + }, + { + "epoch": 0.38, + "grad_norm": 7.505997603932912, + "learning_rate": 7.075781204643685e-06, + "loss": 1.0469, + "step": 4359 + }, + { + "epoch": 0.38, + "grad_norm": 7.0040110071272315, + "learning_rate": 7.074487176309174e-06, + "loss": 0.9559, + "step": 4360 + }, + { + "epoch": 0.38, + "grad_norm": 12.849925453383525, + "learning_rate": 7.073192980110612e-06, + "loss": 0.8399, + "step": 4361 + }, + { + "epoch": 0.38, + "grad_norm": 14.103526328120084, + "learning_rate": 7.0718986161527235e-06, + "loss": 0.8236, + "step": 4362 + }, + { + "epoch": 0.38, + "grad_norm": 6.023823761216292, + "learning_rate": 7.070604084540245e-06, + "loss": 0.6547, + "step": 4363 + }, + { + "epoch": 0.38, + "grad_norm": 11.60621626637406, + "learning_rate": 7.069309385377927e-06, + "loss": 0.6796, + "step": 4364 + }, + { + "epoch": 0.38, + "grad_norm": 7.160464772361005, + "learning_rate": 7.068014518770536e-06, + "loss": 0.7828, + "step": 4365 + }, + { + "epoch": 0.38, + "grad_norm": 13.871072240564585, + "learning_rate": 7.066719484822851e-06, + "loss": 0.8578, + "step": 4366 + }, + { + "epoch": 0.38, + "grad_norm": 2.887656717816784, + "learning_rate": 7.0654242836396634e-06, + "loss": 0.5899, + "step": 4367 + }, + { + "epoch": 0.38, + "grad_norm": 11.358110226833418, + "learning_rate": 7.064128915325777e-06, + "loss": 0.8255, + "step": 4368 + }, + { + "epoch": 0.38, + "grad_norm": 6.344311448564115, + "learning_rate": 7.062833379986014e-06, + "loss": 0.8413, + "step": 4369 + }, + { + "epoch": 0.38, + "grad_norm": 7.881171404848917, + "learning_rate": 7.0615376777252045e-06, + "loss": 0.8845, + "step": 4370 + }, + { + "epoch": 0.38, + "grad_norm": 11.085429353621564, + "learning_rate": 7.0602418086481935e-06, + "loss": 0.7852, + "step": 4371 + }, + { + "epoch": 0.38, + "grad_norm": 10.698827510901527, + "learning_rate": 7.058945772859845e-06, + "loss": 0.799, + "step": 4372 + }, + { + "epoch": 0.38, + "grad_norm": 7.443600111490494, + "learning_rate": 7.05764957046503e-06, + "loss": 0.7844, + "step": 4373 + }, + { + "epoch": 0.38, + "grad_norm": 16.01174354090694, + "learning_rate": 7.056353201568631e-06, + "loss": 0.7381, + "step": 4374 + }, + { + "epoch": 0.38, + "grad_norm": 2.638460501808738, + "learning_rate": 7.055056666275555e-06, + "loss": 0.498, + "step": 4375 + }, + { + "epoch": 0.38, + "grad_norm": 10.776839042192195, + "learning_rate": 7.05375996469071e-06, + "loss": 0.7101, + "step": 4376 + }, + { + "epoch": 0.38, + "grad_norm": 3.1602418687727, + "learning_rate": 7.052463096919025e-06, + "loss": 0.5062, + "step": 4377 + }, + { + "epoch": 0.38, + "grad_norm": 8.879990892466195, + "learning_rate": 7.051166063065441e-06, + "loss": 0.7493, + "step": 4378 + }, + { + "epoch": 0.38, + "grad_norm": 10.801971360688881, + "learning_rate": 7.049868863234911e-06, + "loss": 0.7898, + "step": 4379 + }, + { + "epoch": 0.38, + "grad_norm": 6.835744338495152, + "learning_rate": 7.048571497532402e-06, + "loss": 0.657, + "step": 4380 + }, + { + "epoch": 0.38, + "grad_norm": 18.38726550236623, + "learning_rate": 7.047273966062896e-06, + "loss": 0.9791, + "step": 4381 + }, + { + "epoch": 0.38, + "grad_norm": 7.180885625245812, + "learning_rate": 7.0459762689313846e-06, + "loss": 0.6251, + "step": 4382 + }, + { + "epoch": 0.38, + "grad_norm": 9.56667662521771, + "learning_rate": 7.044678406242876e-06, + "loss": 0.8594, + "step": 4383 + }, + { + "epoch": 0.39, + "grad_norm": 7.510125340299081, + "learning_rate": 7.043380378102394e-06, + "loss": 0.7367, + "step": 4384 + }, + { + "epoch": 0.39, + "grad_norm": 19.58832473747313, + "learning_rate": 7.04208218461497e-06, + "loss": 0.7854, + "step": 4385 + }, + { + "epoch": 0.39, + "grad_norm": 2.179648187439884, + "learning_rate": 7.040783825885653e-06, + "loss": 0.5454, + "step": 4386 + }, + { + "epoch": 0.39, + "grad_norm": 13.29559157260246, + "learning_rate": 7.039485302019504e-06, + "loss": 0.8692, + "step": 4387 + }, + { + "epoch": 0.39, + "grad_norm": 7.922886029405353, + "learning_rate": 7.038186613121594e-06, + "loss": 0.8808, + "step": 4388 + }, + { + "epoch": 0.39, + "grad_norm": 2.681621866474274, + "learning_rate": 7.036887759297018e-06, + "loss": 0.502, + "step": 4389 + }, + { + "epoch": 0.39, + "grad_norm": 2.6871763385516347, + "learning_rate": 7.03558874065087e-06, + "loss": 0.5451, + "step": 4390 + }, + { + "epoch": 0.39, + "grad_norm": 7.034166422809769, + "learning_rate": 7.034289557288266e-06, + "loss": 0.7101, + "step": 4391 + }, + { + "epoch": 0.39, + "grad_norm": 10.632375251450952, + "learning_rate": 7.032990209314336e-06, + "loss": 0.788, + "step": 4392 + }, + { + "epoch": 0.39, + "grad_norm": 5.584633235933161, + "learning_rate": 7.031690696834222e-06, + "loss": 0.7659, + "step": 4393 + }, + { + "epoch": 0.39, + "grad_norm": 9.831528755941395, + "learning_rate": 7.030391019953073e-06, + "loss": 0.7298, + "step": 4394 + }, + { + "epoch": 0.39, + "grad_norm": 10.367024470211662, + "learning_rate": 7.029091178776063e-06, + "loss": 0.8507, + "step": 4395 + }, + { + "epoch": 0.39, + "grad_norm": 7.027833378652201, + "learning_rate": 7.027791173408369e-06, + "loss": 0.7664, + "step": 4396 + }, + { + "epoch": 0.39, + "grad_norm": 6.662216369173146, + "learning_rate": 7.0264910039551856e-06, + "loss": 0.8597, + "step": 4397 + }, + { + "epoch": 0.39, + "grad_norm": 16.362816659977018, + "learning_rate": 7.025190670521722e-06, + "loss": 0.6462, + "step": 4398 + }, + { + "epoch": 0.39, + "grad_norm": 16.194740269429857, + "learning_rate": 7.023890173213198e-06, + "loss": 0.7335, + "step": 4399 + }, + { + "epoch": 0.39, + "grad_norm": 2.7807372493467284, + "learning_rate": 7.022589512134846e-06, + "loss": 0.5446, + "step": 4400 + }, + { + "epoch": 0.39, + "grad_norm": 8.51899848653866, + "learning_rate": 7.021288687391917e-06, + "loss": 0.7216, + "step": 4401 + }, + { + "epoch": 0.39, + "grad_norm": 12.211547483163018, + "learning_rate": 7.0199876990896695e-06, + "loss": 0.8435, + "step": 4402 + }, + { + "epoch": 0.39, + "grad_norm": 8.963310729456314, + "learning_rate": 7.018686547333377e-06, + "loss": 0.8504, + "step": 4403 + }, + { + "epoch": 0.39, + "grad_norm": 9.968116046673517, + "learning_rate": 7.017385232228327e-06, + "loss": 0.7755, + "step": 4404 + }, + { + "epoch": 0.39, + "grad_norm": 9.438786709907923, + "learning_rate": 7.0160837538798185e-06, + "loss": 0.7761, + "step": 4405 + }, + { + "epoch": 0.39, + "grad_norm": 21.60934461450094, + "learning_rate": 7.014782112393168e-06, + "loss": 0.8822, + "step": 4406 + }, + { + "epoch": 0.39, + "grad_norm": 7.884515963143021, + "learning_rate": 7.0134803078737015e-06, + "loss": 0.8859, + "step": 4407 + }, + { + "epoch": 0.39, + "grad_norm": 10.016886021795631, + "learning_rate": 7.012178340426755e-06, + "loss": 0.745, + "step": 4408 + }, + { + "epoch": 0.39, + "grad_norm": 7.103435491192196, + "learning_rate": 7.010876210157685e-06, + "loss": 0.862, + "step": 4409 + }, + { + "epoch": 0.39, + "grad_norm": 10.431343692950202, + "learning_rate": 7.009573917171859e-06, + "loss": 0.8231, + "step": 4410 + }, + { + "epoch": 0.39, + "grad_norm": 18.34552165209728, + "learning_rate": 7.008271461574652e-06, + "loss": 0.9564, + "step": 4411 + }, + { + "epoch": 0.39, + "grad_norm": 9.460702646671372, + "learning_rate": 7.00696884347146e-06, + "loss": 0.9416, + "step": 4412 + }, + { + "epoch": 0.39, + "grad_norm": 2.201015115236704, + "learning_rate": 7.005666062967687e-06, + "loss": 0.568, + "step": 4413 + }, + { + "epoch": 0.39, + "grad_norm": 6.875421975606681, + "learning_rate": 7.004363120168752e-06, + "loss": 0.7676, + "step": 4414 + }, + { + "epoch": 0.39, + "grad_norm": 6.853422136395108, + "learning_rate": 7.003060015180089e-06, + "loss": 0.7846, + "step": 4415 + }, + { + "epoch": 0.39, + "grad_norm": 11.54255015941181, + "learning_rate": 7.001756748107141e-06, + "loss": 0.6803, + "step": 4416 + }, + { + "epoch": 0.39, + "grad_norm": 7.704874143076503, + "learning_rate": 7.000453319055366e-06, + "loss": 0.9141, + "step": 4417 + }, + { + "epoch": 0.39, + "grad_norm": 7.570600562511412, + "learning_rate": 6.999149728130237e-06, + "loss": 0.7957, + "step": 4418 + }, + { + "epoch": 0.39, + "grad_norm": 5.208956517444776, + "learning_rate": 6.9978459754372365e-06, + "loss": 0.9286, + "step": 4419 + }, + { + "epoch": 0.39, + "grad_norm": 12.576487231417843, + "learning_rate": 6.996542061081862e-06, + "loss": 0.8413, + "step": 4420 + }, + { + "epoch": 0.39, + "grad_norm": 7.662516885100971, + "learning_rate": 6.995237985169627e-06, + "loss": 0.8888, + "step": 4421 + }, + { + "epoch": 0.39, + "grad_norm": 6.043479264177179, + "learning_rate": 6.993933747806053e-06, + "loss": 0.6152, + "step": 4422 + }, + { + "epoch": 0.39, + "grad_norm": 12.620440692350428, + "learning_rate": 6.9926293490966755e-06, + "loss": 0.7854, + "step": 4423 + }, + { + "epoch": 0.39, + "grad_norm": 7.741701178813809, + "learning_rate": 6.991324789147047e-06, + "loss": 0.6891, + "step": 4424 + }, + { + "epoch": 0.39, + "grad_norm": 8.685802635404976, + "learning_rate": 6.990020068062727e-06, + "loss": 0.8377, + "step": 4425 + }, + { + "epoch": 0.39, + "grad_norm": 5.752529588433591, + "learning_rate": 6.988715185949295e-06, + "loss": 0.8404, + "step": 4426 + }, + { + "epoch": 0.39, + "grad_norm": 3.388810691922714, + "learning_rate": 6.987410142912338e-06, + "loss": 0.5465, + "step": 4427 + }, + { + "epoch": 0.39, + "grad_norm": 8.93083220066841, + "learning_rate": 6.986104939057458e-06, + "loss": 0.7897, + "step": 4428 + }, + { + "epoch": 0.39, + "grad_norm": 5.661946376943957, + "learning_rate": 6.984799574490271e-06, + "loss": 0.6852, + "step": 4429 + }, + { + "epoch": 0.39, + "grad_norm": 14.656375174248053, + "learning_rate": 6.9834940493164024e-06, + "loss": 0.726, + "step": 4430 + }, + { + "epoch": 0.39, + "grad_norm": 7.09750453166627, + "learning_rate": 6.982188363641495e-06, + "loss": 0.7124, + "step": 4431 + }, + { + "epoch": 0.39, + "grad_norm": 7.93509864075299, + "learning_rate": 6.980882517571204e-06, + "loss": 0.6274, + "step": 4432 + }, + { + "epoch": 0.39, + "grad_norm": 7.948380980976774, + "learning_rate": 6.9795765112111925e-06, + "loss": 0.8197, + "step": 4433 + }, + { + "epoch": 0.39, + "grad_norm": 18.954851387191972, + "learning_rate": 6.978270344667143e-06, + "loss": 0.7681, + "step": 4434 + }, + { + "epoch": 0.39, + "grad_norm": 11.837830348562482, + "learning_rate": 6.976964018044749e-06, + "loss": 0.7819, + "step": 4435 + }, + { + "epoch": 0.39, + "grad_norm": 16.559556509315126, + "learning_rate": 6.9756575314497145e-06, + "loss": 0.9106, + "step": 4436 + }, + { + "epoch": 0.39, + "grad_norm": 13.83920334844155, + "learning_rate": 6.974350884987758e-06, + "loss": 0.7125, + "step": 4437 + }, + { + "epoch": 0.39, + "grad_norm": 3.208268719260643, + "learning_rate": 6.973044078764613e-06, + "loss": 0.5213, + "step": 4438 + }, + { + "epoch": 0.39, + "grad_norm": 2.6658319572656173, + "learning_rate": 6.971737112886025e-06, + "loss": 0.5647, + "step": 4439 + }, + { + "epoch": 0.39, + "grad_norm": 7.314435293069487, + "learning_rate": 6.970429987457747e-06, + "loss": 0.7287, + "step": 4440 + }, + { + "epoch": 0.39, + "grad_norm": 7.563953100160512, + "learning_rate": 6.9691227025855525e-06, + "loss": 0.804, + "step": 4441 + }, + { + "epoch": 0.39, + "grad_norm": 10.627420454939372, + "learning_rate": 6.967815258375226e-06, + "loss": 0.754, + "step": 4442 + }, + { + "epoch": 0.39, + "grad_norm": 8.110923346855921, + "learning_rate": 6.96650765493256e-06, + "loss": 0.6708, + "step": 4443 + }, + { + "epoch": 0.39, + "grad_norm": 7.619405374509321, + "learning_rate": 6.965199892363366e-06, + "loss": 0.7915, + "step": 4444 + }, + { + "epoch": 0.39, + "grad_norm": 9.017596979510124, + "learning_rate": 6.963891970773465e-06, + "loss": 0.7176, + "step": 4445 + }, + { + "epoch": 0.39, + "grad_norm": 7.8634846211156795, + "learning_rate": 6.962583890268695e-06, + "loss": 0.8741, + "step": 4446 + }, + { + "epoch": 0.39, + "grad_norm": 17.33892941879066, + "learning_rate": 6.961275650954899e-06, + "loss": 0.7357, + "step": 4447 + }, + { + "epoch": 0.39, + "grad_norm": 5.6350174967858315, + "learning_rate": 6.95996725293794e-06, + "loss": 0.8733, + "step": 4448 + }, + { + "epoch": 0.39, + "grad_norm": 3.3408517922756142, + "learning_rate": 6.958658696323692e-06, + "loss": 0.5377, + "step": 4449 + }, + { + "epoch": 0.39, + "grad_norm": 9.044866318113685, + "learning_rate": 6.957349981218041e-06, + "loss": 0.8834, + "step": 4450 + }, + { + "epoch": 0.39, + "grad_norm": 8.688095163303197, + "learning_rate": 6.956041107726884e-06, + "loss": 0.7928, + "step": 4451 + }, + { + "epoch": 0.39, + "grad_norm": 7.630419145447222, + "learning_rate": 6.954732075956134e-06, + "loss": 0.8421, + "step": 4452 + }, + { + "epoch": 0.39, + "grad_norm": 7.27321705132315, + "learning_rate": 6.953422886011718e-06, + "loss": 0.733, + "step": 4453 + }, + { + "epoch": 0.39, + "grad_norm": 8.597974729892012, + "learning_rate": 6.952113537999569e-06, + "loss": 0.8292, + "step": 4454 + }, + { + "epoch": 0.39, + "grad_norm": 6.62555092148915, + "learning_rate": 6.950804032025642e-06, + "loss": 0.8013, + "step": 4455 + }, + { + "epoch": 0.39, + "grad_norm": 2.91077079361972, + "learning_rate": 6.949494368195896e-06, + "loss": 0.5764, + "step": 4456 + }, + { + "epoch": 0.39, + "grad_norm": 17.039199617730944, + "learning_rate": 6.948184546616309e-06, + "loss": 0.9312, + "step": 4457 + }, + { + "epoch": 0.39, + "grad_norm": 4.899303303531948, + "learning_rate": 6.946874567392869e-06, + "loss": 0.674, + "step": 4458 + }, + { + "epoch": 0.39, + "grad_norm": 7.041618546185628, + "learning_rate": 6.945564430631578e-06, + "loss": 0.8433, + "step": 4459 + }, + { + "epoch": 0.39, + "grad_norm": 9.753665798423048, + "learning_rate": 6.944254136438448e-06, + "loss": 0.8307, + "step": 4460 + }, + { + "epoch": 0.39, + "grad_norm": 6.6969354022508725, + "learning_rate": 6.94294368491951e-06, + "loss": 0.8394, + "step": 4461 + }, + { + "epoch": 0.39, + "grad_norm": 12.78792407698553, + "learning_rate": 6.941633076180799e-06, + "loss": 0.6415, + "step": 4462 + }, + { + "epoch": 0.39, + "grad_norm": 7.705390243247103, + "learning_rate": 6.940322310328369e-06, + "loss": 0.9567, + "step": 4463 + }, + { + "epoch": 0.39, + "grad_norm": 7.5194497296776595, + "learning_rate": 6.939011387468286e-06, + "loss": 0.9357, + "step": 4464 + }, + { + "epoch": 0.39, + "grad_norm": 10.131349960578042, + "learning_rate": 6.937700307706627e-06, + "loss": 0.6208, + "step": 4465 + }, + { + "epoch": 0.39, + "grad_norm": 2.3396407010772484, + "learning_rate": 6.936389071149482e-06, + "loss": 0.5448, + "step": 4466 + }, + { + "epoch": 0.39, + "grad_norm": 9.906183667300747, + "learning_rate": 6.935077677902955e-06, + "loss": 0.7877, + "step": 4467 + }, + { + "epoch": 0.39, + "grad_norm": 6.990468715281866, + "learning_rate": 6.93376612807316e-06, + "loss": 0.7676, + "step": 4468 + }, + { + "epoch": 0.39, + "grad_norm": 6.597011091456861, + "learning_rate": 6.932454421766228e-06, + "loss": 0.7744, + "step": 4469 + }, + { + "epoch": 0.39, + "grad_norm": 9.121266363197666, + "learning_rate": 6.931142559088298e-06, + "loss": 0.684, + "step": 4470 + }, + { + "epoch": 0.39, + "grad_norm": 13.537676710991741, + "learning_rate": 6.929830540145523e-06, + "loss": 0.8041, + "step": 4471 + }, + { + "epoch": 0.39, + "grad_norm": 26.394742919690835, + "learning_rate": 6.928518365044072e-06, + "loss": 0.8196, + "step": 4472 + }, + { + "epoch": 0.39, + "grad_norm": 8.427070209221434, + "learning_rate": 6.927206033890124e-06, + "loss": 0.8078, + "step": 4473 + }, + { + "epoch": 0.39, + "grad_norm": 8.062876189308389, + "learning_rate": 6.925893546789868e-06, + "loss": 0.771, + "step": 4474 + }, + { + "epoch": 0.39, + "grad_norm": 8.909593195072985, + "learning_rate": 6.92458090384951e-06, + "loss": 0.7415, + "step": 4475 + }, + { + "epoch": 0.39, + "grad_norm": 8.168890457596687, + "learning_rate": 6.923268105175267e-06, + "loss": 0.834, + "step": 4476 + }, + { + "epoch": 0.39, + "grad_norm": 6.406047637590208, + "learning_rate": 6.921955150873367e-06, + "loss": 0.8039, + "step": 4477 + }, + { + "epoch": 0.39, + "grad_norm": 6.585763120895072, + "learning_rate": 6.920642041050055e-06, + "loss": 0.8102, + "step": 4478 + }, + { + "epoch": 0.39, + "grad_norm": 7.394393249783627, + "learning_rate": 6.919328775811584e-06, + "loss": 0.8396, + "step": 4479 + }, + { + "epoch": 0.39, + "grad_norm": 13.105604244571492, + "learning_rate": 6.91801535526422e-06, + "loss": 0.7877, + "step": 4480 + }, + { + "epoch": 0.39, + "grad_norm": 7.494586460354308, + "learning_rate": 6.916701779514243e-06, + "loss": 0.7277, + "step": 4481 + }, + { + "epoch": 0.39, + "grad_norm": 6.361192224319966, + "learning_rate": 6.915388048667948e-06, + "loss": 0.7868, + "step": 4482 + }, + { + "epoch": 0.39, + "grad_norm": 6.854527503381094, + "learning_rate": 6.914074162831637e-06, + "loss": 0.8578, + "step": 4483 + }, + { + "epoch": 0.39, + "grad_norm": 3.7642322549990324, + "learning_rate": 6.912760122111629e-06, + "loss": 0.5576, + "step": 4484 + }, + { + "epoch": 0.39, + "grad_norm": 19.232667773582715, + "learning_rate": 6.9114459266142534e-06, + "loss": 0.8613, + "step": 4485 + }, + { + "epoch": 0.39, + "grad_norm": 7.665538427674049, + "learning_rate": 6.910131576445853e-06, + "loss": 0.757, + "step": 4486 + }, + { + "epoch": 0.39, + "grad_norm": 11.277818069860224, + "learning_rate": 6.908817071712784e-06, + "loss": 0.6727, + "step": 4487 + }, + { + "epoch": 0.39, + "grad_norm": 3.026618945264744, + "learning_rate": 6.907502412521411e-06, + "loss": 0.5528, + "step": 4488 + }, + { + "epoch": 0.39, + "grad_norm": 6.00352775361295, + "learning_rate": 6.9061875989781165e-06, + "loss": 0.8249, + "step": 4489 + }, + { + "epoch": 0.39, + "grad_norm": 7.409841362763583, + "learning_rate": 6.9048726311892925e-06, + "loss": 0.8451, + "step": 4490 + }, + { + "epoch": 0.39, + "grad_norm": 2.882670413874495, + "learning_rate": 6.903557509261343e-06, + "loss": 0.5315, + "step": 4491 + }, + { + "epoch": 0.39, + "grad_norm": 6.307649062850968, + "learning_rate": 6.9022422333006865e-06, + "loss": 0.6776, + "step": 4492 + }, + { + "epoch": 0.39, + "grad_norm": 6.383484877719392, + "learning_rate": 6.900926803413755e-06, + "loss": 0.7759, + "step": 4493 + }, + { + "epoch": 0.39, + "grad_norm": 6.985843991604068, + "learning_rate": 6.8996112197069855e-06, + "loss": 0.8008, + "step": 4494 + }, + { + "epoch": 0.39, + "grad_norm": 5.3836569937750225, + "learning_rate": 6.898295482286837e-06, + "loss": 0.8011, + "step": 4495 + }, + { + "epoch": 0.39, + "grad_norm": 5.4019325870902914, + "learning_rate": 6.896979591259776e-06, + "loss": 0.8664, + "step": 4496 + }, + { + "epoch": 0.39, + "grad_norm": 7.999332344350127, + "learning_rate": 6.895663546732281e-06, + "loss": 0.8122, + "step": 4497 + }, + { + "epoch": 0.4, + "grad_norm": 11.8941421173826, + "learning_rate": 6.894347348810846e-06, + "loss": 0.8144, + "step": 4498 + }, + { + "epoch": 0.4, + "grad_norm": 6.702960949299591, + "learning_rate": 6.893030997601975e-06, + "loss": 0.8713, + "step": 4499 + }, + { + "epoch": 0.4, + "grad_norm": 22.913185528399197, + "learning_rate": 6.891714493212183e-06, + "loss": 0.8515, + "step": 4500 + }, + { + "epoch": 0.4, + "grad_norm": 8.009059324750833, + "learning_rate": 6.890397835748002e-06, + "loss": 0.953, + "step": 4501 + }, + { + "epoch": 0.4, + "grad_norm": 7.537232580519858, + "learning_rate": 6.889081025315974e-06, + "loss": 0.7818, + "step": 4502 + }, + { + "epoch": 0.4, + "grad_norm": 5.953973887134611, + "learning_rate": 6.887764062022649e-06, + "loss": 1.0119, + "step": 4503 + }, + { + "epoch": 0.4, + "grad_norm": 2.681138038055754, + "learning_rate": 6.886446945974597e-06, + "loss": 0.4993, + "step": 4504 + }, + { + "epoch": 0.4, + "grad_norm": 2.221203761714504, + "learning_rate": 6.885129677278397e-06, + "loss": 0.5036, + "step": 4505 + }, + { + "epoch": 0.4, + "grad_norm": 10.832330815906538, + "learning_rate": 6.8838122560406386e-06, + "loss": 0.8133, + "step": 4506 + }, + { + "epoch": 0.4, + "grad_norm": 7.408880089864309, + "learning_rate": 6.882494682367926e-06, + "loss": 0.704, + "step": 4507 + }, + { + "epoch": 0.4, + "grad_norm": 7.0817941069524295, + "learning_rate": 6.8811769563668755e-06, + "loss": 0.7232, + "step": 4508 + }, + { + "epoch": 0.4, + "grad_norm": 6.2673572127303965, + "learning_rate": 6.879859078144116e-06, + "loss": 0.7289, + "step": 4509 + }, + { + "epoch": 0.4, + "grad_norm": 7.762282822551626, + "learning_rate": 6.8785410478062845e-06, + "loss": 0.8908, + "step": 4510 + }, + { + "epoch": 0.4, + "grad_norm": 3.184186979445442, + "learning_rate": 6.877222865460037e-06, + "loss": 0.5461, + "step": 4511 + }, + { + "epoch": 0.4, + "grad_norm": 5.158450883502676, + "learning_rate": 6.875904531212039e-06, + "loss": 0.7476, + "step": 4512 + }, + { + "epoch": 0.4, + "grad_norm": 6.558537689133732, + "learning_rate": 6.874586045168965e-06, + "loss": 0.7669, + "step": 4513 + }, + { + "epoch": 0.4, + "grad_norm": 17.659848248466105, + "learning_rate": 6.873267407437507e-06, + "loss": 0.857, + "step": 4514 + }, + { + "epoch": 0.4, + "grad_norm": 8.565522554428988, + "learning_rate": 6.871948618124367e-06, + "loss": 0.7939, + "step": 4515 + }, + { + "epoch": 0.4, + "grad_norm": 6.697061856405658, + "learning_rate": 6.870629677336259e-06, + "loss": 0.719, + "step": 4516 + }, + { + "epoch": 0.4, + "grad_norm": 8.106996814420599, + "learning_rate": 6.869310585179908e-06, + "loss": 0.8675, + "step": 4517 + }, + { + "epoch": 0.4, + "grad_norm": 9.349229318431787, + "learning_rate": 6.867991341762054e-06, + "loss": 0.857, + "step": 4518 + }, + { + "epoch": 0.4, + "grad_norm": 14.861244380277864, + "learning_rate": 6.86667194718945e-06, + "loss": 0.8082, + "step": 4519 + }, + { + "epoch": 0.4, + "grad_norm": 12.70381297734772, + "learning_rate": 6.865352401568853e-06, + "loss": 0.7619, + "step": 4520 + }, + { + "epoch": 0.4, + "grad_norm": 7.548069797429883, + "learning_rate": 6.864032705007047e-06, + "loss": 0.7003, + "step": 4521 + }, + { + "epoch": 0.4, + "grad_norm": 12.812630375882438, + "learning_rate": 6.862712857610812e-06, + "loss": 0.6667, + "step": 4522 + }, + { + "epoch": 0.4, + "grad_norm": 31.052963429616256, + "learning_rate": 6.861392859486951e-06, + "loss": 0.7181, + "step": 4523 + }, + { + "epoch": 0.4, + "grad_norm": 43.11042655268576, + "learning_rate": 6.860072710742278e-06, + "loss": 0.7938, + "step": 4524 + }, + { + "epoch": 0.4, + "grad_norm": 8.957362535767436, + "learning_rate": 6.858752411483613e-06, + "loss": 0.7683, + "step": 4525 + }, + { + "epoch": 0.4, + "grad_norm": 9.495483107186548, + "learning_rate": 6.857431961817798e-06, + "loss": 0.8605, + "step": 4526 + }, + { + "epoch": 0.4, + "grad_norm": 9.345267722849643, + "learning_rate": 6.856111361851676e-06, + "loss": 0.7716, + "step": 4527 + }, + { + "epoch": 0.4, + "grad_norm": 12.812487558985742, + "learning_rate": 6.8547906116921105e-06, + "loss": 0.774, + "step": 4528 + }, + { + "epoch": 0.4, + "grad_norm": 9.370714545064072, + "learning_rate": 6.853469711445974e-06, + "loss": 0.8527, + "step": 4529 + }, + { + "epoch": 0.4, + "grad_norm": 9.744475206864491, + "learning_rate": 6.8521486612201526e-06, + "loss": 0.7168, + "step": 4530 + }, + { + "epoch": 0.4, + "grad_norm": 7.254084993404733, + "learning_rate": 6.850827461121541e-06, + "loss": 0.7423, + "step": 4531 + }, + { + "epoch": 0.4, + "grad_norm": 7.642328930175775, + "learning_rate": 6.849506111257049e-06, + "loss": 0.9246, + "step": 4532 + }, + { + "epoch": 0.4, + "grad_norm": 11.912635921045446, + "learning_rate": 6.848184611733602e-06, + "loss": 0.7736, + "step": 4533 + }, + { + "epoch": 0.4, + "grad_norm": 1.9589077549906029, + "learning_rate": 6.846862962658127e-06, + "loss": 0.53, + "step": 4534 + }, + { + "epoch": 0.4, + "grad_norm": 7.941922210609474, + "learning_rate": 6.8455411641375765e-06, + "loss": 0.6921, + "step": 4535 + }, + { + "epoch": 0.4, + "grad_norm": 7.85999400518437, + "learning_rate": 6.844219216278903e-06, + "loss": 0.9383, + "step": 4536 + }, + { + "epoch": 0.4, + "grad_norm": 2.7836733680831625, + "learning_rate": 6.842897119189078e-06, + "loss": 0.5291, + "step": 4537 + }, + { + "epoch": 0.4, + "grad_norm": 7.017131118427734, + "learning_rate": 6.841574872975085e-06, + "loss": 0.8399, + "step": 4538 + }, + { + "epoch": 0.4, + "grad_norm": 9.811034168270206, + "learning_rate": 6.840252477743916e-06, + "loss": 0.6205, + "step": 4539 + }, + { + "epoch": 0.4, + "grad_norm": 2.1339303004155386, + "learning_rate": 6.8389299336025775e-06, + "loss": 0.5169, + "step": 4540 + }, + { + "epoch": 0.4, + "grad_norm": 8.970297071201824, + "learning_rate": 6.837607240658087e-06, + "loss": 0.6967, + "step": 4541 + }, + { + "epoch": 0.4, + "grad_norm": 2.69580691479686, + "learning_rate": 6.836284399017477e-06, + "loss": 0.5088, + "step": 4542 + }, + { + "epoch": 0.4, + "grad_norm": 11.391434836394847, + "learning_rate": 6.834961408787786e-06, + "loss": 0.7161, + "step": 4543 + }, + { + "epoch": 0.4, + "grad_norm": 3.4554869986282815, + "learning_rate": 6.833638270076071e-06, + "loss": 0.6615, + "step": 4544 + }, + { + "epoch": 0.4, + "grad_norm": 3.487318017120752, + "learning_rate": 6.8323149829893965e-06, + "loss": 0.5864, + "step": 4545 + }, + { + "epoch": 0.4, + "grad_norm": 13.056820925003093, + "learning_rate": 6.830991547634843e-06, + "loss": 0.936, + "step": 4546 + }, + { + "epoch": 0.4, + "grad_norm": 4.608218400175116, + "learning_rate": 6.829667964119499e-06, + "loss": 0.6416, + "step": 4547 + }, + { + "epoch": 0.4, + "grad_norm": 12.545999631740044, + "learning_rate": 6.828344232550465e-06, + "loss": 0.8303, + "step": 4548 + }, + { + "epoch": 0.4, + "grad_norm": 6.101464687051737, + "learning_rate": 6.827020353034859e-06, + "loss": 0.7545, + "step": 4549 + }, + { + "epoch": 0.4, + "grad_norm": 8.613802412009893, + "learning_rate": 6.825696325679805e-06, + "loss": 0.7347, + "step": 4550 + }, + { + "epoch": 0.4, + "grad_norm": 12.31143659516506, + "learning_rate": 6.82437215059244e-06, + "loss": 0.6734, + "step": 4551 + }, + { + "epoch": 0.4, + "grad_norm": 8.55969184140422, + "learning_rate": 6.823047827879916e-06, + "loss": 0.9353, + "step": 4552 + }, + { + "epoch": 0.4, + "grad_norm": 11.87500262234787, + "learning_rate": 6.821723357649395e-06, + "loss": 0.8044, + "step": 4553 + }, + { + "epoch": 0.4, + "grad_norm": 8.960777093366303, + "learning_rate": 6.820398740008048e-06, + "loss": 0.8229, + "step": 4554 + }, + { + "epoch": 0.4, + "grad_norm": 27.97755907666541, + "learning_rate": 6.819073975063064e-06, + "loss": 0.7171, + "step": 4555 + }, + { + "epoch": 0.4, + "grad_norm": 10.27819513468487, + "learning_rate": 6.81774906292164e-06, + "loss": 0.7974, + "step": 4556 + }, + { + "epoch": 0.4, + "grad_norm": 7.23469055439265, + "learning_rate": 6.816424003690985e-06, + "loss": 0.7557, + "step": 4557 + }, + { + "epoch": 0.4, + "grad_norm": 9.680246133925927, + "learning_rate": 6.815098797478322e-06, + "loss": 0.8435, + "step": 4558 + }, + { + "epoch": 0.4, + "grad_norm": 8.378038788646816, + "learning_rate": 6.813773444390884e-06, + "loss": 0.8619, + "step": 4559 + }, + { + "epoch": 0.4, + "grad_norm": 15.204893132682937, + "learning_rate": 6.812447944535915e-06, + "loss": 0.7172, + "step": 4560 + }, + { + "epoch": 0.4, + "grad_norm": 8.87585730627427, + "learning_rate": 6.811122298020674e-06, + "loss": 0.7898, + "step": 4561 + }, + { + "epoch": 0.4, + "grad_norm": 12.332636511974263, + "learning_rate": 6.809796504952428e-06, + "loss": 0.6782, + "step": 4562 + }, + { + "epoch": 0.4, + "grad_norm": 9.16408062987632, + "learning_rate": 6.80847056543846e-06, + "loss": 0.7601, + "step": 4563 + }, + { + "epoch": 0.4, + "grad_norm": 15.810960905734122, + "learning_rate": 6.807144479586062e-06, + "loss": 0.683, + "step": 4564 + }, + { + "epoch": 0.4, + "grad_norm": 8.389582681670632, + "learning_rate": 6.805818247502538e-06, + "loss": 0.6599, + "step": 4565 + }, + { + "epoch": 0.4, + "grad_norm": 11.114592779317702, + "learning_rate": 6.804491869295207e-06, + "loss": 0.8182, + "step": 4566 + }, + { + "epoch": 0.4, + "grad_norm": 10.987046733628178, + "learning_rate": 6.803165345071394e-06, + "loss": 0.8912, + "step": 4567 + }, + { + "epoch": 0.4, + "grad_norm": 8.134410405890483, + "learning_rate": 6.8018386749384415e-06, + "loss": 0.8646, + "step": 4568 + }, + { + "epoch": 0.4, + "grad_norm": 11.646952248890532, + "learning_rate": 6.8005118590037e-06, + "loss": 0.8342, + "step": 4569 + }, + { + "epoch": 0.4, + "grad_norm": 2.897158040028266, + "learning_rate": 6.799184897374534e-06, + "loss": 0.5048, + "step": 4570 + }, + { + "epoch": 0.4, + "grad_norm": 3.247643164415676, + "learning_rate": 6.79785779015832e-06, + "loss": 0.5165, + "step": 4571 + }, + { + "epoch": 0.4, + "grad_norm": 6.813285561445752, + "learning_rate": 6.796530537462442e-06, + "loss": 0.781, + "step": 4572 + }, + { + "epoch": 0.4, + "grad_norm": 7.545952032037662, + "learning_rate": 6.795203139394304e-06, + "loss": 0.7028, + "step": 4573 + }, + { + "epoch": 0.4, + "grad_norm": 11.55561455366942, + "learning_rate": 6.793875596061312e-06, + "loss": 0.8924, + "step": 4574 + }, + { + "epoch": 0.4, + "grad_norm": 10.909632614268135, + "learning_rate": 6.792547907570893e-06, + "loss": 0.8757, + "step": 4575 + }, + { + "epoch": 0.4, + "grad_norm": 6.661870311715068, + "learning_rate": 6.791220074030477e-06, + "loss": 0.7085, + "step": 4576 + }, + { + "epoch": 0.4, + "grad_norm": 7.569192514546743, + "learning_rate": 6.789892095547511e-06, + "loss": 0.7389, + "step": 4577 + }, + { + "epoch": 0.4, + "grad_norm": 9.352348287388297, + "learning_rate": 6.788563972229456e-06, + "loss": 0.7193, + "step": 4578 + }, + { + "epoch": 0.4, + "grad_norm": 9.51533421608033, + "learning_rate": 6.78723570418378e-06, + "loss": 0.8962, + "step": 4579 + }, + { + "epoch": 0.4, + "grad_norm": 9.527154041001022, + "learning_rate": 6.785907291517962e-06, + "loss": 0.7258, + "step": 4580 + }, + { + "epoch": 0.4, + "grad_norm": 2.7282360273580606, + "learning_rate": 6.784578734339498e-06, + "loss": 0.5336, + "step": 4581 + }, + { + "epoch": 0.4, + "grad_norm": 5.5525174735544285, + "learning_rate": 6.783250032755891e-06, + "loss": 0.6477, + "step": 4582 + }, + { + "epoch": 0.4, + "grad_norm": 15.845130584691823, + "learning_rate": 6.781921186874657e-06, + "loss": 0.9285, + "step": 4583 + }, + { + "epoch": 0.4, + "grad_norm": 7.682221258535735, + "learning_rate": 6.780592196803326e-06, + "loss": 0.9198, + "step": 4584 + }, + { + "epoch": 0.4, + "grad_norm": 7.426836519078272, + "learning_rate": 6.779263062649435e-06, + "loss": 0.6632, + "step": 4585 + }, + { + "epoch": 0.4, + "grad_norm": 7.816858471964451, + "learning_rate": 6.777933784520536e-06, + "loss": 0.7416, + "step": 4586 + }, + { + "epoch": 0.4, + "grad_norm": 7.734695467523761, + "learning_rate": 6.776604362524195e-06, + "loss": 0.7782, + "step": 4587 + }, + { + "epoch": 0.4, + "grad_norm": 9.951854007679364, + "learning_rate": 6.7752747967679825e-06, + "loss": 0.8303, + "step": 4588 + }, + { + "epoch": 0.4, + "grad_norm": 11.733088905870034, + "learning_rate": 6.773945087359487e-06, + "loss": 0.8336, + "step": 4589 + }, + { + "epoch": 0.4, + "grad_norm": 8.833530382881705, + "learning_rate": 6.772615234406306e-06, + "loss": 0.7515, + "step": 4590 + }, + { + "epoch": 0.4, + "grad_norm": 4.376299523145269, + "learning_rate": 6.771285238016048e-06, + "loss": 0.6014, + "step": 4591 + }, + { + "epoch": 0.4, + "grad_norm": 2.322977206571385, + "learning_rate": 6.769955098296334e-06, + "loss": 0.5477, + "step": 4592 + }, + { + "epoch": 0.4, + "grad_norm": 8.270428255875517, + "learning_rate": 6.768624815354801e-06, + "loss": 0.8061, + "step": 4593 + }, + { + "epoch": 0.4, + "grad_norm": 12.996905544040304, + "learning_rate": 6.767294389299086e-06, + "loss": 0.6906, + "step": 4594 + }, + { + "epoch": 0.4, + "grad_norm": 10.98460338458097, + "learning_rate": 6.765963820236851e-06, + "loss": 0.9838, + "step": 4595 + }, + { + "epoch": 0.4, + "grad_norm": 7.035865174837813, + "learning_rate": 6.764633108275762e-06, + "loss": 0.8248, + "step": 4596 + }, + { + "epoch": 0.4, + "grad_norm": 8.639699454121303, + "learning_rate": 6.763302253523496e-06, + "loss": 0.7907, + "step": 4597 + }, + { + "epoch": 0.4, + "grad_norm": 5.883651678833041, + "learning_rate": 6.761971256087745e-06, + "loss": 0.796, + "step": 4598 + }, + { + "epoch": 0.4, + "grad_norm": 8.460269740378816, + "learning_rate": 6.7606401160762105e-06, + "loss": 0.6476, + "step": 4599 + }, + { + "epoch": 0.4, + "grad_norm": 14.711174146691137, + "learning_rate": 6.759308833596606e-06, + "loss": 0.7507, + "step": 4600 + }, + { + "epoch": 0.4, + "grad_norm": 9.921848143674518, + "learning_rate": 6.7579774087566585e-06, + "loss": 0.7169, + "step": 4601 + }, + { + "epoch": 0.4, + "grad_norm": 11.643531690630933, + "learning_rate": 6.756645841664102e-06, + "loss": 0.8738, + "step": 4602 + }, + { + "epoch": 0.4, + "grad_norm": 3.1713516746445762, + "learning_rate": 6.755314132426687e-06, + "loss": 0.574, + "step": 4603 + }, + { + "epoch": 0.4, + "grad_norm": 14.814944914672978, + "learning_rate": 6.753982281152172e-06, + "loss": 0.7187, + "step": 4604 + }, + { + "epoch": 0.4, + "grad_norm": 10.472444170496006, + "learning_rate": 6.752650287948328e-06, + "loss": 0.7462, + "step": 4605 + }, + { + "epoch": 0.4, + "grad_norm": 2.675102304750123, + "learning_rate": 6.751318152922939e-06, + "loss": 0.5545, + "step": 4606 + }, + { + "epoch": 0.4, + "grad_norm": 7.9526009756943585, + "learning_rate": 6.7499858761837974e-06, + "loss": 0.8201, + "step": 4607 + }, + { + "epoch": 0.4, + "grad_norm": 9.735220238284905, + "learning_rate": 6.748653457838709e-06, + "loss": 0.8074, + "step": 4608 + }, + { + "epoch": 0.4, + "grad_norm": 3.1439590892967724, + "learning_rate": 6.747320897995493e-06, + "loss": 0.575, + "step": 4609 + }, + { + "epoch": 0.4, + "grad_norm": 8.787852513566639, + "learning_rate": 6.745988196761976e-06, + "loss": 0.6282, + "step": 4610 + }, + { + "epoch": 0.4, + "grad_norm": 8.169283643239552, + "learning_rate": 6.744655354245997e-06, + "loss": 0.7628, + "step": 4611 + }, + { + "epoch": 0.41, + "grad_norm": 11.242689916265613, + "learning_rate": 6.7433223705554104e-06, + "loss": 0.7994, + "step": 4612 + }, + { + "epoch": 0.41, + "grad_norm": 11.814269238339683, + "learning_rate": 6.741989245798078e-06, + "loss": 0.7815, + "step": 4613 + }, + { + "epoch": 0.41, + "grad_norm": 13.026312574371186, + "learning_rate": 6.740655980081871e-06, + "loss": 0.7276, + "step": 4614 + }, + { + "epoch": 0.41, + "grad_norm": 7.1366874889311225, + "learning_rate": 6.73932257351468e-06, + "loss": 0.9168, + "step": 4615 + }, + { + "epoch": 0.41, + "grad_norm": 5.552825329991237, + "learning_rate": 6.737989026204399e-06, + "loss": 0.7576, + "step": 4616 + }, + { + "epoch": 0.41, + "grad_norm": 8.26497465094061, + "learning_rate": 6.736655338258936e-06, + "loss": 0.8107, + "step": 4617 + }, + { + "epoch": 0.41, + "grad_norm": 13.104740119586634, + "learning_rate": 6.735321509786212e-06, + "loss": 0.9147, + "step": 4618 + }, + { + "epoch": 0.41, + "grad_norm": 13.63151906009315, + "learning_rate": 6.7339875408941605e-06, + "loss": 0.6713, + "step": 4619 + }, + { + "epoch": 0.41, + "grad_norm": 12.42677636125553, + "learning_rate": 6.732653431690717e-06, + "loss": 0.7956, + "step": 4620 + }, + { + "epoch": 0.41, + "grad_norm": 10.10370676461406, + "learning_rate": 6.731319182283844e-06, + "loss": 0.8472, + "step": 4621 + }, + { + "epoch": 0.41, + "grad_norm": 2.596265952464411, + "learning_rate": 6.729984792781501e-06, + "loss": 0.4939, + "step": 4622 + }, + { + "epoch": 0.41, + "grad_norm": 10.371558018707695, + "learning_rate": 6.728650263291666e-06, + "loss": 0.8747, + "step": 4623 + }, + { + "epoch": 0.41, + "grad_norm": 9.484194166910912, + "learning_rate": 6.7273155939223275e-06, + "loss": 0.7659, + "step": 4624 + }, + { + "epoch": 0.41, + "grad_norm": 12.003210068481927, + "learning_rate": 6.725980784781484e-06, + "loss": 0.7024, + "step": 4625 + }, + { + "epoch": 0.41, + "grad_norm": 12.195139019311512, + "learning_rate": 6.724645835977147e-06, + "loss": 0.7402, + "step": 4626 + }, + { + "epoch": 0.41, + "grad_norm": 9.204228085000878, + "learning_rate": 6.723310747617338e-06, + "loss": 0.8216, + "step": 4627 + }, + { + "epoch": 0.41, + "grad_norm": 11.54702593588081, + "learning_rate": 6.721975519810089e-06, + "loss": 0.8073, + "step": 4628 + }, + { + "epoch": 0.41, + "grad_norm": 11.768493905397344, + "learning_rate": 6.720640152663446e-06, + "loss": 0.8802, + "step": 4629 + }, + { + "epoch": 0.41, + "grad_norm": 21.273126485941344, + "learning_rate": 6.719304646285464e-06, + "loss": 0.7522, + "step": 4630 + }, + { + "epoch": 0.41, + "grad_norm": 7.837700597024896, + "learning_rate": 6.7179690007842094e-06, + "loss": 0.8204, + "step": 4631 + }, + { + "epoch": 0.41, + "grad_norm": 14.952994772980054, + "learning_rate": 6.71663321626776e-06, + "loss": 0.8583, + "step": 4632 + }, + { + "epoch": 0.41, + "grad_norm": 8.561770491959464, + "learning_rate": 6.71529729284421e-06, + "loss": 0.6727, + "step": 4633 + }, + { + "epoch": 0.41, + "grad_norm": 23.130314210252255, + "learning_rate": 6.713961230621652e-06, + "loss": 0.8149, + "step": 4634 + }, + { + "epoch": 0.41, + "grad_norm": 7.407139828020364, + "learning_rate": 6.712625029708204e-06, + "loss": 0.8183, + "step": 4635 + }, + { + "epoch": 0.41, + "grad_norm": 17.451046884798714, + "learning_rate": 6.711288690211988e-06, + "loss": 0.6912, + "step": 4636 + }, + { + "epoch": 0.41, + "grad_norm": 12.208927786407155, + "learning_rate": 6.709952212241137e-06, + "loss": 0.8733, + "step": 4637 + }, + { + "epoch": 0.41, + "grad_norm": 10.723141684122927, + "learning_rate": 6.708615595903798e-06, + "loss": 0.754, + "step": 4638 + }, + { + "epoch": 0.41, + "grad_norm": 11.007937113685093, + "learning_rate": 6.707278841308127e-06, + "loss": 0.7358, + "step": 4639 + }, + { + "epoch": 0.41, + "grad_norm": 7.4156028274704235, + "learning_rate": 6.70594194856229e-06, + "loss": 0.884, + "step": 4640 + }, + { + "epoch": 0.41, + "grad_norm": 27.80329019880578, + "learning_rate": 6.704604917774471e-06, + "loss": 0.8627, + "step": 4641 + }, + { + "epoch": 0.41, + "grad_norm": 15.697317398330378, + "learning_rate": 6.703267749052855e-06, + "loss": 0.8861, + "step": 4642 + }, + { + "epoch": 0.41, + "grad_norm": 9.48133469599976, + "learning_rate": 6.7019304425056484e-06, + "loss": 0.8147, + "step": 4643 + }, + { + "epoch": 0.41, + "grad_norm": 8.12132211268097, + "learning_rate": 6.70059299824106e-06, + "loss": 0.8496, + "step": 4644 + }, + { + "epoch": 0.41, + "grad_norm": 10.53420066222315, + "learning_rate": 6.6992554163673154e-06, + "loss": 0.7537, + "step": 4645 + }, + { + "epoch": 0.41, + "grad_norm": 21.699647717625204, + "learning_rate": 6.697917696992649e-06, + "loss": 0.6702, + "step": 4646 + }, + { + "epoch": 0.41, + "grad_norm": 10.340079372943851, + "learning_rate": 6.696579840225308e-06, + "loss": 0.9015, + "step": 4647 + }, + { + "epoch": 0.41, + "grad_norm": 18.38359542773394, + "learning_rate": 6.6952418461735454e-06, + "loss": 0.9389, + "step": 4648 + }, + { + "epoch": 0.41, + "grad_norm": 8.547435062441659, + "learning_rate": 6.693903714945635e-06, + "loss": 0.7329, + "step": 4649 + }, + { + "epoch": 0.41, + "grad_norm": 8.9169837352878, + "learning_rate": 6.692565446649855e-06, + "loss": 0.8259, + "step": 4650 + }, + { + "epoch": 0.41, + "grad_norm": 7.007920004657562, + "learning_rate": 6.691227041394491e-06, + "loss": 0.7953, + "step": 4651 + }, + { + "epoch": 0.41, + "grad_norm": 5.881235507858213, + "learning_rate": 6.68988849928785e-06, + "loss": 0.7083, + "step": 4652 + }, + { + "epoch": 0.41, + "grad_norm": 13.015757156188535, + "learning_rate": 6.6885498204382424e-06, + "loss": 0.8684, + "step": 4653 + }, + { + "epoch": 0.41, + "grad_norm": 8.733672481334036, + "learning_rate": 6.687211004953992e-06, + "loss": 0.7416, + "step": 4654 + }, + { + "epoch": 0.41, + "grad_norm": 11.439613506285307, + "learning_rate": 6.685872052943434e-06, + "loss": 0.9154, + "step": 4655 + }, + { + "epoch": 0.41, + "grad_norm": 12.55414625668234, + "learning_rate": 6.684532964514914e-06, + "loss": 0.6399, + "step": 4656 + }, + { + "epoch": 0.41, + "grad_norm": 9.93801827423805, + "learning_rate": 6.683193739776787e-06, + "loss": 0.8061, + "step": 4657 + }, + { + "epoch": 0.41, + "grad_norm": 37.74450489292121, + "learning_rate": 6.681854378837423e-06, + "loss": 0.7467, + "step": 4658 + }, + { + "epoch": 0.41, + "grad_norm": 8.593723162235175, + "learning_rate": 6.680514881805201e-06, + "loss": 0.6452, + "step": 4659 + }, + { + "epoch": 0.41, + "grad_norm": 6.915776974604622, + "learning_rate": 6.67917524878851e-06, + "loss": 0.862, + "step": 4660 + }, + { + "epoch": 0.41, + "grad_norm": 16.896242773036416, + "learning_rate": 6.677835479895751e-06, + "loss": 0.8745, + "step": 4661 + }, + { + "epoch": 0.41, + "grad_norm": 10.448202103533227, + "learning_rate": 6.6764955752353355e-06, + "loss": 0.9763, + "step": 4662 + }, + { + "epoch": 0.41, + "grad_norm": 7.621686805590932, + "learning_rate": 6.675155534915687e-06, + "loss": 0.8617, + "step": 4663 + }, + { + "epoch": 0.41, + "grad_norm": 6.944254142177849, + "learning_rate": 6.673815359045239e-06, + "loss": 0.7994, + "step": 4664 + }, + { + "epoch": 0.41, + "grad_norm": 2.379936514396078, + "learning_rate": 6.672475047732436e-06, + "loss": 0.4951, + "step": 4665 + }, + { + "epoch": 0.41, + "grad_norm": 2.820337799048046, + "learning_rate": 6.671134601085736e-06, + "loss": 0.5533, + "step": 4666 + }, + { + "epoch": 0.41, + "grad_norm": 9.778208530408529, + "learning_rate": 6.669794019213603e-06, + "loss": 0.8132, + "step": 4667 + }, + { + "epoch": 0.41, + "grad_norm": 11.88796087727865, + "learning_rate": 6.668453302224514e-06, + "loss": 0.8361, + "step": 4668 + }, + { + "epoch": 0.41, + "grad_norm": 13.473446888599463, + "learning_rate": 6.66711245022696e-06, + "loss": 0.7234, + "step": 4669 + }, + { + "epoch": 0.41, + "grad_norm": 15.795578320742566, + "learning_rate": 6.66577146332944e-06, + "loss": 0.8351, + "step": 4670 + }, + { + "epoch": 0.41, + "grad_norm": 11.991527874461841, + "learning_rate": 6.6644303416404636e-06, + "loss": 0.6362, + "step": 4671 + }, + { + "epoch": 0.41, + "grad_norm": 10.639785219449639, + "learning_rate": 6.6630890852685525e-06, + "loss": 0.9046, + "step": 4672 + }, + { + "epoch": 0.41, + "grad_norm": 11.717860183698763, + "learning_rate": 6.661747694322241e-06, + "loss": 0.968, + "step": 4673 + }, + { + "epoch": 0.41, + "grad_norm": 14.018611286061766, + "learning_rate": 6.660406168910068e-06, + "loss": 0.6451, + "step": 4674 + }, + { + "epoch": 0.41, + "grad_norm": 11.388620411134397, + "learning_rate": 6.659064509140589e-06, + "loss": 0.6217, + "step": 4675 + }, + { + "epoch": 0.41, + "grad_norm": 10.85089083078717, + "learning_rate": 6.657722715122372e-06, + "loss": 0.8074, + "step": 4676 + }, + { + "epoch": 0.41, + "grad_norm": 6.924897499479003, + "learning_rate": 6.6563807869639894e-06, + "loss": 0.8217, + "step": 4677 + }, + { + "epoch": 0.41, + "grad_norm": 6.289701972624333, + "learning_rate": 6.655038724774029e-06, + "loss": 0.6837, + "step": 4678 + }, + { + "epoch": 0.41, + "grad_norm": 11.772457960900587, + "learning_rate": 6.653696528661088e-06, + "loss": 0.8962, + "step": 4679 + }, + { + "epoch": 0.41, + "grad_norm": 7.176390914258338, + "learning_rate": 6.652354198733774e-06, + "loss": 0.8093, + "step": 4680 + }, + { + "epoch": 0.41, + "grad_norm": 9.697697969028509, + "learning_rate": 6.651011735100707e-06, + "loss": 0.7879, + "step": 4681 + }, + { + "epoch": 0.41, + "grad_norm": 15.80963728920102, + "learning_rate": 6.6496691378705155e-06, + "loss": 0.7619, + "step": 4682 + }, + { + "epoch": 0.41, + "grad_norm": 13.790805430845143, + "learning_rate": 6.6483264071518435e-06, + "loss": 0.7294, + "step": 4683 + }, + { + "epoch": 0.41, + "grad_norm": 12.17272722920267, + "learning_rate": 6.646983543053341e-06, + "loss": 0.7995, + "step": 4684 + }, + { + "epoch": 0.41, + "grad_norm": 6.5656087597082085, + "learning_rate": 6.645640545683668e-06, + "loss": 0.7449, + "step": 4685 + }, + { + "epoch": 0.41, + "grad_norm": 13.16019113879846, + "learning_rate": 6.644297415151501e-06, + "loss": 0.7735, + "step": 4686 + }, + { + "epoch": 0.41, + "grad_norm": 7.978587131729856, + "learning_rate": 6.6429541515655215e-06, + "loss": 0.9228, + "step": 4687 + }, + { + "epoch": 0.41, + "grad_norm": 9.233639021198618, + "learning_rate": 6.6416107550344245e-06, + "loss": 0.649, + "step": 4688 + }, + { + "epoch": 0.41, + "grad_norm": 7.002334302611503, + "learning_rate": 6.640267225666916e-06, + "loss": 0.6959, + "step": 4689 + }, + { + "epoch": 0.41, + "grad_norm": 6.169419076261253, + "learning_rate": 6.6389235635717135e-06, + "loss": 0.8079, + "step": 4690 + }, + { + "epoch": 0.41, + "grad_norm": 10.860225555279351, + "learning_rate": 6.6375797688575405e-06, + "loss": 0.6501, + "step": 4691 + }, + { + "epoch": 0.41, + "grad_norm": 7.5997866084736945, + "learning_rate": 6.636235841633138e-06, + "loss": 0.8617, + "step": 4692 + }, + { + "epoch": 0.41, + "grad_norm": 9.6916867150959, + "learning_rate": 6.634891782007251e-06, + "loss": 0.8277, + "step": 4693 + }, + { + "epoch": 0.41, + "grad_norm": 6.553589414235121, + "learning_rate": 6.633547590088643e-06, + "loss": 0.8006, + "step": 4694 + }, + { + "epoch": 0.41, + "grad_norm": 2.6004108285168783, + "learning_rate": 6.6322032659860794e-06, + "loss": 0.5798, + "step": 4695 + }, + { + "epoch": 0.41, + "grad_norm": 7.870778381919343, + "learning_rate": 6.630858809808343e-06, + "loss": 0.8476, + "step": 4696 + }, + { + "epoch": 0.41, + "grad_norm": 8.133931920409125, + "learning_rate": 6.629514221664224e-06, + "loss": 0.8776, + "step": 4697 + }, + { + "epoch": 0.41, + "grad_norm": 7.106798117606666, + "learning_rate": 6.628169501662527e-06, + "loss": 0.7791, + "step": 4698 + }, + { + "epoch": 0.41, + "grad_norm": 13.253924829100976, + "learning_rate": 6.62682464991206e-06, + "loss": 0.7803, + "step": 4699 + }, + { + "epoch": 0.41, + "grad_norm": 10.07883392675708, + "learning_rate": 6.6254796665216495e-06, + "loss": 0.9026, + "step": 4700 + }, + { + "epoch": 0.41, + "grad_norm": 8.792968969217904, + "learning_rate": 6.624134551600127e-06, + "loss": 0.6782, + "step": 4701 + }, + { + "epoch": 0.41, + "grad_norm": 11.682537817991737, + "learning_rate": 6.622789305256338e-06, + "loss": 0.7117, + "step": 4702 + }, + { + "epoch": 0.41, + "grad_norm": 2.4116098026172113, + "learning_rate": 6.621443927599141e-06, + "loss": 0.575, + "step": 4703 + }, + { + "epoch": 0.41, + "grad_norm": 8.247156959142453, + "learning_rate": 6.620098418737396e-06, + "loss": 0.8719, + "step": 4704 + }, + { + "epoch": 0.41, + "grad_norm": 9.20210329692776, + "learning_rate": 6.618752778779981e-06, + "loss": 0.7978, + "step": 4705 + }, + { + "epoch": 0.41, + "grad_norm": 10.245532596729323, + "learning_rate": 6.617407007835785e-06, + "loss": 0.8851, + "step": 4706 + }, + { + "epoch": 0.41, + "grad_norm": 16.83248853088187, + "learning_rate": 6.616061106013706e-06, + "loss": 0.841, + "step": 4707 + }, + { + "epoch": 0.41, + "grad_norm": 9.516826058794132, + "learning_rate": 6.614715073422647e-06, + "loss": 0.8187, + "step": 4708 + }, + { + "epoch": 0.41, + "grad_norm": 37.22873816986429, + "learning_rate": 6.613368910171533e-06, + "loss": 0.7948, + "step": 4709 + }, + { + "epoch": 0.41, + "grad_norm": 12.683590825950311, + "learning_rate": 6.6120226163692905e-06, + "loss": 0.814, + "step": 4710 + }, + { + "epoch": 0.41, + "grad_norm": 21.211582935032336, + "learning_rate": 6.610676192124858e-06, + "loss": 0.9362, + "step": 4711 + }, + { + "epoch": 0.41, + "grad_norm": 10.053434188069469, + "learning_rate": 6.609329637547187e-06, + "loss": 0.8539, + "step": 4712 + }, + { + "epoch": 0.41, + "grad_norm": 6.4561459442244935, + "learning_rate": 6.60798295274524e-06, + "loss": 0.7667, + "step": 4713 + }, + { + "epoch": 0.41, + "grad_norm": 9.253763955209635, + "learning_rate": 6.606636137827985e-06, + "loss": 0.7947, + "step": 4714 + }, + { + "epoch": 0.41, + "grad_norm": 9.816916972780524, + "learning_rate": 6.605289192904408e-06, + "loss": 0.8782, + "step": 4715 + }, + { + "epoch": 0.41, + "grad_norm": 9.546357259098617, + "learning_rate": 6.603942118083501e-06, + "loss": 0.8528, + "step": 4716 + }, + { + "epoch": 0.41, + "grad_norm": 4.1584213173908084, + "learning_rate": 6.602594913474263e-06, + "loss": 0.5527, + "step": 4717 + }, + { + "epoch": 0.41, + "grad_norm": 22.96947952330664, + "learning_rate": 6.601247579185712e-06, + "loss": 0.9447, + "step": 4718 + }, + { + "epoch": 0.41, + "grad_norm": 9.048997490575971, + "learning_rate": 6.5999001153268715e-06, + "loss": 0.9381, + "step": 4719 + }, + { + "epoch": 0.41, + "grad_norm": 18.42973540089383, + "learning_rate": 6.598552522006772e-06, + "loss": 0.8287, + "step": 4720 + }, + { + "epoch": 0.41, + "grad_norm": 27.381045645124708, + "learning_rate": 6.597204799334463e-06, + "loss": 0.7224, + "step": 4721 + }, + { + "epoch": 0.41, + "grad_norm": 9.801163354975081, + "learning_rate": 6.595856947419e-06, + "loss": 0.9876, + "step": 4722 + }, + { + "epoch": 0.41, + "grad_norm": 13.495915968036586, + "learning_rate": 6.594508966369445e-06, + "loss": 0.7129, + "step": 4723 + }, + { + "epoch": 0.41, + "grad_norm": 8.903250314427844, + "learning_rate": 6.593160856294878e-06, + "loss": 0.8175, + "step": 4724 + }, + { + "epoch": 0.41, + "grad_norm": 2.3771926903019307, + "learning_rate": 6.591812617304384e-06, + "loss": 0.5266, + "step": 4725 + }, + { + "epoch": 0.42, + "grad_norm": 12.448898431808898, + "learning_rate": 6.590464249507061e-06, + "loss": 0.8062, + "step": 4726 + }, + { + "epoch": 0.42, + "grad_norm": 8.218811368243047, + "learning_rate": 6.589115753012018e-06, + "loss": 0.9054, + "step": 4727 + }, + { + "epoch": 0.42, + "grad_norm": 8.692979998511426, + "learning_rate": 6.587767127928368e-06, + "loss": 0.8045, + "step": 4728 + }, + { + "epoch": 0.42, + "grad_norm": 8.000558381342687, + "learning_rate": 6.586418374365247e-06, + "loss": 0.745, + "step": 4729 + }, + { + "epoch": 0.42, + "grad_norm": 6.825543849252954, + "learning_rate": 6.585069492431788e-06, + "loss": 0.865, + "step": 4730 + }, + { + "epoch": 0.42, + "grad_norm": 14.03190712878701, + "learning_rate": 6.583720482237143e-06, + "loss": 0.947, + "step": 4731 + }, + { + "epoch": 0.42, + "grad_norm": 7.3344264351659865, + "learning_rate": 6.58237134389047e-06, + "loss": 0.7682, + "step": 4732 + }, + { + "epoch": 0.42, + "grad_norm": 2.275280894309381, + "learning_rate": 6.581022077500943e-06, + "loss": 0.4904, + "step": 4733 + }, + { + "epoch": 0.42, + "grad_norm": 8.349928173036133, + "learning_rate": 6.5796726831777355e-06, + "loss": 0.9836, + "step": 4734 + }, + { + "epoch": 0.42, + "grad_norm": 3.9265999658612714, + "learning_rate": 6.578323161030043e-06, + "loss": 0.5069, + "step": 4735 + }, + { + "epoch": 0.42, + "grad_norm": 8.9973008638135, + "learning_rate": 6.576973511167067e-06, + "loss": 0.633, + "step": 4736 + }, + { + "epoch": 0.42, + "grad_norm": 10.325946451817126, + "learning_rate": 6.575623733698018e-06, + "loss": 0.8605, + "step": 4737 + }, + { + "epoch": 0.42, + "grad_norm": 10.015094285053761, + "learning_rate": 6.574273828732116e-06, + "loss": 0.7675, + "step": 4738 + }, + { + "epoch": 0.42, + "grad_norm": 7.659962097286774, + "learning_rate": 6.5729237963785955e-06, + "loss": 0.8272, + "step": 4739 + }, + { + "epoch": 0.42, + "grad_norm": 6.800201909484501, + "learning_rate": 6.571573636746696e-06, + "loss": 0.7888, + "step": 4740 + }, + { + "epoch": 0.42, + "grad_norm": 14.135761103032035, + "learning_rate": 6.5702233499456734e-06, + "loss": 0.9482, + "step": 4741 + }, + { + "epoch": 0.42, + "grad_norm": 13.448362805342589, + "learning_rate": 6.568872936084789e-06, + "loss": 0.7423, + "step": 4742 + }, + { + "epoch": 0.42, + "grad_norm": 11.986097118386457, + "learning_rate": 6.567522395273317e-06, + "loss": 0.8242, + "step": 4743 + }, + { + "epoch": 0.42, + "grad_norm": 7.773065198544609, + "learning_rate": 6.566171727620539e-06, + "loss": 0.7732, + "step": 4744 + }, + { + "epoch": 0.42, + "grad_norm": 12.10004099018371, + "learning_rate": 6.56482093323575e-06, + "loss": 0.7838, + "step": 4745 + }, + { + "epoch": 0.42, + "grad_norm": 6.4855176013327105, + "learning_rate": 6.563470012228256e-06, + "loss": 0.6439, + "step": 4746 + }, + { + "epoch": 0.42, + "grad_norm": 12.332410616612352, + "learning_rate": 6.562118964707368e-06, + "loss": 0.8778, + "step": 4747 + }, + { + "epoch": 0.42, + "grad_norm": 7.780097767312517, + "learning_rate": 6.5607677907824105e-06, + "loss": 0.7448, + "step": 4748 + }, + { + "epoch": 0.42, + "grad_norm": 3.3870265597544873, + "learning_rate": 6.559416490562722e-06, + "loss": 0.5409, + "step": 4749 + }, + { + "epoch": 0.42, + "grad_norm": 2.9796446968721186, + "learning_rate": 6.558065064157644e-06, + "loss": 0.5835, + "step": 4750 + }, + { + "epoch": 0.42, + "grad_norm": 8.258423832921062, + "learning_rate": 6.556713511676531e-06, + "loss": 0.7093, + "step": 4751 + }, + { + "epoch": 0.42, + "grad_norm": 9.671850883122842, + "learning_rate": 6.555361833228753e-06, + "loss": 0.7726, + "step": 4752 + }, + { + "epoch": 0.42, + "grad_norm": 7.555798513631139, + "learning_rate": 6.554010028923682e-06, + "loss": 1.0024, + "step": 4753 + }, + { + "epoch": 0.42, + "grad_norm": 8.487160591286294, + "learning_rate": 6.552658098870703e-06, + "loss": 0.8013, + "step": 4754 + }, + { + "epoch": 0.42, + "grad_norm": 12.40403370724538, + "learning_rate": 6.551306043179214e-06, + "loss": 0.8296, + "step": 4755 + }, + { + "epoch": 0.42, + "grad_norm": 7.675278765304566, + "learning_rate": 6.549953861958621e-06, + "loss": 0.9192, + "step": 4756 + }, + { + "epoch": 0.42, + "grad_norm": 6.600693264057409, + "learning_rate": 6.548601555318339e-06, + "loss": 0.8488, + "step": 4757 + }, + { + "epoch": 0.42, + "grad_norm": 14.710036527865475, + "learning_rate": 6.547249123367796e-06, + "loss": 0.7543, + "step": 4758 + }, + { + "epoch": 0.42, + "grad_norm": 8.187537099485148, + "learning_rate": 6.545896566216428e-06, + "loss": 0.7147, + "step": 4759 + }, + { + "epoch": 0.42, + "grad_norm": 7.189817230377963, + "learning_rate": 6.544543883973679e-06, + "loss": 0.7669, + "step": 4760 + }, + { + "epoch": 0.42, + "grad_norm": 10.619725554616577, + "learning_rate": 6.543191076749009e-06, + "loss": 0.8028, + "step": 4761 + }, + { + "epoch": 0.42, + "grad_norm": 3.0948362169055614, + "learning_rate": 6.541838144651884e-06, + "loss": 0.5862, + "step": 4762 + }, + { + "epoch": 0.42, + "grad_norm": 9.573516403012839, + "learning_rate": 6.540485087791783e-06, + "loss": 0.7329, + "step": 4763 + }, + { + "epoch": 0.42, + "grad_norm": 6.21249972343167, + "learning_rate": 6.539131906278189e-06, + "loss": 0.7369, + "step": 4764 + }, + { + "epoch": 0.42, + "grad_norm": 11.264153684281268, + "learning_rate": 6.537778600220601e-06, + "loss": 0.8317, + "step": 4765 + }, + { + "epoch": 0.42, + "grad_norm": 6.249898520412217, + "learning_rate": 6.536425169728529e-06, + "loss": 0.6823, + "step": 4766 + }, + { + "epoch": 0.42, + "grad_norm": 9.67116251068737, + "learning_rate": 6.535071614911487e-06, + "loss": 0.7753, + "step": 4767 + }, + { + "epoch": 0.42, + "grad_norm": 10.99916429156585, + "learning_rate": 6.533717935879003e-06, + "loss": 0.932, + "step": 4768 + }, + { + "epoch": 0.42, + "grad_norm": 2.72134194263893, + "learning_rate": 6.532364132740615e-06, + "loss": 0.549, + "step": 4769 + }, + { + "epoch": 0.42, + "grad_norm": 8.227080931441527, + "learning_rate": 6.53101020560587e-06, + "loss": 0.8954, + "step": 4770 + }, + { + "epoch": 0.42, + "grad_norm": 10.80405093118873, + "learning_rate": 6.529656154584326e-06, + "loss": 0.868, + "step": 4771 + }, + { + "epoch": 0.42, + "grad_norm": 11.841562712656165, + "learning_rate": 6.5283019797855505e-06, + "loss": 0.8232, + "step": 4772 + }, + { + "epoch": 0.42, + "grad_norm": 8.002837718516462, + "learning_rate": 6.526947681319122e-06, + "loss": 0.6601, + "step": 4773 + }, + { + "epoch": 0.42, + "grad_norm": 8.09858140122482, + "learning_rate": 6.525593259294624e-06, + "loss": 0.6028, + "step": 4774 + }, + { + "epoch": 0.42, + "grad_norm": 3.362314132269189, + "learning_rate": 6.524238713821661e-06, + "loss": 0.5799, + "step": 4775 + }, + { + "epoch": 0.42, + "grad_norm": 6.292200693366516, + "learning_rate": 6.522884045009835e-06, + "loss": 0.6386, + "step": 4776 + }, + { + "epoch": 0.42, + "grad_norm": 11.703165649037256, + "learning_rate": 6.521529252968764e-06, + "loss": 0.7076, + "step": 4777 + }, + { + "epoch": 0.42, + "grad_norm": 3.387975595460863, + "learning_rate": 6.52017433780808e-06, + "loss": 0.5632, + "step": 4778 + }, + { + "epoch": 0.42, + "grad_norm": 5.792041592680619, + "learning_rate": 6.518819299637415e-06, + "loss": 0.7304, + "step": 4779 + }, + { + "epoch": 0.42, + "grad_norm": 7.158848527950088, + "learning_rate": 6.5174641385664186e-06, + "loss": 0.7545, + "step": 4780 + }, + { + "epoch": 0.42, + "grad_norm": 2.9843371525503426, + "learning_rate": 6.516108854704749e-06, + "loss": 0.5305, + "step": 4781 + }, + { + "epoch": 0.42, + "grad_norm": 9.040739790780204, + "learning_rate": 6.5147534481620725e-06, + "loss": 0.8608, + "step": 4782 + }, + { + "epoch": 0.42, + "grad_norm": 5.034367836304729, + "learning_rate": 6.513397919048066e-06, + "loss": 0.6369, + "step": 4783 + }, + { + "epoch": 0.42, + "grad_norm": 6.6359398751826095, + "learning_rate": 6.512042267472418e-06, + "loss": 0.7915, + "step": 4784 + }, + { + "epoch": 0.42, + "grad_norm": 13.765279302060298, + "learning_rate": 6.5106864935448245e-06, + "loss": 0.8376, + "step": 4785 + }, + { + "epoch": 0.42, + "grad_norm": 7.0994153305177345, + "learning_rate": 6.509330597374993e-06, + "loss": 0.9432, + "step": 4786 + }, + { + "epoch": 0.42, + "grad_norm": 6.579814452506254, + "learning_rate": 6.507974579072641e-06, + "loss": 0.9044, + "step": 4787 + }, + { + "epoch": 0.42, + "grad_norm": 6.405575347635375, + "learning_rate": 6.506618438747494e-06, + "loss": 0.7659, + "step": 4788 + }, + { + "epoch": 0.42, + "grad_norm": 5.831204036317382, + "learning_rate": 6.505262176509288e-06, + "loss": 0.6525, + "step": 4789 + }, + { + "epoch": 0.42, + "grad_norm": 8.92113792948247, + "learning_rate": 6.503905792467773e-06, + "loss": 0.8775, + "step": 4790 + }, + { + "epoch": 0.42, + "grad_norm": 7.735153773779657, + "learning_rate": 6.5025492867327e-06, + "loss": 0.7305, + "step": 4791 + }, + { + "epoch": 0.42, + "grad_norm": 7.3854118010446275, + "learning_rate": 6.501192659413839e-06, + "loss": 0.6634, + "step": 4792 + }, + { + "epoch": 0.42, + "grad_norm": 5.390351850334179, + "learning_rate": 6.499835910620967e-06, + "loss": 0.7363, + "step": 4793 + }, + { + "epoch": 0.42, + "grad_norm": 12.014213876412342, + "learning_rate": 6.498479040463866e-06, + "loss": 0.7616, + "step": 4794 + }, + { + "epoch": 0.42, + "grad_norm": 9.942767199673392, + "learning_rate": 6.497122049052334e-06, + "loss": 0.7693, + "step": 4795 + }, + { + "epoch": 0.42, + "grad_norm": 7.782716889940684, + "learning_rate": 6.495764936496176e-06, + "loss": 0.7904, + "step": 4796 + }, + { + "epoch": 0.42, + "grad_norm": 9.122088570480757, + "learning_rate": 6.494407702905207e-06, + "loss": 0.795, + "step": 4797 + }, + { + "epoch": 0.42, + "grad_norm": 12.960281328564363, + "learning_rate": 6.493050348389254e-06, + "loss": 0.7957, + "step": 4798 + }, + { + "epoch": 0.42, + "grad_norm": 6.471737686681379, + "learning_rate": 6.4916928730581496e-06, + "loss": 0.7943, + "step": 4799 + }, + { + "epoch": 0.42, + "grad_norm": 12.528930108840102, + "learning_rate": 6.4903352770217386e-06, + "loss": 0.7072, + "step": 4800 + }, + { + "epoch": 0.42, + "grad_norm": 10.087807778314614, + "learning_rate": 6.488977560389877e-06, + "loss": 0.7161, + "step": 4801 + }, + { + "epoch": 0.42, + "grad_norm": 12.879785396661429, + "learning_rate": 6.487619723272427e-06, + "loss": 0.9024, + "step": 4802 + }, + { + "epoch": 0.42, + "grad_norm": 6.428892722565888, + "learning_rate": 6.4862617657792645e-06, + "loss": 0.817, + "step": 4803 + }, + { + "epoch": 0.42, + "grad_norm": 10.039237100365094, + "learning_rate": 6.484903688020272e-06, + "loss": 0.8092, + "step": 4804 + }, + { + "epoch": 0.42, + "grad_norm": 7.943701107535146, + "learning_rate": 6.4835454901053416e-06, + "loss": 0.927, + "step": 4805 + }, + { + "epoch": 0.42, + "grad_norm": 5.122253410727563, + "learning_rate": 6.482187172144379e-06, + "loss": 0.7614, + "step": 4806 + }, + { + "epoch": 0.42, + "grad_norm": 3.0604454308800317, + "learning_rate": 6.480828734247297e-06, + "loss": 0.5405, + "step": 4807 + }, + { + "epoch": 0.42, + "grad_norm": 10.754994020551383, + "learning_rate": 6.479470176524015e-06, + "loss": 0.8533, + "step": 4808 + }, + { + "epoch": 0.42, + "grad_norm": 7.831871085518153, + "learning_rate": 6.47811149908447e-06, + "loss": 0.7811, + "step": 4809 + }, + { + "epoch": 0.42, + "grad_norm": 4.110751435384667, + "learning_rate": 6.4767527020386e-06, + "loss": 0.5455, + "step": 4810 + }, + { + "epoch": 0.42, + "grad_norm": 7.452219167716631, + "learning_rate": 6.475393785496356e-06, + "loss": 0.7975, + "step": 4811 + }, + { + "epoch": 0.42, + "grad_norm": 13.820164442300335, + "learning_rate": 6.4740347495677034e-06, + "loss": 0.8531, + "step": 4812 + }, + { + "epoch": 0.42, + "grad_norm": 8.32283594905479, + "learning_rate": 6.47267559436261e-06, + "loss": 0.8287, + "step": 4813 + }, + { + "epoch": 0.42, + "grad_norm": 2.062368483415398, + "learning_rate": 6.471316319991056e-06, + "loss": 0.5714, + "step": 4814 + }, + { + "epoch": 0.42, + "grad_norm": 8.125113388877757, + "learning_rate": 6.469956926563034e-06, + "loss": 0.9403, + "step": 4815 + }, + { + "epoch": 0.42, + "grad_norm": 9.97056181619599, + "learning_rate": 6.468597414188543e-06, + "loss": 0.7093, + "step": 4816 + }, + { + "epoch": 0.42, + "grad_norm": 3.834455852662269, + "learning_rate": 6.467237782977591e-06, + "loss": 0.5659, + "step": 4817 + }, + { + "epoch": 0.42, + "grad_norm": 8.888031503971945, + "learning_rate": 6.465878033040199e-06, + "loss": 0.7288, + "step": 4818 + }, + { + "epoch": 0.42, + "grad_norm": 13.725231202844462, + "learning_rate": 6.464518164486395e-06, + "loss": 0.8391, + "step": 4819 + }, + { + "epoch": 0.42, + "grad_norm": 8.21702137352497, + "learning_rate": 6.463158177426216e-06, + "loss": 0.708, + "step": 4820 + }, + { + "epoch": 0.42, + "grad_norm": 6.089777589790747, + "learning_rate": 6.461798071969714e-06, + "loss": 0.6658, + "step": 4821 + }, + { + "epoch": 0.42, + "grad_norm": 14.635836598002006, + "learning_rate": 6.4604378482269405e-06, + "loss": 0.9308, + "step": 4822 + }, + { + "epoch": 0.42, + "grad_norm": 7.812554730192831, + "learning_rate": 6.459077506307967e-06, + "loss": 0.6682, + "step": 4823 + }, + { + "epoch": 0.42, + "grad_norm": 8.403531814736201, + "learning_rate": 6.457717046322869e-06, + "loss": 0.6589, + "step": 4824 + }, + { + "epoch": 0.42, + "grad_norm": 7.970407038149947, + "learning_rate": 6.456356468381731e-06, + "loss": 0.9161, + "step": 4825 + }, + { + "epoch": 0.42, + "grad_norm": 8.004673829951448, + "learning_rate": 6.454995772594652e-06, + "loss": 0.7377, + "step": 4826 + }, + { + "epoch": 0.42, + "grad_norm": 8.026955111335235, + "learning_rate": 6.453634959071735e-06, + "loss": 0.782, + "step": 4827 + }, + { + "epoch": 0.42, + "grad_norm": 8.572968263492816, + "learning_rate": 6.452274027923093e-06, + "loss": 0.68, + "step": 4828 + }, + { + "epoch": 0.42, + "grad_norm": 5.731735490633474, + "learning_rate": 6.450912979258854e-06, + "loss": 0.8164, + "step": 4829 + }, + { + "epoch": 0.42, + "grad_norm": 7.0904577412657055, + "learning_rate": 6.44955181318915e-06, + "loss": 0.7405, + "step": 4830 + }, + { + "epoch": 0.42, + "grad_norm": 10.367007671999644, + "learning_rate": 6.448190529824125e-06, + "loss": 0.7376, + "step": 4831 + }, + { + "epoch": 0.42, + "grad_norm": 2.809299076561459, + "learning_rate": 6.44682912927393e-06, + "loss": 0.5146, + "step": 4832 + }, + { + "epoch": 0.42, + "grad_norm": 2.253385800357446, + "learning_rate": 6.4454676116487306e-06, + "loss": 0.5294, + "step": 4833 + }, + { + "epoch": 0.42, + "grad_norm": 21.05720719603002, + "learning_rate": 6.444105977058694e-06, + "loss": 0.868, + "step": 4834 + }, + { + "epoch": 0.42, + "grad_norm": 7.8103995018625385, + "learning_rate": 6.442744225614006e-06, + "loss": 0.7966, + "step": 4835 + }, + { + "epoch": 0.42, + "grad_norm": 11.423221099650107, + "learning_rate": 6.441382357424855e-06, + "loss": 0.9958, + "step": 4836 + }, + { + "epoch": 0.42, + "grad_norm": 11.718077320611755, + "learning_rate": 6.440020372601439e-06, + "loss": 0.997, + "step": 4837 + }, + { + "epoch": 0.42, + "grad_norm": 7.420899407824005, + "learning_rate": 6.4386582712539725e-06, + "loss": 0.8614, + "step": 4838 + }, + { + "epoch": 0.42, + "grad_norm": 11.289921495579563, + "learning_rate": 6.437296053492671e-06, + "loss": 0.7416, + "step": 4839 + }, + { + "epoch": 0.43, + "grad_norm": 6.45449711804917, + "learning_rate": 6.435933719427762e-06, + "loss": 0.7328, + "step": 4840 + }, + { + "epoch": 0.43, + "grad_norm": 7.831641880652262, + "learning_rate": 6.434571269169487e-06, + "loss": 0.8078, + "step": 4841 + }, + { + "epoch": 0.43, + "grad_norm": 8.166306050715635, + "learning_rate": 6.433208702828091e-06, + "loss": 0.8212, + "step": 4842 + }, + { + "epoch": 0.43, + "grad_norm": 9.702416922807418, + "learning_rate": 6.43184602051383e-06, + "loss": 0.8928, + "step": 4843 + }, + { + "epoch": 0.43, + "grad_norm": 11.921071130865803, + "learning_rate": 6.430483222336971e-06, + "loss": 0.8885, + "step": 4844 + }, + { + "epoch": 0.43, + "grad_norm": 7.898084681197369, + "learning_rate": 6.429120308407789e-06, + "loss": 0.6533, + "step": 4845 + }, + { + "epoch": 0.43, + "grad_norm": 6.00346584325033, + "learning_rate": 6.427757278836569e-06, + "loss": 0.7578, + "step": 4846 + }, + { + "epoch": 0.43, + "grad_norm": 5.92316386061347, + "learning_rate": 6.426394133733606e-06, + "loss": 0.7949, + "step": 4847 + }, + { + "epoch": 0.43, + "grad_norm": 6.135542917606759, + "learning_rate": 6.425030873209202e-06, + "loss": 0.7521, + "step": 4848 + }, + { + "epoch": 0.43, + "grad_norm": 7.285619332834274, + "learning_rate": 6.423667497373668e-06, + "loss": 0.9022, + "step": 4849 + }, + { + "epoch": 0.43, + "grad_norm": 7.9993213603101925, + "learning_rate": 6.42230400633733e-06, + "loss": 0.7687, + "step": 4850 + }, + { + "epoch": 0.43, + "grad_norm": 13.76549820302982, + "learning_rate": 6.420940400210518e-06, + "loss": 0.8436, + "step": 4851 + }, + { + "epoch": 0.43, + "grad_norm": 4.653832949099206, + "learning_rate": 6.419576679103571e-06, + "loss": 0.7207, + "step": 4852 + }, + { + "epoch": 0.43, + "grad_norm": 8.080184668282419, + "learning_rate": 6.418212843126842e-06, + "loss": 0.9386, + "step": 4853 + }, + { + "epoch": 0.43, + "grad_norm": 6.378218703309807, + "learning_rate": 6.416848892390687e-06, + "loss": 0.6486, + "step": 4854 + }, + { + "epoch": 0.43, + "grad_norm": 7.3198832568479135, + "learning_rate": 6.4154848270054785e-06, + "loss": 0.7878, + "step": 4855 + }, + { + "epoch": 0.43, + "grad_norm": 7.0491543641059335, + "learning_rate": 6.414120647081592e-06, + "loss": 0.8573, + "step": 4856 + }, + { + "epoch": 0.43, + "grad_norm": 6.400461394910423, + "learning_rate": 6.412756352729414e-06, + "loss": 0.6382, + "step": 4857 + }, + { + "epoch": 0.43, + "grad_norm": 7.8505460434474035, + "learning_rate": 6.411391944059342e-06, + "loss": 0.8973, + "step": 4858 + }, + { + "epoch": 0.43, + "grad_norm": 9.387844435236248, + "learning_rate": 6.4100274211817825e-06, + "loss": 0.7482, + "step": 4859 + }, + { + "epoch": 0.43, + "grad_norm": 13.239479113995285, + "learning_rate": 6.408662784207149e-06, + "loss": 0.7974, + "step": 4860 + }, + { + "epoch": 0.43, + "grad_norm": 9.190529528058198, + "learning_rate": 6.407298033245865e-06, + "loss": 0.9125, + "step": 4861 + }, + { + "epoch": 0.43, + "grad_norm": 9.254878151053234, + "learning_rate": 6.405933168408368e-06, + "loss": 0.847, + "step": 4862 + }, + { + "epoch": 0.43, + "grad_norm": 5.811792492385842, + "learning_rate": 6.404568189805095e-06, + "loss": 0.6555, + "step": 4863 + }, + { + "epoch": 0.43, + "grad_norm": 6.751771301603102, + "learning_rate": 6.403203097546502e-06, + "loss": 0.8079, + "step": 4864 + }, + { + "epoch": 0.43, + "grad_norm": 6.67953511045026, + "learning_rate": 6.4018378917430476e-06, + "loss": 0.6977, + "step": 4865 + }, + { + "epoch": 0.43, + "grad_norm": 17.349006035719096, + "learning_rate": 6.400472572505203e-06, + "loss": 0.8419, + "step": 4866 + }, + { + "epoch": 0.43, + "grad_norm": 8.639412115204943, + "learning_rate": 6.399107139943448e-06, + "loss": 0.7009, + "step": 4867 + }, + { + "epoch": 0.43, + "grad_norm": 6.643862807285892, + "learning_rate": 6.39774159416827e-06, + "loss": 0.8083, + "step": 4868 + }, + { + "epoch": 0.43, + "grad_norm": 12.273382467364733, + "learning_rate": 6.396375935290166e-06, + "loss": 0.8372, + "step": 4869 + }, + { + "epoch": 0.43, + "grad_norm": 10.351372987401248, + "learning_rate": 6.395010163419647e-06, + "loss": 0.775, + "step": 4870 + }, + { + "epoch": 0.43, + "grad_norm": 6.843586298826455, + "learning_rate": 6.393644278667225e-06, + "loss": 0.7585, + "step": 4871 + }, + { + "epoch": 0.43, + "grad_norm": 8.709297627768843, + "learning_rate": 6.392278281143425e-06, + "loss": 0.704, + "step": 4872 + }, + { + "epoch": 0.43, + "grad_norm": 5.749998704845602, + "learning_rate": 6.390912170958783e-06, + "loss": 0.7426, + "step": 4873 + }, + { + "epoch": 0.43, + "grad_norm": 6.930354727319176, + "learning_rate": 6.389545948223841e-06, + "loss": 0.809, + "step": 4874 + }, + { + "epoch": 0.43, + "grad_norm": 2.904920835273837, + "learning_rate": 6.388179613049154e-06, + "loss": 0.5147, + "step": 4875 + }, + { + "epoch": 0.43, + "grad_norm": 7.193070901901449, + "learning_rate": 6.386813165545281e-06, + "loss": 0.8927, + "step": 4876 + }, + { + "epoch": 0.43, + "grad_norm": 9.615062488121804, + "learning_rate": 6.385446605822793e-06, + "loss": 0.8185, + "step": 4877 + }, + { + "epoch": 0.43, + "grad_norm": 7.0145955409823255, + "learning_rate": 6.384079933992271e-06, + "loss": 0.7475, + "step": 4878 + }, + { + "epoch": 0.43, + "grad_norm": 5.825993006609767, + "learning_rate": 6.382713150164303e-06, + "loss": 0.7124, + "step": 4879 + }, + { + "epoch": 0.43, + "grad_norm": 12.224387630829352, + "learning_rate": 6.381346254449485e-06, + "loss": 0.6718, + "step": 4880 + }, + { + "epoch": 0.43, + "grad_norm": 3.0262841544184473, + "learning_rate": 6.379979246958427e-06, + "loss": 0.6524, + "step": 4881 + }, + { + "epoch": 0.43, + "grad_norm": 9.005873690705368, + "learning_rate": 6.378612127801746e-06, + "loss": 0.8396, + "step": 4882 + }, + { + "epoch": 0.43, + "grad_norm": 12.663790285110176, + "learning_rate": 6.377244897090063e-06, + "loss": 0.9176, + "step": 4883 + }, + { + "epoch": 0.43, + "grad_norm": 13.028367814128273, + "learning_rate": 6.375877554934014e-06, + "loss": 0.869, + "step": 4884 + }, + { + "epoch": 0.43, + "grad_norm": 8.934888416880224, + "learning_rate": 6.374510101444242e-06, + "loss": 0.8441, + "step": 4885 + }, + { + "epoch": 0.43, + "grad_norm": 13.88111888365971, + "learning_rate": 6.373142536731398e-06, + "loss": 0.9434, + "step": 4886 + }, + { + "epoch": 0.43, + "grad_norm": 7.239381835640128, + "learning_rate": 6.371774860906144e-06, + "loss": 0.6575, + "step": 4887 + }, + { + "epoch": 0.43, + "grad_norm": 12.961125599450735, + "learning_rate": 6.370407074079153e-06, + "loss": 0.6194, + "step": 4888 + }, + { + "epoch": 0.43, + "grad_norm": 24.986495833960554, + "learning_rate": 6.369039176361098e-06, + "loss": 0.7776, + "step": 4889 + }, + { + "epoch": 0.43, + "grad_norm": 11.44613958387622, + "learning_rate": 6.367671167862671e-06, + "loss": 0.8219, + "step": 4890 + }, + { + "epoch": 0.43, + "grad_norm": 5.531638312847099, + "learning_rate": 6.366303048694569e-06, + "loss": 0.7401, + "step": 4891 + }, + { + "epoch": 0.43, + "grad_norm": 6.99442104127846, + "learning_rate": 6.364934818967495e-06, + "loss": 0.8551, + "step": 4892 + }, + { + "epoch": 0.43, + "grad_norm": 6.42286918939165, + "learning_rate": 6.363566478792168e-06, + "loss": 0.8848, + "step": 4893 + }, + { + "epoch": 0.43, + "grad_norm": 12.277958024277977, + "learning_rate": 6.362198028279308e-06, + "loss": 0.8121, + "step": 4894 + }, + { + "epoch": 0.43, + "grad_norm": 11.746005018997216, + "learning_rate": 6.360829467539651e-06, + "loss": 0.8966, + "step": 4895 + }, + { + "epoch": 0.43, + "grad_norm": 7.187098916471652, + "learning_rate": 6.359460796683937e-06, + "loss": 0.8294, + "step": 4896 + }, + { + "epoch": 0.43, + "grad_norm": 8.068066201049433, + "learning_rate": 6.358092015822915e-06, + "loss": 0.6795, + "step": 4897 + }, + { + "epoch": 0.43, + "grad_norm": 7.060300665639662, + "learning_rate": 6.356723125067348e-06, + "loss": 0.7239, + "step": 4898 + }, + { + "epoch": 0.43, + "grad_norm": 9.942977360798318, + "learning_rate": 6.3553541245280016e-06, + "loss": 0.729, + "step": 4899 + }, + { + "epoch": 0.43, + "grad_norm": 6.129250422647321, + "learning_rate": 6.353985014315653e-06, + "loss": 0.8981, + "step": 4900 + }, + { + "epoch": 0.43, + "grad_norm": 2.6152388833817466, + "learning_rate": 6.352615794541092e-06, + "loss": 0.5901, + "step": 4901 + }, + { + "epoch": 0.43, + "grad_norm": 5.610830553276552, + "learning_rate": 6.3512464653151096e-06, + "loss": 0.8532, + "step": 4902 + }, + { + "epoch": 0.43, + "grad_norm": 6.352378501520167, + "learning_rate": 6.34987702674851e-06, + "loss": 0.7755, + "step": 4903 + }, + { + "epoch": 0.43, + "grad_norm": 2.5563458187714123, + "learning_rate": 6.348507478952109e-06, + "loss": 0.4674, + "step": 4904 + }, + { + "epoch": 0.43, + "grad_norm": 9.62149708112033, + "learning_rate": 6.347137822036724e-06, + "loss": 0.7461, + "step": 4905 + }, + { + "epoch": 0.43, + "grad_norm": 7.671270285929403, + "learning_rate": 6.345768056113186e-06, + "loss": 0.845, + "step": 4906 + }, + { + "epoch": 0.43, + "grad_norm": 10.19862303882386, + "learning_rate": 6.344398181292338e-06, + "loss": 0.7742, + "step": 4907 + }, + { + "epoch": 0.43, + "grad_norm": 5.167301927833582, + "learning_rate": 6.343028197685025e-06, + "loss": 0.8528, + "step": 4908 + }, + { + "epoch": 0.43, + "grad_norm": 23.218052219839482, + "learning_rate": 6.341658105402103e-06, + "loss": 0.8593, + "step": 4909 + }, + { + "epoch": 0.43, + "grad_norm": 5.722558255283988, + "learning_rate": 6.340287904554441e-06, + "loss": 0.7686, + "step": 4910 + }, + { + "epoch": 0.43, + "grad_norm": 8.33795174474914, + "learning_rate": 6.338917595252909e-06, + "loss": 0.6859, + "step": 4911 + }, + { + "epoch": 0.43, + "grad_norm": 9.58599950736582, + "learning_rate": 6.337547177608393e-06, + "loss": 0.7036, + "step": 4912 + }, + { + "epoch": 0.43, + "grad_norm": 10.833190996305783, + "learning_rate": 6.3361766517317845e-06, + "loss": 0.7191, + "step": 4913 + }, + { + "epoch": 0.43, + "grad_norm": 32.5424542403203, + "learning_rate": 6.334806017733983e-06, + "loss": 0.7309, + "step": 4914 + }, + { + "epoch": 0.43, + "grad_norm": 7.082318499849796, + "learning_rate": 6.333435275725901e-06, + "loss": 0.7827, + "step": 4915 + }, + { + "epoch": 0.43, + "grad_norm": 6.245213391677762, + "learning_rate": 6.332064425818454e-06, + "loss": 0.7869, + "step": 4916 + }, + { + "epoch": 0.43, + "grad_norm": 7.386261332209018, + "learning_rate": 6.330693468122569e-06, + "loss": 0.653, + "step": 4917 + }, + { + "epoch": 0.43, + "grad_norm": 4.183459014098514, + "learning_rate": 6.329322402749181e-06, + "loss": 0.8017, + "step": 4918 + }, + { + "epoch": 0.43, + "grad_norm": 7.001059280070093, + "learning_rate": 6.327951229809237e-06, + "loss": 0.6617, + "step": 4919 + }, + { + "epoch": 0.43, + "grad_norm": 3.4339034194369726, + "learning_rate": 6.326579949413687e-06, + "loss": 0.5974, + "step": 4920 + }, + { + "epoch": 0.43, + "grad_norm": 13.186393017831621, + "learning_rate": 6.325208561673495e-06, + "loss": 0.5892, + "step": 4921 + }, + { + "epoch": 0.43, + "grad_norm": 9.6443668512463, + "learning_rate": 6.3238370666996305e-06, + "loss": 0.8185, + "step": 4922 + }, + { + "epoch": 0.43, + "grad_norm": 2.9631747327515288, + "learning_rate": 6.322465464603072e-06, + "loss": 0.5245, + "step": 4923 + }, + { + "epoch": 0.43, + "grad_norm": 7.373875434607973, + "learning_rate": 6.321093755494809e-06, + "loss": 0.729, + "step": 4924 + }, + { + "epoch": 0.43, + "grad_norm": 5.287453486938416, + "learning_rate": 6.319721939485837e-06, + "loss": 0.6967, + "step": 4925 + }, + { + "epoch": 0.43, + "grad_norm": 11.330762976921928, + "learning_rate": 6.318350016687159e-06, + "loss": 0.7679, + "step": 4926 + }, + { + "epoch": 0.43, + "grad_norm": 6.163242839684757, + "learning_rate": 6.316977987209793e-06, + "loss": 0.7939, + "step": 4927 + }, + { + "epoch": 0.43, + "grad_norm": 8.267936831930106, + "learning_rate": 6.3156058511647575e-06, + "loss": 0.9654, + "step": 4928 + }, + { + "epoch": 0.43, + "grad_norm": 5.651265644373009, + "learning_rate": 6.314233608663085e-06, + "loss": 0.895, + "step": 4929 + }, + { + "epoch": 0.43, + "grad_norm": 2.6099194960412704, + "learning_rate": 6.312861259815816e-06, + "loss": 0.5073, + "step": 4930 + }, + { + "epoch": 0.43, + "grad_norm": 3.7266569332404074, + "learning_rate": 6.311488804733997e-06, + "loss": 0.5647, + "step": 4931 + }, + { + "epoch": 0.43, + "grad_norm": 3.0705393103633334, + "learning_rate": 6.310116243528685e-06, + "loss": 0.633, + "step": 4932 + }, + { + "epoch": 0.43, + "grad_norm": 7.27438417881033, + "learning_rate": 6.308743576310946e-06, + "loss": 0.75, + "step": 4933 + }, + { + "epoch": 0.43, + "grad_norm": 11.30058261661784, + "learning_rate": 6.307370803191854e-06, + "loss": 0.7301, + "step": 4934 + }, + { + "epoch": 0.43, + "grad_norm": 8.55030578897673, + "learning_rate": 6.3059979242824905e-06, + "loss": 0.798, + "step": 4935 + }, + { + "epoch": 0.43, + "grad_norm": 10.493410830472312, + "learning_rate": 6.304624939693949e-06, + "loss": 0.798, + "step": 4936 + }, + { + "epoch": 0.43, + "grad_norm": 9.82825406192289, + "learning_rate": 6.303251849537326e-06, + "loss": 1.0265, + "step": 4937 + }, + { + "epoch": 0.43, + "grad_norm": 12.582305698186483, + "learning_rate": 6.301878653923732e-06, + "loss": 1.0314, + "step": 4938 + }, + { + "epoch": 0.43, + "grad_norm": 10.782487865526162, + "learning_rate": 6.3005053529642835e-06, + "loss": 0.8809, + "step": 4939 + }, + { + "epoch": 0.43, + "grad_norm": 5.547545426196373, + "learning_rate": 6.299131946770104e-06, + "loss": 0.8288, + "step": 4940 + }, + { + "epoch": 0.43, + "grad_norm": 2.152554077355234, + "learning_rate": 6.297758435452329e-06, + "loss": 0.4884, + "step": 4941 + }, + { + "epoch": 0.43, + "grad_norm": 2.133118939725559, + "learning_rate": 6.2963848191221e-06, + "loss": 0.5093, + "step": 4942 + }, + { + "epoch": 0.43, + "grad_norm": 7.198717926791442, + "learning_rate": 6.295011097890568e-06, + "loss": 0.7829, + "step": 4943 + }, + { + "epoch": 0.43, + "grad_norm": 13.29533640438176, + "learning_rate": 6.293637271868892e-06, + "loss": 0.6614, + "step": 4944 + }, + { + "epoch": 0.43, + "grad_norm": 7.724350283575093, + "learning_rate": 6.29226334116824e-06, + "loss": 0.8496, + "step": 4945 + }, + { + "epoch": 0.43, + "grad_norm": 8.077976379127868, + "learning_rate": 6.290889305899787e-06, + "loss": 0.9094, + "step": 4946 + }, + { + "epoch": 0.43, + "grad_norm": 4.9339386138931545, + "learning_rate": 6.28951516617472e-06, + "loss": 0.5436, + "step": 4947 + }, + { + "epoch": 0.43, + "grad_norm": 7.5685301445916515, + "learning_rate": 6.28814092210423e-06, + "loss": 0.826, + "step": 4948 + }, + { + "epoch": 0.43, + "grad_norm": 7.834012704532941, + "learning_rate": 6.286766573799519e-06, + "loss": 1.0944, + "step": 4949 + }, + { + "epoch": 0.43, + "grad_norm": 6.1417979439310715, + "learning_rate": 6.285392121371797e-06, + "loss": 0.7122, + "step": 4950 + }, + { + "epoch": 0.43, + "grad_norm": 5.334176727223197, + "learning_rate": 6.284017564932284e-06, + "loss": 0.8338, + "step": 4951 + }, + { + "epoch": 0.43, + "grad_norm": 7.868495181323629, + "learning_rate": 6.282642904592204e-06, + "loss": 0.8933, + "step": 4952 + }, + { + "epoch": 0.43, + "grad_norm": 7.022458330201518, + "learning_rate": 6.2812681404627955e-06, + "loss": 0.7711, + "step": 4953 + }, + { + "epoch": 0.44, + "grad_norm": 9.129517099470679, + "learning_rate": 6.2798932726552985e-06, + "loss": 0.7602, + "step": 4954 + }, + { + "epoch": 0.44, + "grad_norm": 6.279924817047215, + "learning_rate": 6.278518301280968e-06, + "loss": 0.6968, + "step": 4955 + }, + { + "epoch": 0.44, + "grad_norm": 15.61217861418572, + "learning_rate": 6.277143226451063e-06, + "loss": 0.6912, + "step": 4956 + }, + { + "epoch": 0.44, + "grad_norm": 8.680894725430635, + "learning_rate": 6.275768048276852e-06, + "loss": 0.7752, + "step": 4957 + }, + { + "epoch": 0.44, + "grad_norm": 21.512757225873663, + "learning_rate": 6.274392766869614e-06, + "loss": 0.9109, + "step": 4958 + }, + { + "epoch": 0.44, + "grad_norm": 6.194855234216463, + "learning_rate": 6.273017382340632e-06, + "loss": 0.8106, + "step": 4959 + }, + { + "epoch": 0.44, + "grad_norm": 19.541503427644372, + "learning_rate": 6.271641894801202e-06, + "loss": 0.9161, + "step": 4960 + }, + { + "epoch": 0.44, + "grad_norm": 8.432280206945029, + "learning_rate": 6.270266304362625e-06, + "loss": 0.7043, + "step": 4961 + }, + { + "epoch": 0.44, + "grad_norm": 3.646363059278552, + "learning_rate": 6.2688906111362115e-06, + "loss": 0.5364, + "step": 4962 + }, + { + "epoch": 0.44, + "grad_norm": 8.967890780993926, + "learning_rate": 6.2675148152332786e-06, + "loss": 0.7784, + "step": 4963 + }, + { + "epoch": 0.44, + "grad_norm": 9.048028013139312, + "learning_rate": 6.266138916765156e-06, + "loss": 0.7667, + "step": 4964 + }, + { + "epoch": 0.44, + "grad_norm": 2.7904320838355563, + "learning_rate": 6.2647629158431815e-06, + "loss": 0.5613, + "step": 4965 + }, + { + "epoch": 0.44, + "grad_norm": 8.098333448517304, + "learning_rate": 6.263386812578691e-06, + "loss": 0.7606, + "step": 4966 + }, + { + "epoch": 0.44, + "grad_norm": 9.359090847723932, + "learning_rate": 6.262010607083044e-06, + "loss": 0.8231, + "step": 4967 + }, + { + "epoch": 0.44, + "grad_norm": 3.5482690264071297, + "learning_rate": 6.2606342994675984e-06, + "loss": 0.5356, + "step": 4968 + }, + { + "epoch": 0.44, + "grad_norm": 7.460768249125805, + "learning_rate": 6.259257889843721e-06, + "loss": 0.8251, + "step": 4969 + }, + { + "epoch": 0.44, + "grad_norm": 13.332723111083535, + "learning_rate": 6.25788137832279e-06, + "loss": 0.7746, + "step": 4970 + }, + { + "epoch": 0.44, + "grad_norm": 8.737050817950768, + "learning_rate": 6.256504765016189e-06, + "loss": 0.6586, + "step": 4971 + }, + { + "epoch": 0.44, + "grad_norm": 14.718350872174092, + "learning_rate": 6.255128050035314e-06, + "loss": 0.6857, + "step": 4972 + }, + { + "epoch": 0.44, + "grad_norm": 8.566123476168766, + "learning_rate": 6.253751233491565e-06, + "loss": 0.7261, + "step": 4973 + }, + { + "epoch": 0.44, + "grad_norm": 5.652021900921063, + "learning_rate": 6.252374315496351e-06, + "loss": 0.6323, + "step": 4974 + }, + { + "epoch": 0.44, + "grad_norm": 7.7960236060115315, + "learning_rate": 6.250997296161092e-06, + "loss": 0.7113, + "step": 4975 + }, + { + "epoch": 0.44, + "grad_norm": 14.380930903457855, + "learning_rate": 6.249620175597212e-06, + "loss": 0.6735, + "step": 4976 + }, + { + "epoch": 0.44, + "grad_norm": 16.345975175103643, + "learning_rate": 6.248242953916145e-06, + "loss": 0.8857, + "step": 4977 + }, + { + "epoch": 0.44, + "grad_norm": 9.153360821891807, + "learning_rate": 6.246865631229335e-06, + "loss": 0.6527, + "step": 4978 + }, + { + "epoch": 0.44, + "grad_norm": 8.046791658230628, + "learning_rate": 6.245488207648233e-06, + "loss": 0.7497, + "step": 4979 + }, + { + "epoch": 0.44, + "grad_norm": 6.081184040225459, + "learning_rate": 6.244110683284296e-06, + "loss": 0.7021, + "step": 4980 + }, + { + "epoch": 0.44, + "grad_norm": 13.881511058312551, + "learning_rate": 6.242733058248993e-06, + "loss": 0.67, + "step": 4981 + }, + { + "epoch": 0.44, + "grad_norm": 16.152793760888937, + "learning_rate": 6.241355332653798e-06, + "loss": 0.8054, + "step": 4982 + }, + { + "epoch": 0.44, + "grad_norm": 8.921756381128002, + "learning_rate": 6.239977506610193e-06, + "loss": 0.6941, + "step": 4983 + }, + { + "epoch": 0.44, + "grad_norm": 7.6896074401509384, + "learning_rate": 6.238599580229673e-06, + "loss": 0.8338, + "step": 4984 + }, + { + "epoch": 0.44, + "grad_norm": 10.97394551814242, + "learning_rate": 6.237221553623734e-06, + "loss": 0.745, + "step": 4985 + }, + { + "epoch": 0.44, + "grad_norm": 6.495699004079808, + "learning_rate": 6.2358434269038835e-06, + "loss": 0.9954, + "step": 4986 + }, + { + "epoch": 0.44, + "grad_norm": 5.35906162808834, + "learning_rate": 6.2344652001816405e-06, + "loss": 0.7279, + "step": 4987 + }, + { + "epoch": 0.44, + "grad_norm": 3.2829289389932144, + "learning_rate": 6.233086873568527e-06, + "loss": 0.545, + "step": 4988 + }, + { + "epoch": 0.44, + "grad_norm": 5.125593807242373, + "learning_rate": 6.231708447176073e-06, + "loss": 0.8292, + "step": 4989 + }, + { + "epoch": 0.44, + "grad_norm": 9.276833322241663, + "learning_rate": 6.230329921115823e-06, + "loss": 0.6378, + "step": 4990 + }, + { + "epoch": 0.44, + "grad_norm": 5.369179148607482, + "learning_rate": 6.228951295499319e-06, + "loss": 0.7019, + "step": 4991 + }, + { + "epoch": 0.44, + "grad_norm": 6.579257022243241, + "learning_rate": 6.227572570438123e-06, + "loss": 0.6831, + "step": 4992 + }, + { + "epoch": 0.44, + "grad_norm": 10.359794414083645, + "learning_rate": 6.226193746043797e-06, + "loss": 0.6369, + "step": 4993 + }, + { + "epoch": 0.44, + "grad_norm": 6.235665764195764, + "learning_rate": 6.224814822427911e-06, + "loss": 0.6848, + "step": 4994 + }, + { + "epoch": 0.44, + "grad_norm": 17.333455071146826, + "learning_rate": 6.2234357997020475e-06, + "loss": 0.7088, + "step": 4995 + }, + { + "epoch": 0.44, + "grad_norm": 5.67704894833611, + "learning_rate": 6.222056677977795e-06, + "loss": 0.8098, + "step": 4996 + }, + { + "epoch": 0.44, + "grad_norm": 13.324332959563606, + "learning_rate": 6.220677457366747e-06, + "loss": 0.9315, + "step": 4997 + }, + { + "epoch": 0.44, + "grad_norm": 6.839316363418515, + "learning_rate": 6.219298137980512e-06, + "loss": 0.7421, + "step": 4998 + }, + { + "epoch": 0.44, + "grad_norm": 8.980373873338406, + "learning_rate": 6.217918719930698e-06, + "loss": 0.728, + "step": 4999 + }, + { + "epoch": 0.44, + "grad_norm": 7.063152388985364, + "learning_rate": 6.216539203328927e-06, + "loss": 0.7366, + "step": 5000 + }, + { + "epoch": 0.44, + "grad_norm": 3.7850384594387916, + "learning_rate": 6.215159588286829e-06, + "loss": 0.5337, + "step": 5001 + }, + { + "epoch": 0.44, + "grad_norm": 8.838737244958491, + "learning_rate": 6.2137798749160375e-06, + "loss": 0.8057, + "step": 5002 + }, + { + "epoch": 0.44, + "grad_norm": 7.06526139903672, + "learning_rate": 6.212400063328196e-06, + "loss": 0.7333, + "step": 5003 + }, + { + "epoch": 0.44, + "grad_norm": 10.349690908321215, + "learning_rate": 6.211020153634961e-06, + "loss": 0.73, + "step": 5004 + }, + { + "epoch": 0.44, + "grad_norm": 9.766302565697558, + "learning_rate": 6.209640145947988e-06, + "loss": 0.8504, + "step": 5005 + }, + { + "epoch": 0.44, + "grad_norm": 16.931751861505855, + "learning_rate": 6.208260040378946e-06, + "loss": 0.9255, + "step": 5006 + }, + { + "epoch": 0.44, + "grad_norm": 8.417659040296, + "learning_rate": 6.206879837039513e-06, + "loss": 0.9139, + "step": 5007 + }, + { + "epoch": 0.44, + "grad_norm": 8.756262752404679, + "learning_rate": 6.205499536041371e-06, + "loss": 0.8049, + "step": 5008 + }, + { + "epoch": 0.44, + "grad_norm": 6.225378982333944, + "learning_rate": 6.2041191374962096e-06, + "loss": 0.7299, + "step": 5009 + }, + { + "epoch": 0.44, + "grad_norm": 7.572027673927364, + "learning_rate": 6.202738641515732e-06, + "loss": 0.9322, + "step": 5010 + }, + { + "epoch": 0.44, + "grad_norm": 10.572830891273744, + "learning_rate": 6.201358048211643e-06, + "loss": 0.7037, + "step": 5011 + }, + { + "epoch": 0.44, + "grad_norm": 8.823890245048766, + "learning_rate": 6.199977357695661e-06, + "loss": 0.7557, + "step": 5012 + }, + { + "epoch": 0.44, + "grad_norm": 7.325564835076637, + "learning_rate": 6.198596570079507e-06, + "loss": 0.7057, + "step": 5013 + }, + { + "epoch": 0.44, + "grad_norm": 10.294143541419222, + "learning_rate": 6.197215685474911e-06, + "loss": 0.7573, + "step": 5014 + }, + { + "epoch": 0.44, + "grad_norm": 9.030496638660706, + "learning_rate": 6.195834703993615e-06, + "loss": 0.8274, + "step": 5015 + }, + { + "epoch": 0.44, + "grad_norm": 5.176252043384722, + "learning_rate": 6.194453625747363e-06, + "loss": 0.7094, + "step": 5016 + }, + { + "epoch": 0.44, + "grad_norm": 6.033931433906542, + "learning_rate": 6.193072450847909e-06, + "loss": 0.9337, + "step": 5017 + }, + { + "epoch": 0.44, + "grad_norm": 7.367609106795373, + "learning_rate": 6.191691179407019e-06, + "loss": 0.7314, + "step": 5018 + }, + { + "epoch": 0.44, + "grad_norm": 12.269137302610964, + "learning_rate": 6.1903098115364615e-06, + "loss": 0.7374, + "step": 5019 + }, + { + "epoch": 0.44, + "grad_norm": 8.417173201123397, + "learning_rate": 6.188928347348011e-06, + "loss": 0.7133, + "step": 5020 + }, + { + "epoch": 0.44, + "grad_norm": 11.411888713744268, + "learning_rate": 6.187546786953459e-06, + "loss": 0.7823, + "step": 5021 + }, + { + "epoch": 0.44, + "grad_norm": 4.759922618555399, + "learning_rate": 6.1861651304645955e-06, + "loss": 0.5994, + "step": 5022 + }, + { + "epoch": 0.44, + "grad_norm": 6.272591092271707, + "learning_rate": 6.184783377993222e-06, + "loss": 0.8748, + "step": 5023 + }, + { + "epoch": 0.44, + "grad_norm": 8.190664324592984, + "learning_rate": 6.18340152965115e-06, + "loss": 0.8176, + "step": 5024 + }, + { + "epoch": 0.44, + "grad_norm": 9.384349388117244, + "learning_rate": 6.182019585550194e-06, + "loss": 0.8, + "step": 5025 + }, + { + "epoch": 0.44, + "grad_norm": 8.62698715289998, + "learning_rate": 6.180637545802178e-06, + "loss": 0.7866, + "step": 5026 + }, + { + "epoch": 0.44, + "grad_norm": 11.268971794306513, + "learning_rate": 6.179255410518937e-06, + "loss": 0.7893, + "step": 5027 + }, + { + "epoch": 0.44, + "grad_norm": 13.019464363899898, + "learning_rate": 6.1778731798123105e-06, + "loss": 0.8442, + "step": 5028 + }, + { + "epoch": 0.44, + "grad_norm": 8.838802183471955, + "learning_rate": 6.176490853794144e-06, + "loss": 0.7917, + "step": 5029 + }, + { + "epoch": 0.44, + "grad_norm": 7.543077721643445, + "learning_rate": 6.1751084325762954e-06, + "loss": 0.7238, + "step": 5030 + }, + { + "epoch": 0.44, + "grad_norm": 11.40314931988829, + "learning_rate": 6.1737259162706265e-06, + "loss": 0.7482, + "step": 5031 + }, + { + "epoch": 0.44, + "grad_norm": 8.700389363235251, + "learning_rate": 6.172343304989009e-06, + "loss": 0.7972, + "step": 5032 + }, + { + "epoch": 0.44, + "grad_norm": 7.06593871315047, + "learning_rate": 6.170960598843323e-06, + "loss": 0.7224, + "step": 5033 + }, + { + "epoch": 0.44, + "grad_norm": 5.911120881928557, + "learning_rate": 6.1695777979454506e-06, + "loss": 0.8282, + "step": 5034 + }, + { + "epoch": 0.44, + "grad_norm": 3.113504361572462, + "learning_rate": 6.16819490240729e-06, + "loss": 0.4891, + "step": 5035 + }, + { + "epoch": 0.44, + "grad_norm": 2.0904448084365557, + "learning_rate": 6.166811912340741e-06, + "loss": 0.5116, + "step": 5036 + }, + { + "epoch": 0.44, + "grad_norm": 10.976894473400991, + "learning_rate": 6.1654288278577114e-06, + "loss": 0.7973, + "step": 5037 + }, + { + "epoch": 0.44, + "grad_norm": 22.166436766719873, + "learning_rate": 6.164045649070122e-06, + "loss": 0.8274, + "step": 5038 + }, + { + "epoch": 0.44, + "grad_norm": 5.322631901929777, + "learning_rate": 6.162662376089894e-06, + "loss": 0.8451, + "step": 5039 + }, + { + "epoch": 0.44, + "grad_norm": 5.5095045342063385, + "learning_rate": 6.1612790090289585e-06, + "loss": 0.6507, + "step": 5040 + }, + { + "epoch": 0.44, + "grad_norm": 7.947007320181074, + "learning_rate": 6.159895547999259e-06, + "loss": 0.7963, + "step": 5041 + }, + { + "epoch": 0.44, + "grad_norm": 14.95375823794031, + "learning_rate": 6.158511993112743e-06, + "loss": 0.7735, + "step": 5042 + }, + { + "epoch": 0.44, + "grad_norm": 6.150514918600729, + "learning_rate": 6.15712834448136e-06, + "loss": 0.9481, + "step": 5043 + }, + { + "epoch": 0.44, + "grad_norm": 10.819380059078467, + "learning_rate": 6.1557446022170785e-06, + "loss": 0.7925, + "step": 5044 + }, + { + "epoch": 0.44, + "grad_norm": 8.19465890995624, + "learning_rate": 6.154360766431866e-06, + "loss": 0.8404, + "step": 5045 + }, + { + "epoch": 0.44, + "grad_norm": 2.5992385558721387, + "learning_rate": 6.152976837237699e-06, + "loss": 0.4909, + "step": 5046 + }, + { + "epoch": 0.44, + "grad_norm": 8.212952366319465, + "learning_rate": 6.1515928147465665e-06, + "loss": 0.7992, + "step": 5047 + }, + { + "epoch": 0.44, + "grad_norm": 6.951617730836396, + "learning_rate": 6.150208699070458e-06, + "loss": 0.6801, + "step": 5048 + }, + { + "epoch": 0.44, + "grad_norm": 13.42730922645401, + "learning_rate": 6.1488244903213745e-06, + "loss": 0.7687, + "step": 5049 + }, + { + "epoch": 0.44, + "grad_norm": 9.916043751155554, + "learning_rate": 6.147440188611324e-06, + "loss": 0.8815, + "step": 5050 + }, + { + "epoch": 0.44, + "grad_norm": 7.175278003582041, + "learning_rate": 6.146055794052323e-06, + "loss": 0.6919, + "step": 5051 + }, + { + "epoch": 0.44, + "grad_norm": 5.04846735174646, + "learning_rate": 6.144671306756393e-06, + "loss": 0.6777, + "step": 5052 + }, + { + "epoch": 0.44, + "grad_norm": 6.947623251747073, + "learning_rate": 6.1432867268355654e-06, + "loss": 0.9298, + "step": 5053 + }, + { + "epoch": 0.44, + "grad_norm": 7.200168322710256, + "learning_rate": 6.141902054401876e-06, + "loss": 0.9459, + "step": 5054 + }, + { + "epoch": 0.44, + "grad_norm": 8.98825224262858, + "learning_rate": 6.140517289567373e-06, + "loss": 0.8304, + "step": 5055 + }, + { + "epoch": 0.44, + "grad_norm": 6.210883090264091, + "learning_rate": 6.13913243244411e-06, + "loss": 0.8033, + "step": 5056 + }, + { + "epoch": 0.44, + "grad_norm": 6.065555617422564, + "learning_rate": 6.137747483144142e-06, + "loss": 0.847, + "step": 5057 + }, + { + "epoch": 0.44, + "grad_norm": 6.376055645512641, + "learning_rate": 6.136362441779542e-06, + "loss": 0.6119, + "step": 5058 + }, + { + "epoch": 0.44, + "grad_norm": 5.5416531156793205, + "learning_rate": 6.134977308462383e-06, + "loss": 0.7734, + "step": 5059 + }, + { + "epoch": 0.44, + "grad_norm": 4.439787192252617, + "learning_rate": 6.133592083304748e-06, + "loss": 0.7693, + "step": 5060 + }, + { + "epoch": 0.44, + "grad_norm": 20.608248082179042, + "learning_rate": 6.132206766418728e-06, + "loss": 0.8453, + "step": 5061 + }, + { + "epoch": 0.44, + "grad_norm": 6.118820072057558, + "learning_rate": 6.13082135791642e-06, + "loss": 0.7753, + "step": 5062 + }, + { + "epoch": 0.44, + "grad_norm": 7.951646144980656, + "learning_rate": 6.129435857909927e-06, + "loss": 0.7379, + "step": 5063 + }, + { + "epoch": 0.44, + "grad_norm": 7.7254308304221, + "learning_rate": 6.128050266511365e-06, + "loss": 0.7136, + "step": 5064 + }, + { + "epoch": 0.44, + "grad_norm": 7.66093943263933, + "learning_rate": 6.126664583832851e-06, + "loss": 0.8659, + "step": 5065 + }, + { + "epoch": 0.44, + "grad_norm": 7.930309202289793, + "learning_rate": 6.125278809986512e-06, + "loss": 0.926, + "step": 5066 + }, + { + "epoch": 0.44, + "grad_norm": 10.188429626875068, + "learning_rate": 6.123892945084486e-06, + "loss": 0.7729, + "step": 5067 + }, + { + "epoch": 0.45, + "grad_norm": 8.487341987183875, + "learning_rate": 6.122506989238911e-06, + "loss": 0.8196, + "step": 5068 + }, + { + "epoch": 0.45, + "grad_norm": 9.03241864523877, + "learning_rate": 6.1211209425619366e-06, + "loss": 0.8028, + "step": 5069 + }, + { + "epoch": 0.45, + "grad_norm": 8.190237423382857, + "learning_rate": 6.119734805165721e-06, + "loss": 0.8172, + "step": 5070 + }, + { + "epoch": 0.45, + "grad_norm": 12.891434487615003, + "learning_rate": 6.118348577162428e-06, + "loss": 0.9686, + "step": 5071 + }, + { + "epoch": 0.45, + "grad_norm": 15.098087470195377, + "learning_rate": 6.116962258664228e-06, + "loss": 0.9392, + "step": 5072 + }, + { + "epoch": 0.45, + "grad_norm": 15.08437653273864, + "learning_rate": 6.115575849783302e-06, + "loss": 0.6589, + "step": 5073 + }, + { + "epoch": 0.45, + "grad_norm": 8.377537295338733, + "learning_rate": 6.11418935063183e-06, + "loss": 0.8638, + "step": 5074 + }, + { + "epoch": 0.45, + "grad_norm": 11.053237634263073, + "learning_rate": 6.112802761322012e-06, + "loss": 0.8506, + "step": 5075 + }, + { + "epoch": 0.45, + "grad_norm": 6.391092318656821, + "learning_rate": 6.111416081966046e-06, + "loss": 0.8791, + "step": 5076 + }, + { + "epoch": 0.45, + "grad_norm": 10.927803436194706, + "learning_rate": 6.110029312676138e-06, + "loss": 0.7618, + "step": 5077 + }, + { + "epoch": 0.45, + "grad_norm": 26.705632991289967, + "learning_rate": 6.108642453564505e-06, + "loss": 0.7559, + "step": 5078 + }, + { + "epoch": 0.45, + "grad_norm": 6.27794448258325, + "learning_rate": 6.107255504743369e-06, + "loss": 0.7004, + "step": 5079 + }, + { + "epoch": 0.45, + "grad_norm": 10.86680032735345, + "learning_rate": 6.105868466324958e-06, + "loss": 0.5901, + "step": 5080 + }, + { + "epoch": 0.45, + "grad_norm": 18.41788959063535, + "learning_rate": 6.104481338421512e-06, + "loss": 0.8561, + "step": 5081 + }, + { + "epoch": 0.45, + "grad_norm": 7.547331918297749, + "learning_rate": 6.103094121145272e-06, + "loss": 0.7521, + "step": 5082 + }, + { + "epoch": 0.45, + "grad_norm": 10.129866572849224, + "learning_rate": 6.10170681460849e-06, + "loss": 0.7206, + "step": 5083 + }, + { + "epoch": 0.45, + "grad_norm": 19.27651016093193, + "learning_rate": 6.1003194189234275e-06, + "loss": 0.7332, + "step": 5084 + }, + { + "epoch": 0.45, + "grad_norm": 9.041505704451103, + "learning_rate": 6.098931934202347e-06, + "loss": 0.7188, + "step": 5085 + }, + { + "epoch": 0.45, + "grad_norm": 7.654603872656157, + "learning_rate": 6.097544360557521e-06, + "loss": 0.8412, + "step": 5086 + }, + { + "epoch": 0.45, + "grad_norm": 11.48934517592477, + "learning_rate": 6.096156698101232e-06, + "loss": 0.9779, + "step": 5087 + }, + { + "epoch": 0.45, + "grad_norm": 11.475957683306431, + "learning_rate": 6.094768946945767e-06, + "loss": 0.8512, + "step": 5088 + }, + { + "epoch": 0.45, + "grad_norm": 9.957662192769588, + "learning_rate": 6.0933811072034185e-06, + "loss": 0.7814, + "step": 5089 + }, + { + "epoch": 0.45, + "grad_norm": 9.262208307478952, + "learning_rate": 6.091993178986491e-06, + "loss": 0.6303, + "step": 5090 + }, + { + "epoch": 0.45, + "grad_norm": 6.74727868746976, + "learning_rate": 6.09060516240729e-06, + "loss": 0.7639, + "step": 5091 + }, + { + "epoch": 0.45, + "grad_norm": 14.157255712740387, + "learning_rate": 6.089217057578135e-06, + "loss": 0.728, + "step": 5092 + }, + { + "epoch": 0.45, + "grad_norm": 9.741368662699651, + "learning_rate": 6.087828864611348e-06, + "loss": 0.8517, + "step": 5093 + }, + { + "epoch": 0.45, + "grad_norm": 3.3422985831715843, + "learning_rate": 6.0864405836192575e-06, + "loss": 0.6278, + "step": 5094 + }, + { + "epoch": 0.45, + "grad_norm": 6.727340188688103, + "learning_rate": 6.085052214714202e-06, + "loss": 0.6909, + "step": 5095 + }, + { + "epoch": 0.45, + "grad_norm": 2.5260815980732128, + "learning_rate": 6.083663758008528e-06, + "loss": 0.5048, + "step": 5096 + }, + { + "epoch": 0.45, + "grad_norm": 11.455849001172068, + "learning_rate": 6.082275213614583e-06, + "loss": 0.8791, + "step": 5097 + }, + { + "epoch": 0.45, + "grad_norm": 7.053771261392564, + "learning_rate": 6.080886581644729e-06, + "loss": 0.688, + "step": 5098 + }, + { + "epoch": 0.45, + "grad_norm": 13.077822665246764, + "learning_rate": 6.079497862211332e-06, + "loss": 0.7646, + "step": 5099 + }, + { + "epoch": 0.45, + "grad_norm": 11.65608766620714, + "learning_rate": 6.078109055426762e-06, + "loss": 0.8301, + "step": 5100 + }, + { + "epoch": 0.45, + "grad_norm": 8.622615315627222, + "learning_rate": 6.076720161403401e-06, + "loss": 0.7117, + "step": 5101 + }, + { + "epoch": 0.45, + "grad_norm": 5.804266112200034, + "learning_rate": 6.0753311802536365e-06, + "loss": 0.682, + "step": 5102 + }, + { + "epoch": 0.45, + "grad_norm": 7.9544042833513995, + "learning_rate": 6.073942112089859e-06, + "loss": 0.8305, + "step": 5103 + }, + { + "epoch": 0.45, + "grad_norm": 10.152492450080448, + "learning_rate": 6.072552957024475e-06, + "loss": 0.7864, + "step": 5104 + }, + { + "epoch": 0.45, + "grad_norm": 5.569114232632865, + "learning_rate": 6.071163715169889e-06, + "loss": 0.8319, + "step": 5105 + }, + { + "epoch": 0.45, + "grad_norm": 8.58535660660071, + "learning_rate": 6.069774386638516e-06, + "loss": 0.8775, + "step": 5106 + }, + { + "epoch": 0.45, + "grad_norm": 6.683245496963363, + "learning_rate": 6.06838497154278e-06, + "loss": 0.8762, + "step": 5107 + }, + { + "epoch": 0.45, + "grad_norm": 8.028814273988013, + "learning_rate": 6.06699546999511e-06, + "loss": 0.8339, + "step": 5108 + }, + { + "epoch": 0.45, + "grad_norm": 6.101815661584444, + "learning_rate": 6.065605882107939e-06, + "loss": 0.826, + "step": 5109 + }, + { + "epoch": 0.45, + "grad_norm": 8.766182069982312, + "learning_rate": 6.064216207993714e-06, + "loss": 0.7857, + "step": 5110 + }, + { + "epoch": 0.45, + "grad_norm": 2.957172494864544, + "learning_rate": 6.062826447764883e-06, + "loss": 0.568, + "step": 5111 + }, + { + "epoch": 0.45, + "grad_norm": 11.357058487402172, + "learning_rate": 6.061436601533905e-06, + "loss": 0.8482, + "step": 5112 + }, + { + "epoch": 0.45, + "grad_norm": 3.2053284112549014, + "learning_rate": 6.060046669413243e-06, + "loss": 0.5652, + "step": 5113 + }, + { + "epoch": 0.45, + "grad_norm": 7.401043318773031, + "learning_rate": 6.058656651515367e-06, + "loss": 0.8275, + "step": 5114 + }, + { + "epoch": 0.45, + "grad_norm": 2.183498205788231, + "learning_rate": 6.057266547952756e-06, + "loss": 0.5291, + "step": 5115 + }, + { + "epoch": 0.45, + "grad_norm": 9.638876132699277, + "learning_rate": 6.055876358837894e-06, + "loss": 0.6896, + "step": 5116 + }, + { + "epoch": 0.45, + "grad_norm": 8.317420726329354, + "learning_rate": 6.054486084283274e-06, + "loss": 0.7838, + "step": 5117 + }, + { + "epoch": 0.45, + "grad_norm": 12.467598167307523, + "learning_rate": 6.053095724401394e-06, + "loss": 0.9303, + "step": 5118 + }, + { + "epoch": 0.45, + "grad_norm": 8.93139806968995, + "learning_rate": 6.05170527930476e-06, + "loss": 0.7492, + "step": 5119 + }, + { + "epoch": 0.45, + "grad_norm": 7.892452716111707, + "learning_rate": 6.050314749105884e-06, + "loss": 0.8944, + "step": 5120 + }, + { + "epoch": 0.45, + "grad_norm": 2.46321965425462, + "learning_rate": 6.048924133917286e-06, + "loss": 0.5312, + "step": 5121 + }, + { + "epoch": 0.45, + "grad_norm": 12.131103864808562, + "learning_rate": 6.0475334338514915e-06, + "loss": 0.7407, + "step": 5122 + }, + { + "epoch": 0.45, + "grad_norm": 3.48152493943969, + "learning_rate": 6.046142649021033e-06, + "loss": 0.5472, + "step": 5123 + }, + { + "epoch": 0.45, + "grad_norm": 5.454030896702557, + "learning_rate": 6.044751779538453e-06, + "loss": 0.6311, + "step": 5124 + }, + { + "epoch": 0.45, + "grad_norm": 7.696515068172094, + "learning_rate": 6.043360825516297e-06, + "loss": 0.887, + "step": 5125 + }, + { + "epoch": 0.45, + "grad_norm": 9.426449562347846, + "learning_rate": 6.041969787067117e-06, + "loss": 0.9064, + "step": 5126 + }, + { + "epoch": 0.45, + "grad_norm": 6.704065774969496, + "learning_rate": 6.040578664303476e-06, + "loss": 0.6378, + "step": 5127 + }, + { + "epoch": 0.45, + "grad_norm": 2.4686749954071145, + "learning_rate": 6.03918745733794e-06, + "loss": 0.5424, + "step": 5128 + }, + { + "epoch": 0.45, + "grad_norm": 7.619517350260915, + "learning_rate": 6.037796166283083e-06, + "loss": 0.6994, + "step": 5129 + }, + { + "epoch": 0.45, + "grad_norm": 3.8317693834520643, + "learning_rate": 6.036404791251489e-06, + "loss": 0.5877, + "step": 5130 + }, + { + "epoch": 0.45, + "grad_norm": 12.874718086872749, + "learning_rate": 6.035013332355741e-06, + "loss": 0.7381, + "step": 5131 + }, + { + "epoch": 0.45, + "grad_norm": 11.29801498773492, + "learning_rate": 6.033621789708436e-06, + "loss": 0.6883, + "step": 5132 + }, + { + "epoch": 0.45, + "grad_norm": 9.317261473167385, + "learning_rate": 6.0322301634221755e-06, + "loss": 0.8493, + "step": 5133 + }, + { + "epoch": 0.45, + "grad_norm": 7.500248669327494, + "learning_rate": 6.030838453609567e-06, + "loss": 0.8639, + "step": 5134 + }, + { + "epoch": 0.45, + "grad_norm": 6.393221018278091, + "learning_rate": 6.029446660383226e-06, + "loss": 0.6297, + "step": 5135 + }, + { + "epoch": 0.45, + "grad_norm": 10.203457799683086, + "learning_rate": 6.028054783855774e-06, + "loss": 0.732, + "step": 5136 + }, + { + "epoch": 0.45, + "grad_norm": 9.536296221817146, + "learning_rate": 6.026662824139838e-06, + "loss": 0.8054, + "step": 5137 + }, + { + "epoch": 0.45, + "grad_norm": 7.656467567822687, + "learning_rate": 6.025270781348055e-06, + "loss": 0.7152, + "step": 5138 + }, + { + "epoch": 0.45, + "grad_norm": 43.89988901664711, + "learning_rate": 6.023878655593065e-06, + "loss": 0.764, + "step": 5139 + }, + { + "epoch": 0.45, + "grad_norm": 7.399538456978111, + "learning_rate": 6.0224864469875164e-06, + "loss": 0.8551, + "step": 5140 + }, + { + "epoch": 0.45, + "grad_norm": 6.147062045034401, + "learning_rate": 6.021094155644067e-06, + "loss": 0.8091, + "step": 5141 + }, + { + "epoch": 0.45, + "grad_norm": 4.675540786020213, + "learning_rate": 6.019701781675378e-06, + "loss": 0.6068, + "step": 5142 + }, + { + "epoch": 0.45, + "grad_norm": 6.550199165637611, + "learning_rate": 6.018309325194114e-06, + "loss": 0.8099, + "step": 5143 + }, + { + "epoch": 0.45, + "grad_norm": 8.26823163848474, + "learning_rate": 6.0169167863129554e-06, + "loss": 0.645, + "step": 5144 + }, + { + "epoch": 0.45, + "grad_norm": 6.779022230609357, + "learning_rate": 6.0155241651445815e-06, + "loss": 0.7061, + "step": 5145 + }, + { + "epoch": 0.45, + "grad_norm": 8.101683351393257, + "learning_rate": 6.014131461801681e-06, + "loss": 0.7905, + "step": 5146 + }, + { + "epoch": 0.45, + "grad_norm": 11.70551908978532, + "learning_rate": 6.012738676396951e-06, + "loss": 0.8587, + "step": 5147 + }, + { + "epoch": 0.45, + "grad_norm": 8.877443807605555, + "learning_rate": 6.011345809043091e-06, + "loss": 0.6303, + "step": 5148 + }, + { + "epoch": 0.45, + "grad_norm": 8.297255057141667, + "learning_rate": 6.009952859852809e-06, + "loss": 0.7805, + "step": 5149 + }, + { + "epoch": 0.45, + "grad_norm": 4.109596957813839, + "learning_rate": 6.008559828938823e-06, + "loss": 0.7575, + "step": 5150 + }, + { + "epoch": 0.45, + "grad_norm": 6.938887784330655, + "learning_rate": 6.0071667164138525e-06, + "loss": 0.7434, + "step": 5151 + }, + { + "epoch": 0.45, + "grad_norm": 6.3736813584597005, + "learning_rate": 6.0057735223906275e-06, + "loss": 0.656, + "step": 5152 + }, + { + "epoch": 0.45, + "grad_norm": 8.128088984007281, + "learning_rate": 6.0043802469818825e-06, + "loss": 0.6389, + "step": 5153 + }, + { + "epoch": 0.45, + "grad_norm": 19.36272067953935, + "learning_rate": 6.0029868903003575e-06, + "loss": 0.7195, + "step": 5154 + }, + { + "epoch": 0.45, + "grad_norm": 6.6612074649085455, + "learning_rate": 6.001593452458801e-06, + "loss": 0.8444, + "step": 5155 + }, + { + "epoch": 0.45, + "grad_norm": 6.665433270803927, + "learning_rate": 6.000199933569971e-06, + "loss": 0.7219, + "step": 5156 + }, + { + "epoch": 0.45, + "grad_norm": 6.779120250123763, + "learning_rate": 5.998806333746624e-06, + "loss": 0.6315, + "step": 5157 + }, + { + "epoch": 0.45, + "grad_norm": 7.868810296620423, + "learning_rate": 5.997412653101531e-06, + "loss": 0.7229, + "step": 5158 + }, + { + "epoch": 0.45, + "grad_norm": 5.059702714277412, + "learning_rate": 5.996018891747465e-06, + "loss": 0.5894, + "step": 5159 + }, + { + "epoch": 0.45, + "grad_norm": 11.20785675680035, + "learning_rate": 5.994625049797206e-06, + "loss": 0.5829, + "step": 5160 + }, + { + "epoch": 0.45, + "grad_norm": 9.342157615255946, + "learning_rate": 5.993231127363544e-06, + "loss": 0.8717, + "step": 5161 + }, + { + "epoch": 0.45, + "grad_norm": 3.8589890441820462, + "learning_rate": 5.99183712455927e-06, + "loss": 0.5672, + "step": 5162 + }, + { + "epoch": 0.45, + "grad_norm": 7.587584971062362, + "learning_rate": 5.9904430414971855e-06, + "loss": 0.7709, + "step": 5163 + }, + { + "epoch": 0.45, + "grad_norm": 6.350872004622523, + "learning_rate": 5.989048878290099e-06, + "loss": 0.879, + "step": 5164 + }, + { + "epoch": 0.45, + "grad_norm": 8.631942243608044, + "learning_rate": 5.987654635050822e-06, + "loss": 0.8166, + "step": 5165 + }, + { + "epoch": 0.45, + "grad_norm": 5.89851584472512, + "learning_rate": 5.986260311892175e-06, + "loss": 0.6787, + "step": 5166 + }, + { + "epoch": 0.45, + "grad_norm": 7.145982783802666, + "learning_rate": 5.984865908926983e-06, + "loss": 0.7875, + "step": 5167 + }, + { + "epoch": 0.45, + "grad_norm": 3.963785506087981, + "learning_rate": 5.9834714262680816e-06, + "loss": 0.6734, + "step": 5168 + }, + { + "epoch": 0.45, + "grad_norm": 6.793545789991069, + "learning_rate": 5.982076864028307e-06, + "loss": 0.7746, + "step": 5169 + }, + { + "epoch": 0.45, + "grad_norm": 8.813027330704479, + "learning_rate": 5.9806822223205065e-06, + "loss": 0.8409, + "step": 5170 + }, + { + "epoch": 0.45, + "grad_norm": 10.266942251432903, + "learning_rate": 5.979287501257531e-06, + "loss": 0.7116, + "step": 5171 + }, + { + "epoch": 0.45, + "grad_norm": 5.845915078696601, + "learning_rate": 5.977892700952241e-06, + "loss": 0.7562, + "step": 5172 + }, + { + "epoch": 0.45, + "grad_norm": 6.906865160295786, + "learning_rate": 5.9764978215175015e-06, + "loss": 0.8146, + "step": 5173 + }, + { + "epoch": 0.45, + "grad_norm": 15.373271626275185, + "learning_rate": 5.97510286306618e-06, + "loss": 0.9105, + "step": 5174 + }, + { + "epoch": 0.45, + "grad_norm": 7.706708254274169, + "learning_rate": 5.973707825711158e-06, + "loss": 0.8751, + "step": 5175 + }, + { + "epoch": 0.45, + "grad_norm": 15.202601072291666, + "learning_rate": 5.972312709565319e-06, + "loss": 0.7004, + "step": 5176 + }, + { + "epoch": 0.45, + "grad_norm": 12.537590435971556, + "learning_rate": 5.9709175147415524e-06, + "loss": 0.8946, + "step": 5177 + }, + { + "epoch": 0.45, + "grad_norm": 6.429511396743959, + "learning_rate": 5.969522241352755e-06, + "loss": 0.7823, + "step": 5178 + }, + { + "epoch": 0.45, + "grad_norm": 2.8115623557121596, + "learning_rate": 5.968126889511832e-06, + "loss": 0.6979, + "step": 5179 + }, + { + "epoch": 0.45, + "grad_norm": 25.459118981767702, + "learning_rate": 5.9667314593316895e-06, + "loss": 0.7318, + "step": 5180 + }, + { + "epoch": 0.46, + "grad_norm": 7.427481924025928, + "learning_rate": 5.965335950925246e-06, + "loss": 0.8651, + "step": 5181 + }, + { + "epoch": 0.46, + "grad_norm": 8.283641952975566, + "learning_rate": 5.963940364405425e-06, + "loss": 0.8238, + "step": 5182 + }, + { + "epoch": 0.46, + "grad_norm": 5.166315243185707, + "learning_rate": 5.962544699885151e-06, + "loss": 0.802, + "step": 5183 + }, + { + "epoch": 0.46, + "grad_norm": 10.945080476530674, + "learning_rate": 5.9611489574773615e-06, + "loss": 0.6692, + "step": 5184 + }, + { + "epoch": 0.46, + "grad_norm": 5.403805677097654, + "learning_rate": 5.959753137294998e-06, + "loss": 0.8312, + "step": 5185 + }, + { + "epoch": 0.46, + "grad_norm": 16.987865824897806, + "learning_rate": 5.958357239451006e-06, + "loss": 0.6242, + "step": 5186 + }, + { + "epoch": 0.46, + "grad_norm": 8.046401900621712, + "learning_rate": 5.956961264058341e-06, + "loss": 0.8677, + "step": 5187 + }, + { + "epoch": 0.46, + "grad_norm": 5.613372104676428, + "learning_rate": 5.955565211229962e-06, + "loss": 0.6834, + "step": 5188 + }, + { + "epoch": 0.46, + "grad_norm": 2.6545311642686413, + "learning_rate": 5.954169081078835e-06, + "loss": 0.6089, + "step": 5189 + }, + { + "epoch": 0.46, + "grad_norm": 20.270984227654722, + "learning_rate": 5.9527728737179334e-06, + "loss": 0.911, + "step": 5190 + }, + { + "epoch": 0.46, + "grad_norm": 9.254242332919155, + "learning_rate": 5.951376589260236e-06, + "loss": 0.7668, + "step": 5191 + }, + { + "epoch": 0.46, + "grad_norm": 2.1414896350419776, + "learning_rate": 5.949980227818728e-06, + "loss": 0.5113, + "step": 5192 + }, + { + "epoch": 0.46, + "grad_norm": 9.257697232555552, + "learning_rate": 5.9485837895064e-06, + "loss": 0.7292, + "step": 5193 + }, + { + "epoch": 0.46, + "grad_norm": 8.281891921967816, + "learning_rate": 5.947187274436249e-06, + "loss": 0.6699, + "step": 5194 + }, + { + "epoch": 0.46, + "grad_norm": 9.053743900176357, + "learning_rate": 5.94579068272128e-06, + "loss": 0.7903, + "step": 5195 + }, + { + "epoch": 0.46, + "grad_norm": 8.08744586902519, + "learning_rate": 5.944394014474503e-06, + "loss": 0.7061, + "step": 5196 + }, + { + "epoch": 0.46, + "grad_norm": 3.1632164079335596, + "learning_rate": 5.942997269808931e-06, + "loss": 0.5404, + "step": 5197 + }, + { + "epoch": 0.46, + "grad_norm": 8.427068358378092, + "learning_rate": 5.94160044883759e-06, + "loss": 0.7312, + "step": 5198 + }, + { + "epoch": 0.46, + "grad_norm": 9.733132280089936, + "learning_rate": 5.9402035516735066e-06, + "loss": 0.774, + "step": 5199 + }, + { + "epoch": 0.46, + "grad_norm": 8.018753340625807, + "learning_rate": 5.938806578429716e-06, + "loss": 0.7726, + "step": 5200 + }, + { + "epoch": 0.46, + "grad_norm": 11.195435264566363, + "learning_rate": 5.937409529219259e-06, + "loss": 0.7522, + "step": 5201 + }, + { + "epoch": 0.46, + "grad_norm": 7.959556792830696, + "learning_rate": 5.936012404155182e-06, + "loss": 0.7849, + "step": 5202 + }, + { + "epoch": 0.46, + "grad_norm": 9.063775938095015, + "learning_rate": 5.934615203350538e-06, + "loss": 0.7299, + "step": 5203 + }, + { + "epoch": 0.46, + "grad_norm": 7.67339900364958, + "learning_rate": 5.933217926918386e-06, + "loss": 0.7617, + "step": 5204 + }, + { + "epoch": 0.46, + "grad_norm": 6.249222908835862, + "learning_rate": 5.931820574971792e-06, + "loss": 0.7365, + "step": 5205 + }, + { + "epoch": 0.46, + "grad_norm": 5.2257997143143955, + "learning_rate": 5.930423147623827e-06, + "loss": 1.0064, + "step": 5206 + }, + { + "epoch": 0.46, + "grad_norm": 9.094700895015153, + "learning_rate": 5.929025644987569e-06, + "loss": 0.6959, + "step": 5207 + }, + { + "epoch": 0.46, + "grad_norm": 15.875771855069688, + "learning_rate": 5.927628067176101e-06, + "loss": 0.6792, + "step": 5208 + }, + { + "epoch": 0.46, + "grad_norm": 12.304541367874233, + "learning_rate": 5.926230414302512e-06, + "loss": 0.8794, + "step": 5209 + }, + { + "epoch": 0.46, + "grad_norm": 10.625057075796002, + "learning_rate": 5.9248326864799e-06, + "loss": 0.8772, + "step": 5210 + }, + { + "epoch": 0.46, + "grad_norm": 6.9343876267659414, + "learning_rate": 5.923434883821364e-06, + "loss": 0.872, + "step": 5211 + }, + { + "epoch": 0.46, + "grad_norm": 3.0778574873714324, + "learning_rate": 5.922037006440015e-06, + "loss": 0.5461, + "step": 5212 + }, + { + "epoch": 0.46, + "grad_norm": 11.452428566445706, + "learning_rate": 5.920639054448963e-06, + "loss": 0.733, + "step": 5213 + }, + { + "epoch": 0.46, + "grad_norm": 8.057945595159216, + "learning_rate": 5.9192410279613314e-06, + "loss": 0.7592, + "step": 5214 + }, + { + "epoch": 0.46, + "grad_norm": 8.577751805430829, + "learning_rate": 5.9178429270902445e-06, + "loss": 0.9348, + "step": 5215 + }, + { + "epoch": 0.46, + "grad_norm": 8.134230086061391, + "learning_rate": 5.916444751948835e-06, + "loss": 0.6791, + "step": 5216 + }, + { + "epoch": 0.46, + "grad_norm": 9.671310202075558, + "learning_rate": 5.9150465026502405e-06, + "loss": 0.7413, + "step": 5217 + }, + { + "epoch": 0.46, + "grad_norm": 4.914393536684944, + "learning_rate": 5.913648179307605e-06, + "loss": 0.9018, + "step": 5218 + }, + { + "epoch": 0.46, + "grad_norm": 5.0661238286352805, + "learning_rate": 5.91224978203408e-06, + "loss": 0.6539, + "step": 5219 + }, + { + "epoch": 0.46, + "grad_norm": 15.035145121192203, + "learning_rate": 5.910851310942818e-06, + "loss": 0.9238, + "step": 5220 + }, + { + "epoch": 0.46, + "grad_norm": 4.0959246641431815, + "learning_rate": 5.909452766146984e-06, + "loss": 0.6348, + "step": 5221 + }, + { + "epoch": 0.46, + "grad_norm": 8.065825725744517, + "learning_rate": 5.908054147759745e-06, + "loss": 0.7819, + "step": 5222 + }, + { + "epoch": 0.46, + "grad_norm": 11.230193474941776, + "learning_rate": 5.906655455894275e-06, + "loss": 0.8451, + "step": 5223 + }, + { + "epoch": 0.46, + "grad_norm": 7.096055236721539, + "learning_rate": 5.905256690663753e-06, + "loss": 0.7107, + "step": 5224 + }, + { + "epoch": 0.46, + "grad_norm": 8.813627359193081, + "learning_rate": 5.903857852181366e-06, + "loss": 0.7072, + "step": 5225 + }, + { + "epoch": 0.46, + "grad_norm": 5.134849463673547, + "learning_rate": 5.902458940560304e-06, + "loss": 0.7884, + "step": 5226 + }, + { + "epoch": 0.46, + "grad_norm": 28.66980212874968, + "learning_rate": 5.901059955913767e-06, + "loss": 0.9034, + "step": 5227 + }, + { + "epoch": 0.46, + "grad_norm": 6.870012142203137, + "learning_rate": 5.8996608983549575e-06, + "loss": 0.7846, + "step": 5228 + }, + { + "epoch": 0.46, + "grad_norm": 11.298758953383958, + "learning_rate": 5.898261767997082e-06, + "loss": 0.7597, + "step": 5229 + }, + { + "epoch": 0.46, + "grad_norm": 2.617801432762764, + "learning_rate": 5.8968625649533605e-06, + "loss": 0.5487, + "step": 5230 + }, + { + "epoch": 0.46, + "grad_norm": 6.965653812005439, + "learning_rate": 5.895463289337011e-06, + "loss": 0.8071, + "step": 5231 + }, + { + "epoch": 0.46, + "grad_norm": 12.514464129183311, + "learning_rate": 5.8940639412612635e-06, + "loss": 0.8548, + "step": 5232 + }, + { + "epoch": 0.46, + "grad_norm": 14.526799322542034, + "learning_rate": 5.892664520839348e-06, + "loss": 0.7777, + "step": 5233 + }, + { + "epoch": 0.46, + "grad_norm": 6.440564857315899, + "learning_rate": 5.891265028184504e-06, + "loss": 0.8458, + "step": 5234 + }, + { + "epoch": 0.46, + "grad_norm": 8.563091043240808, + "learning_rate": 5.889865463409977e-06, + "loss": 0.7901, + "step": 5235 + }, + { + "epoch": 0.46, + "grad_norm": 8.769006942180258, + "learning_rate": 5.888465826629018e-06, + "loss": 0.7795, + "step": 5236 + }, + { + "epoch": 0.46, + "grad_norm": 7.84440723406311, + "learning_rate": 5.88706611795488e-06, + "loss": 0.7166, + "step": 5237 + }, + { + "epoch": 0.46, + "grad_norm": 2.442264668901082, + "learning_rate": 5.885666337500828e-06, + "loss": 0.556, + "step": 5238 + }, + { + "epoch": 0.46, + "grad_norm": 11.781612705805035, + "learning_rate": 5.88426648538013e-06, + "loss": 0.7889, + "step": 5239 + }, + { + "epoch": 0.46, + "grad_norm": 8.225542939789769, + "learning_rate": 5.882866561706058e-06, + "loss": 0.7035, + "step": 5240 + }, + { + "epoch": 0.46, + "grad_norm": 10.76097632215231, + "learning_rate": 5.8814665665918935e-06, + "loss": 0.6207, + "step": 5241 + }, + { + "epoch": 0.46, + "grad_norm": 8.776860471640921, + "learning_rate": 5.880066500150921e-06, + "loss": 0.7558, + "step": 5242 + }, + { + "epoch": 0.46, + "grad_norm": 12.038627233053912, + "learning_rate": 5.878666362496428e-06, + "loss": 0.7017, + "step": 5243 + }, + { + "epoch": 0.46, + "grad_norm": 10.277233981192705, + "learning_rate": 5.877266153741717e-06, + "loss": 0.7236, + "step": 5244 + }, + { + "epoch": 0.46, + "grad_norm": 6.668059655072398, + "learning_rate": 5.8758658740000885e-06, + "loss": 0.7657, + "step": 5245 + }, + { + "epoch": 0.46, + "grad_norm": 10.41969593585232, + "learning_rate": 5.874465523384849e-06, + "loss": 0.8061, + "step": 5246 + }, + { + "epoch": 0.46, + "grad_norm": 8.4723825910308, + "learning_rate": 5.873065102009315e-06, + "loss": 0.6488, + "step": 5247 + }, + { + "epoch": 0.46, + "grad_norm": 7.384448706054025, + "learning_rate": 5.871664609986804e-06, + "loss": 0.6081, + "step": 5248 + }, + { + "epoch": 0.46, + "grad_norm": 6.665965025434519, + "learning_rate": 5.870264047430645e-06, + "loss": 0.7831, + "step": 5249 + }, + { + "epoch": 0.46, + "grad_norm": 6.801337858567972, + "learning_rate": 5.868863414454166e-06, + "loss": 0.6529, + "step": 5250 + }, + { + "epoch": 0.46, + "grad_norm": 12.368544211830372, + "learning_rate": 5.867462711170703e-06, + "loss": 0.8346, + "step": 5251 + }, + { + "epoch": 0.46, + "grad_norm": 11.522185315047784, + "learning_rate": 5.866061937693602e-06, + "loss": 0.6619, + "step": 5252 + }, + { + "epoch": 0.46, + "grad_norm": 7.553427054846712, + "learning_rate": 5.864661094136209e-06, + "loss": 0.8811, + "step": 5253 + }, + { + "epoch": 0.46, + "grad_norm": 13.29592966535699, + "learning_rate": 5.863260180611878e-06, + "loss": 0.6662, + "step": 5254 + }, + { + "epoch": 0.46, + "grad_norm": 6.328932438189753, + "learning_rate": 5.8618591972339715e-06, + "loss": 0.6962, + "step": 5255 + }, + { + "epoch": 0.46, + "grad_norm": 11.853216797800535, + "learning_rate": 5.86045814411585e-06, + "loss": 0.7505, + "step": 5256 + }, + { + "epoch": 0.46, + "grad_norm": 11.365663561394715, + "learning_rate": 5.859057021370887e-06, + "loss": 0.6909, + "step": 5257 + }, + { + "epoch": 0.46, + "grad_norm": 25.42592401245823, + "learning_rate": 5.857655829112461e-06, + "loss": 0.8301, + "step": 5258 + }, + { + "epoch": 0.46, + "grad_norm": 7.102035831616182, + "learning_rate": 5.85625456745395e-06, + "loss": 0.7496, + "step": 5259 + }, + { + "epoch": 0.46, + "grad_norm": 7.726009662456242, + "learning_rate": 5.854853236508743e-06, + "loss": 0.8647, + "step": 5260 + }, + { + "epoch": 0.46, + "grad_norm": 8.068990138078798, + "learning_rate": 5.853451836390236e-06, + "loss": 0.6913, + "step": 5261 + }, + { + "epoch": 0.46, + "grad_norm": 7.223887704057652, + "learning_rate": 5.8520503672118255e-06, + "loss": 0.8264, + "step": 5262 + }, + { + "epoch": 0.46, + "grad_norm": 6.0516342364922515, + "learning_rate": 5.850648829086915e-06, + "loss": 0.7941, + "step": 5263 + }, + { + "epoch": 0.46, + "grad_norm": 9.130050461127441, + "learning_rate": 5.849247222128919e-06, + "loss": 0.7526, + "step": 5264 + }, + { + "epoch": 0.46, + "grad_norm": 5.1062220680711015, + "learning_rate": 5.847845546451248e-06, + "loss": 0.8583, + "step": 5265 + }, + { + "epoch": 0.46, + "grad_norm": 9.896366472080857, + "learning_rate": 5.846443802167326e-06, + "loss": 0.8352, + "step": 5266 + }, + { + "epoch": 0.46, + "grad_norm": 2.3677569860403542, + "learning_rate": 5.845041989390581e-06, + "loss": 0.4682, + "step": 5267 + }, + { + "epoch": 0.46, + "grad_norm": 6.733465462641614, + "learning_rate": 5.843640108234441e-06, + "loss": 0.8279, + "step": 5268 + }, + { + "epoch": 0.46, + "grad_norm": 11.506271125042929, + "learning_rate": 5.84223815881235e-06, + "loss": 0.8024, + "step": 5269 + }, + { + "epoch": 0.46, + "grad_norm": 46.854080619839685, + "learning_rate": 5.8408361412377475e-06, + "loss": 0.6668, + "step": 5270 + }, + { + "epoch": 0.46, + "grad_norm": 9.972428467480235, + "learning_rate": 5.839434055624083e-06, + "loss": 0.8775, + "step": 5271 + }, + { + "epoch": 0.46, + "grad_norm": 5.378496560068299, + "learning_rate": 5.83803190208481e-06, + "loss": 0.8022, + "step": 5272 + }, + { + "epoch": 0.46, + "grad_norm": 7.552139220060958, + "learning_rate": 5.836629680733391e-06, + "loss": 0.8733, + "step": 5273 + }, + { + "epoch": 0.46, + "grad_norm": 2.8204632130903278, + "learning_rate": 5.835227391683288e-06, + "loss": 0.5197, + "step": 5274 + }, + { + "epoch": 0.46, + "grad_norm": 8.619914594257907, + "learning_rate": 5.833825035047975e-06, + "loss": 0.8422, + "step": 5275 + }, + { + "epoch": 0.46, + "grad_norm": 10.872977918717307, + "learning_rate": 5.832422610940927e-06, + "loss": 0.7474, + "step": 5276 + }, + { + "epoch": 0.46, + "grad_norm": 16.576209516682255, + "learning_rate": 5.8310201194756255e-06, + "loss": 0.8729, + "step": 5277 + }, + { + "epoch": 0.46, + "grad_norm": 10.02955066502399, + "learning_rate": 5.829617560765559e-06, + "loss": 0.8593, + "step": 5278 + }, + { + "epoch": 0.46, + "grad_norm": 2.0324482633002368, + "learning_rate": 5.82821493492422e-06, + "loss": 0.4526, + "step": 5279 + }, + { + "epoch": 0.46, + "grad_norm": 7.292536080030734, + "learning_rate": 5.826812242065104e-06, + "loss": 0.8565, + "step": 5280 + }, + { + "epoch": 0.46, + "grad_norm": 8.911906005627541, + "learning_rate": 5.8254094823017195e-06, + "loss": 0.9024, + "step": 5281 + }, + { + "epoch": 0.46, + "grad_norm": 9.726016086522375, + "learning_rate": 5.824006655747571e-06, + "loss": 0.7938, + "step": 5282 + }, + { + "epoch": 0.46, + "grad_norm": 8.523890688809523, + "learning_rate": 5.822603762516173e-06, + "loss": 0.7712, + "step": 5283 + }, + { + "epoch": 0.46, + "grad_norm": 13.29368826149234, + "learning_rate": 5.82120080272105e-06, + "loss": 0.7818, + "step": 5284 + }, + { + "epoch": 0.46, + "grad_norm": 2.7145595259190953, + "learning_rate": 5.819797776475723e-06, + "loss": 0.5203, + "step": 5285 + }, + { + "epoch": 0.46, + "grad_norm": 9.684083418329495, + "learning_rate": 5.818394683893723e-06, + "loss": 0.6348, + "step": 5286 + }, + { + "epoch": 0.46, + "grad_norm": 6.086761046724396, + "learning_rate": 5.816991525088587e-06, + "loss": 0.6746, + "step": 5287 + }, + { + "epoch": 0.46, + "grad_norm": 7.873368555271907, + "learning_rate": 5.8155883001738545e-06, + "loss": 0.7218, + "step": 5288 + }, + { + "epoch": 0.46, + "grad_norm": 5.906297609914621, + "learning_rate": 5.814185009263074e-06, + "loss": 0.8597, + "step": 5289 + }, + { + "epoch": 0.46, + "grad_norm": 8.062831751597656, + "learning_rate": 5.812781652469799e-06, + "loss": 0.9627, + "step": 5290 + }, + { + "epoch": 0.46, + "grad_norm": 6.83380571617286, + "learning_rate": 5.811378229907583e-06, + "loss": 0.8317, + "step": 5291 + }, + { + "epoch": 0.46, + "grad_norm": 9.169187412808627, + "learning_rate": 5.80997474168999e-06, + "loss": 0.7901, + "step": 5292 + }, + { + "epoch": 0.46, + "grad_norm": 2.6965177362252155, + "learning_rate": 5.80857118793059e-06, + "loss": 0.5455, + "step": 5293 + }, + { + "epoch": 0.46, + "grad_norm": 12.996009923740825, + "learning_rate": 5.8071675687429505e-06, + "loss": 0.7659, + "step": 5294 + }, + { + "epoch": 0.47, + "grad_norm": 9.69254433882528, + "learning_rate": 5.8057638842406575e-06, + "loss": 0.7574, + "step": 5295 + }, + { + "epoch": 0.47, + "grad_norm": 6.7627242871754065, + "learning_rate": 5.804360134537291e-06, + "loss": 0.7285, + "step": 5296 + }, + { + "epoch": 0.47, + "grad_norm": 5.998570130249606, + "learning_rate": 5.802956319746439e-06, + "loss": 0.7431, + "step": 5297 + }, + { + "epoch": 0.47, + "grad_norm": 16.16831261994843, + "learning_rate": 5.801552439981698e-06, + "loss": 0.6652, + "step": 5298 + }, + { + "epoch": 0.47, + "grad_norm": 7.878956514979974, + "learning_rate": 5.800148495356668e-06, + "loss": 0.7469, + "step": 5299 + }, + { + "epoch": 0.47, + "grad_norm": 6.385271655520846, + "learning_rate": 5.798744485984951e-06, + "loss": 0.8011, + "step": 5300 + }, + { + "epoch": 0.47, + "grad_norm": 7.674047895359349, + "learning_rate": 5.797340411980159e-06, + "loss": 0.8404, + "step": 5301 + }, + { + "epoch": 0.47, + "grad_norm": 6.085419603370786, + "learning_rate": 5.7959362734559075e-06, + "loss": 0.8444, + "step": 5302 + }, + { + "epoch": 0.47, + "grad_norm": 2.474941649221201, + "learning_rate": 5.794532070525817e-06, + "loss": 0.5008, + "step": 5303 + }, + { + "epoch": 0.47, + "grad_norm": 12.067182514978176, + "learning_rate": 5.793127803303513e-06, + "loss": 0.8093, + "step": 5304 + }, + { + "epoch": 0.47, + "grad_norm": 2.976193195625, + "learning_rate": 5.791723471902626e-06, + "loss": 0.5203, + "step": 5305 + }, + { + "epoch": 0.47, + "grad_norm": 7.265172882382275, + "learning_rate": 5.7903190764367925e-06, + "loss": 0.7454, + "step": 5306 + }, + { + "epoch": 0.47, + "grad_norm": 8.033276870513761, + "learning_rate": 5.788914617019655e-06, + "loss": 0.7532, + "step": 5307 + }, + { + "epoch": 0.47, + "grad_norm": 7.078160082162302, + "learning_rate": 5.787510093764857e-06, + "loss": 0.662, + "step": 5308 + }, + { + "epoch": 0.47, + "grad_norm": 6.361317751224247, + "learning_rate": 5.786105506786055e-06, + "loss": 0.8726, + "step": 5309 + }, + { + "epoch": 0.47, + "grad_norm": 21.52396699022316, + "learning_rate": 5.784700856196901e-06, + "loss": 0.8223, + "step": 5310 + }, + { + "epoch": 0.47, + "grad_norm": 6.941427667029811, + "learning_rate": 5.783296142111059e-06, + "loss": 0.7511, + "step": 5311 + }, + { + "epoch": 0.47, + "grad_norm": 13.923091784708811, + "learning_rate": 5.781891364642197e-06, + "loss": 0.852, + "step": 5312 + }, + { + "epoch": 0.47, + "grad_norm": 9.714195672994556, + "learning_rate": 5.780486523903986e-06, + "loss": 0.7993, + "step": 5313 + }, + { + "epoch": 0.47, + "grad_norm": 8.32865003357608, + "learning_rate": 5.779081620010104e-06, + "loss": 0.6781, + "step": 5314 + }, + { + "epoch": 0.47, + "grad_norm": 7.427228167754433, + "learning_rate": 5.777676653074234e-06, + "loss": 0.8498, + "step": 5315 + }, + { + "epoch": 0.47, + "grad_norm": 7.3208966474145045, + "learning_rate": 5.776271623210061e-06, + "loss": 0.6572, + "step": 5316 + }, + { + "epoch": 0.47, + "grad_norm": 5.715111629813935, + "learning_rate": 5.77486653053128e-06, + "loss": 0.8093, + "step": 5317 + }, + { + "epoch": 0.47, + "grad_norm": 11.148352883665742, + "learning_rate": 5.773461375151589e-06, + "loss": 0.712, + "step": 5318 + }, + { + "epoch": 0.47, + "grad_norm": 8.258549342713257, + "learning_rate": 5.77205615718469e-06, + "loss": 0.7973, + "step": 5319 + }, + { + "epoch": 0.47, + "grad_norm": 7.257569588944699, + "learning_rate": 5.77065087674429e-06, + "loss": 0.7204, + "step": 5320 + }, + { + "epoch": 0.47, + "grad_norm": 13.095618492714003, + "learning_rate": 5.769245533944104e-06, + "loss": 0.6615, + "step": 5321 + }, + { + "epoch": 0.47, + "grad_norm": 11.318771744128384, + "learning_rate": 5.767840128897849e-06, + "loss": 0.7654, + "step": 5322 + }, + { + "epoch": 0.47, + "grad_norm": 5.628149568222793, + "learning_rate": 5.766434661719247e-06, + "loss": 0.8887, + "step": 5323 + }, + { + "epoch": 0.47, + "grad_norm": 9.963512045673832, + "learning_rate": 5.765029132522029e-06, + "loss": 0.8229, + "step": 5324 + }, + { + "epoch": 0.47, + "grad_norm": 10.937963215409473, + "learning_rate": 5.763623541419925e-06, + "loss": 0.8868, + "step": 5325 + }, + { + "epoch": 0.47, + "grad_norm": 6.539308101419208, + "learning_rate": 5.762217888526673e-06, + "loss": 0.8801, + "step": 5326 + }, + { + "epoch": 0.47, + "grad_norm": 10.066423557999114, + "learning_rate": 5.76081217395602e-06, + "loss": 0.8447, + "step": 5327 + }, + { + "epoch": 0.47, + "grad_norm": 7.119495768210432, + "learning_rate": 5.759406397821709e-06, + "loss": 0.7739, + "step": 5328 + }, + { + "epoch": 0.47, + "grad_norm": 7.228494555074528, + "learning_rate": 5.758000560237496e-06, + "loss": 0.7429, + "step": 5329 + }, + { + "epoch": 0.47, + "grad_norm": 14.744669663011276, + "learning_rate": 5.756594661317139e-06, + "loss": 0.8925, + "step": 5330 + }, + { + "epoch": 0.47, + "grad_norm": 16.933371768433354, + "learning_rate": 5.7551887011744e-06, + "loss": 0.8011, + "step": 5331 + }, + { + "epoch": 0.47, + "grad_norm": 9.218419881957788, + "learning_rate": 5.753782679923049e-06, + "loss": 0.791, + "step": 5332 + }, + { + "epoch": 0.47, + "grad_norm": 5.044364579025961, + "learning_rate": 5.752376597676856e-06, + "loss": 0.7085, + "step": 5333 + }, + { + "epoch": 0.47, + "grad_norm": 4.630580744531248, + "learning_rate": 5.7509704545496e-06, + "loss": 0.8287, + "step": 5334 + }, + { + "epoch": 0.47, + "grad_norm": 19.35658696961735, + "learning_rate": 5.749564250655065e-06, + "loss": 0.8141, + "step": 5335 + }, + { + "epoch": 0.47, + "grad_norm": 4.558708788851448, + "learning_rate": 5.748157986107038e-06, + "loss": 0.7524, + "step": 5336 + }, + { + "epoch": 0.47, + "grad_norm": 11.467873749524255, + "learning_rate": 5.7467516610193095e-06, + "loss": 0.886, + "step": 5337 + }, + { + "epoch": 0.47, + "grad_norm": 2.6232931073760426, + "learning_rate": 5.74534527550568e-06, + "loss": 0.5761, + "step": 5338 + }, + { + "epoch": 0.47, + "grad_norm": 8.457438121657846, + "learning_rate": 5.743938829679952e-06, + "loss": 0.7295, + "step": 5339 + }, + { + "epoch": 0.47, + "grad_norm": 8.323667394025524, + "learning_rate": 5.742532323655928e-06, + "loss": 0.7323, + "step": 5340 + }, + { + "epoch": 0.47, + "grad_norm": 8.466584093936214, + "learning_rate": 5.7411257575474265e-06, + "loss": 0.7044, + "step": 5341 + }, + { + "epoch": 0.47, + "grad_norm": 10.306489806754339, + "learning_rate": 5.739719131468261e-06, + "loss": 0.708, + "step": 5342 + }, + { + "epoch": 0.47, + "grad_norm": 6.821898359510716, + "learning_rate": 5.7383124455322525e-06, + "loss": 0.76, + "step": 5343 + }, + { + "epoch": 0.47, + "grad_norm": 21.390571547127934, + "learning_rate": 5.73690569985323e-06, + "loss": 0.8763, + "step": 5344 + }, + { + "epoch": 0.47, + "grad_norm": 6.851171787786047, + "learning_rate": 5.735498894545024e-06, + "loss": 0.8537, + "step": 5345 + }, + { + "epoch": 0.47, + "grad_norm": 12.208779308368724, + "learning_rate": 5.73409202972147e-06, + "loss": 0.8263, + "step": 5346 + }, + { + "epoch": 0.47, + "grad_norm": 7.508953707337459, + "learning_rate": 5.73268510549641e-06, + "loss": 0.837, + "step": 5347 + }, + { + "epoch": 0.47, + "grad_norm": 7.5345422526019545, + "learning_rate": 5.731278121983691e-06, + "loss": 0.7929, + "step": 5348 + }, + { + "epoch": 0.47, + "grad_norm": 8.662377814030295, + "learning_rate": 5.729871079297163e-06, + "loss": 0.7542, + "step": 5349 + }, + { + "epoch": 0.47, + "grad_norm": 7.018463108328036, + "learning_rate": 5.728463977550681e-06, + "loss": 0.8118, + "step": 5350 + }, + { + "epoch": 0.47, + "grad_norm": 12.996414927500087, + "learning_rate": 5.727056816858105e-06, + "loss": 0.9123, + "step": 5351 + }, + { + "epoch": 0.47, + "grad_norm": 10.255471349004672, + "learning_rate": 5.725649597333302e-06, + "loss": 0.8772, + "step": 5352 + }, + { + "epoch": 0.47, + "grad_norm": 9.199266967331736, + "learning_rate": 5.72424231909014e-06, + "loss": 0.8473, + "step": 5353 + }, + { + "epoch": 0.47, + "grad_norm": 6.956075781716095, + "learning_rate": 5.722834982242493e-06, + "loss": 0.77, + "step": 5354 + }, + { + "epoch": 0.47, + "grad_norm": 6.047080225646225, + "learning_rate": 5.721427586904243e-06, + "loss": 0.8105, + "step": 5355 + }, + { + "epoch": 0.47, + "grad_norm": 2.8606328565756214, + "learning_rate": 5.720020133189273e-06, + "loss": 0.5125, + "step": 5356 + }, + { + "epoch": 0.47, + "grad_norm": 5.633223207905812, + "learning_rate": 5.71861262121147e-06, + "loss": 0.8153, + "step": 5357 + }, + { + "epoch": 0.47, + "grad_norm": 8.13435620875894, + "learning_rate": 5.717205051084731e-06, + "loss": 0.8963, + "step": 5358 + }, + { + "epoch": 0.47, + "grad_norm": 7.689723581974963, + "learning_rate": 5.715797422922951e-06, + "loss": 0.7481, + "step": 5359 + }, + { + "epoch": 0.47, + "grad_norm": 7.576643300659309, + "learning_rate": 5.714389736840035e-06, + "loss": 0.7216, + "step": 5360 + }, + { + "epoch": 0.47, + "grad_norm": 10.280510264562952, + "learning_rate": 5.71298199294989e-06, + "loss": 0.9262, + "step": 5361 + }, + { + "epoch": 0.47, + "grad_norm": 7.112403235899684, + "learning_rate": 5.711574191366427e-06, + "loss": 0.8279, + "step": 5362 + }, + { + "epoch": 0.47, + "grad_norm": 8.73499410302968, + "learning_rate": 5.710166332203564e-06, + "loss": 0.8431, + "step": 5363 + }, + { + "epoch": 0.47, + "grad_norm": 6.226232245984905, + "learning_rate": 5.708758415575224e-06, + "loss": 0.7149, + "step": 5364 + }, + { + "epoch": 0.47, + "grad_norm": 8.286779361806284, + "learning_rate": 5.707350441595332e-06, + "loss": 0.9395, + "step": 5365 + }, + { + "epoch": 0.47, + "grad_norm": 8.429498018928848, + "learning_rate": 5.705942410377816e-06, + "loss": 0.8082, + "step": 5366 + }, + { + "epoch": 0.47, + "grad_norm": 3.074286076555336, + "learning_rate": 5.704534322036618e-06, + "loss": 0.5767, + "step": 5367 + }, + { + "epoch": 0.47, + "grad_norm": 14.405376525264776, + "learning_rate": 5.703126176685671e-06, + "loss": 0.7692, + "step": 5368 + }, + { + "epoch": 0.47, + "grad_norm": 8.369760055634348, + "learning_rate": 5.7017179744389276e-06, + "loss": 0.7456, + "step": 5369 + }, + { + "epoch": 0.47, + "grad_norm": 9.898670139476009, + "learning_rate": 5.700309715410331e-06, + "loss": 0.7632, + "step": 5370 + }, + { + "epoch": 0.47, + "grad_norm": 8.012628764972629, + "learning_rate": 5.698901399713836e-06, + "loss": 0.6397, + "step": 5371 + }, + { + "epoch": 0.47, + "grad_norm": 13.062473331655973, + "learning_rate": 5.6974930274634036e-06, + "loss": 0.8799, + "step": 5372 + }, + { + "epoch": 0.47, + "grad_norm": 11.95672185491936, + "learning_rate": 5.696084598772995e-06, + "loss": 0.7359, + "step": 5373 + }, + { + "epoch": 0.47, + "grad_norm": 13.560373790617104, + "learning_rate": 5.694676113756579e-06, + "loss": 0.7313, + "step": 5374 + }, + { + "epoch": 0.47, + "grad_norm": 10.509971492052957, + "learning_rate": 5.6932675725281275e-06, + "loss": 0.8533, + "step": 5375 + }, + { + "epoch": 0.47, + "grad_norm": 9.694980574200548, + "learning_rate": 5.691858975201617e-06, + "loss": 0.915, + "step": 5376 + }, + { + "epoch": 0.47, + "grad_norm": 7.422057861590197, + "learning_rate": 5.690450321891029e-06, + "loss": 0.7131, + "step": 5377 + }, + { + "epoch": 0.47, + "grad_norm": 4.953713990936256, + "learning_rate": 5.68904161271035e-06, + "loss": 0.7095, + "step": 5378 + }, + { + "epoch": 0.47, + "grad_norm": 6.457639590791356, + "learning_rate": 5.68763284777357e-06, + "loss": 0.7289, + "step": 5379 + }, + { + "epoch": 0.47, + "grad_norm": 2.9010162625895144, + "learning_rate": 5.686224027194682e-06, + "loss": 0.5535, + "step": 5380 + }, + { + "epoch": 0.47, + "grad_norm": 9.070665561231367, + "learning_rate": 5.684815151087688e-06, + "loss": 0.65, + "step": 5381 + }, + { + "epoch": 0.47, + "grad_norm": 8.073848447932901, + "learning_rate": 5.683406219566592e-06, + "loss": 0.8696, + "step": 5382 + }, + { + "epoch": 0.47, + "grad_norm": 10.186750539058604, + "learning_rate": 5.681997232745401e-06, + "loss": 0.9384, + "step": 5383 + }, + { + "epoch": 0.47, + "grad_norm": 7.045984969520804, + "learning_rate": 5.680588190738127e-06, + "loss": 0.8188, + "step": 5384 + }, + { + "epoch": 0.47, + "grad_norm": 11.29917245907064, + "learning_rate": 5.679179093658792e-06, + "loss": 0.9039, + "step": 5385 + }, + { + "epoch": 0.47, + "grad_norm": 8.117980352953023, + "learning_rate": 5.6777699416214115e-06, + "loss": 0.703, + "step": 5386 + }, + { + "epoch": 0.47, + "grad_norm": 6.466831676486336, + "learning_rate": 5.676360734740016e-06, + "loss": 0.7636, + "step": 5387 + }, + { + "epoch": 0.47, + "grad_norm": 11.71813890169913, + "learning_rate": 5.674951473128633e-06, + "loss": 0.7798, + "step": 5388 + }, + { + "epoch": 0.47, + "grad_norm": 12.047540210070567, + "learning_rate": 5.673542156901302e-06, + "loss": 0.8013, + "step": 5389 + }, + { + "epoch": 0.47, + "grad_norm": 5.714785978044927, + "learning_rate": 5.672132786172061e-06, + "loss": 0.558, + "step": 5390 + }, + { + "epoch": 0.47, + "grad_norm": 10.332678037262866, + "learning_rate": 5.6707233610549505e-06, + "loss": 0.7579, + "step": 5391 + }, + { + "epoch": 0.47, + "grad_norm": 10.094217282027108, + "learning_rate": 5.669313881664023e-06, + "loss": 0.9664, + "step": 5392 + }, + { + "epoch": 0.47, + "grad_norm": 10.810171437367249, + "learning_rate": 5.667904348113328e-06, + "loss": 0.7843, + "step": 5393 + }, + { + "epoch": 0.47, + "grad_norm": 5.771780721857128, + "learning_rate": 5.666494760516924e-06, + "loss": 0.7694, + "step": 5394 + }, + { + "epoch": 0.47, + "grad_norm": 6.777107410242367, + "learning_rate": 5.6650851189888735e-06, + "loss": 0.7811, + "step": 5395 + }, + { + "epoch": 0.47, + "grad_norm": 7.792338372236433, + "learning_rate": 5.663675423643242e-06, + "loss": 0.7159, + "step": 5396 + }, + { + "epoch": 0.47, + "grad_norm": 5.899516566081188, + "learning_rate": 5.662265674594098e-06, + "loss": 0.6604, + "step": 5397 + }, + { + "epoch": 0.47, + "grad_norm": 11.358711607663864, + "learning_rate": 5.660855871955518e-06, + "loss": 0.7955, + "step": 5398 + }, + { + "epoch": 0.47, + "grad_norm": 11.510573954755193, + "learning_rate": 5.65944601584158e-06, + "loss": 0.8517, + "step": 5399 + }, + { + "epoch": 0.47, + "grad_norm": 5.810845906970608, + "learning_rate": 5.6580361063663665e-06, + "loss": 0.8234, + "step": 5400 + }, + { + "epoch": 0.47, + "grad_norm": 4.799120921830698, + "learning_rate": 5.656626143643968e-06, + "loss": 0.5697, + "step": 5401 + }, + { + "epoch": 0.47, + "grad_norm": 7.63632896156045, + "learning_rate": 5.655216127788472e-06, + "loss": 0.6751, + "step": 5402 + }, + { + "epoch": 0.47, + "grad_norm": 2.0225271985067814, + "learning_rate": 5.653806058913978e-06, + "loss": 0.4917, + "step": 5403 + }, + { + "epoch": 0.47, + "grad_norm": 14.003389141249357, + "learning_rate": 5.6523959371345845e-06, + "loss": 0.7651, + "step": 5404 + }, + { + "epoch": 0.47, + "grad_norm": 13.549591653952858, + "learning_rate": 5.650985762564398e-06, + "loss": 0.7446, + "step": 5405 + }, + { + "epoch": 0.47, + "grad_norm": 7.676245538350061, + "learning_rate": 5.6495755353175255e-06, + "loss": 0.7951, + "step": 5406 + }, + { + "epoch": 0.47, + "grad_norm": 12.507183493916175, + "learning_rate": 5.648165255508081e-06, + "loss": 0.7588, + "step": 5407 + }, + { + "epoch": 0.47, + "grad_norm": 8.877073976772031, + "learning_rate": 5.646754923250183e-06, + "loss": 0.8755, + "step": 5408 + }, + { + "epoch": 0.48, + "grad_norm": 13.218434777282997, + "learning_rate": 5.645344538657952e-06, + "loss": 0.8543, + "step": 5409 + }, + { + "epoch": 0.48, + "grad_norm": 8.539865993720833, + "learning_rate": 5.643934101845515e-06, + "loss": 0.786, + "step": 5410 + }, + { + "epoch": 0.48, + "grad_norm": 10.792114113822032, + "learning_rate": 5.642523612927e-06, + "loss": 1.0385, + "step": 5411 + }, + { + "epoch": 0.48, + "grad_norm": 6.5329130633615415, + "learning_rate": 5.641113072016545e-06, + "loss": 0.6587, + "step": 5412 + }, + { + "epoch": 0.48, + "grad_norm": 5.280745338799054, + "learning_rate": 5.639702479228286e-06, + "loss": 0.7801, + "step": 5413 + }, + { + "epoch": 0.48, + "grad_norm": 5.8714248082864025, + "learning_rate": 5.638291834676366e-06, + "loss": 0.874, + "step": 5414 + }, + { + "epoch": 0.48, + "grad_norm": 9.693227582794034, + "learning_rate": 5.636881138474932e-06, + "loss": 0.7909, + "step": 5415 + }, + { + "epoch": 0.48, + "grad_norm": 3.2195391080005615, + "learning_rate": 5.6354703907381384e-06, + "loss": 0.5406, + "step": 5416 + }, + { + "epoch": 0.48, + "grad_norm": 5.6770464640174465, + "learning_rate": 5.634059591580134e-06, + "loss": 0.7361, + "step": 5417 + }, + { + "epoch": 0.48, + "grad_norm": 7.66679563293829, + "learning_rate": 5.632648741115085e-06, + "loss": 0.673, + "step": 5418 + }, + { + "epoch": 0.48, + "grad_norm": 6.321445738281808, + "learning_rate": 5.631237839457151e-06, + "loss": 0.7844, + "step": 5419 + }, + { + "epoch": 0.48, + "grad_norm": 11.282175822273192, + "learning_rate": 5.629826886720501e-06, + "loss": 0.8744, + "step": 5420 + }, + { + "epoch": 0.48, + "grad_norm": 7.854614371275008, + "learning_rate": 5.6284158830193084e-06, + "loss": 0.8764, + "step": 5421 + }, + { + "epoch": 0.48, + "grad_norm": 7.489740564073897, + "learning_rate": 5.627004828467747e-06, + "loss": 0.6735, + "step": 5422 + }, + { + "epoch": 0.48, + "grad_norm": 12.639394409522758, + "learning_rate": 5.625593723179996e-06, + "loss": 0.7368, + "step": 5423 + }, + { + "epoch": 0.48, + "grad_norm": 2.5977464624527684, + "learning_rate": 5.6241825672702444e-06, + "loss": 0.5318, + "step": 5424 + }, + { + "epoch": 0.48, + "grad_norm": 10.331685603798103, + "learning_rate": 5.622771360852677e-06, + "loss": 0.9074, + "step": 5425 + }, + { + "epoch": 0.48, + "grad_norm": 9.541799427583403, + "learning_rate": 5.621360104041485e-06, + "loss": 0.8016, + "step": 5426 + }, + { + "epoch": 0.48, + "grad_norm": 7.077700192256886, + "learning_rate": 5.61994879695087e-06, + "loss": 0.5719, + "step": 5427 + }, + { + "epoch": 0.48, + "grad_norm": 10.989830724705902, + "learning_rate": 5.618537439695027e-06, + "loss": 0.7704, + "step": 5428 + }, + { + "epoch": 0.48, + "grad_norm": 4.0265833054759685, + "learning_rate": 5.617126032388163e-06, + "loss": 0.7351, + "step": 5429 + }, + { + "epoch": 0.48, + "grad_norm": 5.675246641103588, + "learning_rate": 5.61571457514449e-06, + "loss": 0.6709, + "step": 5430 + }, + { + "epoch": 0.48, + "grad_norm": 9.051842484712099, + "learning_rate": 5.614303068078215e-06, + "loss": 0.6998, + "step": 5431 + }, + { + "epoch": 0.48, + "grad_norm": 9.690841526823826, + "learning_rate": 5.612891511303558e-06, + "loss": 0.8113, + "step": 5432 + }, + { + "epoch": 0.48, + "grad_norm": 5.395588569711698, + "learning_rate": 5.611479904934741e-06, + "loss": 0.6749, + "step": 5433 + }, + { + "epoch": 0.48, + "grad_norm": 5.213245648466657, + "learning_rate": 5.610068249085986e-06, + "loss": 0.6995, + "step": 5434 + }, + { + "epoch": 0.48, + "grad_norm": 3.2656017613765296, + "learning_rate": 5.608656543871524e-06, + "loss": 0.5668, + "step": 5435 + }, + { + "epoch": 0.48, + "grad_norm": 8.628016246611997, + "learning_rate": 5.607244789405586e-06, + "loss": 0.8196, + "step": 5436 + }, + { + "epoch": 0.48, + "grad_norm": 9.76237563880752, + "learning_rate": 5.60583298580241e-06, + "loss": 0.7545, + "step": 5437 + }, + { + "epoch": 0.48, + "grad_norm": 8.279493118941662, + "learning_rate": 5.604421133176237e-06, + "loss": 0.9038, + "step": 5438 + }, + { + "epoch": 0.48, + "grad_norm": 14.680820452235777, + "learning_rate": 5.603009231641311e-06, + "loss": 0.9009, + "step": 5439 + }, + { + "epoch": 0.48, + "grad_norm": 5.783472761177973, + "learning_rate": 5.6015972813118806e-06, + "loss": 0.6808, + "step": 5440 + }, + { + "epoch": 0.48, + "grad_norm": 9.057900184697385, + "learning_rate": 5.6001852823021984e-06, + "loss": 0.795, + "step": 5441 + }, + { + "epoch": 0.48, + "grad_norm": 8.914501557078436, + "learning_rate": 5.598773234726524e-06, + "loss": 0.9261, + "step": 5442 + }, + { + "epoch": 0.48, + "grad_norm": 6.3857316933559405, + "learning_rate": 5.5973611386991124e-06, + "loss": 0.8332, + "step": 5443 + }, + { + "epoch": 0.48, + "grad_norm": 6.375749214418185, + "learning_rate": 5.595948994334233e-06, + "loss": 0.7477, + "step": 5444 + }, + { + "epoch": 0.48, + "grad_norm": 9.593757654627627, + "learning_rate": 5.594536801746152e-06, + "loss": 0.7864, + "step": 5445 + }, + { + "epoch": 0.48, + "grad_norm": 5.648648809809286, + "learning_rate": 5.593124561049141e-06, + "loss": 0.716, + "step": 5446 + }, + { + "epoch": 0.48, + "grad_norm": 10.649165650464457, + "learning_rate": 5.591712272357478e-06, + "loss": 0.7655, + "step": 5447 + }, + { + "epoch": 0.48, + "grad_norm": 7.639187832525078, + "learning_rate": 5.590299935785442e-06, + "loss": 0.6703, + "step": 5448 + }, + { + "epoch": 0.48, + "grad_norm": 11.680976307997827, + "learning_rate": 5.5888875514473175e-06, + "loss": 0.7584, + "step": 5449 + }, + { + "epoch": 0.48, + "grad_norm": 12.581326601397928, + "learning_rate": 5.587475119457391e-06, + "loss": 0.6814, + "step": 5450 + }, + { + "epoch": 0.48, + "grad_norm": 16.260515551749503, + "learning_rate": 5.586062639929955e-06, + "loss": 0.6887, + "step": 5451 + }, + { + "epoch": 0.48, + "grad_norm": 14.320863772501765, + "learning_rate": 5.584650112979306e-06, + "loss": 0.7974, + "step": 5452 + }, + { + "epoch": 0.48, + "grad_norm": 8.937841580674263, + "learning_rate": 5.583237538719742e-06, + "loss": 0.7889, + "step": 5453 + }, + { + "epoch": 0.48, + "grad_norm": 12.421765172598944, + "learning_rate": 5.581824917265567e-06, + "loss": 0.8313, + "step": 5454 + }, + { + "epoch": 0.48, + "grad_norm": 7.34897912505775, + "learning_rate": 5.580412248731087e-06, + "loss": 0.7578, + "step": 5455 + }, + { + "epoch": 0.48, + "grad_norm": 2.1076704829208044, + "learning_rate": 5.578999533230613e-06, + "loss": 0.5161, + "step": 5456 + }, + { + "epoch": 0.48, + "grad_norm": 26.909028356531834, + "learning_rate": 5.57758677087846e-06, + "loss": 0.6493, + "step": 5457 + }, + { + "epoch": 0.48, + "grad_norm": 17.756707934717387, + "learning_rate": 5.576173961788947e-06, + "loss": 0.7678, + "step": 5458 + }, + { + "epoch": 0.48, + "grad_norm": 10.266221559701604, + "learning_rate": 5.574761106076395e-06, + "loss": 0.7046, + "step": 5459 + }, + { + "epoch": 0.48, + "grad_norm": 9.706642332031349, + "learning_rate": 5.57334820385513e-06, + "loss": 0.9076, + "step": 5460 + }, + { + "epoch": 0.48, + "grad_norm": 21.532098845811838, + "learning_rate": 5.571935255239482e-06, + "loss": 0.8382, + "step": 5461 + }, + { + "epoch": 0.48, + "grad_norm": 4.974631738731587, + "learning_rate": 5.570522260343786e-06, + "loss": 0.5823, + "step": 5462 + }, + { + "epoch": 0.48, + "grad_norm": 6.579779922962061, + "learning_rate": 5.569109219282376e-06, + "loss": 0.8414, + "step": 5463 + }, + { + "epoch": 0.48, + "grad_norm": 7.970981070858852, + "learning_rate": 5.5676961321695955e-06, + "loss": 0.84, + "step": 5464 + }, + { + "epoch": 0.48, + "grad_norm": 5.671578228354992, + "learning_rate": 5.5662829991197885e-06, + "loss": 0.7688, + "step": 5465 + }, + { + "epoch": 0.48, + "grad_norm": 7.045307829623208, + "learning_rate": 5.5648698202473015e-06, + "loss": 0.8157, + "step": 5466 + }, + { + "epoch": 0.48, + "grad_norm": 7.225592544735985, + "learning_rate": 5.56345659566649e-06, + "loss": 0.7707, + "step": 5467 + }, + { + "epoch": 0.48, + "grad_norm": 10.841544457102817, + "learning_rate": 5.5620433254917075e-06, + "loss": 0.7846, + "step": 5468 + }, + { + "epoch": 0.48, + "grad_norm": 9.605692261608382, + "learning_rate": 5.560630009837314e-06, + "loss": 0.7987, + "step": 5469 + }, + { + "epoch": 0.48, + "grad_norm": 7.729532486861728, + "learning_rate": 5.5592166488176726e-06, + "loss": 0.9233, + "step": 5470 + }, + { + "epoch": 0.48, + "grad_norm": 2.321321117907425, + "learning_rate": 5.5578032425471495e-06, + "loss": 0.5452, + "step": 5471 + }, + { + "epoch": 0.48, + "grad_norm": 8.908748583202486, + "learning_rate": 5.556389791140118e-06, + "loss": 0.8174, + "step": 5472 + }, + { + "epoch": 0.48, + "grad_norm": 11.718087051053741, + "learning_rate": 5.554976294710949e-06, + "loss": 0.7496, + "step": 5473 + }, + { + "epoch": 0.48, + "grad_norm": 8.059128448612698, + "learning_rate": 5.55356275337402e-06, + "loss": 0.6289, + "step": 5474 + }, + { + "epoch": 0.48, + "grad_norm": 7.288267063037581, + "learning_rate": 5.552149167243716e-06, + "loss": 0.7343, + "step": 5475 + }, + { + "epoch": 0.48, + "grad_norm": 2.5253082062795293, + "learning_rate": 5.550735536434418e-06, + "loss": 0.5148, + "step": 5476 + }, + { + "epoch": 0.48, + "grad_norm": 6.336740657879335, + "learning_rate": 5.549321861060517e-06, + "loss": 0.7043, + "step": 5477 + }, + { + "epoch": 0.48, + "grad_norm": 12.352531920227388, + "learning_rate": 5.547908141236406e-06, + "loss": 0.6458, + "step": 5478 + }, + { + "epoch": 0.48, + "grad_norm": 9.13562041014916, + "learning_rate": 5.546494377076478e-06, + "loss": 0.8492, + "step": 5479 + }, + { + "epoch": 0.48, + "grad_norm": 9.684757251150856, + "learning_rate": 5.545080568695134e-06, + "loss": 0.7781, + "step": 5480 + }, + { + "epoch": 0.48, + "grad_norm": 28.02965502613709, + "learning_rate": 5.5436667162067785e-06, + "loss": 0.928, + "step": 5481 + }, + { + "epoch": 0.48, + "grad_norm": 9.349859650262301, + "learning_rate": 5.542252819725816e-06, + "loss": 0.8058, + "step": 5482 + }, + { + "epoch": 0.48, + "grad_norm": 11.796360202961417, + "learning_rate": 5.5408388793666565e-06, + "loss": 0.7473, + "step": 5483 + }, + { + "epoch": 0.48, + "grad_norm": 9.540950643033511, + "learning_rate": 5.539424895243716e-06, + "loss": 0.6274, + "step": 5484 + }, + { + "epoch": 0.48, + "grad_norm": 7.2113859076521605, + "learning_rate": 5.538010867471409e-06, + "loss": 0.8137, + "step": 5485 + }, + { + "epoch": 0.48, + "grad_norm": 18.56711289888833, + "learning_rate": 5.536596796164158e-06, + "loss": 0.8266, + "step": 5486 + }, + { + "epoch": 0.48, + "grad_norm": 9.912212007937395, + "learning_rate": 5.535182681436387e-06, + "loss": 0.7123, + "step": 5487 + }, + { + "epoch": 0.48, + "grad_norm": 4.715950585078359, + "learning_rate": 5.533768523402523e-06, + "loss": 0.8309, + "step": 5488 + }, + { + "epoch": 0.48, + "grad_norm": 4.923178500619133, + "learning_rate": 5.532354322176999e-06, + "loss": 0.6582, + "step": 5489 + }, + { + "epoch": 0.48, + "grad_norm": 3.40485710777311, + "learning_rate": 5.530940077874248e-06, + "loss": 0.5011, + "step": 5490 + }, + { + "epoch": 0.48, + "grad_norm": 5.778499324916399, + "learning_rate": 5.52952579060871e-06, + "loss": 0.645, + "step": 5491 + }, + { + "epoch": 0.48, + "grad_norm": 5.299540081055213, + "learning_rate": 5.528111460494825e-06, + "loss": 0.7067, + "step": 5492 + }, + { + "epoch": 0.48, + "grad_norm": 9.32297282620908, + "learning_rate": 5.526697087647039e-06, + "loss": 0.6546, + "step": 5493 + }, + { + "epoch": 0.48, + "grad_norm": 7.506883851261621, + "learning_rate": 5.5252826721798015e-06, + "loss": 0.7131, + "step": 5494 + }, + { + "epoch": 0.48, + "grad_norm": 9.9243492328384, + "learning_rate": 5.523868214207564e-06, + "loss": 0.7983, + "step": 5495 + }, + { + "epoch": 0.48, + "grad_norm": 10.698018641879402, + "learning_rate": 5.522453713844781e-06, + "loss": 0.7771, + "step": 5496 + }, + { + "epoch": 0.48, + "grad_norm": 6.8413145042026615, + "learning_rate": 5.521039171205913e-06, + "loss": 0.8081, + "step": 5497 + }, + { + "epoch": 0.48, + "grad_norm": 6.896730798670407, + "learning_rate": 5.519624586405422e-06, + "loss": 0.7393, + "step": 5498 + }, + { + "epoch": 0.48, + "grad_norm": 9.713583978096686, + "learning_rate": 5.518209959557774e-06, + "loss": 0.9305, + "step": 5499 + }, + { + "epoch": 0.48, + "grad_norm": 6.884306290152862, + "learning_rate": 5.516795290777437e-06, + "loss": 0.7791, + "step": 5500 + }, + { + "epoch": 0.48, + "grad_norm": 6.930996350938453, + "learning_rate": 5.515380580178887e-06, + "loss": 0.9083, + "step": 5501 + }, + { + "epoch": 0.48, + "grad_norm": 6.92063980221516, + "learning_rate": 5.513965827876596e-06, + "loss": 0.706, + "step": 5502 + }, + { + "epoch": 0.48, + "grad_norm": 2.5559485114853024, + "learning_rate": 5.512551033985045e-06, + "loss": 0.4359, + "step": 5503 + }, + { + "epoch": 0.48, + "grad_norm": 12.248555219593303, + "learning_rate": 5.511136198618718e-06, + "loss": 0.8393, + "step": 5504 + }, + { + "epoch": 0.48, + "grad_norm": 11.341199244697279, + "learning_rate": 5.5097213218921e-06, + "loss": 0.6617, + "step": 5505 + }, + { + "epoch": 0.48, + "grad_norm": 12.196696489479075, + "learning_rate": 5.508306403919679e-06, + "loss": 0.779, + "step": 5506 + }, + { + "epoch": 0.48, + "grad_norm": 6.874913731770091, + "learning_rate": 5.506891444815951e-06, + "loss": 0.7195, + "step": 5507 + }, + { + "epoch": 0.48, + "grad_norm": 2.449740885381673, + "learning_rate": 5.505476444695409e-06, + "loss": 0.5377, + "step": 5508 + }, + { + "epoch": 0.48, + "grad_norm": 9.217803110758842, + "learning_rate": 5.5040614036725546e-06, + "loss": 0.8769, + "step": 5509 + }, + { + "epoch": 0.48, + "grad_norm": 6.053574862449157, + "learning_rate": 5.502646321861891e-06, + "loss": 0.9122, + "step": 5510 + }, + { + "epoch": 0.48, + "grad_norm": 2.286356655594447, + "learning_rate": 5.501231199377921e-06, + "loss": 0.5177, + "step": 5511 + }, + { + "epoch": 0.48, + "grad_norm": 6.6370476152672975, + "learning_rate": 5.499816036335157e-06, + "loss": 0.7252, + "step": 5512 + }, + { + "epoch": 0.48, + "grad_norm": 6.488074663231248, + "learning_rate": 5.498400832848112e-06, + "loss": 0.7363, + "step": 5513 + }, + { + "epoch": 0.48, + "grad_norm": 7.267965318871073, + "learning_rate": 5.496985589031298e-06, + "loss": 0.7871, + "step": 5514 + }, + { + "epoch": 0.48, + "grad_norm": 7.393740395604455, + "learning_rate": 5.495570304999239e-06, + "loss": 0.7659, + "step": 5515 + }, + { + "epoch": 0.48, + "grad_norm": 6.3096079121736075, + "learning_rate": 5.494154980866455e-06, + "loss": 0.6138, + "step": 5516 + }, + { + "epoch": 0.48, + "grad_norm": 8.396396083389313, + "learning_rate": 5.4927396167474715e-06, + "loss": 0.6621, + "step": 5517 + }, + { + "epoch": 0.48, + "grad_norm": 8.500591508208851, + "learning_rate": 5.491324212756818e-06, + "loss": 0.9578, + "step": 5518 + }, + { + "epoch": 0.48, + "grad_norm": 14.037305194755602, + "learning_rate": 5.489908769009026e-06, + "loss": 0.8627, + "step": 5519 + }, + { + "epoch": 0.48, + "grad_norm": 4.819684206923649, + "learning_rate": 5.4884932856186315e-06, + "loss": 0.6633, + "step": 5520 + }, + { + "epoch": 0.48, + "grad_norm": 7.980459503592586, + "learning_rate": 5.487077762700173e-06, + "loss": 0.6436, + "step": 5521 + }, + { + "epoch": 0.48, + "grad_norm": 20.530383780641625, + "learning_rate": 5.4856622003681925e-06, + "loss": 0.9361, + "step": 5522 + }, + { + "epoch": 0.49, + "grad_norm": 6.152680206785996, + "learning_rate": 5.484246598737234e-06, + "loss": 0.7323, + "step": 5523 + }, + { + "epoch": 0.49, + "grad_norm": 8.069216962978633, + "learning_rate": 5.482830957921846e-06, + "loss": 0.8025, + "step": 5524 + }, + { + "epoch": 0.49, + "grad_norm": 26.468754385459867, + "learning_rate": 5.481415278036581e-06, + "loss": 0.8318, + "step": 5525 + }, + { + "epoch": 0.49, + "grad_norm": 7.077157859392573, + "learning_rate": 5.479999559195992e-06, + "loss": 0.7721, + "step": 5526 + }, + { + "epoch": 0.49, + "grad_norm": 46.42790481283026, + "learning_rate": 5.478583801514637e-06, + "loss": 0.7638, + "step": 5527 + }, + { + "epoch": 0.49, + "grad_norm": 2.758224949016295, + "learning_rate": 5.477168005107078e-06, + "loss": 0.5665, + "step": 5528 + }, + { + "epoch": 0.49, + "grad_norm": 6.359641362372209, + "learning_rate": 5.475752170087877e-06, + "loss": 0.7264, + "step": 5529 + }, + { + "epoch": 0.49, + "grad_norm": 7.183018070122878, + "learning_rate": 5.474336296571602e-06, + "loss": 0.7639, + "step": 5530 + }, + { + "epoch": 0.49, + "grad_norm": 9.930071699125147, + "learning_rate": 5.472920384672823e-06, + "loss": 0.8182, + "step": 5531 + }, + { + "epoch": 0.49, + "grad_norm": 5.268288599554901, + "learning_rate": 5.471504434506112e-06, + "loss": 0.5872, + "step": 5532 + }, + { + "epoch": 0.49, + "grad_norm": 5.8538972408896885, + "learning_rate": 5.470088446186047e-06, + "loss": 0.6065, + "step": 5533 + }, + { + "epoch": 0.49, + "grad_norm": 15.319699828982007, + "learning_rate": 5.468672419827208e-06, + "loss": 0.9148, + "step": 5534 + }, + { + "epoch": 0.49, + "grad_norm": 4.062935657001531, + "learning_rate": 5.467256355544175e-06, + "loss": 0.5781, + "step": 5535 + }, + { + "epoch": 0.49, + "grad_norm": 6.738048696928931, + "learning_rate": 5.465840253451535e-06, + "loss": 0.6891, + "step": 5536 + }, + { + "epoch": 0.49, + "grad_norm": 5.313515619151598, + "learning_rate": 5.464424113663878e-06, + "loss": 0.7801, + "step": 5537 + }, + { + "epoch": 0.49, + "grad_norm": 10.555137822974322, + "learning_rate": 5.463007936295792e-06, + "loss": 0.8077, + "step": 5538 + }, + { + "epoch": 0.49, + "grad_norm": 7.607005727004731, + "learning_rate": 5.4615917214618755e-06, + "loss": 0.8632, + "step": 5539 + }, + { + "epoch": 0.49, + "grad_norm": 5.555956819230804, + "learning_rate": 5.460175469276724e-06, + "loss": 0.7753, + "step": 5540 + }, + { + "epoch": 0.49, + "grad_norm": 5.235882856128093, + "learning_rate": 5.4587591798549396e-06, + "loss": 0.8245, + "step": 5541 + }, + { + "epoch": 0.49, + "grad_norm": 7.036031391739572, + "learning_rate": 5.457342853311126e-06, + "loss": 0.7238, + "step": 5542 + }, + { + "epoch": 0.49, + "grad_norm": 3.7222140732436024, + "learning_rate": 5.455926489759887e-06, + "loss": 0.5614, + "step": 5543 + }, + { + "epoch": 0.49, + "grad_norm": 14.730455438038506, + "learning_rate": 5.454510089315838e-06, + "loss": 0.6451, + "step": 5544 + }, + { + "epoch": 0.49, + "grad_norm": 6.3323332707641455, + "learning_rate": 5.453093652093588e-06, + "loss": 0.6123, + "step": 5545 + }, + { + "epoch": 0.49, + "grad_norm": 7.13138285275086, + "learning_rate": 5.451677178207752e-06, + "loss": 0.6846, + "step": 5546 + }, + { + "epoch": 0.49, + "grad_norm": 5.373774711956501, + "learning_rate": 5.45026066777295e-06, + "loss": 0.7951, + "step": 5547 + }, + { + "epoch": 0.49, + "grad_norm": 5.954264878818602, + "learning_rate": 5.448844120903805e-06, + "loss": 0.6967, + "step": 5548 + }, + { + "epoch": 0.49, + "grad_norm": 7.670729572822089, + "learning_rate": 5.447427537714939e-06, + "loss": 0.6821, + "step": 5549 + }, + { + "epoch": 0.49, + "grad_norm": 9.503270829660165, + "learning_rate": 5.446010918320982e-06, + "loss": 0.6333, + "step": 5550 + }, + { + "epoch": 0.49, + "grad_norm": 7.712928911199944, + "learning_rate": 5.444594262836564e-06, + "loss": 0.8066, + "step": 5551 + }, + { + "epoch": 0.49, + "grad_norm": 6.110117578204275, + "learning_rate": 5.4431775713763154e-06, + "loss": 0.8746, + "step": 5552 + }, + { + "epoch": 0.49, + "grad_norm": 10.586840099240343, + "learning_rate": 5.441760844054877e-06, + "loss": 0.8959, + "step": 5553 + }, + { + "epoch": 0.49, + "grad_norm": 7.272446048284882, + "learning_rate": 5.440344080986886e-06, + "loss": 0.661, + "step": 5554 + }, + { + "epoch": 0.49, + "grad_norm": 10.129032994882005, + "learning_rate": 5.438927282286982e-06, + "loss": 0.8151, + "step": 5555 + }, + { + "epoch": 0.49, + "grad_norm": 8.511059694639243, + "learning_rate": 5.437510448069815e-06, + "loss": 0.5846, + "step": 5556 + }, + { + "epoch": 0.49, + "grad_norm": 23.306493167799417, + "learning_rate": 5.436093578450029e-06, + "loss": 0.8327, + "step": 5557 + }, + { + "epoch": 0.49, + "grad_norm": 6.431307773294482, + "learning_rate": 5.434676673542277e-06, + "loss": 0.8594, + "step": 5558 + }, + { + "epoch": 0.49, + "grad_norm": 7.9536313150292965, + "learning_rate": 5.43325973346121e-06, + "loss": 0.736, + "step": 5559 + }, + { + "epoch": 0.49, + "grad_norm": 6.498442720991051, + "learning_rate": 5.431842758321487e-06, + "loss": 0.8024, + "step": 5560 + }, + { + "epoch": 0.49, + "grad_norm": 5.010260884648815, + "learning_rate": 5.430425748237767e-06, + "loss": 0.8742, + "step": 5561 + }, + { + "epoch": 0.49, + "grad_norm": 29.97294755708582, + "learning_rate": 5.429008703324711e-06, + "loss": 0.8503, + "step": 5562 + }, + { + "epoch": 0.49, + "grad_norm": 10.411718501917248, + "learning_rate": 5.427591623696984e-06, + "loss": 0.7716, + "step": 5563 + }, + { + "epoch": 0.49, + "grad_norm": 8.374892333127411, + "learning_rate": 5.426174509469254e-06, + "loss": 0.8566, + "step": 5564 + }, + { + "epoch": 0.49, + "grad_norm": 6.716126077975041, + "learning_rate": 5.424757360756192e-06, + "loss": 0.7109, + "step": 5565 + }, + { + "epoch": 0.49, + "grad_norm": 17.04811199797502, + "learning_rate": 5.42334017767247e-06, + "loss": 0.8469, + "step": 5566 + }, + { + "epoch": 0.49, + "grad_norm": 9.455786016041179, + "learning_rate": 5.421922960332767e-06, + "loss": 0.9961, + "step": 5567 + }, + { + "epoch": 0.49, + "grad_norm": 10.236091893775777, + "learning_rate": 5.4205057088517585e-06, + "loss": 0.7945, + "step": 5568 + }, + { + "epoch": 0.49, + "grad_norm": 6.010959879727815, + "learning_rate": 5.419088423344128e-06, + "loss": 0.6201, + "step": 5569 + }, + { + "epoch": 0.49, + "grad_norm": 7.103183401243793, + "learning_rate": 5.417671103924559e-06, + "loss": 0.7618, + "step": 5570 + }, + { + "epoch": 0.49, + "grad_norm": 9.25298878329844, + "learning_rate": 5.4162537507077395e-06, + "loss": 0.8737, + "step": 5571 + }, + { + "epoch": 0.49, + "grad_norm": 4.987367652571113, + "learning_rate": 5.4148363638083575e-06, + "loss": 0.6277, + "step": 5572 + }, + { + "epoch": 0.49, + "grad_norm": 15.106839220090533, + "learning_rate": 5.413418943341109e-06, + "loss": 0.8293, + "step": 5573 + }, + { + "epoch": 0.49, + "grad_norm": 9.038856139446136, + "learning_rate": 5.412001489420686e-06, + "loss": 0.762, + "step": 5574 + }, + { + "epoch": 0.49, + "grad_norm": 6.348904513820352, + "learning_rate": 5.410584002161788e-06, + "loss": 0.7253, + "step": 5575 + }, + { + "epoch": 0.49, + "grad_norm": 7.736162819810234, + "learning_rate": 5.409166481679116e-06, + "loss": 0.6615, + "step": 5576 + }, + { + "epoch": 0.49, + "grad_norm": 13.715973620560153, + "learning_rate": 5.407748928087373e-06, + "loss": 0.8844, + "step": 5577 + }, + { + "epoch": 0.49, + "grad_norm": 3.0729439489264143, + "learning_rate": 5.406331341501264e-06, + "loss": 0.5991, + "step": 5578 + }, + { + "epoch": 0.49, + "grad_norm": 5.352497798904061, + "learning_rate": 5.404913722035499e-06, + "loss": 0.7504, + "step": 5579 + }, + { + "epoch": 0.49, + "grad_norm": 8.220449975985227, + "learning_rate": 5.403496069804789e-06, + "loss": 0.756, + "step": 5580 + }, + { + "epoch": 0.49, + "grad_norm": 8.59062865308194, + "learning_rate": 5.402078384923849e-06, + "loss": 0.7916, + "step": 5581 + }, + { + "epoch": 0.49, + "grad_norm": 13.202414907571885, + "learning_rate": 5.400660667507396e-06, + "loss": 0.946, + "step": 5582 + }, + { + "epoch": 0.49, + "grad_norm": 5.822555722107722, + "learning_rate": 5.399242917670146e-06, + "loss": 0.6399, + "step": 5583 + }, + { + "epoch": 0.49, + "grad_norm": 5.0471620741644685, + "learning_rate": 5.3978251355268245e-06, + "loss": 0.6421, + "step": 5584 + }, + { + "epoch": 0.49, + "grad_norm": 2.0962465810954574, + "learning_rate": 5.396407321192154e-06, + "loss": 0.4765, + "step": 5585 + }, + { + "epoch": 0.49, + "grad_norm": 11.044274478863377, + "learning_rate": 5.394989474780862e-06, + "loss": 0.7682, + "step": 5586 + }, + { + "epoch": 0.49, + "grad_norm": 5.557140952197606, + "learning_rate": 5.3935715964076806e-06, + "loss": 0.7723, + "step": 5587 + }, + { + "epoch": 0.49, + "grad_norm": 7.236729611203902, + "learning_rate": 5.392153686187339e-06, + "loss": 0.7124, + "step": 5588 + }, + { + "epoch": 0.49, + "grad_norm": 12.901780605739415, + "learning_rate": 5.390735744234573e-06, + "loss": 0.724, + "step": 5589 + }, + { + "epoch": 0.49, + "grad_norm": 7.552522483676034, + "learning_rate": 5.389317770664122e-06, + "loss": 0.7271, + "step": 5590 + }, + { + "epoch": 0.49, + "grad_norm": 7.849806415026917, + "learning_rate": 5.387899765590724e-06, + "loss": 0.7481, + "step": 5591 + }, + { + "epoch": 0.49, + "grad_norm": 10.630917060146713, + "learning_rate": 5.38648172912912e-06, + "loss": 0.8244, + "step": 5592 + }, + { + "epoch": 0.49, + "grad_norm": 13.401315363635526, + "learning_rate": 5.385063661394061e-06, + "loss": 0.6711, + "step": 5593 + }, + { + "epoch": 0.49, + "grad_norm": 5.70742708219716, + "learning_rate": 5.383645562500289e-06, + "loss": 0.7416, + "step": 5594 + }, + { + "epoch": 0.49, + "grad_norm": 7.21263856098155, + "learning_rate": 5.382227432562556e-06, + "loss": 0.8155, + "step": 5595 + }, + { + "epoch": 0.49, + "grad_norm": 5.765595005681509, + "learning_rate": 5.380809271695616e-06, + "loss": 0.792, + "step": 5596 + }, + { + "epoch": 0.49, + "grad_norm": 21.395818495100947, + "learning_rate": 5.379391080014222e-06, + "loss": 0.882, + "step": 5597 + }, + { + "epoch": 0.49, + "grad_norm": 7.772390718456046, + "learning_rate": 5.377972857633134e-06, + "loss": 0.78, + "step": 5598 + }, + { + "epoch": 0.49, + "grad_norm": 8.32108615382391, + "learning_rate": 5.376554604667112e-06, + "loss": 0.7388, + "step": 5599 + }, + { + "epoch": 0.49, + "grad_norm": 2.444318428689282, + "learning_rate": 5.375136321230915e-06, + "loss": 0.5265, + "step": 5600 + }, + { + "epoch": 0.49, + "grad_norm": 6.878800840381979, + "learning_rate": 5.373718007439313e-06, + "loss": 0.8666, + "step": 5601 + }, + { + "epoch": 0.49, + "grad_norm": 6.460527547636283, + "learning_rate": 5.372299663407071e-06, + "loss": 0.7645, + "step": 5602 + }, + { + "epoch": 0.49, + "grad_norm": 2.7889812338389737, + "learning_rate": 5.370881289248959e-06, + "loss": 0.5683, + "step": 5603 + }, + { + "epoch": 0.49, + "grad_norm": 6.889007828949089, + "learning_rate": 5.369462885079751e-06, + "loss": 0.7846, + "step": 5604 + }, + { + "epoch": 0.49, + "grad_norm": 4.957654569205178, + "learning_rate": 5.3680444510142205e-06, + "loss": 0.7287, + "step": 5605 + }, + { + "epoch": 0.49, + "grad_norm": 10.831307323124665, + "learning_rate": 5.366625987167144e-06, + "loss": 0.8018, + "step": 5606 + }, + { + "epoch": 0.49, + "grad_norm": 8.498570915694659, + "learning_rate": 5.365207493653304e-06, + "loss": 0.9368, + "step": 5607 + }, + { + "epoch": 0.49, + "grad_norm": 11.379413648219092, + "learning_rate": 5.3637889705874815e-06, + "loss": 0.8739, + "step": 5608 + }, + { + "epoch": 0.49, + "grad_norm": 16.66351283754721, + "learning_rate": 5.362370418084459e-06, + "loss": 0.7999, + "step": 5609 + }, + { + "epoch": 0.49, + "grad_norm": 6.895467973481263, + "learning_rate": 5.360951836259026e-06, + "loss": 0.8497, + "step": 5610 + }, + { + "epoch": 0.49, + "grad_norm": 10.592755745265674, + "learning_rate": 5.359533225225971e-06, + "loss": 0.9835, + "step": 5611 + }, + { + "epoch": 0.49, + "grad_norm": 9.780252121336812, + "learning_rate": 5.358114585100085e-06, + "loss": 0.6861, + "step": 5612 + }, + { + "epoch": 0.49, + "grad_norm": 32.632118753139444, + "learning_rate": 5.356695915996162e-06, + "loss": 0.8246, + "step": 5613 + }, + { + "epoch": 0.49, + "grad_norm": 17.837211328651172, + "learning_rate": 5.355277218029e-06, + "loss": 0.7163, + "step": 5614 + }, + { + "epoch": 0.49, + "grad_norm": 7.102502199389485, + "learning_rate": 5.353858491313395e-06, + "loss": 0.7993, + "step": 5615 + }, + { + "epoch": 0.49, + "grad_norm": 8.594186721175138, + "learning_rate": 5.352439735964151e-06, + "loss": 0.6448, + "step": 5616 + }, + { + "epoch": 0.49, + "grad_norm": 6.895735426601374, + "learning_rate": 5.351020952096069e-06, + "loss": 0.6881, + "step": 5617 + }, + { + "epoch": 0.49, + "grad_norm": 6.769993230641324, + "learning_rate": 5.349602139823955e-06, + "loss": 0.6747, + "step": 5618 + }, + { + "epoch": 0.49, + "grad_norm": 3.7442585218439737, + "learning_rate": 5.348183299262618e-06, + "loss": 0.5818, + "step": 5619 + }, + { + "epoch": 0.49, + "grad_norm": 5.761761119972367, + "learning_rate": 5.346764430526867e-06, + "loss": 0.7381, + "step": 5620 + }, + { + "epoch": 0.49, + "grad_norm": 14.154406868406863, + "learning_rate": 5.3453455337315165e-06, + "loss": 0.7818, + "step": 5621 + }, + { + "epoch": 0.49, + "grad_norm": 6.306436070961199, + "learning_rate": 5.34392660899138e-06, + "loss": 0.776, + "step": 5622 + }, + { + "epoch": 0.49, + "grad_norm": 4.8011714831057475, + "learning_rate": 5.342507656421272e-06, + "loss": 0.6662, + "step": 5623 + }, + { + "epoch": 0.49, + "grad_norm": 11.314326871915148, + "learning_rate": 5.341088676136016e-06, + "loss": 0.8158, + "step": 5624 + }, + { + "epoch": 0.49, + "grad_norm": 7.39327782239765, + "learning_rate": 5.3396696682504325e-06, + "loss": 0.7841, + "step": 5625 + }, + { + "epoch": 0.49, + "grad_norm": 10.300408363635764, + "learning_rate": 5.338250632879341e-06, + "loss": 0.7174, + "step": 5626 + }, + { + "epoch": 0.49, + "grad_norm": 18.08441058295389, + "learning_rate": 5.336831570137575e-06, + "loss": 0.7269, + "step": 5627 + }, + { + "epoch": 0.49, + "grad_norm": 9.123154378157041, + "learning_rate": 5.335412480139956e-06, + "loss": 0.8001, + "step": 5628 + }, + { + "epoch": 0.49, + "grad_norm": 9.764237178568214, + "learning_rate": 5.333993363001318e-06, + "loss": 0.8113, + "step": 5629 + }, + { + "epoch": 0.49, + "grad_norm": 12.153793242570197, + "learning_rate": 5.332574218836492e-06, + "loss": 0.8371, + "step": 5630 + }, + { + "epoch": 0.49, + "grad_norm": 9.620995375691352, + "learning_rate": 5.3311550477603145e-06, + "loss": 0.7621, + "step": 5631 + }, + { + "epoch": 0.49, + "grad_norm": 8.01916763567087, + "learning_rate": 5.329735849887621e-06, + "loss": 0.7414, + "step": 5632 + }, + { + "epoch": 0.49, + "grad_norm": 11.614441785062041, + "learning_rate": 5.328316625333251e-06, + "loss": 0.7946, + "step": 5633 + }, + { + "epoch": 0.49, + "grad_norm": 2.8698868444536365, + "learning_rate": 5.326897374212045e-06, + "loss": 0.5301, + "step": 5634 + }, + { + "epoch": 0.49, + "grad_norm": 6.75101450128695, + "learning_rate": 5.325478096638849e-06, + "loss": 0.706, + "step": 5635 + }, + { + "epoch": 0.49, + "grad_norm": 11.638180324912453, + "learning_rate": 5.324058792728506e-06, + "loss": 0.7325, + "step": 5636 + }, + { + "epoch": 0.5, + "grad_norm": 7.058104663143945, + "learning_rate": 5.322639462595863e-06, + "loss": 0.8334, + "step": 5637 + }, + { + "epoch": 0.5, + "grad_norm": 8.196843351364631, + "learning_rate": 5.321220106355773e-06, + "loss": 0.7023, + "step": 5638 + }, + { + "epoch": 0.5, + "grad_norm": 9.580632600113752, + "learning_rate": 5.3198007241230865e-06, + "loss": 0.6901, + "step": 5639 + }, + { + "epoch": 0.5, + "grad_norm": 7.693690739700727, + "learning_rate": 5.318381316012655e-06, + "loss": 0.7565, + "step": 5640 + }, + { + "epoch": 0.5, + "grad_norm": 3.2973359466586025, + "learning_rate": 5.31696188213934e-06, + "loss": 0.5459, + "step": 5641 + }, + { + "epoch": 0.5, + "grad_norm": 23.858123861336686, + "learning_rate": 5.315542422617996e-06, + "loss": 0.8212, + "step": 5642 + }, + { + "epoch": 0.5, + "grad_norm": 6.802286335458395, + "learning_rate": 5.314122937563483e-06, + "loss": 0.6168, + "step": 5643 + }, + { + "epoch": 0.5, + "grad_norm": 8.977407911413621, + "learning_rate": 5.312703427090665e-06, + "loss": 0.8631, + "step": 5644 + }, + { + "epoch": 0.5, + "grad_norm": 8.511772095170237, + "learning_rate": 5.311283891314406e-06, + "loss": 0.8237, + "step": 5645 + }, + { + "epoch": 0.5, + "grad_norm": 9.823658032433846, + "learning_rate": 5.309864330349572e-06, + "loss": 0.674, + "step": 5646 + }, + { + "epoch": 0.5, + "grad_norm": 13.519685011849209, + "learning_rate": 5.308444744311032e-06, + "loss": 0.6472, + "step": 5647 + }, + { + "epoch": 0.5, + "grad_norm": 8.887083083249003, + "learning_rate": 5.307025133313657e-06, + "loss": 0.7895, + "step": 5648 + }, + { + "epoch": 0.5, + "grad_norm": 6.87320763442151, + "learning_rate": 5.305605497472317e-06, + "loss": 0.7077, + "step": 5649 + }, + { + "epoch": 0.5, + "grad_norm": 2.9191338609171504, + "learning_rate": 5.304185836901891e-06, + "loss": 0.5742, + "step": 5650 + }, + { + "epoch": 0.5, + "grad_norm": 3.12775198412541, + "learning_rate": 5.302766151717251e-06, + "loss": 0.6052, + "step": 5651 + }, + { + "epoch": 0.5, + "grad_norm": 9.370485759822161, + "learning_rate": 5.301346442033278e-06, + "loss": 0.7791, + "step": 5652 + }, + { + "epoch": 0.5, + "grad_norm": 7.120018729355025, + "learning_rate": 5.299926707964853e-06, + "loss": 0.7023, + "step": 5653 + }, + { + "epoch": 0.5, + "grad_norm": 7.6736389497530935, + "learning_rate": 5.2985069496268585e-06, + "loss": 0.665, + "step": 5654 + }, + { + "epoch": 0.5, + "grad_norm": 7.943899115112489, + "learning_rate": 5.297087167134176e-06, + "loss": 0.7128, + "step": 5655 + }, + { + "epoch": 0.5, + "grad_norm": 6.471614442789855, + "learning_rate": 5.295667360601695e-06, + "loss": 0.7943, + "step": 5656 + }, + { + "epoch": 0.5, + "grad_norm": 13.649399331795163, + "learning_rate": 5.294247530144303e-06, + "loss": 0.8059, + "step": 5657 + }, + { + "epoch": 0.5, + "grad_norm": 17.55455142861343, + "learning_rate": 5.29282767587689e-06, + "loss": 0.7569, + "step": 5658 + }, + { + "epoch": 0.5, + "grad_norm": 11.188506287399896, + "learning_rate": 5.291407797914349e-06, + "loss": 0.6784, + "step": 5659 + }, + { + "epoch": 0.5, + "grad_norm": 13.625455998403229, + "learning_rate": 5.289987896371573e-06, + "loss": 0.8436, + "step": 5660 + }, + { + "epoch": 0.5, + "grad_norm": 9.749250880497744, + "learning_rate": 5.288567971363461e-06, + "loss": 0.7148, + "step": 5661 + }, + { + "epoch": 0.5, + "grad_norm": 2.9593337581166383, + "learning_rate": 5.287148023004907e-06, + "loss": 0.5744, + "step": 5662 + }, + { + "epoch": 0.5, + "grad_norm": 7.232852683572149, + "learning_rate": 5.285728051410812e-06, + "loss": 0.77, + "step": 5663 + }, + { + "epoch": 0.5, + "grad_norm": 5.862012678377767, + "learning_rate": 5.284308056696081e-06, + "loss": 0.7891, + "step": 5664 + }, + { + "epoch": 0.5, + "grad_norm": 7.641715460617979, + "learning_rate": 5.282888038975612e-06, + "loss": 0.6425, + "step": 5665 + }, + { + "epoch": 0.5, + "grad_norm": 10.69971756846906, + "learning_rate": 5.281467998364314e-06, + "loss": 0.7674, + "step": 5666 + }, + { + "epoch": 0.5, + "grad_norm": 9.080011052656337, + "learning_rate": 5.280047934977095e-06, + "loss": 0.7298, + "step": 5667 + }, + { + "epoch": 0.5, + "grad_norm": 9.181417567431229, + "learning_rate": 5.278627848928863e-06, + "loss": 0.7875, + "step": 5668 + }, + { + "epoch": 0.5, + "grad_norm": 11.819074349812128, + "learning_rate": 5.277207740334528e-06, + "loss": 0.8169, + "step": 5669 + }, + { + "epoch": 0.5, + "grad_norm": 41.7122592606509, + "learning_rate": 5.275787609309004e-06, + "loss": 0.8814, + "step": 5670 + }, + { + "epoch": 0.5, + "grad_norm": 2.262632746311432, + "learning_rate": 5.274367455967207e-06, + "loss": 0.5252, + "step": 5671 + }, + { + "epoch": 0.5, + "grad_norm": 8.04268599796862, + "learning_rate": 5.272947280424049e-06, + "loss": 0.7285, + "step": 5672 + }, + { + "epoch": 0.5, + "grad_norm": 14.741980005115176, + "learning_rate": 5.271527082794452e-06, + "loss": 0.848, + "step": 5673 + }, + { + "epoch": 0.5, + "grad_norm": 9.577275034883852, + "learning_rate": 5.270106863193336e-06, + "loss": 0.7132, + "step": 5674 + }, + { + "epoch": 0.5, + "grad_norm": 2.8335488258882675, + "learning_rate": 5.2686866217356216e-06, + "loss": 0.5164, + "step": 5675 + }, + { + "epoch": 0.5, + "grad_norm": 7.124671276479627, + "learning_rate": 5.2672663585362325e-06, + "loss": 0.8304, + "step": 5676 + }, + { + "epoch": 0.5, + "grad_norm": 7.525193649890002, + "learning_rate": 5.265846073710093e-06, + "loss": 0.6443, + "step": 5677 + }, + { + "epoch": 0.5, + "grad_norm": 13.524335382304033, + "learning_rate": 5.264425767372134e-06, + "loss": 0.8001, + "step": 5678 + }, + { + "epoch": 0.5, + "grad_norm": 8.656978252119956, + "learning_rate": 5.263005439637281e-06, + "loss": 0.8923, + "step": 5679 + }, + { + "epoch": 0.5, + "grad_norm": 8.406879753834508, + "learning_rate": 5.261585090620463e-06, + "loss": 0.8266, + "step": 5680 + }, + { + "epoch": 0.5, + "grad_norm": 7.586480981176536, + "learning_rate": 5.260164720436617e-06, + "loss": 0.7601, + "step": 5681 + }, + { + "epoch": 0.5, + "grad_norm": 2.5498039493151015, + "learning_rate": 5.258744329200674e-06, + "loss": 0.5127, + "step": 5682 + }, + { + "epoch": 0.5, + "grad_norm": 14.229403751393985, + "learning_rate": 5.25732391702757e-06, + "loss": 0.78, + "step": 5683 + }, + { + "epoch": 0.5, + "grad_norm": 7.778745156729457, + "learning_rate": 5.2559034840322435e-06, + "loss": 0.7332, + "step": 5684 + }, + { + "epoch": 0.5, + "grad_norm": 6.416008643612649, + "learning_rate": 5.254483030329632e-06, + "loss": 0.8543, + "step": 5685 + }, + { + "epoch": 0.5, + "grad_norm": 9.944545758720922, + "learning_rate": 5.253062556034677e-06, + "loss": 0.7327, + "step": 5686 + }, + { + "epoch": 0.5, + "grad_norm": 6.84954611378298, + "learning_rate": 5.251642061262322e-06, + "loss": 0.8068, + "step": 5687 + }, + { + "epoch": 0.5, + "grad_norm": 9.652785867707056, + "learning_rate": 5.250221546127508e-06, + "loss": 0.6873, + "step": 5688 + }, + { + "epoch": 0.5, + "grad_norm": 16.875046323960085, + "learning_rate": 5.248801010745184e-06, + "loss": 0.7612, + "step": 5689 + }, + { + "epoch": 0.5, + "grad_norm": 6.7825339508546065, + "learning_rate": 5.247380455230296e-06, + "loss": 0.7538, + "step": 5690 + }, + { + "epoch": 0.5, + "grad_norm": 14.264217028792773, + "learning_rate": 5.245959879697794e-06, + "loss": 0.7617, + "step": 5691 + }, + { + "epoch": 0.5, + "grad_norm": 7.293238726903303, + "learning_rate": 5.244539284262625e-06, + "loss": 0.6875, + "step": 5692 + }, + { + "epoch": 0.5, + "grad_norm": 7.311527360786906, + "learning_rate": 5.243118669039746e-06, + "loss": 0.7005, + "step": 5693 + }, + { + "epoch": 0.5, + "grad_norm": 9.53919750500987, + "learning_rate": 5.2416980341441095e-06, + "loss": 0.8899, + "step": 5694 + }, + { + "epoch": 0.5, + "grad_norm": 11.956189749148626, + "learning_rate": 5.240277379690668e-06, + "loss": 0.7401, + "step": 5695 + }, + { + "epoch": 0.5, + "grad_norm": 10.731199193844095, + "learning_rate": 5.238856705794382e-06, + "loss": 0.827, + "step": 5696 + }, + { + "epoch": 0.5, + "grad_norm": 7.737134121967019, + "learning_rate": 5.237436012570207e-06, + "loss": 0.7599, + "step": 5697 + }, + { + "epoch": 0.5, + "grad_norm": 10.678843587130629, + "learning_rate": 5.236015300133106e-06, + "loss": 0.5891, + "step": 5698 + }, + { + "epoch": 0.5, + "grad_norm": 7.634598559591674, + "learning_rate": 5.2345945685980404e-06, + "loss": 0.909, + "step": 5699 + }, + { + "epoch": 0.5, + "grad_norm": 11.360586222981938, + "learning_rate": 5.233173818079971e-06, + "loss": 0.8357, + "step": 5700 + }, + { + "epoch": 0.5, + "grad_norm": 8.616467595725798, + "learning_rate": 5.2317530486938635e-06, + "loss": 0.7363, + "step": 5701 + }, + { + "epoch": 0.5, + "grad_norm": 11.207709253044836, + "learning_rate": 5.2303322605546866e-06, + "loss": 0.6646, + "step": 5702 + }, + { + "epoch": 0.5, + "grad_norm": 8.83938995093395, + "learning_rate": 5.228911453777405e-06, + "loss": 0.7314, + "step": 5703 + }, + { + "epoch": 0.5, + "grad_norm": 9.821097462354714, + "learning_rate": 5.227490628476991e-06, + "loss": 0.7726, + "step": 5704 + }, + { + "epoch": 0.5, + "grad_norm": 31.061375374891405, + "learning_rate": 5.2260697847684125e-06, + "loss": 0.7484, + "step": 5705 + }, + { + "epoch": 0.5, + "grad_norm": 9.747499484042462, + "learning_rate": 5.224648922766643e-06, + "loss": 0.8622, + "step": 5706 + }, + { + "epoch": 0.5, + "grad_norm": 18.850167405761617, + "learning_rate": 5.223228042586658e-06, + "loss": 0.8902, + "step": 5707 + }, + { + "epoch": 0.5, + "grad_norm": 9.223635462842736, + "learning_rate": 5.221807144343429e-06, + "loss": 0.6811, + "step": 5708 + }, + { + "epoch": 0.5, + "grad_norm": 6.848815791942964, + "learning_rate": 5.220386228151936e-06, + "loss": 0.6943, + "step": 5709 + }, + { + "epoch": 0.5, + "grad_norm": 7.601359308715685, + "learning_rate": 5.218965294127155e-06, + "loss": 0.807, + "step": 5710 + }, + { + "epoch": 0.5, + "grad_norm": 25.211792758263343, + "learning_rate": 5.217544342384067e-06, + "loss": 0.9178, + "step": 5711 + }, + { + "epoch": 0.5, + "grad_norm": 6.389538625952023, + "learning_rate": 5.216123373037654e-06, + "loss": 0.8391, + "step": 5712 + }, + { + "epoch": 0.5, + "grad_norm": 30.152189151814504, + "learning_rate": 5.214702386202896e-06, + "loss": 0.7747, + "step": 5713 + }, + { + "epoch": 0.5, + "grad_norm": 6.960445996069964, + "learning_rate": 5.213281381994778e-06, + "loss": 0.7172, + "step": 5714 + }, + { + "epoch": 0.5, + "grad_norm": 8.687481734873824, + "learning_rate": 5.2118603605282845e-06, + "loss": 0.7697, + "step": 5715 + }, + { + "epoch": 0.5, + "grad_norm": 10.354071548298682, + "learning_rate": 5.2104393219184056e-06, + "loss": 0.642, + "step": 5716 + }, + { + "epoch": 0.5, + "grad_norm": 11.5397557964436, + "learning_rate": 5.2090182662801235e-06, + "loss": 0.8849, + "step": 5717 + }, + { + "epoch": 0.5, + "grad_norm": 10.014374271415512, + "learning_rate": 5.207597193728434e-06, + "loss": 0.8661, + "step": 5718 + }, + { + "epoch": 0.5, + "grad_norm": 6.1004115316868965, + "learning_rate": 5.206176104378325e-06, + "loss": 0.8741, + "step": 5719 + }, + { + "epoch": 0.5, + "grad_norm": 12.439537798909267, + "learning_rate": 5.204754998344786e-06, + "loss": 0.7128, + "step": 5720 + }, + { + "epoch": 0.5, + "grad_norm": 14.176149508516716, + "learning_rate": 5.203333875742814e-06, + "loss": 0.8159, + "step": 5721 + }, + { + "epoch": 0.5, + "grad_norm": 11.65061548710457, + "learning_rate": 5.201912736687403e-06, + "loss": 0.82, + "step": 5722 + }, + { + "epoch": 0.5, + "grad_norm": 10.323884633942397, + "learning_rate": 5.20049158129355e-06, + "loss": 0.767, + "step": 5723 + }, + { + "epoch": 0.5, + "grad_norm": 10.669828168570502, + "learning_rate": 5.199070409676251e-06, + "loss": 0.6954, + "step": 5724 + }, + { + "epoch": 0.5, + "grad_norm": 6.216301451060616, + "learning_rate": 5.197649221950507e-06, + "loss": 0.7587, + "step": 5725 + }, + { + "epoch": 0.5, + "grad_norm": 10.378292786770256, + "learning_rate": 5.1962280182313155e-06, + "loss": 0.8309, + "step": 5726 + }, + { + "epoch": 0.5, + "grad_norm": 6.308116522634765, + "learning_rate": 5.194806798633679e-06, + "loss": 0.8913, + "step": 5727 + }, + { + "epoch": 0.5, + "grad_norm": 8.764533667542146, + "learning_rate": 5.1933855632726e-06, + "loss": 0.7194, + "step": 5728 + }, + { + "epoch": 0.5, + "grad_norm": 15.118547481617346, + "learning_rate": 5.191964312263084e-06, + "loss": 1.0159, + "step": 5729 + }, + { + "epoch": 0.5, + "grad_norm": 8.785027919034137, + "learning_rate": 5.1905430457201354e-06, + "loss": 0.8259, + "step": 5730 + }, + { + "epoch": 0.5, + "grad_norm": 5.470338251302581, + "learning_rate": 5.18912176375876e-06, + "loss": 0.6116, + "step": 5731 + }, + { + "epoch": 0.5, + "grad_norm": 5.397987491302774, + "learning_rate": 5.187700466493966e-06, + "loss": 0.5949, + "step": 5732 + }, + { + "epoch": 0.5, + "grad_norm": 12.913358081235746, + "learning_rate": 5.186279154040762e-06, + "loss": 0.7728, + "step": 5733 + }, + { + "epoch": 0.5, + "grad_norm": 10.626692872545622, + "learning_rate": 5.184857826514159e-06, + "loss": 0.9019, + "step": 5734 + }, + { + "epoch": 0.5, + "grad_norm": 6.549809875449674, + "learning_rate": 5.183436484029168e-06, + "loss": 0.8006, + "step": 5735 + }, + { + "epoch": 0.5, + "grad_norm": 10.336821564208744, + "learning_rate": 5.182015126700803e-06, + "loss": 0.652, + "step": 5736 + }, + { + "epoch": 0.5, + "grad_norm": 7.173970928265324, + "learning_rate": 5.1805937546440745e-06, + "loss": 0.7228, + "step": 5737 + }, + { + "epoch": 0.5, + "grad_norm": 6.0405313662990645, + "learning_rate": 5.179172367974001e-06, + "loss": 0.7241, + "step": 5738 + }, + { + "epoch": 0.5, + "grad_norm": 8.075753061478414, + "learning_rate": 5.1777509668055995e-06, + "loss": 0.9416, + "step": 5739 + }, + { + "epoch": 0.5, + "grad_norm": 6.2205221506436725, + "learning_rate": 5.176329551253882e-06, + "loss": 0.8389, + "step": 5740 + }, + { + "epoch": 0.5, + "grad_norm": 12.243141944975147, + "learning_rate": 5.174908121433873e-06, + "loss": 0.9546, + "step": 5741 + }, + { + "epoch": 0.5, + "grad_norm": 2.9389063616550226, + "learning_rate": 5.173486677460589e-06, + "loss": 0.5253, + "step": 5742 + }, + { + "epoch": 0.5, + "grad_norm": 3.5065191338924406, + "learning_rate": 5.1720652194490504e-06, + "loss": 0.645, + "step": 5743 + }, + { + "epoch": 0.5, + "grad_norm": 10.54093558350713, + "learning_rate": 5.1706437475142825e-06, + "loss": 0.8407, + "step": 5744 + }, + { + "epoch": 0.5, + "grad_norm": 9.235702717011437, + "learning_rate": 5.169222261771306e-06, + "loss": 0.7446, + "step": 5745 + }, + { + "epoch": 0.5, + "grad_norm": 11.390384570940961, + "learning_rate": 5.167800762335145e-06, + "loss": 0.7635, + "step": 5746 + }, + { + "epoch": 0.5, + "grad_norm": 8.850748890030742, + "learning_rate": 5.166379249320825e-06, + "loss": 0.6662, + "step": 5747 + }, + { + "epoch": 0.5, + "grad_norm": 8.542823806270347, + "learning_rate": 5.164957722843373e-06, + "loss": 0.7464, + "step": 5748 + }, + { + "epoch": 0.5, + "grad_norm": 7.178543258024006, + "learning_rate": 5.163536183017817e-06, + "loss": 0.7327, + "step": 5749 + }, + { + "epoch": 0.5, + "grad_norm": 5.511310611900542, + "learning_rate": 5.162114629959185e-06, + "loss": 0.8181, + "step": 5750 + }, + { + "epoch": 0.51, + "grad_norm": 2.775290801461886, + "learning_rate": 5.160693063782507e-06, + "loss": 0.5323, + "step": 5751 + }, + { + "epoch": 0.51, + "grad_norm": 6.065799686978724, + "learning_rate": 5.159271484602812e-06, + "loss": 0.8411, + "step": 5752 + }, + { + "epoch": 0.51, + "grad_norm": 6.747858797915304, + "learning_rate": 5.157849892535136e-06, + "loss": 0.7518, + "step": 5753 + }, + { + "epoch": 0.51, + "grad_norm": 18.84866340233331, + "learning_rate": 5.156428287694508e-06, + "loss": 0.7852, + "step": 5754 + }, + { + "epoch": 0.51, + "grad_norm": 2.76224788766832, + "learning_rate": 5.155006670195962e-06, + "loss": 0.4108, + "step": 5755 + }, + { + "epoch": 0.51, + "grad_norm": 9.743018922234743, + "learning_rate": 5.153585040154536e-06, + "loss": 0.7177, + "step": 5756 + }, + { + "epoch": 0.51, + "grad_norm": 2.635355843844033, + "learning_rate": 5.152163397685263e-06, + "loss": 0.4604, + "step": 5757 + }, + { + "epoch": 0.51, + "grad_norm": 7.199530360494463, + "learning_rate": 5.150741742903182e-06, + "loss": 0.7335, + "step": 5758 + }, + { + "epoch": 0.51, + "grad_norm": 26.509578894332044, + "learning_rate": 5.149320075923329e-06, + "loss": 0.7457, + "step": 5759 + }, + { + "epoch": 0.51, + "grad_norm": 8.709796583719568, + "learning_rate": 5.147898396860743e-06, + "loss": 0.716, + "step": 5760 + }, + { + "epoch": 0.51, + "grad_norm": 5.996879926455844, + "learning_rate": 5.146476705830467e-06, + "loss": 0.5757, + "step": 5761 + }, + { + "epoch": 0.51, + "grad_norm": 13.621922616914706, + "learning_rate": 5.145055002947538e-06, + "loss": 0.7943, + "step": 5762 + }, + { + "epoch": 0.51, + "grad_norm": 15.860968760660693, + "learning_rate": 5.143633288326999e-06, + "loss": 0.754, + "step": 5763 + }, + { + "epoch": 0.51, + "grad_norm": 5.26091706001963, + "learning_rate": 5.142211562083894e-06, + "loss": 0.7782, + "step": 5764 + }, + { + "epoch": 0.51, + "grad_norm": 6.9376799696694205, + "learning_rate": 5.140789824333266e-06, + "loss": 0.6655, + "step": 5765 + }, + { + "epoch": 0.51, + "grad_norm": 2.6047482556148496, + "learning_rate": 5.1393680751901595e-06, + "loss": 0.55, + "step": 5766 + }, + { + "epoch": 0.51, + "grad_norm": 6.056885928849883, + "learning_rate": 5.1379463147696205e-06, + "loss": 0.6236, + "step": 5767 + }, + { + "epoch": 0.51, + "grad_norm": 6.58835971104602, + "learning_rate": 5.136524543186696e-06, + "loss": 0.8437, + "step": 5768 + }, + { + "epoch": 0.51, + "grad_norm": 12.372297526827829, + "learning_rate": 5.13510276055643e-06, + "loss": 0.6026, + "step": 5769 + }, + { + "epoch": 0.51, + "grad_norm": 5.461317567574696, + "learning_rate": 5.133680966993876e-06, + "loss": 0.7726, + "step": 5770 + }, + { + "epoch": 0.51, + "grad_norm": 6.477547253951548, + "learning_rate": 5.132259162614079e-06, + "loss": 0.6446, + "step": 5771 + }, + { + "epoch": 0.51, + "grad_norm": 8.204682614051444, + "learning_rate": 5.13083734753209e-06, + "loss": 0.8339, + "step": 5772 + }, + { + "epoch": 0.51, + "grad_norm": 9.180040865128262, + "learning_rate": 5.1294155218629625e-06, + "loss": 0.712, + "step": 5773 + }, + { + "epoch": 0.51, + "grad_norm": 6.441850767205609, + "learning_rate": 5.127993685721746e-06, + "loss": 0.8559, + "step": 5774 + }, + { + "epoch": 0.51, + "grad_norm": 4.44427549645921, + "learning_rate": 5.126571839223491e-06, + "loss": 0.6234, + "step": 5775 + }, + { + "epoch": 0.51, + "grad_norm": 9.090860803166064, + "learning_rate": 5.125149982483255e-06, + "loss": 0.9044, + "step": 5776 + }, + { + "epoch": 0.51, + "grad_norm": 6.44553853193292, + "learning_rate": 5.1237281156160905e-06, + "loss": 0.6792, + "step": 5777 + }, + { + "epoch": 0.51, + "grad_norm": 12.853487890627504, + "learning_rate": 5.1223062387370525e-06, + "loss": 0.7522, + "step": 5778 + }, + { + "epoch": 0.51, + "grad_norm": 6.491541151394758, + "learning_rate": 5.120884351961199e-06, + "loss": 0.6905, + "step": 5779 + }, + { + "epoch": 0.51, + "grad_norm": 6.420119710164076, + "learning_rate": 5.119462455403581e-06, + "loss": 0.6804, + "step": 5780 + }, + { + "epoch": 0.51, + "grad_norm": 6.372706504501275, + "learning_rate": 5.118040549179263e-06, + "loss": 0.6915, + "step": 5781 + }, + { + "epoch": 0.51, + "grad_norm": 6.352928076047926, + "learning_rate": 5.116618633403298e-06, + "loss": 0.7435, + "step": 5782 + }, + { + "epoch": 0.51, + "grad_norm": 7.37370748390702, + "learning_rate": 5.1151967081907485e-06, + "loss": 0.817, + "step": 5783 + }, + { + "epoch": 0.51, + "grad_norm": 8.207790793796727, + "learning_rate": 5.113774773656672e-06, + "loss": 0.8465, + "step": 5784 + }, + { + "epoch": 0.51, + "grad_norm": 8.44227326202058, + "learning_rate": 5.112352829916132e-06, + "loss": 0.8918, + "step": 5785 + }, + { + "epoch": 0.51, + "grad_norm": 5.998129124812597, + "learning_rate": 5.1109308770841855e-06, + "loss": 0.6993, + "step": 5786 + }, + { + "epoch": 0.51, + "grad_norm": 13.167596973562844, + "learning_rate": 5.109508915275898e-06, + "loss": 0.8222, + "step": 5787 + }, + { + "epoch": 0.51, + "grad_norm": 11.189549794119017, + "learning_rate": 5.108086944606332e-06, + "loss": 0.6761, + "step": 5788 + }, + { + "epoch": 0.51, + "grad_norm": 8.284327723796714, + "learning_rate": 5.106664965190549e-06, + "loss": 0.6783, + "step": 5789 + }, + { + "epoch": 0.51, + "grad_norm": 8.378168087012858, + "learning_rate": 5.1052429771436155e-06, + "loss": 0.7746, + "step": 5790 + }, + { + "epoch": 0.51, + "grad_norm": 5.689915538470285, + "learning_rate": 5.103820980580595e-06, + "loss": 0.7333, + "step": 5791 + }, + { + "epoch": 0.51, + "grad_norm": 9.056772195596217, + "learning_rate": 5.1023989756165526e-06, + "loss": 0.6237, + "step": 5792 + }, + { + "epoch": 0.51, + "grad_norm": 15.818836908454802, + "learning_rate": 5.100976962366556e-06, + "loss": 0.7795, + "step": 5793 + }, + { + "epoch": 0.51, + "grad_norm": 6.222287968208102, + "learning_rate": 5.099554940945672e-06, + "loss": 0.685, + "step": 5794 + }, + { + "epoch": 0.51, + "grad_norm": 9.600059161387522, + "learning_rate": 5.098132911468967e-06, + "loss": 0.671, + "step": 5795 + }, + { + "epoch": 0.51, + "grad_norm": 3.3686428458141946, + "learning_rate": 5.0967108740515116e-06, + "loss": 0.5565, + "step": 5796 + }, + { + "epoch": 0.51, + "grad_norm": 5.8451408459803345, + "learning_rate": 5.095288828808372e-06, + "loss": 0.6006, + "step": 5797 + }, + { + "epoch": 0.51, + "grad_norm": 9.730266433059429, + "learning_rate": 5.093866775854618e-06, + "loss": 0.8068, + "step": 5798 + }, + { + "epoch": 0.51, + "grad_norm": 13.33143913108808, + "learning_rate": 5.092444715305323e-06, + "loss": 0.6898, + "step": 5799 + }, + { + "epoch": 0.51, + "grad_norm": 3.109179700687264, + "learning_rate": 5.091022647275552e-06, + "loss": 0.5868, + "step": 5800 + }, + { + "epoch": 0.51, + "grad_norm": 7.357238484907103, + "learning_rate": 5.089600571880382e-06, + "loss": 0.8533, + "step": 5801 + }, + { + "epoch": 0.51, + "grad_norm": 5.761458810232443, + "learning_rate": 5.088178489234883e-06, + "loss": 0.806, + "step": 5802 + }, + { + "epoch": 0.51, + "grad_norm": 6.302601269183379, + "learning_rate": 5.086756399454127e-06, + "loss": 0.7512, + "step": 5803 + }, + { + "epoch": 0.51, + "grad_norm": 17.867509368715794, + "learning_rate": 5.085334302653187e-06, + "loss": 0.6114, + "step": 5804 + }, + { + "epoch": 0.51, + "grad_norm": 10.640660440703103, + "learning_rate": 5.083912198947137e-06, + "loss": 0.7154, + "step": 5805 + }, + { + "epoch": 0.51, + "grad_norm": 8.071817304979948, + "learning_rate": 5.082490088451052e-06, + "loss": 0.8325, + "step": 5806 + }, + { + "epoch": 0.51, + "grad_norm": 5.13957003915384, + "learning_rate": 5.081067971280007e-06, + "loss": 0.6141, + "step": 5807 + }, + { + "epoch": 0.51, + "grad_norm": 11.773698098443484, + "learning_rate": 5.079645847549076e-06, + "loss": 0.7093, + "step": 5808 + }, + { + "epoch": 0.51, + "grad_norm": 6.884710585626823, + "learning_rate": 5.078223717373334e-06, + "loss": 0.8379, + "step": 5809 + }, + { + "epoch": 0.51, + "grad_norm": 3.2100078985789624, + "learning_rate": 5.07680158086786e-06, + "loss": 0.5146, + "step": 5810 + }, + { + "epoch": 0.51, + "grad_norm": 9.99618934584595, + "learning_rate": 5.075379438147732e-06, + "loss": 0.6451, + "step": 5811 + }, + { + "epoch": 0.51, + "grad_norm": 7.96408507416145, + "learning_rate": 5.073957289328022e-06, + "loss": 0.7546, + "step": 5812 + }, + { + "epoch": 0.51, + "grad_norm": 7.055611352164288, + "learning_rate": 5.072535134523813e-06, + "loss": 0.7864, + "step": 5813 + }, + { + "epoch": 0.51, + "grad_norm": 8.221235645368372, + "learning_rate": 5.0711129738501805e-06, + "loss": 0.8335, + "step": 5814 + }, + { + "epoch": 0.51, + "grad_norm": 9.570034186601584, + "learning_rate": 5.069690807422205e-06, + "loss": 0.7749, + "step": 5815 + }, + { + "epoch": 0.51, + "grad_norm": 5.861215851359586, + "learning_rate": 5.068268635354966e-06, + "loss": 0.764, + "step": 5816 + }, + { + "epoch": 0.51, + "grad_norm": 15.830695294020371, + "learning_rate": 5.0668464577635415e-06, + "loss": 0.9607, + "step": 5817 + }, + { + "epoch": 0.51, + "grad_norm": 8.442217917346262, + "learning_rate": 5.065424274763012e-06, + "loss": 0.796, + "step": 5818 + }, + { + "epoch": 0.51, + "grad_norm": 2.4894281909555573, + "learning_rate": 5.064002086468461e-06, + "loss": 0.5189, + "step": 5819 + }, + { + "epoch": 0.51, + "grad_norm": 9.589124335478346, + "learning_rate": 5.062579892994966e-06, + "loss": 0.7576, + "step": 5820 + }, + { + "epoch": 0.51, + "grad_norm": 10.416090995214171, + "learning_rate": 5.061157694457611e-06, + "loss": 0.6808, + "step": 5821 + }, + { + "epoch": 0.51, + "grad_norm": 5.862445317913591, + "learning_rate": 5.059735490971478e-06, + "loss": 0.5701, + "step": 5822 + }, + { + "epoch": 0.51, + "grad_norm": 6.507955973796929, + "learning_rate": 5.058313282651647e-06, + "loss": 0.7703, + "step": 5823 + }, + { + "epoch": 0.51, + "grad_norm": 10.601995209832381, + "learning_rate": 5.056891069613203e-06, + "loss": 0.8702, + "step": 5824 + }, + { + "epoch": 0.51, + "grad_norm": 3.5030549068402865, + "learning_rate": 5.055468851971228e-06, + "loss": 0.61, + "step": 5825 + }, + { + "epoch": 0.51, + "grad_norm": 7.128247435397542, + "learning_rate": 5.0540466298408054e-06, + "loss": 0.6713, + "step": 5826 + }, + { + "epoch": 0.51, + "grad_norm": 12.211459702473288, + "learning_rate": 5.052624403337019e-06, + "loss": 0.6802, + "step": 5827 + }, + { + "epoch": 0.51, + "grad_norm": 8.52816483129982, + "learning_rate": 5.051202172574956e-06, + "loss": 0.7021, + "step": 5828 + }, + { + "epoch": 0.51, + "grad_norm": 6.375166185029113, + "learning_rate": 5.0497799376696955e-06, + "loss": 0.8416, + "step": 5829 + }, + { + "epoch": 0.51, + "grad_norm": 11.123066167599665, + "learning_rate": 5.048357698736327e-06, + "loss": 0.7311, + "step": 5830 + }, + { + "epoch": 0.51, + "grad_norm": 10.733379459304555, + "learning_rate": 5.046935455889933e-06, + "loss": 0.9002, + "step": 5831 + }, + { + "epoch": 0.51, + "grad_norm": 5.803901637660462, + "learning_rate": 5.045513209245599e-06, + "loss": 0.8072, + "step": 5832 + }, + { + "epoch": 0.51, + "grad_norm": 6.700398573464469, + "learning_rate": 5.044090958918414e-06, + "loss": 0.6945, + "step": 5833 + }, + { + "epoch": 0.51, + "grad_norm": 7.316731151997332, + "learning_rate": 5.0426687050234614e-06, + "loss": 0.556, + "step": 5834 + }, + { + "epoch": 0.51, + "grad_norm": 2.3111719770018264, + "learning_rate": 5.041246447675827e-06, + "loss": 0.5359, + "step": 5835 + }, + { + "epoch": 0.51, + "grad_norm": 13.16866607155494, + "learning_rate": 5.0398241869906e-06, + "loss": 0.7779, + "step": 5836 + }, + { + "epoch": 0.51, + "grad_norm": 6.657256125332028, + "learning_rate": 5.038401923082864e-06, + "loss": 0.8542, + "step": 5837 + }, + { + "epoch": 0.51, + "grad_norm": 17.47281088214784, + "learning_rate": 5.036979656067711e-06, + "loss": 0.7898, + "step": 5838 + }, + { + "epoch": 0.51, + "grad_norm": 7.124654062476184, + "learning_rate": 5.035557386060225e-06, + "loss": 0.8297, + "step": 5839 + }, + { + "epoch": 0.51, + "grad_norm": 2.3095658631913776, + "learning_rate": 5.034135113175492e-06, + "loss": 0.4892, + "step": 5840 + }, + { + "epoch": 0.51, + "grad_norm": 6.965763890997004, + "learning_rate": 5.032712837528605e-06, + "loss": 0.8528, + "step": 5841 + }, + { + "epoch": 0.51, + "grad_norm": 9.921842455275737, + "learning_rate": 5.03129055923465e-06, + "loss": 0.8698, + "step": 5842 + }, + { + "epoch": 0.51, + "grad_norm": 7.450532796777731, + "learning_rate": 5.029868278408713e-06, + "loss": 0.7233, + "step": 5843 + }, + { + "epoch": 0.51, + "grad_norm": 10.807889267144724, + "learning_rate": 5.0284459951658856e-06, + "loss": 0.7528, + "step": 5844 + }, + { + "epoch": 0.51, + "grad_norm": 8.038695647140774, + "learning_rate": 5.027023709621256e-06, + "loss": 0.6671, + "step": 5845 + }, + { + "epoch": 0.51, + "grad_norm": 7.597668556429813, + "learning_rate": 5.025601421889912e-06, + "loss": 0.7334, + "step": 5846 + }, + { + "epoch": 0.51, + "grad_norm": 8.636660793609186, + "learning_rate": 5.024179132086944e-06, + "loss": 0.9074, + "step": 5847 + }, + { + "epoch": 0.51, + "grad_norm": 7.169605456765505, + "learning_rate": 5.022756840327441e-06, + "loss": 0.8328, + "step": 5848 + }, + { + "epoch": 0.51, + "grad_norm": 5.509036222885624, + "learning_rate": 5.02133454672649e-06, + "loss": 0.8984, + "step": 5849 + }, + { + "epoch": 0.51, + "grad_norm": 20.336847460155656, + "learning_rate": 5.019912251399186e-06, + "loss": 0.8809, + "step": 5850 + }, + { + "epoch": 0.51, + "grad_norm": 9.244446590935697, + "learning_rate": 5.018489954460615e-06, + "loss": 0.6872, + "step": 5851 + }, + { + "epoch": 0.51, + "grad_norm": 7.2054302483241095, + "learning_rate": 5.017067656025867e-06, + "loss": 0.6377, + "step": 5852 + }, + { + "epoch": 0.51, + "grad_norm": 11.375499741441969, + "learning_rate": 5.0156453562100325e-06, + "loss": 0.8391, + "step": 5853 + }, + { + "epoch": 0.51, + "grad_norm": 2.6780042246838267, + "learning_rate": 5.014223055128203e-06, + "loss": 0.5528, + "step": 5854 + }, + { + "epoch": 0.51, + "grad_norm": 6.32558163570038, + "learning_rate": 5.012800752895467e-06, + "loss": 0.8094, + "step": 5855 + }, + { + "epoch": 0.51, + "grad_norm": 8.123589097579142, + "learning_rate": 5.0113784496269146e-06, + "loss": 0.7883, + "step": 5856 + }, + { + "epoch": 0.51, + "grad_norm": 2.729470167939291, + "learning_rate": 5.009956145437638e-06, + "loss": 0.5284, + "step": 5857 + }, + { + "epoch": 0.51, + "grad_norm": 9.033893410073226, + "learning_rate": 5.008533840442727e-06, + "loss": 0.6566, + "step": 5858 + }, + { + "epoch": 0.51, + "grad_norm": 5.951635726287182, + "learning_rate": 5.007111534757271e-06, + "loss": 0.7085, + "step": 5859 + }, + { + "epoch": 0.51, + "grad_norm": 7.761456286154134, + "learning_rate": 5.00568922849636e-06, + "loss": 0.727, + "step": 5860 + }, + { + "epoch": 0.51, + "grad_norm": 4.481446875410592, + "learning_rate": 5.004266921775088e-06, + "loss": 0.8387, + "step": 5861 + }, + { + "epoch": 0.51, + "grad_norm": 6.382763376625518, + "learning_rate": 5.002844614708544e-06, + "loss": 0.9123, + "step": 5862 + }, + { + "epoch": 0.51, + "grad_norm": 6.450279688602565, + "learning_rate": 5.001422307411817e-06, + "loss": 0.6913, + "step": 5863 + }, + { + "epoch": 0.51, + "grad_norm": 7.813882556676017, + "learning_rate": 5e-06, + "loss": 0.694, + "step": 5864 + }, + { + "epoch": 0.52, + "grad_norm": 16.35105773097358, + "learning_rate": 4.998577692588184e-06, + "loss": 0.7179, + "step": 5865 + }, + { + "epoch": 0.52, + "grad_norm": 7.816993711347936, + "learning_rate": 4.997155385291457e-06, + "loss": 0.8216, + "step": 5866 + }, + { + "epoch": 0.52, + "grad_norm": 11.336216570225638, + "learning_rate": 4.995733078224913e-06, + "loss": 0.8089, + "step": 5867 + }, + { + "epoch": 0.52, + "grad_norm": 11.883447340037396, + "learning_rate": 4.994310771503641e-06, + "loss": 0.8204, + "step": 5868 + }, + { + "epoch": 0.52, + "grad_norm": 11.32035433284272, + "learning_rate": 4.992888465242731e-06, + "loss": 0.6658, + "step": 5869 + }, + { + "epoch": 0.52, + "grad_norm": 10.063065960024678, + "learning_rate": 4.991466159557276e-06, + "loss": 0.8424, + "step": 5870 + }, + { + "epoch": 0.52, + "grad_norm": 5.837255459069054, + "learning_rate": 4.990043854562364e-06, + "loss": 0.7471, + "step": 5871 + }, + { + "epoch": 0.52, + "grad_norm": 10.501497833445733, + "learning_rate": 4.988621550373086e-06, + "loss": 0.7729, + "step": 5872 + }, + { + "epoch": 0.52, + "grad_norm": 7.565660167141623, + "learning_rate": 4.987199247104535e-06, + "loss": 0.6722, + "step": 5873 + }, + { + "epoch": 0.52, + "grad_norm": 2.4784804315819713, + "learning_rate": 4.985776944871798e-06, + "loss": 0.5309, + "step": 5874 + }, + { + "epoch": 0.52, + "grad_norm": 9.942049589284482, + "learning_rate": 4.984354643789968e-06, + "loss": 0.9492, + "step": 5875 + }, + { + "epoch": 0.52, + "grad_norm": 2.924782658572734, + "learning_rate": 4.982932343974135e-06, + "loss": 0.5791, + "step": 5876 + }, + { + "epoch": 0.52, + "grad_norm": 8.14569791248323, + "learning_rate": 4.981510045539386e-06, + "loss": 0.6901, + "step": 5877 + }, + { + "epoch": 0.52, + "grad_norm": 8.010807212301243, + "learning_rate": 4.980087748600816e-06, + "loss": 0.7068, + "step": 5878 + }, + { + "epoch": 0.52, + "grad_norm": 2.7502745949756986, + "learning_rate": 4.9786654532735106e-06, + "loss": 0.5004, + "step": 5879 + }, + { + "epoch": 0.52, + "grad_norm": 6.244428090012358, + "learning_rate": 4.977243159672561e-06, + "loss": 0.7102, + "step": 5880 + }, + { + "epoch": 0.52, + "grad_norm": 11.043740216959483, + "learning_rate": 4.975820867913058e-06, + "loss": 0.6352, + "step": 5881 + }, + { + "epoch": 0.52, + "grad_norm": 11.234902716791105, + "learning_rate": 4.9743985781100905e-06, + "loss": 0.7558, + "step": 5882 + }, + { + "epoch": 0.52, + "grad_norm": 7.785583584485197, + "learning_rate": 4.9729762903787455e-06, + "loss": 0.7804, + "step": 5883 + }, + { + "epoch": 0.52, + "grad_norm": 8.75769270799265, + "learning_rate": 4.971554004834116e-06, + "loss": 0.6541, + "step": 5884 + }, + { + "epoch": 0.52, + "grad_norm": 10.811311821645289, + "learning_rate": 4.9701317215912885e-06, + "loss": 0.8979, + "step": 5885 + }, + { + "epoch": 0.52, + "grad_norm": 10.314596746149569, + "learning_rate": 4.968709440765352e-06, + "loss": 0.7869, + "step": 5886 + }, + { + "epoch": 0.52, + "grad_norm": 16.831026159519013, + "learning_rate": 4.967287162471396e-06, + "loss": 0.8091, + "step": 5887 + }, + { + "epoch": 0.52, + "grad_norm": 7.34598986504919, + "learning_rate": 4.965864886824509e-06, + "loss": 0.5978, + "step": 5888 + }, + { + "epoch": 0.52, + "grad_norm": 2.7922792574187714, + "learning_rate": 4.964442613939777e-06, + "loss": 0.5489, + "step": 5889 + }, + { + "epoch": 0.52, + "grad_norm": 10.51758100364323, + "learning_rate": 4.9630203439322914e-06, + "loss": 0.7433, + "step": 5890 + }, + { + "epoch": 0.52, + "grad_norm": 8.299988434558433, + "learning_rate": 4.961598076917137e-06, + "loss": 0.8934, + "step": 5891 + }, + { + "epoch": 0.52, + "grad_norm": 4.964364755109857, + "learning_rate": 4.960175813009402e-06, + "loss": 0.7015, + "step": 5892 + }, + { + "epoch": 0.52, + "grad_norm": 25.471088971078228, + "learning_rate": 4.958753552324175e-06, + "loss": 0.8218, + "step": 5893 + }, + { + "epoch": 0.52, + "grad_norm": 8.666992746871086, + "learning_rate": 4.95733129497654e-06, + "loss": 0.6999, + "step": 5894 + }, + { + "epoch": 0.52, + "grad_norm": 6.727904503643589, + "learning_rate": 4.955909041081587e-06, + "loss": 0.6773, + "step": 5895 + }, + { + "epoch": 0.52, + "grad_norm": 9.951327232753393, + "learning_rate": 4.954486790754402e-06, + "loss": 0.8518, + "step": 5896 + }, + { + "epoch": 0.52, + "grad_norm": 22.129996728346523, + "learning_rate": 4.953064544110069e-06, + "loss": 0.8397, + "step": 5897 + }, + { + "epoch": 0.52, + "grad_norm": 3.334491408113813, + "learning_rate": 4.951642301263675e-06, + "loss": 0.5434, + "step": 5898 + }, + { + "epoch": 0.52, + "grad_norm": 7.961617311147177, + "learning_rate": 4.950220062330306e-06, + "loss": 0.7721, + "step": 5899 + }, + { + "epoch": 0.52, + "grad_norm": 5.1163937655604395, + "learning_rate": 4.948797827425046e-06, + "loss": 0.7795, + "step": 5900 + }, + { + "epoch": 0.52, + "grad_norm": 7.6864848883908286, + "learning_rate": 4.947375596662981e-06, + "loss": 0.7475, + "step": 5901 + }, + { + "epoch": 0.52, + "grad_norm": 9.23723738320249, + "learning_rate": 4.945953370159196e-06, + "loss": 0.7935, + "step": 5902 + }, + { + "epoch": 0.52, + "grad_norm": 7.458126872423043, + "learning_rate": 4.9445311480287735e-06, + "loss": 0.735, + "step": 5903 + }, + { + "epoch": 0.52, + "grad_norm": 6.24787695569207, + "learning_rate": 4.943108930386799e-06, + "loss": 0.7663, + "step": 5904 + }, + { + "epoch": 0.52, + "grad_norm": 8.745145623109414, + "learning_rate": 4.941686717348355e-06, + "loss": 0.7327, + "step": 5905 + }, + { + "epoch": 0.52, + "grad_norm": 8.08107972387006, + "learning_rate": 4.940264509028524e-06, + "loss": 0.6843, + "step": 5906 + }, + { + "epoch": 0.52, + "grad_norm": 6.135437400184241, + "learning_rate": 4.93884230554239e-06, + "loss": 0.6555, + "step": 5907 + }, + { + "epoch": 0.52, + "grad_norm": 6.936198008777543, + "learning_rate": 4.9374201070050345e-06, + "loss": 0.6376, + "step": 5908 + }, + { + "epoch": 0.52, + "grad_norm": 4.152095955036684, + "learning_rate": 4.93599791353154e-06, + "loss": 0.5983, + "step": 5909 + }, + { + "epoch": 0.52, + "grad_norm": 8.927494278308833, + "learning_rate": 4.934575725236989e-06, + "loss": 0.7906, + "step": 5910 + }, + { + "epoch": 0.52, + "grad_norm": 10.884801680447806, + "learning_rate": 4.933153542236461e-06, + "loss": 0.746, + "step": 5911 + }, + { + "epoch": 0.52, + "grad_norm": 8.12817826869104, + "learning_rate": 4.931731364645036e-06, + "loss": 0.6691, + "step": 5912 + }, + { + "epoch": 0.52, + "grad_norm": 2.159217849775556, + "learning_rate": 4.930309192577796e-06, + "loss": 0.5256, + "step": 5913 + }, + { + "epoch": 0.52, + "grad_norm": 9.462974918980569, + "learning_rate": 4.92888702614982e-06, + "loss": 1.021, + "step": 5914 + }, + { + "epoch": 0.52, + "grad_norm": 8.810819043157057, + "learning_rate": 4.927464865476189e-06, + "loss": 0.6638, + "step": 5915 + }, + { + "epoch": 0.52, + "grad_norm": 19.722437845014056, + "learning_rate": 4.926042710671979e-06, + "loss": 0.5647, + "step": 5916 + }, + { + "epoch": 0.52, + "grad_norm": 8.739160461133235, + "learning_rate": 4.92462056185227e-06, + "loss": 0.8583, + "step": 5917 + }, + { + "epoch": 0.52, + "grad_norm": 8.573148450145919, + "learning_rate": 4.9231984191321405e-06, + "loss": 0.8653, + "step": 5918 + }, + { + "epoch": 0.52, + "grad_norm": 9.037365071030804, + "learning_rate": 4.9217762826266665e-06, + "loss": 0.59, + "step": 5919 + }, + { + "epoch": 0.52, + "grad_norm": 8.910124018469828, + "learning_rate": 4.920354152450926e-06, + "loss": 0.7426, + "step": 5920 + }, + { + "epoch": 0.52, + "grad_norm": 8.76684464795255, + "learning_rate": 4.918932028719995e-06, + "loss": 0.773, + "step": 5921 + }, + { + "epoch": 0.52, + "grad_norm": 2.04034511495544, + "learning_rate": 4.917509911548949e-06, + "loss": 0.5313, + "step": 5922 + }, + { + "epoch": 0.52, + "grad_norm": 6.271743431703367, + "learning_rate": 4.916087801052864e-06, + "loss": 0.6868, + "step": 5923 + }, + { + "epoch": 0.52, + "grad_norm": 16.49451820219165, + "learning_rate": 4.914665697346814e-06, + "loss": 0.7574, + "step": 5924 + }, + { + "epoch": 0.52, + "grad_norm": 11.736848848205009, + "learning_rate": 4.913243600545875e-06, + "loss": 0.7789, + "step": 5925 + }, + { + "epoch": 0.52, + "grad_norm": 4.849809908820887, + "learning_rate": 4.911821510765118e-06, + "loss": 0.8348, + "step": 5926 + }, + { + "epoch": 0.52, + "grad_norm": 14.534537989511602, + "learning_rate": 4.9103994281196185e-06, + "loss": 0.7215, + "step": 5927 + }, + { + "epoch": 0.52, + "grad_norm": 6.810220013068258, + "learning_rate": 4.9089773527244494e-06, + "loss": 0.799, + "step": 5928 + }, + { + "epoch": 0.52, + "grad_norm": 14.61004505251948, + "learning_rate": 4.907555284694679e-06, + "loss": 0.7546, + "step": 5929 + }, + { + "epoch": 0.52, + "grad_norm": 8.52730617289901, + "learning_rate": 4.906133224145384e-06, + "loss": 0.7572, + "step": 5930 + }, + { + "epoch": 0.52, + "grad_norm": 7.268479392638587, + "learning_rate": 4.904711171191631e-06, + "loss": 0.759, + "step": 5931 + }, + { + "epoch": 0.52, + "grad_norm": 11.275002355569882, + "learning_rate": 4.90328912594849e-06, + "loss": 0.8088, + "step": 5932 + }, + { + "epoch": 0.52, + "grad_norm": 6.168955312271786, + "learning_rate": 4.901867088531034e-06, + "loss": 0.7613, + "step": 5933 + }, + { + "epoch": 0.52, + "grad_norm": 12.606701747858198, + "learning_rate": 4.900445059054329e-06, + "loss": 0.9729, + "step": 5934 + }, + { + "epoch": 0.52, + "grad_norm": 1.9532204600620595, + "learning_rate": 4.899023037633445e-06, + "loss": 0.4666, + "step": 5935 + }, + { + "epoch": 0.52, + "grad_norm": 9.58825773770186, + "learning_rate": 4.897601024383449e-06, + "loss": 0.7838, + "step": 5936 + }, + { + "epoch": 0.52, + "grad_norm": 7.995395008754907, + "learning_rate": 4.896179019419407e-06, + "loss": 0.717, + "step": 5937 + }, + { + "epoch": 0.52, + "grad_norm": 6.981981069990596, + "learning_rate": 4.894757022856386e-06, + "loss": 0.9502, + "step": 5938 + }, + { + "epoch": 0.52, + "grad_norm": 7.178762460504211, + "learning_rate": 4.893335034809452e-06, + "loss": 0.8807, + "step": 5939 + }, + { + "epoch": 0.52, + "grad_norm": 7.525626185682142, + "learning_rate": 4.891913055393669e-06, + "loss": 0.761, + "step": 5940 + }, + { + "epoch": 0.52, + "grad_norm": 6.30135144476534, + "learning_rate": 4.8904910847241025e-06, + "loss": 0.8917, + "step": 5941 + }, + { + "epoch": 0.52, + "grad_norm": 6.039282758768304, + "learning_rate": 4.889069122915816e-06, + "loss": 0.7003, + "step": 5942 + }, + { + "epoch": 0.52, + "grad_norm": 24.370520267657263, + "learning_rate": 4.887647170083869e-06, + "loss": 0.834, + "step": 5943 + }, + { + "epoch": 0.52, + "grad_norm": 2.9256972971575443, + "learning_rate": 4.8862252263433284e-06, + "loss": 0.4537, + "step": 5944 + }, + { + "epoch": 0.52, + "grad_norm": 9.831232122646982, + "learning_rate": 4.884803291809253e-06, + "loss": 0.931, + "step": 5945 + }, + { + "epoch": 0.52, + "grad_norm": 11.619034252646204, + "learning_rate": 4.883381366596703e-06, + "loss": 0.7062, + "step": 5946 + }, + { + "epoch": 0.52, + "grad_norm": 10.560959693098265, + "learning_rate": 4.8819594508207394e-06, + "loss": 0.7852, + "step": 5947 + }, + { + "epoch": 0.52, + "grad_norm": 6.598676596057162, + "learning_rate": 4.88053754459642e-06, + "loss": 0.7103, + "step": 5948 + }, + { + "epoch": 0.52, + "grad_norm": 6.571910155410966, + "learning_rate": 4.879115648038804e-06, + "loss": 0.6807, + "step": 5949 + }, + { + "epoch": 0.52, + "grad_norm": 12.0876236889042, + "learning_rate": 4.877693761262949e-06, + "loss": 0.7458, + "step": 5950 + }, + { + "epoch": 0.52, + "grad_norm": 7.556795274615782, + "learning_rate": 4.876271884383911e-06, + "loss": 0.6659, + "step": 5951 + }, + { + "epoch": 0.52, + "grad_norm": 2.88019062947736, + "learning_rate": 4.874850017516746e-06, + "loss": 0.5635, + "step": 5952 + }, + { + "epoch": 0.52, + "grad_norm": 10.08075720401106, + "learning_rate": 4.87342816077651e-06, + "loss": 0.8145, + "step": 5953 + }, + { + "epoch": 0.52, + "grad_norm": 2.204551357922091, + "learning_rate": 4.872006314278256e-06, + "loss": 0.4897, + "step": 5954 + }, + { + "epoch": 0.52, + "grad_norm": 11.78556298609114, + "learning_rate": 4.870584478137038e-06, + "loss": 0.7298, + "step": 5955 + }, + { + "epoch": 0.52, + "grad_norm": 11.87855629462068, + "learning_rate": 4.869162652467911e-06, + "loss": 0.7133, + "step": 5956 + }, + { + "epoch": 0.52, + "grad_norm": 16.365756149420058, + "learning_rate": 4.8677408373859216e-06, + "loss": 0.7599, + "step": 5957 + }, + { + "epoch": 0.52, + "grad_norm": 7.799205374069639, + "learning_rate": 4.866319033006125e-06, + "loss": 0.8458, + "step": 5958 + }, + { + "epoch": 0.52, + "grad_norm": 7.227262390402364, + "learning_rate": 4.8648972394435704e-06, + "loss": 0.6755, + "step": 5959 + }, + { + "epoch": 0.52, + "grad_norm": 11.50832710671476, + "learning_rate": 4.863475456813306e-06, + "loss": 0.7553, + "step": 5960 + }, + { + "epoch": 0.52, + "grad_norm": 7.261179805859437, + "learning_rate": 4.86205368523038e-06, + "loss": 0.8252, + "step": 5961 + }, + { + "epoch": 0.52, + "grad_norm": 11.188387222193677, + "learning_rate": 4.860631924809841e-06, + "loss": 0.8721, + "step": 5962 + }, + { + "epoch": 0.52, + "grad_norm": 7.3284058909512995, + "learning_rate": 4.8592101756667345e-06, + "loss": 0.8944, + "step": 5963 + }, + { + "epoch": 0.52, + "grad_norm": 7.287412849272218, + "learning_rate": 4.8577884379161066e-06, + "loss": 0.8102, + "step": 5964 + }, + { + "epoch": 0.52, + "grad_norm": 10.409952687092717, + "learning_rate": 4.856366711673002e-06, + "loss": 0.7516, + "step": 5965 + }, + { + "epoch": 0.52, + "grad_norm": 14.623198511441762, + "learning_rate": 4.854944997052463e-06, + "loss": 0.7136, + "step": 5966 + }, + { + "epoch": 0.52, + "grad_norm": 11.94742790966387, + "learning_rate": 4.853523294169535e-06, + "loss": 0.8387, + "step": 5967 + }, + { + "epoch": 0.52, + "grad_norm": 15.502256360768726, + "learning_rate": 4.852101603139258e-06, + "loss": 0.7087, + "step": 5968 + }, + { + "epoch": 0.52, + "grad_norm": 7.9257050249879315, + "learning_rate": 4.850679924076672e-06, + "loss": 0.7262, + "step": 5969 + }, + { + "epoch": 0.52, + "grad_norm": 2.31732946377939, + "learning_rate": 4.84925825709682e-06, + "loss": 0.5541, + "step": 5970 + }, + { + "epoch": 0.52, + "grad_norm": 13.271018775847779, + "learning_rate": 4.847836602314739e-06, + "loss": 0.8498, + "step": 5971 + }, + { + "epoch": 0.52, + "grad_norm": 24.201709681770332, + "learning_rate": 4.8464149598454656e-06, + "loss": 0.7707, + "step": 5972 + }, + { + "epoch": 0.52, + "grad_norm": 7.10943119057123, + "learning_rate": 4.8449933298040384e-06, + "loss": 0.6777, + "step": 5973 + }, + { + "epoch": 0.52, + "grad_norm": 2.785950152288687, + "learning_rate": 4.843571712305493e-06, + "loss": 0.584, + "step": 5974 + }, + { + "epoch": 0.52, + "grad_norm": 8.843397962520953, + "learning_rate": 4.842150107464866e-06, + "loss": 0.8828, + "step": 5975 + }, + { + "epoch": 0.52, + "grad_norm": 20.508566752224127, + "learning_rate": 4.840728515397189e-06, + "loss": 0.7463, + "step": 5976 + }, + { + "epoch": 0.52, + "grad_norm": 7.617913948635836, + "learning_rate": 4.8393069362174945e-06, + "loss": 0.6726, + "step": 5977 + }, + { + "epoch": 0.53, + "grad_norm": 9.465818689145316, + "learning_rate": 4.837885370040816e-06, + "loss": 0.6441, + "step": 5978 + }, + { + "epoch": 0.53, + "grad_norm": 10.198095526802517, + "learning_rate": 4.836463816982185e-06, + "loss": 0.7703, + "step": 5979 + }, + { + "epoch": 0.53, + "grad_norm": 13.051156797102395, + "learning_rate": 4.835042277156628e-06, + "loss": 0.777, + "step": 5980 + }, + { + "epoch": 0.53, + "grad_norm": 9.934155337873976, + "learning_rate": 4.833620750679176e-06, + "loss": 0.7371, + "step": 5981 + }, + { + "epoch": 0.53, + "grad_norm": 7.0282316709560835, + "learning_rate": 4.832199237664857e-06, + "loss": 0.6365, + "step": 5982 + }, + { + "epoch": 0.53, + "grad_norm": 11.473000408048913, + "learning_rate": 4.830777738228695e-06, + "loss": 0.9085, + "step": 5983 + }, + { + "epoch": 0.53, + "grad_norm": 8.228995830859496, + "learning_rate": 4.829356252485719e-06, + "loss": 0.8067, + "step": 5984 + }, + { + "epoch": 0.53, + "grad_norm": 8.512361331415946, + "learning_rate": 4.82793478055095e-06, + "loss": 0.7006, + "step": 5985 + }, + { + "epoch": 0.53, + "grad_norm": 8.99923789809434, + "learning_rate": 4.826513322539412e-06, + "loss": 0.7245, + "step": 5986 + }, + { + "epoch": 0.53, + "grad_norm": 11.626012264341993, + "learning_rate": 4.825091878566129e-06, + "loss": 0.7201, + "step": 5987 + }, + { + "epoch": 0.53, + "grad_norm": 9.530987946824256, + "learning_rate": 4.82367044874612e-06, + "loss": 0.6918, + "step": 5988 + }, + { + "epoch": 0.53, + "grad_norm": 8.319280695189834, + "learning_rate": 4.822249033194403e-06, + "loss": 0.8198, + "step": 5989 + }, + { + "epoch": 0.53, + "grad_norm": 3.848840818389828, + "learning_rate": 4.820827632026e-06, + "loss": 0.5627, + "step": 5990 + }, + { + "epoch": 0.53, + "grad_norm": 13.315136565154306, + "learning_rate": 4.819406245355926e-06, + "loss": 0.7551, + "step": 5991 + }, + { + "epoch": 0.53, + "grad_norm": 40.9779439352763, + "learning_rate": 4.817984873299199e-06, + "loss": 0.8055, + "step": 5992 + }, + { + "epoch": 0.53, + "grad_norm": 6.2874622113290295, + "learning_rate": 4.816563515970833e-06, + "loss": 0.6427, + "step": 5993 + }, + { + "epoch": 0.53, + "grad_norm": 2.6949396729760675, + "learning_rate": 4.815142173485842e-06, + "loss": 0.4732, + "step": 5994 + }, + { + "epoch": 0.53, + "grad_norm": 11.985251054136466, + "learning_rate": 4.81372084595924e-06, + "loss": 0.8632, + "step": 5995 + }, + { + "epoch": 0.53, + "grad_norm": 16.53580544701229, + "learning_rate": 4.8122995335060365e-06, + "loss": 0.9076, + "step": 5996 + }, + { + "epoch": 0.53, + "grad_norm": 8.402612072941368, + "learning_rate": 4.810878236241241e-06, + "loss": 0.6275, + "step": 5997 + }, + { + "epoch": 0.53, + "grad_norm": 11.952158591731784, + "learning_rate": 4.809456954279866e-06, + "loss": 0.7149, + "step": 5998 + }, + { + "epoch": 0.53, + "grad_norm": 13.085307956634376, + "learning_rate": 4.808035687736918e-06, + "loss": 0.9617, + "step": 5999 + }, + { + "epoch": 0.53, + "grad_norm": 10.794155962160273, + "learning_rate": 4.806614436727401e-06, + "loss": 0.7528, + "step": 6000 + }, + { + "epoch": 0.53, + "grad_norm": 14.00751876542753, + "learning_rate": 4.805193201366323e-06, + "loss": 0.7484, + "step": 6001 + }, + { + "epoch": 0.53, + "grad_norm": 7.312174255323384, + "learning_rate": 4.803771981768687e-06, + "loss": 0.8236, + "step": 6002 + }, + { + "epoch": 0.53, + "grad_norm": 9.222841412525257, + "learning_rate": 4.802350778049495e-06, + "loss": 0.7859, + "step": 6003 + }, + { + "epoch": 0.53, + "grad_norm": 25.82819932869598, + "learning_rate": 4.8009295903237504e-06, + "loss": 1.0182, + "step": 6004 + }, + { + "epoch": 0.53, + "grad_norm": 8.940488612529593, + "learning_rate": 4.799508418706452e-06, + "loss": 0.7921, + "step": 6005 + }, + { + "epoch": 0.53, + "grad_norm": 10.425777688343796, + "learning_rate": 4.7980872633125974e-06, + "loss": 0.7412, + "step": 6006 + }, + { + "epoch": 0.53, + "grad_norm": 10.745647088587765, + "learning_rate": 4.796666124257187e-06, + "loss": 0.7396, + "step": 6007 + }, + { + "epoch": 0.53, + "grad_norm": 7.928626787978304, + "learning_rate": 4.795245001655216e-06, + "loss": 0.753, + "step": 6008 + }, + { + "epoch": 0.53, + "grad_norm": 7.385654741516115, + "learning_rate": 4.793823895621678e-06, + "loss": 0.9116, + "step": 6009 + }, + { + "epoch": 0.53, + "grad_norm": 21.40480368519812, + "learning_rate": 4.7924028062715675e-06, + "loss": 0.6345, + "step": 6010 + }, + { + "epoch": 0.53, + "grad_norm": 21.089553237495668, + "learning_rate": 4.790981733719877e-06, + "loss": 0.6586, + "step": 6011 + }, + { + "epoch": 0.53, + "grad_norm": 10.444724233438732, + "learning_rate": 4.789560678081596e-06, + "loss": 0.7651, + "step": 6012 + }, + { + "epoch": 0.53, + "grad_norm": 11.129500416259596, + "learning_rate": 4.788139639471716e-06, + "loss": 0.9742, + "step": 6013 + }, + { + "epoch": 0.53, + "grad_norm": 9.33649670031719, + "learning_rate": 4.786718618005223e-06, + "loss": 0.5113, + "step": 6014 + }, + { + "epoch": 0.53, + "grad_norm": 8.734822658568007, + "learning_rate": 4.785297613797105e-06, + "loss": 0.7832, + "step": 6015 + }, + { + "epoch": 0.53, + "grad_norm": 10.682626651032017, + "learning_rate": 4.783876626962349e-06, + "loss": 0.818, + "step": 6016 + }, + { + "epoch": 0.53, + "grad_norm": 6.056002665780192, + "learning_rate": 4.7824556576159335e-06, + "loss": 0.862, + "step": 6017 + }, + { + "epoch": 0.53, + "grad_norm": 6.449520980431395, + "learning_rate": 4.781034705872846e-06, + "loss": 0.7351, + "step": 6018 + }, + { + "epoch": 0.53, + "grad_norm": 19.616503617081854, + "learning_rate": 4.779613771848066e-06, + "loss": 0.8484, + "step": 6019 + }, + { + "epoch": 0.53, + "grad_norm": 13.518075954476835, + "learning_rate": 4.778192855656572e-06, + "loss": 0.8348, + "step": 6020 + }, + { + "epoch": 0.53, + "grad_norm": 8.364673574886089, + "learning_rate": 4.776771957413344e-06, + "loss": 0.7894, + "step": 6021 + }, + { + "epoch": 0.53, + "grad_norm": 6.365109898449248, + "learning_rate": 4.775351077233358e-06, + "loss": 0.6085, + "step": 6022 + }, + { + "epoch": 0.53, + "grad_norm": 2.598872676656552, + "learning_rate": 4.773930215231588e-06, + "loss": 0.5461, + "step": 6023 + }, + { + "epoch": 0.53, + "grad_norm": 14.65783425142131, + "learning_rate": 4.7725093715230105e-06, + "loss": 0.8515, + "step": 6024 + }, + { + "epoch": 0.53, + "grad_norm": 10.55372895476097, + "learning_rate": 4.771088546222596e-06, + "loss": 0.6962, + "step": 6025 + }, + { + "epoch": 0.53, + "grad_norm": 2.5537971347152415, + "learning_rate": 4.769667739445314e-06, + "loss": 0.5069, + "step": 6026 + }, + { + "epoch": 0.53, + "grad_norm": 8.097546393326091, + "learning_rate": 4.768246951306137e-06, + "loss": 0.8461, + "step": 6027 + }, + { + "epoch": 0.53, + "grad_norm": 11.376638907845283, + "learning_rate": 4.766826181920031e-06, + "loss": 0.846, + "step": 6028 + }, + { + "epoch": 0.53, + "grad_norm": 16.967738562874256, + "learning_rate": 4.765405431401961e-06, + "loss": 0.8492, + "step": 6029 + }, + { + "epoch": 0.53, + "grad_norm": 4.851737746455539, + "learning_rate": 4.763984699866895e-06, + "loss": 0.764, + "step": 6030 + }, + { + "epoch": 0.53, + "grad_norm": 7.907895227431327, + "learning_rate": 4.7625639874297945e-06, + "loss": 0.729, + "step": 6031 + }, + { + "epoch": 0.53, + "grad_norm": 3.2228549600932093, + "learning_rate": 4.76114329420562e-06, + "loss": 0.5099, + "step": 6032 + }, + { + "epoch": 0.53, + "grad_norm": 13.348762002453457, + "learning_rate": 4.759722620309334e-06, + "loss": 0.8045, + "step": 6033 + }, + { + "epoch": 0.53, + "grad_norm": 9.815667375564553, + "learning_rate": 4.758301965855893e-06, + "loss": 0.9285, + "step": 6034 + }, + { + "epoch": 0.53, + "grad_norm": 13.086966637681765, + "learning_rate": 4.756881330960255e-06, + "loss": 0.7095, + "step": 6035 + }, + { + "epoch": 0.53, + "grad_norm": 7.970985165707232, + "learning_rate": 4.7554607157373764e-06, + "loss": 0.7829, + "step": 6036 + }, + { + "epoch": 0.53, + "grad_norm": 44.326611793969725, + "learning_rate": 4.754040120302208e-06, + "loss": 0.8792, + "step": 6037 + }, + { + "epoch": 0.53, + "grad_norm": 9.065322623214483, + "learning_rate": 4.7526195447697045e-06, + "loss": 0.9529, + "step": 6038 + }, + { + "epoch": 0.53, + "grad_norm": 5.545871931399285, + "learning_rate": 4.751198989254817e-06, + "loss": 0.8052, + "step": 6039 + }, + { + "epoch": 0.53, + "grad_norm": 5.233127174090253, + "learning_rate": 4.7497784538724925e-06, + "loss": 0.7516, + "step": 6040 + }, + { + "epoch": 0.53, + "grad_norm": 14.647973472248083, + "learning_rate": 4.74835793873768e-06, + "loss": 0.8011, + "step": 6041 + }, + { + "epoch": 0.53, + "grad_norm": 8.027767261748368, + "learning_rate": 4.746937443965324e-06, + "loss": 0.704, + "step": 6042 + }, + { + "epoch": 0.53, + "grad_norm": 5.790619644811541, + "learning_rate": 4.745516969670369e-06, + "loss": 0.7427, + "step": 6043 + }, + { + "epoch": 0.53, + "grad_norm": 13.653600230677998, + "learning_rate": 4.744096515967757e-06, + "loss": 0.7759, + "step": 6044 + }, + { + "epoch": 0.53, + "grad_norm": 9.565122481174237, + "learning_rate": 4.742676082972431e-06, + "loss": 0.8133, + "step": 6045 + }, + { + "epoch": 0.53, + "grad_norm": 10.255279106053006, + "learning_rate": 4.7412556707993265e-06, + "loss": 0.7232, + "step": 6046 + }, + { + "epoch": 0.53, + "grad_norm": 10.508082789496079, + "learning_rate": 4.739835279563384e-06, + "loss": 0.8187, + "step": 6047 + }, + { + "epoch": 0.53, + "grad_norm": 5.79252308328596, + "learning_rate": 4.738414909379538e-06, + "loss": 0.8519, + "step": 6048 + }, + { + "epoch": 0.53, + "grad_norm": 12.685676317895235, + "learning_rate": 4.736994560362721e-06, + "loss": 0.9193, + "step": 6049 + }, + { + "epoch": 0.53, + "grad_norm": 9.199786613024589, + "learning_rate": 4.735574232627868e-06, + "loss": 0.854, + "step": 6050 + }, + { + "epoch": 0.53, + "grad_norm": 9.195256510281082, + "learning_rate": 4.7341539262899075e-06, + "loss": 0.7632, + "step": 6051 + }, + { + "epoch": 0.53, + "grad_norm": 5.635167807157764, + "learning_rate": 4.732733641463769e-06, + "loss": 0.652, + "step": 6052 + }, + { + "epoch": 0.53, + "grad_norm": 9.857593655828794, + "learning_rate": 4.73131337826438e-06, + "loss": 0.8189, + "step": 6053 + }, + { + "epoch": 0.53, + "grad_norm": 10.533668439713622, + "learning_rate": 4.729893136806665e-06, + "loss": 0.7584, + "step": 6054 + }, + { + "epoch": 0.53, + "grad_norm": 10.947157573692689, + "learning_rate": 4.728472917205549e-06, + "loss": 0.8568, + "step": 6055 + }, + { + "epoch": 0.53, + "grad_norm": 9.412907071997429, + "learning_rate": 4.727052719575953e-06, + "loss": 0.8024, + "step": 6056 + }, + { + "epoch": 0.53, + "grad_norm": 9.166173339298489, + "learning_rate": 4.725632544032796e-06, + "loss": 0.8525, + "step": 6057 + }, + { + "epoch": 0.53, + "grad_norm": 14.94763349389282, + "learning_rate": 4.7242123906909975e-06, + "loss": 0.7297, + "step": 6058 + }, + { + "epoch": 0.53, + "grad_norm": 8.08713251509129, + "learning_rate": 4.722792259665474e-06, + "loss": 0.9352, + "step": 6059 + }, + { + "epoch": 0.53, + "grad_norm": 7.815090874490062, + "learning_rate": 4.721372151071139e-06, + "loss": 0.6628, + "step": 6060 + }, + { + "epoch": 0.53, + "grad_norm": 2.1746813936032345, + "learning_rate": 4.719952065022906e-06, + "loss": 0.4108, + "step": 6061 + }, + { + "epoch": 0.53, + "grad_norm": 9.508885116027558, + "learning_rate": 4.7185320016356865e-06, + "loss": 0.7595, + "step": 6062 + }, + { + "epoch": 0.53, + "grad_norm": 11.818963262168078, + "learning_rate": 4.717111961024388e-06, + "loss": 0.697, + "step": 6063 + }, + { + "epoch": 0.53, + "grad_norm": 8.433236968388803, + "learning_rate": 4.715691943303922e-06, + "loss": 0.7378, + "step": 6064 + }, + { + "epoch": 0.53, + "grad_norm": 9.194187936260821, + "learning_rate": 4.714271948589189e-06, + "loss": 0.8081, + "step": 6065 + }, + { + "epoch": 0.53, + "grad_norm": 6.923858391344621, + "learning_rate": 4.712851976995094e-06, + "loss": 0.7106, + "step": 6066 + }, + { + "epoch": 0.53, + "grad_norm": 7.948523022417157, + "learning_rate": 4.711432028636541e-06, + "loss": 0.7066, + "step": 6067 + }, + { + "epoch": 0.53, + "grad_norm": 11.807390509790958, + "learning_rate": 4.710012103628428e-06, + "loss": 0.8095, + "step": 6068 + }, + { + "epoch": 0.53, + "grad_norm": 11.49398766011647, + "learning_rate": 4.7085922020856515e-06, + "loss": 0.5542, + "step": 6069 + }, + { + "epoch": 0.53, + "grad_norm": 9.808668271750742, + "learning_rate": 4.707172324123111e-06, + "loss": 0.7465, + "step": 6070 + }, + { + "epoch": 0.53, + "grad_norm": 7.133659170064362, + "learning_rate": 4.705752469855699e-06, + "loss": 0.692, + "step": 6071 + }, + { + "epoch": 0.53, + "grad_norm": 9.102527911854255, + "learning_rate": 4.704332639398307e-06, + "loss": 0.7426, + "step": 6072 + }, + { + "epoch": 0.53, + "grad_norm": 3.6060989657386826, + "learning_rate": 4.7029128328658255e-06, + "loss": 0.5863, + "step": 6073 + }, + { + "epoch": 0.53, + "grad_norm": 7.109625910783146, + "learning_rate": 4.701493050373144e-06, + "loss": 0.8469, + "step": 6074 + }, + { + "epoch": 0.53, + "grad_norm": 8.356363529915994, + "learning_rate": 4.700073292035149e-06, + "loss": 0.8101, + "step": 6075 + }, + { + "epoch": 0.53, + "grad_norm": 10.657859044891305, + "learning_rate": 4.698653557966723e-06, + "loss": 0.6131, + "step": 6076 + }, + { + "epoch": 0.53, + "grad_norm": 9.771949238219415, + "learning_rate": 4.69723384828275e-06, + "loss": 0.941, + "step": 6077 + }, + { + "epoch": 0.53, + "grad_norm": 6.863825677740131, + "learning_rate": 4.695814163098111e-06, + "loss": 0.7927, + "step": 6078 + }, + { + "epoch": 0.53, + "grad_norm": 6.111213133318179, + "learning_rate": 4.694394502527685e-06, + "loss": 0.7468, + "step": 6079 + }, + { + "epoch": 0.53, + "grad_norm": 9.065705402746183, + "learning_rate": 4.692974866686345e-06, + "loss": 0.7726, + "step": 6080 + }, + { + "epoch": 0.53, + "grad_norm": 7.137307555279101, + "learning_rate": 4.691555255688969e-06, + "loss": 0.7426, + "step": 6081 + }, + { + "epoch": 0.53, + "grad_norm": 12.620074509905304, + "learning_rate": 4.69013566965043e-06, + "loss": 0.8646, + "step": 6082 + }, + { + "epoch": 0.53, + "grad_norm": 11.82807383672708, + "learning_rate": 4.688716108685595e-06, + "loss": 0.7675, + "step": 6083 + }, + { + "epoch": 0.53, + "grad_norm": 2.2687428478771783, + "learning_rate": 4.687296572909336e-06, + "loss": 0.4998, + "step": 6084 + }, + { + "epoch": 0.53, + "grad_norm": 2.8882645328668746, + "learning_rate": 4.685877062436519e-06, + "loss": 0.6139, + "step": 6085 + }, + { + "epoch": 0.53, + "grad_norm": 8.684986495694183, + "learning_rate": 4.6844575773820055e-06, + "loss": 0.6759, + "step": 6086 + }, + { + "epoch": 0.53, + "grad_norm": 3.145009652228294, + "learning_rate": 4.683038117860662e-06, + "loss": 0.6505, + "step": 6087 + }, + { + "epoch": 0.53, + "grad_norm": 14.475928533701161, + "learning_rate": 4.681618683987346e-06, + "loss": 0.752, + "step": 6088 + }, + { + "epoch": 0.53, + "grad_norm": 6.331755680307714, + "learning_rate": 4.680199275876915e-06, + "loss": 0.6643, + "step": 6089 + }, + { + "epoch": 0.53, + "grad_norm": 13.440644743972474, + "learning_rate": 4.6787798936442285e-06, + "loss": 0.7988, + "step": 6090 + }, + { + "epoch": 0.53, + "grad_norm": 8.200185672888962, + "learning_rate": 4.677360537404139e-06, + "loss": 0.6094, + "step": 6091 + }, + { + "epoch": 0.54, + "grad_norm": 12.55482723974797, + "learning_rate": 4.675941207271496e-06, + "loss": 0.6912, + "step": 6092 + }, + { + "epoch": 0.54, + "grad_norm": 12.128527034716088, + "learning_rate": 4.674521903361153e-06, + "loss": 0.6686, + "step": 6093 + }, + { + "epoch": 0.54, + "grad_norm": 12.648344527254586, + "learning_rate": 4.673102625787956e-06, + "loss": 0.7818, + "step": 6094 + }, + { + "epoch": 0.54, + "grad_norm": 6.90039636792742, + "learning_rate": 4.671683374666751e-06, + "loss": 0.8048, + "step": 6095 + }, + { + "epoch": 0.54, + "grad_norm": 8.665483072678022, + "learning_rate": 4.670264150112381e-06, + "loss": 0.7435, + "step": 6096 + }, + { + "epoch": 0.54, + "grad_norm": 3.068526123419759, + "learning_rate": 4.668844952239686e-06, + "loss": 0.6187, + "step": 6097 + }, + { + "epoch": 0.54, + "grad_norm": 9.161009042045567, + "learning_rate": 4.6674257811635085e-06, + "loss": 0.8664, + "step": 6098 + }, + { + "epoch": 0.54, + "grad_norm": 11.147502133405363, + "learning_rate": 4.666006636998683e-06, + "loss": 0.6692, + "step": 6099 + }, + { + "epoch": 0.54, + "grad_norm": 11.757409833530275, + "learning_rate": 4.6645875198600446e-06, + "loss": 0.868, + "step": 6100 + }, + { + "epoch": 0.54, + "grad_norm": 19.562506751428828, + "learning_rate": 4.663168429862427e-06, + "loss": 0.7632, + "step": 6101 + }, + { + "epoch": 0.54, + "grad_norm": 10.673321730731232, + "learning_rate": 4.66174936712066e-06, + "loss": 0.815, + "step": 6102 + }, + { + "epoch": 0.54, + "grad_norm": 14.50600327081407, + "learning_rate": 4.660330331749569e-06, + "loss": 0.6578, + "step": 6103 + }, + { + "epoch": 0.54, + "grad_norm": 10.868280135702285, + "learning_rate": 4.658911323863985e-06, + "loss": 0.7217, + "step": 6104 + }, + { + "epoch": 0.54, + "grad_norm": 8.668437753993635, + "learning_rate": 4.657492343578729e-06, + "loss": 0.768, + "step": 6105 + }, + { + "epoch": 0.54, + "grad_norm": 8.87456520060904, + "learning_rate": 4.656073391008622e-06, + "loss": 0.7133, + "step": 6106 + }, + { + "epoch": 0.54, + "grad_norm": 9.974482993547488, + "learning_rate": 4.654654466268485e-06, + "loss": 0.8529, + "step": 6107 + }, + { + "epoch": 0.54, + "grad_norm": 9.28445353434963, + "learning_rate": 4.653235569473134e-06, + "loss": 0.8404, + "step": 6108 + }, + { + "epoch": 0.54, + "grad_norm": 7.603366625856516, + "learning_rate": 4.651816700737383e-06, + "loss": 0.7509, + "step": 6109 + }, + { + "epoch": 0.54, + "grad_norm": 8.584942944631203, + "learning_rate": 4.650397860176046e-06, + "loss": 0.7391, + "step": 6110 + }, + { + "epoch": 0.54, + "grad_norm": 18.24412576694174, + "learning_rate": 4.648979047903933e-06, + "loss": 0.8034, + "step": 6111 + }, + { + "epoch": 0.54, + "grad_norm": 13.132451120952677, + "learning_rate": 4.647560264035851e-06, + "loss": 0.8024, + "step": 6112 + }, + { + "epoch": 0.54, + "grad_norm": 9.666550847898131, + "learning_rate": 4.646141508686606e-06, + "loss": 0.6308, + "step": 6113 + }, + { + "epoch": 0.54, + "grad_norm": 12.787612213575086, + "learning_rate": 4.6447227819710014e-06, + "loss": 0.8274, + "step": 6114 + }, + { + "epoch": 0.54, + "grad_norm": 3.707892926908572, + "learning_rate": 4.643304084003839e-06, + "loss": 0.5941, + "step": 6115 + }, + { + "epoch": 0.54, + "grad_norm": 14.223759289049712, + "learning_rate": 4.641885414899917e-06, + "loss": 0.8054, + "step": 6116 + }, + { + "epoch": 0.54, + "grad_norm": 11.35735881814492, + "learning_rate": 4.64046677477403e-06, + "loss": 0.6898, + "step": 6117 + }, + { + "epoch": 0.54, + "grad_norm": 8.255109973425787, + "learning_rate": 4.639048163740975e-06, + "loss": 0.7742, + "step": 6118 + }, + { + "epoch": 0.54, + "grad_norm": 7.9962830490835515, + "learning_rate": 4.637629581915543e-06, + "loss": 0.749, + "step": 6119 + }, + { + "epoch": 0.54, + "grad_norm": 7.068307011970124, + "learning_rate": 4.63621102941252e-06, + "loss": 0.8287, + "step": 6120 + }, + { + "epoch": 0.54, + "grad_norm": 7.139711457793727, + "learning_rate": 4.634792506346697e-06, + "loss": 0.7556, + "step": 6121 + }, + { + "epoch": 0.54, + "grad_norm": 9.239671544991559, + "learning_rate": 4.633374012832857e-06, + "loss": 0.906, + "step": 6122 + }, + { + "epoch": 0.54, + "grad_norm": 2.926210443044263, + "learning_rate": 4.631955548985781e-06, + "loss": 0.5235, + "step": 6123 + }, + { + "epoch": 0.54, + "grad_norm": 8.79811312867659, + "learning_rate": 4.63053711492025e-06, + "loss": 0.6089, + "step": 6124 + }, + { + "epoch": 0.54, + "grad_norm": 9.101861809916462, + "learning_rate": 4.629118710751043e-06, + "loss": 0.6889, + "step": 6125 + }, + { + "epoch": 0.54, + "grad_norm": 11.873424372464774, + "learning_rate": 4.62770033659293e-06, + "loss": 0.6179, + "step": 6126 + }, + { + "epoch": 0.54, + "grad_norm": 3.316327001987218, + "learning_rate": 4.626281992560688e-06, + "loss": 0.5619, + "step": 6127 + }, + { + "epoch": 0.54, + "grad_norm": 12.633456534748749, + "learning_rate": 4.624863678769086e-06, + "loss": 0.6707, + "step": 6128 + }, + { + "epoch": 0.54, + "grad_norm": 9.620541761815527, + "learning_rate": 4.62344539533289e-06, + "loss": 0.7391, + "step": 6129 + }, + { + "epoch": 0.54, + "grad_norm": 7.9640857350162735, + "learning_rate": 4.622027142366867e-06, + "loss": 0.7732, + "step": 6130 + }, + { + "epoch": 0.54, + "grad_norm": 9.621074874951029, + "learning_rate": 4.620608919985779e-06, + "loss": 0.6569, + "step": 6131 + }, + { + "epoch": 0.54, + "grad_norm": 10.606453132114382, + "learning_rate": 4.6191907283043855e-06, + "loss": 0.7791, + "step": 6132 + }, + { + "epoch": 0.54, + "grad_norm": 9.035093947752385, + "learning_rate": 4.617772567437445e-06, + "loss": 0.928, + "step": 6133 + }, + { + "epoch": 0.54, + "grad_norm": 7.577239592988347, + "learning_rate": 4.616354437499712e-06, + "loss": 0.6429, + "step": 6134 + }, + { + "epoch": 0.54, + "grad_norm": 19.365076915587355, + "learning_rate": 4.614936338605941e-06, + "loss": 0.78, + "step": 6135 + }, + { + "epoch": 0.54, + "grad_norm": 11.004619268093215, + "learning_rate": 4.613518270870881e-06, + "loss": 0.7106, + "step": 6136 + }, + { + "epoch": 0.54, + "grad_norm": 13.477356153910822, + "learning_rate": 4.612100234409277e-06, + "loss": 0.657, + "step": 6137 + }, + { + "epoch": 0.54, + "grad_norm": 17.44248373009694, + "learning_rate": 4.61068222933588e-06, + "loss": 0.8012, + "step": 6138 + }, + { + "epoch": 0.54, + "grad_norm": 10.196444884853957, + "learning_rate": 4.609264255765429e-06, + "loss": 0.5827, + "step": 6139 + }, + { + "epoch": 0.54, + "grad_norm": 19.80970683307842, + "learning_rate": 4.607846313812662e-06, + "loss": 0.8312, + "step": 6140 + }, + { + "epoch": 0.54, + "grad_norm": 15.637376296817743, + "learning_rate": 4.606428403592321e-06, + "loss": 0.756, + "step": 6141 + }, + { + "epoch": 0.54, + "grad_norm": 7.367654660684729, + "learning_rate": 4.605010525219139e-06, + "loss": 0.793, + "step": 6142 + }, + { + "epoch": 0.54, + "grad_norm": 7.632880311764945, + "learning_rate": 4.603592678807847e-06, + "loss": 0.7163, + "step": 6143 + }, + { + "epoch": 0.54, + "grad_norm": 3.1642076186462917, + "learning_rate": 4.602174864473177e-06, + "loss": 0.58, + "step": 6144 + }, + { + "epoch": 0.54, + "grad_norm": 10.185729507722918, + "learning_rate": 4.600757082329856e-06, + "loss": 0.738, + "step": 6145 + }, + { + "epoch": 0.54, + "grad_norm": 11.549162596048431, + "learning_rate": 4.5993393324926065e-06, + "loss": 0.7409, + "step": 6146 + }, + { + "epoch": 0.54, + "grad_norm": 8.051619298483834, + "learning_rate": 4.5979216150761514e-06, + "loss": 0.7817, + "step": 6147 + }, + { + "epoch": 0.54, + "grad_norm": 7.505517475670673, + "learning_rate": 4.596503930195212e-06, + "loss": 0.6846, + "step": 6148 + }, + { + "epoch": 0.54, + "grad_norm": 5.642985465683764, + "learning_rate": 4.5950862779645015e-06, + "loss": 0.6549, + "step": 6149 + }, + { + "epoch": 0.54, + "grad_norm": 11.880773361400696, + "learning_rate": 4.593668658498737e-06, + "loss": 0.8037, + "step": 6150 + }, + { + "epoch": 0.54, + "grad_norm": 9.135515994697606, + "learning_rate": 4.592251071912628e-06, + "loss": 0.7487, + "step": 6151 + }, + { + "epoch": 0.54, + "grad_norm": 8.660336351323272, + "learning_rate": 4.590833518320885e-06, + "loss": 0.6522, + "step": 6152 + }, + { + "epoch": 0.54, + "grad_norm": 6.793336875074735, + "learning_rate": 4.589415997838214e-06, + "loss": 0.6484, + "step": 6153 + }, + { + "epoch": 0.54, + "grad_norm": 16.939669119219683, + "learning_rate": 4.5879985105793145e-06, + "loss": 0.6661, + "step": 6154 + }, + { + "epoch": 0.54, + "grad_norm": 7.048461394157464, + "learning_rate": 4.586581056658893e-06, + "loss": 0.7526, + "step": 6155 + }, + { + "epoch": 0.54, + "grad_norm": 2.715927839047253, + "learning_rate": 4.585163636191643e-06, + "loss": 0.4503, + "step": 6156 + }, + { + "epoch": 0.54, + "grad_norm": 14.396754035393418, + "learning_rate": 4.583746249292262e-06, + "loss": 0.7092, + "step": 6157 + }, + { + "epoch": 0.54, + "grad_norm": 7.378561940061582, + "learning_rate": 4.582328896075443e-06, + "loss": 0.7329, + "step": 6158 + }, + { + "epoch": 0.54, + "grad_norm": 8.795235575023085, + "learning_rate": 4.580911576655874e-06, + "loss": 0.756, + "step": 6159 + }, + { + "epoch": 0.54, + "grad_norm": 3.2098346674574336, + "learning_rate": 4.579494291148242e-06, + "loss": 0.4808, + "step": 6160 + }, + { + "epoch": 0.54, + "grad_norm": 2.7871229807813673, + "learning_rate": 4.578077039667235e-06, + "loss": 0.5654, + "step": 6161 + }, + { + "epoch": 0.54, + "grad_norm": 10.155408223543343, + "learning_rate": 4.576659822327531e-06, + "loss": 1.0523, + "step": 6162 + }, + { + "epoch": 0.54, + "grad_norm": 22.871977413659952, + "learning_rate": 4.575242639243809e-06, + "loss": 0.8509, + "step": 6163 + }, + { + "epoch": 0.54, + "grad_norm": 10.963917175217613, + "learning_rate": 4.5738254905307475e-06, + "loss": 0.9172, + "step": 6164 + }, + { + "epoch": 0.54, + "grad_norm": 14.160841215087409, + "learning_rate": 4.572408376303018e-06, + "loss": 0.7008, + "step": 6165 + }, + { + "epoch": 0.54, + "grad_norm": 16.20276299935354, + "learning_rate": 4.57099129667529e-06, + "loss": 0.8501, + "step": 6166 + }, + { + "epoch": 0.54, + "grad_norm": 12.10113390749903, + "learning_rate": 4.569574251762234e-06, + "loss": 0.8455, + "step": 6167 + }, + { + "epoch": 0.54, + "grad_norm": 5.846590619313465, + "learning_rate": 4.568157241678515e-06, + "loss": 0.7503, + "step": 6168 + }, + { + "epoch": 0.54, + "grad_norm": 11.188687534121334, + "learning_rate": 4.566740266538791e-06, + "loss": 0.8765, + "step": 6169 + }, + { + "epoch": 0.54, + "grad_norm": 9.042636238901698, + "learning_rate": 4.565323326457725e-06, + "loss": 0.8152, + "step": 6170 + }, + { + "epoch": 0.54, + "grad_norm": 3.461172467505548, + "learning_rate": 4.563906421549972e-06, + "loss": 0.5652, + "step": 6171 + }, + { + "epoch": 0.54, + "grad_norm": 9.35821485195258, + "learning_rate": 4.562489551930187e-06, + "loss": 0.7636, + "step": 6172 + }, + { + "epoch": 0.54, + "grad_norm": 8.546135259233042, + "learning_rate": 4.561072717713019e-06, + "loss": 0.8578, + "step": 6173 + }, + { + "epoch": 0.54, + "grad_norm": 7.625008861892174, + "learning_rate": 4.559655919013116e-06, + "loss": 0.7756, + "step": 6174 + }, + { + "epoch": 0.54, + "grad_norm": 33.6266447697594, + "learning_rate": 4.558239155945125e-06, + "loss": 0.8061, + "step": 6175 + }, + { + "epoch": 0.54, + "grad_norm": 5.836476165318401, + "learning_rate": 4.556822428623686e-06, + "loss": 0.6579, + "step": 6176 + }, + { + "epoch": 0.54, + "grad_norm": 6.11615748619319, + "learning_rate": 4.555405737163438e-06, + "loss": 0.8512, + "step": 6177 + }, + { + "epoch": 0.54, + "grad_norm": 3.1341371787875136, + "learning_rate": 4.553989081679019e-06, + "loss": 0.5445, + "step": 6178 + }, + { + "epoch": 0.54, + "grad_norm": 7.647403072683488, + "learning_rate": 4.552572462285063e-06, + "loss": 0.6474, + "step": 6179 + }, + { + "epoch": 0.54, + "grad_norm": 8.228702392165081, + "learning_rate": 4.551155879096196e-06, + "loss": 0.5768, + "step": 6180 + }, + { + "epoch": 0.54, + "grad_norm": 8.348930035902283, + "learning_rate": 4.549739332227051e-06, + "loss": 0.6947, + "step": 6181 + }, + { + "epoch": 0.54, + "grad_norm": 3.6864110948997055, + "learning_rate": 4.54832282179225e-06, + "loss": 0.5983, + "step": 6182 + }, + { + "epoch": 0.54, + "grad_norm": 12.89947496389089, + "learning_rate": 4.546906347906414e-06, + "loss": 0.7308, + "step": 6183 + }, + { + "epoch": 0.54, + "grad_norm": 7.278388515996149, + "learning_rate": 4.545489910684164e-06, + "loss": 0.7918, + "step": 6184 + }, + { + "epoch": 0.54, + "grad_norm": 15.984125215959343, + "learning_rate": 4.544073510240114e-06, + "loss": 0.9156, + "step": 6185 + }, + { + "epoch": 0.54, + "grad_norm": 15.552725471854979, + "learning_rate": 4.542657146688876e-06, + "loss": 0.7588, + "step": 6186 + }, + { + "epoch": 0.54, + "grad_norm": 9.848431999604472, + "learning_rate": 4.541240820145062e-06, + "loss": 0.8767, + "step": 6187 + }, + { + "epoch": 0.54, + "grad_norm": 6.425988922964754, + "learning_rate": 4.539824530723277e-06, + "loss": 0.7946, + "step": 6188 + }, + { + "epoch": 0.54, + "grad_norm": 15.557976077597985, + "learning_rate": 4.538408278538125e-06, + "loss": 0.7945, + "step": 6189 + }, + { + "epoch": 0.54, + "grad_norm": 3.203235850251115, + "learning_rate": 4.536992063704209e-06, + "loss": 0.4551, + "step": 6190 + }, + { + "epoch": 0.54, + "grad_norm": 9.79774070661425, + "learning_rate": 4.535575886336124e-06, + "loss": 0.721, + "step": 6191 + }, + { + "epoch": 0.54, + "grad_norm": 10.22185546677122, + "learning_rate": 4.534159746548466e-06, + "loss": 0.6683, + "step": 6192 + }, + { + "epoch": 0.54, + "grad_norm": 5.510520660667884, + "learning_rate": 4.532743644455827e-06, + "loss": 0.6991, + "step": 6193 + }, + { + "epoch": 0.54, + "grad_norm": 5.759954459593295, + "learning_rate": 4.531327580172794e-06, + "loss": 0.6998, + "step": 6194 + }, + { + "epoch": 0.54, + "grad_norm": 10.398887070593343, + "learning_rate": 4.529911553813954e-06, + "loss": 0.7457, + "step": 6195 + }, + { + "epoch": 0.54, + "grad_norm": 9.760610120240207, + "learning_rate": 4.5284955654938895e-06, + "loss": 0.7719, + "step": 6196 + }, + { + "epoch": 0.54, + "grad_norm": 7.06749930189759, + "learning_rate": 4.527079615327179e-06, + "loss": 0.9337, + "step": 6197 + }, + { + "epoch": 0.54, + "grad_norm": 9.468968603295439, + "learning_rate": 4.5256637034283995e-06, + "loss": 0.7292, + "step": 6198 + }, + { + "epoch": 0.54, + "grad_norm": 7.913636961680052, + "learning_rate": 4.5242478299121245e-06, + "loss": 0.7746, + "step": 6199 + }, + { + "epoch": 0.54, + "grad_norm": 5.8711165105282666, + "learning_rate": 4.522831994892923e-06, + "loss": 0.6437, + "step": 6200 + }, + { + "epoch": 0.54, + "grad_norm": 10.693656019233899, + "learning_rate": 4.5214161984853636e-06, + "loss": 0.7342, + "step": 6201 + }, + { + "epoch": 0.54, + "grad_norm": 4.963518367157353, + "learning_rate": 4.520000440804009e-06, + "loss": 0.6539, + "step": 6202 + }, + { + "epoch": 0.54, + "grad_norm": 8.36731920329718, + "learning_rate": 4.51858472196342e-06, + "loss": 0.8394, + "step": 6203 + }, + { + "epoch": 0.54, + "grad_norm": 14.512042957347807, + "learning_rate": 4.517169042078155e-06, + "loss": 0.7238, + "step": 6204 + }, + { + "epoch": 0.54, + "grad_norm": 7.7431700723861745, + "learning_rate": 4.515753401262767e-06, + "loss": 0.6087, + "step": 6205 + }, + { + "epoch": 0.55, + "grad_norm": 2.8760340858042945, + "learning_rate": 4.514337799631809e-06, + "loss": 0.6124, + "step": 6206 + }, + { + "epoch": 0.55, + "grad_norm": 14.48904268983375, + "learning_rate": 4.512922237299828e-06, + "loss": 0.6622, + "step": 6207 + }, + { + "epoch": 0.55, + "grad_norm": 11.297055482186675, + "learning_rate": 4.511506714381371e-06, + "loss": 0.6395, + "step": 6208 + }, + { + "epoch": 0.55, + "grad_norm": 8.876117392913372, + "learning_rate": 4.510091230990975e-06, + "loss": 0.8953, + "step": 6209 + }, + { + "epoch": 0.55, + "grad_norm": 12.569747785054256, + "learning_rate": 4.508675787243184e-06, + "loss": 0.7943, + "step": 6210 + }, + { + "epoch": 0.55, + "grad_norm": 10.687579718903255, + "learning_rate": 4.50726038325253e-06, + "loss": 0.6888, + "step": 6211 + }, + { + "epoch": 0.55, + "grad_norm": 2.7731112015459463, + "learning_rate": 4.5058450191335475e-06, + "loss": 0.5679, + "step": 6212 + }, + { + "epoch": 0.55, + "grad_norm": 75.88698611626351, + "learning_rate": 4.5044296950007634e-06, + "loss": 0.6535, + "step": 6213 + }, + { + "epoch": 0.55, + "grad_norm": 3.584045398132698, + "learning_rate": 4.503014410968703e-06, + "loss": 0.6068, + "step": 6214 + }, + { + "epoch": 0.55, + "grad_norm": 10.509289097887812, + "learning_rate": 4.501599167151891e-06, + "loss": 0.9366, + "step": 6215 + }, + { + "epoch": 0.55, + "grad_norm": 10.4871463475363, + "learning_rate": 4.5001839636648456e-06, + "loss": 0.7613, + "step": 6216 + }, + { + "epoch": 0.55, + "grad_norm": 5.484043444070995, + "learning_rate": 4.498768800622081e-06, + "loss": 0.7528, + "step": 6217 + }, + { + "epoch": 0.55, + "grad_norm": 57.93183762340443, + "learning_rate": 4.497353678138112e-06, + "loss": 0.6802, + "step": 6218 + }, + { + "epoch": 0.55, + "grad_norm": 9.249611465723877, + "learning_rate": 4.495938596327448e-06, + "loss": 0.7438, + "step": 6219 + }, + { + "epoch": 0.55, + "grad_norm": 8.576764317603981, + "learning_rate": 4.494523555304593e-06, + "loss": 0.7005, + "step": 6220 + }, + { + "epoch": 0.55, + "grad_norm": 10.387150710624159, + "learning_rate": 4.493108555184052e-06, + "loss": 0.8068, + "step": 6221 + }, + { + "epoch": 0.55, + "grad_norm": 10.986511319070523, + "learning_rate": 4.491693596080324e-06, + "loss": 0.696, + "step": 6222 + }, + { + "epoch": 0.55, + "grad_norm": 8.506683537456944, + "learning_rate": 4.490278678107903e-06, + "loss": 0.7182, + "step": 6223 + }, + { + "epoch": 0.55, + "grad_norm": 10.20931084238644, + "learning_rate": 4.488863801381285e-06, + "loss": 0.6602, + "step": 6224 + }, + { + "epoch": 0.55, + "grad_norm": 15.025179125310997, + "learning_rate": 4.487448966014958e-06, + "loss": 1.082, + "step": 6225 + }, + { + "epoch": 0.55, + "grad_norm": 5.97594199827293, + "learning_rate": 4.4860341721234065e-06, + "loss": 0.7432, + "step": 6226 + }, + { + "epoch": 0.55, + "grad_norm": 8.696288345650947, + "learning_rate": 4.484619419821116e-06, + "loss": 0.9553, + "step": 6227 + }, + { + "epoch": 0.55, + "grad_norm": 10.930823815283153, + "learning_rate": 4.483204709222565e-06, + "loss": 0.9659, + "step": 6228 + }, + { + "epoch": 0.55, + "grad_norm": 8.731199261684223, + "learning_rate": 4.481790040442229e-06, + "loss": 0.7376, + "step": 6229 + }, + { + "epoch": 0.55, + "grad_norm": 12.968644798085363, + "learning_rate": 4.480375413594581e-06, + "loss": 0.7835, + "step": 6230 + }, + { + "epoch": 0.55, + "grad_norm": 8.392155492992558, + "learning_rate": 4.478960828794089e-06, + "loss": 0.8786, + "step": 6231 + }, + { + "epoch": 0.55, + "grad_norm": 8.353173246438464, + "learning_rate": 4.477546286155221e-06, + "loss": 0.8762, + "step": 6232 + }, + { + "epoch": 0.55, + "grad_norm": 2.676990275674176, + "learning_rate": 4.476131785792439e-06, + "loss": 0.5296, + "step": 6233 + }, + { + "epoch": 0.55, + "grad_norm": 9.870421950702816, + "learning_rate": 4.4747173278202e-06, + "loss": 0.6839, + "step": 6234 + }, + { + "epoch": 0.55, + "grad_norm": 8.819258861666459, + "learning_rate": 4.473302912352963e-06, + "loss": 0.6455, + "step": 6235 + }, + { + "epoch": 0.55, + "grad_norm": 7.63623429684275, + "learning_rate": 4.471888539505178e-06, + "loss": 0.6965, + "step": 6236 + }, + { + "epoch": 0.55, + "grad_norm": 24.890279740879123, + "learning_rate": 4.4704742093912925e-06, + "loss": 0.7741, + "step": 6237 + }, + { + "epoch": 0.55, + "grad_norm": 13.564437372711676, + "learning_rate": 4.469059922125753e-06, + "loss": 0.8189, + "step": 6238 + }, + { + "epoch": 0.55, + "grad_norm": 10.22891714215175, + "learning_rate": 4.467645677823004e-06, + "loss": 0.7162, + "step": 6239 + }, + { + "epoch": 0.55, + "grad_norm": 36.89977726167685, + "learning_rate": 4.466231476597478e-06, + "loss": 0.9386, + "step": 6240 + }, + { + "epoch": 0.55, + "grad_norm": 7.592268058876, + "learning_rate": 4.464817318563615e-06, + "loss": 0.685, + "step": 6241 + }, + { + "epoch": 0.55, + "grad_norm": 8.943701772601507, + "learning_rate": 4.463403203835845e-06, + "loss": 0.7049, + "step": 6242 + }, + { + "epoch": 0.55, + "grad_norm": 8.130411760868714, + "learning_rate": 4.461989132528593e-06, + "loss": 0.8069, + "step": 6243 + }, + { + "epoch": 0.55, + "grad_norm": 5.616601126634015, + "learning_rate": 4.4605751047562865e-06, + "loss": 0.6613, + "step": 6244 + }, + { + "epoch": 0.55, + "grad_norm": 13.75375901925643, + "learning_rate": 4.459161120633346e-06, + "loss": 0.8377, + "step": 6245 + }, + { + "epoch": 0.55, + "grad_norm": 7.522019876575242, + "learning_rate": 4.457747180274186e-06, + "loss": 0.746, + "step": 6246 + }, + { + "epoch": 0.55, + "grad_norm": 13.188790004540357, + "learning_rate": 4.456333283793224e-06, + "loss": 0.7759, + "step": 6247 + }, + { + "epoch": 0.55, + "grad_norm": 8.496146625437204, + "learning_rate": 4.4549194313048685e-06, + "loss": 0.6526, + "step": 6248 + }, + { + "epoch": 0.55, + "grad_norm": 8.233979518863809, + "learning_rate": 4.453505622923524e-06, + "loss": 0.827, + "step": 6249 + }, + { + "epoch": 0.55, + "grad_norm": 15.279800208640362, + "learning_rate": 4.452091858763598e-06, + "loss": 0.869, + "step": 6250 + }, + { + "epoch": 0.55, + "grad_norm": 2.6201169294511084, + "learning_rate": 4.450678138939485e-06, + "loss": 0.5262, + "step": 6251 + }, + { + "epoch": 0.55, + "grad_norm": 9.978995370396744, + "learning_rate": 4.449264463565584e-06, + "loss": 0.8124, + "step": 6252 + }, + { + "epoch": 0.55, + "grad_norm": 7.414284833834504, + "learning_rate": 4.4478508327562875e-06, + "loss": 0.7149, + "step": 6253 + }, + { + "epoch": 0.55, + "grad_norm": 8.073455950820033, + "learning_rate": 4.446437246625981e-06, + "loss": 0.814, + "step": 6254 + }, + { + "epoch": 0.55, + "grad_norm": 7.204497337945001, + "learning_rate": 4.445023705289055e-06, + "loss": 0.8709, + "step": 6255 + }, + { + "epoch": 0.55, + "grad_norm": 5.8545850571777205, + "learning_rate": 4.443610208859886e-06, + "loss": 0.8061, + "step": 6256 + }, + { + "epoch": 0.55, + "grad_norm": 8.00903778603158, + "learning_rate": 4.442196757452851e-06, + "loss": 0.8712, + "step": 6257 + }, + { + "epoch": 0.55, + "grad_norm": 6.39425742066624, + "learning_rate": 4.440783351182329e-06, + "loss": 0.68, + "step": 6258 + }, + { + "epoch": 0.55, + "grad_norm": 10.271607809909868, + "learning_rate": 4.439369990162689e-06, + "loss": 0.7362, + "step": 6259 + }, + { + "epoch": 0.55, + "grad_norm": 6.98604275227853, + "learning_rate": 4.437956674508295e-06, + "loss": 0.8974, + "step": 6260 + }, + { + "epoch": 0.55, + "grad_norm": 10.726000906911677, + "learning_rate": 4.436543404333512e-06, + "loss": 0.7526, + "step": 6261 + }, + { + "epoch": 0.55, + "grad_norm": 6.469337367817043, + "learning_rate": 4.435130179752701e-06, + "loss": 0.6277, + "step": 6262 + }, + { + "epoch": 0.55, + "grad_norm": 9.358516583328838, + "learning_rate": 4.433717000880214e-06, + "loss": 0.8011, + "step": 6263 + }, + { + "epoch": 0.55, + "grad_norm": 6.937299496879855, + "learning_rate": 4.432303867830407e-06, + "loss": 0.8644, + "step": 6264 + }, + { + "epoch": 0.55, + "grad_norm": 10.833264856984671, + "learning_rate": 4.430890780717627e-06, + "loss": 0.8502, + "step": 6265 + }, + { + "epoch": 0.55, + "grad_norm": 7.633094226698797, + "learning_rate": 4.429477739656216e-06, + "loss": 0.6947, + "step": 6266 + }, + { + "epoch": 0.55, + "grad_norm": 7.3512963999444905, + "learning_rate": 4.4280647447605196e-06, + "loss": 0.7012, + "step": 6267 + }, + { + "epoch": 0.55, + "grad_norm": 10.557599452466564, + "learning_rate": 4.426651796144873e-06, + "loss": 0.7616, + "step": 6268 + }, + { + "epoch": 0.55, + "grad_norm": 9.117131599589431, + "learning_rate": 4.425238893923607e-06, + "loss": 0.7473, + "step": 6269 + }, + { + "epoch": 0.55, + "grad_norm": 7.282392604112603, + "learning_rate": 4.423826038211056e-06, + "loss": 0.7502, + "step": 6270 + }, + { + "epoch": 0.55, + "grad_norm": 6.958729066456025, + "learning_rate": 4.422413229121541e-06, + "loss": 0.6865, + "step": 6271 + }, + { + "epoch": 0.55, + "grad_norm": 5.952895091006039, + "learning_rate": 4.421000466769389e-06, + "loss": 0.7418, + "step": 6272 + }, + { + "epoch": 0.55, + "grad_norm": 7.363289134976322, + "learning_rate": 4.419587751268915e-06, + "loss": 0.7671, + "step": 6273 + }, + { + "epoch": 0.55, + "grad_norm": 10.803661442006122, + "learning_rate": 4.418175082734435e-06, + "loss": 0.7433, + "step": 6274 + }, + { + "epoch": 0.55, + "grad_norm": 2.7070033591364666, + "learning_rate": 4.416762461280259e-06, + "loss": 0.4776, + "step": 6275 + }, + { + "epoch": 0.55, + "grad_norm": 10.273095190735646, + "learning_rate": 4.4153498870206965e-06, + "loss": 0.7439, + "step": 6276 + }, + { + "epoch": 0.55, + "grad_norm": 25.43367490849225, + "learning_rate": 4.4139373600700464e-06, + "loss": 0.6461, + "step": 6277 + }, + { + "epoch": 0.55, + "grad_norm": 6.688497258481266, + "learning_rate": 4.412524880542611e-06, + "loss": 0.7671, + "step": 6278 + }, + { + "epoch": 0.55, + "grad_norm": 3.2579030083956, + "learning_rate": 4.411112448552686e-06, + "loss": 0.5511, + "step": 6279 + }, + { + "epoch": 0.55, + "grad_norm": 2.9804748216463346, + "learning_rate": 4.40970006421456e-06, + "loss": 0.5351, + "step": 6280 + }, + { + "epoch": 0.55, + "grad_norm": 5.959551992895814, + "learning_rate": 4.408287727642524e-06, + "loss": 0.6875, + "step": 6281 + }, + { + "epoch": 0.55, + "grad_norm": 4.53102015953829, + "learning_rate": 4.4068754389508616e-06, + "loss": 0.6597, + "step": 6282 + }, + { + "epoch": 0.55, + "grad_norm": 9.744682904766206, + "learning_rate": 4.405463198253851e-06, + "loss": 0.7231, + "step": 6283 + }, + { + "epoch": 0.55, + "grad_norm": 6.5855131871313395, + "learning_rate": 4.4040510056657695e-06, + "loss": 0.6096, + "step": 6284 + }, + { + "epoch": 0.55, + "grad_norm": 19.290573953004948, + "learning_rate": 4.40263886130089e-06, + "loss": 0.8676, + "step": 6285 + }, + { + "epoch": 0.55, + "grad_norm": 10.778505835723927, + "learning_rate": 4.401226765273479e-06, + "loss": 0.9102, + "step": 6286 + }, + { + "epoch": 0.55, + "grad_norm": 8.227032753856877, + "learning_rate": 4.399814717697803e-06, + "loss": 0.6233, + "step": 6287 + }, + { + "epoch": 0.55, + "grad_norm": 7.004038425026213, + "learning_rate": 4.398402718688122e-06, + "loss": 0.7284, + "step": 6288 + }, + { + "epoch": 0.55, + "grad_norm": 6.094141228997116, + "learning_rate": 4.396990768358691e-06, + "loss": 0.8031, + "step": 6289 + }, + { + "epoch": 0.55, + "grad_norm": 10.908248125434941, + "learning_rate": 4.395578866823766e-06, + "loss": 0.8321, + "step": 6290 + }, + { + "epoch": 0.55, + "grad_norm": 8.002983633550514, + "learning_rate": 4.394167014197592e-06, + "loss": 0.6701, + "step": 6291 + }, + { + "epoch": 0.55, + "grad_norm": 8.658201376753402, + "learning_rate": 4.392755210594416e-06, + "loss": 0.8385, + "step": 6292 + }, + { + "epoch": 0.55, + "grad_norm": 18.92611025832367, + "learning_rate": 4.391343456128479e-06, + "loss": 0.7436, + "step": 6293 + }, + { + "epoch": 0.55, + "grad_norm": 8.28909775067995, + "learning_rate": 4.389931750914016e-06, + "loss": 0.684, + "step": 6294 + }, + { + "epoch": 0.55, + "grad_norm": 6.927080733186066, + "learning_rate": 4.3885200950652615e-06, + "loss": 0.7055, + "step": 6295 + }, + { + "epoch": 0.55, + "grad_norm": 7.559495803802068, + "learning_rate": 4.387108488696443e-06, + "loss": 0.7825, + "step": 6296 + }, + { + "epoch": 0.55, + "grad_norm": 2.5781345379351635, + "learning_rate": 4.385696931921786e-06, + "loss": 0.5, + "step": 6297 + }, + { + "epoch": 0.55, + "grad_norm": 10.429696521773108, + "learning_rate": 4.384285424855513e-06, + "loss": 0.7321, + "step": 6298 + }, + { + "epoch": 0.55, + "grad_norm": 15.841486465246662, + "learning_rate": 4.382873967611839e-06, + "loss": 0.8593, + "step": 6299 + }, + { + "epoch": 0.55, + "grad_norm": 5.412699756031543, + "learning_rate": 4.381462560304975e-06, + "loss": 0.7373, + "step": 6300 + }, + { + "epoch": 0.55, + "grad_norm": 8.551430024130918, + "learning_rate": 4.380051203049134e-06, + "loss": 0.7873, + "step": 6301 + }, + { + "epoch": 0.55, + "grad_norm": 6.665665966504426, + "learning_rate": 4.378639895958517e-06, + "loss": 0.7674, + "step": 6302 + }, + { + "epoch": 0.55, + "grad_norm": 4.90721052036454, + "learning_rate": 4.3772286391473266e-06, + "loss": 0.7625, + "step": 6303 + }, + { + "epoch": 0.55, + "grad_norm": 5.029433762569209, + "learning_rate": 4.375817432729759e-06, + "loss": 0.7274, + "step": 6304 + }, + { + "epoch": 0.55, + "grad_norm": 8.858965450506151, + "learning_rate": 4.374406276820006e-06, + "loss": 0.6847, + "step": 6305 + }, + { + "epoch": 0.55, + "grad_norm": 15.741925281599562, + "learning_rate": 4.372995171532256e-06, + "loss": 0.7593, + "step": 6306 + }, + { + "epoch": 0.55, + "grad_norm": 9.458499724336832, + "learning_rate": 4.371584116980695e-06, + "loss": 0.7427, + "step": 6307 + }, + { + "epoch": 0.55, + "grad_norm": 7.442293410172695, + "learning_rate": 4.370173113279501e-06, + "loss": 0.8681, + "step": 6308 + }, + { + "epoch": 0.55, + "grad_norm": 4.6785089983304795, + "learning_rate": 4.368762160542851e-06, + "loss": 0.6362, + "step": 6309 + }, + { + "epoch": 0.55, + "grad_norm": 7.8879985407540945, + "learning_rate": 4.367351258884917e-06, + "loss": 0.7333, + "step": 6310 + }, + { + "epoch": 0.55, + "grad_norm": 7.721918249011638, + "learning_rate": 4.365940408419867e-06, + "loss": 0.7271, + "step": 6311 + }, + { + "epoch": 0.55, + "grad_norm": 7.666675958118943, + "learning_rate": 4.364529609261865e-06, + "loss": 0.8294, + "step": 6312 + }, + { + "epoch": 0.55, + "grad_norm": 8.766754152462676, + "learning_rate": 4.363118861525069e-06, + "loss": 0.7002, + "step": 6313 + }, + { + "epoch": 0.55, + "grad_norm": 4.90196979424295, + "learning_rate": 4.361708165323636e-06, + "loss": 0.7019, + "step": 6314 + }, + { + "epoch": 0.55, + "grad_norm": 6.972729291282275, + "learning_rate": 4.360297520771716e-06, + "loss": 0.7718, + "step": 6315 + }, + { + "epoch": 0.55, + "grad_norm": 8.84417036156125, + "learning_rate": 4.358886927983458e-06, + "loss": 0.7179, + "step": 6316 + }, + { + "epoch": 0.55, + "grad_norm": 7.125434615080126, + "learning_rate": 4.357476387073001e-06, + "loss": 0.6865, + "step": 6317 + }, + { + "epoch": 0.55, + "grad_norm": 7.0246689860185905, + "learning_rate": 4.356065898154488e-06, + "loss": 0.7601, + "step": 6318 + }, + { + "epoch": 0.55, + "grad_norm": 6.928541425872322, + "learning_rate": 4.354655461342051e-06, + "loss": 0.7323, + "step": 6319 + }, + { + "epoch": 0.56, + "grad_norm": 5.008916456748003, + "learning_rate": 4.3532450767498195e-06, + "loss": 0.7599, + "step": 6320 + }, + { + "epoch": 0.56, + "grad_norm": 6.203361765710977, + "learning_rate": 4.351834744491921e-06, + "loss": 0.637, + "step": 6321 + }, + { + "epoch": 0.56, + "grad_norm": 6.5479447846352965, + "learning_rate": 4.350424464682478e-06, + "loss": 0.7775, + "step": 6322 + }, + { + "epoch": 0.56, + "grad_norm": 10.42874669436348, + "learning_rate": 4.3490142374356046e-06, + "loss": 0.6226, + "step": 6323 + }, + { + "epoch": 0.56, + "grad_norm": 5.713600827815302, + "learning_rate": 4.347604062865418e-06, + "loss": 0.6805, + "step": 6324 + }, + { + "epoch": 0.56, + "grad_norm": 6.475396768975451, + "learning_rate": 4.3461939410860254e-06, + "loss": 0.6974, + "step": 6325 + }, + { + "epoch": 0.56, + "grad_norm": 5.873756399833181, + "learning_rate": 4.34478387221153e-06, + "loss": 0.8286, + "step": 6326 + }, + { + "epoch": 0.56, + "grad_norm": 8.642255760924666, + "learning_rate": 4.343373856356035e-06, + "loss": 0.599, + "step": 6327 + }, + { + "epoch": 0.56, + "grad_norm": 5.454847933159832, + "learning_rate": 4.341963893633635e-06, + "loss": 0.781, + "step": 6328 + }, + { + "epoch": 0.56, + "grad_norm": 5.381241810525174, + "learning_rate": 4.340553984158422e-06, + "loss": 0.5822, + "step": 6329 + }, + { + "epoch": 0.56, + "grad_norm": 11.135079647876612, + "learning_rate": 4.339144128044485e-06, + "loss": 0.8356, + "step": 6330 + }, + { + "epoch": 0.56, + "grad_norm": 11.637052299256585, + "learning_rate": 4.337734325405903e-06, + "loss": 0.8907, + "step": 6331 + }, + { + "epoch": 0.56, + "grad_norm": 4.792424620798543, + "learning_rate": 4.336324576356761e-06, + "loss": 0.7385, + "step": 6332 + }, + { + "epoch": 0.56, + "grad_norm": 6.356744275125185, + "learning_rate": 4.334914881011128e-06, + "loss": 0.6241, + "step": 6333 + }, + { + "epoch": 0.56, + "grad_norm": 16.656513411785824, + "learning_rate": 4.3335052394830774e-06, + "loss": 0.6818, + "step": 6334 + }, + { + "epoch": 0.56, + "grad_norm": 12.604949621976482, + "learning_rate": 4.332095651886675e-06, + "loss": 0.7148, + "step": 6335 + }, + { + "epoch": 0.56, + "grad_norm": 5.147874170788941, + "learning_rate": 4.33068611833598e-06, + "loss": 0.6766, + "step": 6336 + }, + { + "epoch": 0.56, + "grad_norm": 9.316699844624951, + "learning_rate": 4.329276638945051e-06, + "loss": 0.6611, + "step": 6337 + }, + { + "epoch": 0.56, + "grad_norm": 8.919672567366563, + "learning_rate": 4.327867213827942e-06, + "loss": 0.5732, + "step": 6338 + }, + { + "epoch": 0.56, + "grad_norm": 11.804311032499516, + "learning_rate": 4.3264578430987e-06, + "loss": 0.994, + "step": 6339 + }, + { + "epoch": 0.56, + "grad_norm": 6.611272424887086, + "learning_rate": 4.3250485268713675e-06, + "loss": 0.6862, + "step": 6340 + }, + { + "epoch": 0.56, + "grad_norm": 9.751789384127882, + "learning_rate": 4.323639265259987e-06, + "loss": 0.7879, + "step": 6341 + }, + { + "epoch": 0.56, + "grad_norm": 2.8558144634107125, + "learning_rate": 4.322230058378591e-06, + "loss": 0.5041, + "step": 6342 + }, + { + "epoch": 0.56, + "grad_norm": 10.603959404215828, + "learning_rate": 4.320820906341211e-06, + "loss": 0.6937, + "step": 6343 + }, + { + "epoch": 0.56, + "grad_norm": 2.3279873801999273, + "learning_rate": 4.319411809261874e-06, + "loss": 0.5473, + "step": 6344 + }, + { + "epoch": 0.56, + "grad_norm": 5.615308719525582, + "learning_rate": 4.318002767254602e-06, + "loss": 0.8299, + "step": 6345 + }, + { + "epoch": 0.56, + "grad_norm": 10.386547834481302, + "learning_rate": 4.31659378043341e-06, + "loss": 0.7175, + "step": 6346 + }, + { + "epoch": 0.56, + "grad_norm": 7.755235291263743, + "learning_rate": 4.315184848912314e-06, + "loss": 0.7107, + "step": 6347 + }, + { + "epoch": 0.56, + "grad_norm": 9.943616126355954, + "learning_rate": 4.3137759728053206e-06, + "loss": 0.665, + "step": 6348 + }, + { + "epoch": 0.56, + "grad_norm": 6.079765594139801, + "learning_rate": 4.312367152226433e-06, + "loss": 0.6659, + "step": 6349 + }, + { + "epoch": 0.56, + "grad_norm": 2.8263296023110906, + "learning_rate": 4.310958387289653e-06, + "loss": 0.5382, + "step": 6350 + }, + { + "epoch": 0.56, + "grad_norm": 2.9513253578432814, + "learning_rate": 4.309549678108973e-06, + "loss": 0.5151, + "step": 6351 + }, + { + "epoch": 0.56, + "grad_norm": 2.2855691016913524, + "learning_rate": 4.308141024798385e-06, + "loss": 0.4405, + "step": 6352 + }, + { + "epoch": 0.56, + "grad_norm": 2.957156600787396, + "learning_rate": 4.306732427471875e-06, + "loss": 0.5267, + "step": 6353 + }, + { + "epoch": 0.56, + "grad_norm": 10.411456779700023, + "learning_rate": 4.305323886243423e-06, + "loss": 0.7437, + "step": 6354 + }, + { + "epoch": 0.56, + "grad_norm": 6.74155420305021, + "learning_rate": 4.303915401227006e-06, + "loss": 0.8446, + "step": 6355 + }, + { + "epoch": 0.56, + "grad_norm": 7.357968064671599, + "learning_rate": 4.302506972536599e-06, + "loss": 0.5807, + "step": 6356 + }, + { + "epoch": 0.56, + "grad_norm": 11.206377407041334, + "learning_rate": 4.301098600286165e-06, + "loss": 0.7427, + "step": 6357 + }, + { + "epoch": 0.56, + "grad_norm": 9.716199102319035, + "learning_rate": 4.299690284589672e-06, + "loss": 0.7316, + "step": 6358 + }, + { + "epoch": 0.56, + "grad_norm": 6.845642383487231, + "learning_rate": 4.298282025561076e-06, + "loss": 0.846, + "step": 6359 + }, + { + "epoch": 0.56, + "grad_norm": 5.791431538382081, + "learning_rate": 4.29687382331433e-06, + "loss": 0.6471, + "step": 6360 + }, + { + "epoch": 0.56, + "grad_norm": 8.027111438176808, + "learning_rate": 4.295465677963385e-06, + "loss": 0.6858, + "step": 6361 + }, + { + "epoch": 0.56, + "grad_norm": 8.904753727714104, + "learning_rate": 4.294057589622186e-06, + "loss": 0.6244, + "step": 6362 + }, + { + "epoch": 0.56, + "grad_norm": 5.8487307149766785, + "learning_rate": 4.2926495584046715e-06, + "loss": 0.7736, + "step": 6363 + }, + { + "epoch": 0.56, + "grad_norm": 16.510961010434762, + "learning_rate": 4.291241584424779e-06, + "loss": 0.9684, + "step": 6364 + }, + { + "epoch": 0.56, + "grad_norm": 7.22133717967114, + "learning_rate": 4.289833667796438e-06, + "loss": 0.637, + "step": 6365 + }, + { + "epoch": 0.56, + "grad_norm": 6.409601621587951, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.6322, + "step": 6366 + }, + { + "epoch": 0.56, + "grad_norm": 8.592474187713089, + "learning_rate": 4.287018007050113e-06, + "loss": 0.7903, + "step": 6367 + }, + { + "epoch": 0.56, + "grad_norm": 5.70544572795986, + "learning_rate": 4.285610263159969e-06, + "loss": 0.7626, + "step": 6368 + }, + { + "epoch": 0.56, + "grad_norm": 2.618717647719805, + "learning_rate": 4.28420257707705e-06, + "loss": 0.5218, + "step": 6369 + }, + { + "epoch": 0.56, + "grad_norm": 7.462027259948279, + "learning_rate": 4.282794948915271e-06, + "loss": 0.6813, + "step": 6370 + }, + { + "epoch": 0.56, + "grad_norm": 7.303675549456738, + "learning_rate": 4.281387378788531e-06, + "loss": 0.769, + "step": 6371 + }, + { + "epoch": 0.56, + "grad_norm": 5.917517621123481, + "learning_rate": 4.279979866810729e-06, + "loss": 0.715, + "step": 6372 + }, + { + "epoch": 0.56, + "grad_norm": 5.380123384937926, + "learning_rate": 4.278572413095759e-06, + "loss": 0.7707, + "step": 6373 + }, + { + "epoch": 0.56, + "grad_norm": 4.170216098485186, + "learning_rate": 4.277165017757508e-06, + "loss": 0.6197, + "step": 6374 + }, + { + "epoch": 0.56, + "grad_norm": 8.398524856523332, + "learning_rate": 4.275757680909863e-06, + "loss": 0.7772, + "step": 6375 + }, + { + "epoch": 0.56, + "grad_norm": 10.036759690996337, + "learning_rate": 4.274350402666701e-06, + "loss": 0.9243, + "step": 6376 + }, + { + "epoch": 0.56, + "grad_norm": 5.40857876579638, + "learning_rate": 4.2729431831418965e-06, + "loss": 0.7492, + "step": 6377 + }, + { + "epoch": 0.56, + "grad_norm": 6.952598183265659, + "learning_rate": 4.2715360224493214e-06, + "loss": 0.7533, + "step": 6378 + }, + { + "epoch": 0.56, + "grad_norm": 11.599290202345392, + "learning_rate": 4.27012892070284e-06, + "loss": 0.9355, + "step": 6379 + }, + { + "epoch": 0.56, + "grad_norm": 7.329516391521652, + "learning_rate": 4.268721878016311e-06, + "loss": 0.7611, + "step": 6380 + }, + { + "epoch": 0.56, + "grad_norm": 3.448082021719624, + "learning_rate": 4.267314894503591e-06, + "loss": 0.6392, + "step": 6381 + }, + { + "epoch": 0.56, + "grad_norm": 5.48681174833948, + "learning_rate": 4.265907970278532e-06, + "loss": 0.656, + "step": 6382 + }, + { + "epoch": 0.56, + "grad_norm": 4.083668747850375, + "learning_rate": 4.264501105454979e-06, + "loss": 0.5958, + "step": 6383 + }, + { + "epoch": 0.56, + "grad_norm": 6.211194464324452, + "learning_rate": 4.263094300146773e-06, + "loss": 0.8007, + "step": 6384 + }, + { + "epoch": 0.56, + "grad_norm": 5.46179352467872, + "learning_rate": 4.261687554467751e-06, + "loss": 0.7472, + "step": 6385 + }, + { + "epoch": 0.56, + "grad_norm": 7.12396411740272, + "learning_rate": 4.260280868531742e-06, + "loss": 0.6256, + "step": 6386 + }, + { + "epoch": 0.56, + "grad_norm": 8.261921871103612, + "learning_rate": 4.258874242452576e-06, + "loss": 0.8929, + "step": 6387 + }, + { + "epoch": 0.56, + "grad_norm": 7.447461768776576, + "learning_rate": 4.257467676344074e-06, + "loss": 0.8052, + "step": 6388 + }, + { + "epoch": 0.56, + "grad_norm": 4.860198861971528, + "learning_rate": 4.256061170320051e-06, + "loss": 0.8592, + "step": 6389 + }, + { + "epoch": 0.56, + "grad_norm": 6.385665862297629, + "learning_rate": 4.254654724494321e-06, + "loss": 0.6374, + "step": 6390 + }, + { + "epoch": 0.56, + "grad_norm": 6.4052224135230755, + "learning_rate": 4.253248338980691e-06, + "loss": 0.6942, + "step": 6391 + }, + { + "epoch": 0.56, + "grad_norm": 6.2859665053541, + "learning_rate": 4.2518420138929645e-06, + "loss": 0.6199, + "step": 6392 + }, + { + "epoch": 0.56, + "grad_norm": 12.135614919435966, + "learning_rate": 4.250435749344936e-06, + "loss": 0.9097, + "step": 6393 + }, + { + "epoch": 0.56, + "grad_norm": 44.96410876846427, + "learning_rate": 4.249029545450401e-06, + "loss": 0.7639, + "step": 6394 + }, + { + "epoch": 0.56, + "grad_norm": 7.882212774803252, + "learning_rate": 4.247623402323146e-06, + "loss": 0.6549, + "step": 6395 + }, + { + "epoch": 0.56, + "grad_norm": 8.011477164697205, + "learning_rate": 4.246217320076953e-06, + "loss": 0.8508, + "step": 6396 + }, + { + "epoch": 0.56, + "grad_norm": 5.200486298295638, + "learning_rate": 4.244811298825601e-06, + "loss": 0.7413, + "step": 6397 + }, + { + "epoch": 0.56, + "grad_norm": 13.830177975894902, + "learning_rate": 4.243405338682863e-06, + "loss": 0.6016, + "step": 6398 + }, + { + "epoch": 0.56, + "grad_norm": 6.393893139413456, + "learning_rate": 4.241999439762506e-06, + "loss": 0.6204, + "step": 6399 + }, + { + "epoch": 0.56, + "grad_norm": 2.353008065888321, + "learning_rate": 4.240593602178293e-06, + "loss": 0.5497, + "step": 6400 + }, + { + "epoch": 0.56, + "grad_norm": 3.6618139455588827, + "learning_rate": 4.2391878260439836e-06, + "loss": 0.5329, + "step": 6401 + }, + { + "epoch": 0.56, + "grad_norm": 6.29113569232284, + "learning_rate": 4.237782111473329e-06, + "loss": 0.7446, + "step": 6402 + }, + { + "epoch": 0.56, + "grad_norm": 7.928349571491594, + "learning_rate": 4.2363764585800775e-06, + "loss": 0.7675, + "step": 6403 + }, + { + "epoch": 0.56, + "grad_norm": 9.110439555361367, + "learning_rate": 4.2349708674779735e-06, + "loss": 0.8402, + "step": 6404 + }, + { + "epoch": 0.56, + "grad_norm": 13.960120235706917, + "learning_rate": 4.233565338280755e-06, + "loss": 0.8213, + "step": 6405 + }, + { + "epoch": 0.56, + "grad_norm": 5.4938597454889155, + "learning_rate": 4.2321598711021536e-06, + "loss": 0.7903, + "step": 6406 + }, + { + "epoch": 0.56, + "grad_norm": 9.474066213642569, + "learning_rate": 4.230754466055898e-06, + "loss": 0.6128, + "step": 6407 + }, + { + "epoch": 0.56, + "grad_norm": 2.9201193041884808, + "learning_rate": 4.2293491232557115e-06, + "loss": 0.5413, + "step": 6408 + }, + { + "epoch": 0.56, + "grad_norm": 7.093471036380087, + "learning_rate": 4.227943842815313e-06, + "loss": 0.6968, + "step": 6409 + }, + { + "epoch": 0.56, + "grad_norm": 10.908877597076033, + "learning_rate": 4.226538624848413e-06, + "loss": 0.791, + "step": 6410 + }, + { + "epoch": 0.56, + "grad_norm": 8.966614760388193, + "learning_rate": 4.2251334694687215e-06, + "loss": 0.6804, + "step": 6411 + }, + { + "epoch": 0.56, + "grad_norm": 12.596848560313695, + "learning_rate": 4.2237283767899416e-06, + "loss": 0.7627, + "step": 6412 + }, + { + "epoch": 0.56, + "grad_norm": 7.259617144268273, + "learning_rate": 4.22232334692577e-06, + "loss": 0.7889, + "step": 6413 + }, + { + "epoch": 0.56, + "grad_norm": 5.456284981173055, + "learning_rate": 4.220918379989898e-06, + "loss": 0.829, + "step": 6414 + }, + { + "epoch": 0.56, + "grad_norm": 5.589778111580373, + "learning_rate": 4.219513476096016e-06, + "loss": 0.7787, + "step": 6415 + }, + { + "epoch": 0.56, + "grad_norm": 7.826080634199132, + "learning_rate": 4.218108635357806e-06, + "loss": 0.7896, + "step": 6416 + }, + { + "epoch": 0.56, + "grad_norm": 10.789099103657634, + "learning_rate": 4.216703857888942e-06, + "loss": 0.6906, + "step": 6417 + }, + { + "epoch": 0.56, + "grad_norm": 7.2406857459487926, + "learning_rate": 4.215299143803101e-06, + "loss": 0.7272, + "step": 6418 + }, + { + "epoch": 0.56, + "grad_norm": 9.1978130662193, + "learning_rate": 4.213894493213948e-06, + "loss": 0.7798, + "step": 6419 + }, + { + "epoch": 0.56, + "grad_norm": 5.89463973858116, + "learning_rate": 4.2124899062351435e-06, + "loss": 0.7314, + "step": 6420 + }, + { + "epoch": 0.56, + "grad_norm": 7.923318459410421, + "learning_rate": 4.211085382980347e-06, + "loss": 0.8695, + "step": 6421 + }, + { + "epoch": 0.56, + "grad_norm": 9.031342726163821, + "learning_rate": 4.20968092356321e-06, + "loss": 0.709, + "step": 6422 + }, + { + "epoch": 0.56, + "grad_norm": 5.415715172669707, + "learning_rate": 4.208276528097376e-06, + "loss": 0.7231, + "step": 6423 + }, + { + "epoch": 0.56, + "grad_norm": 5.758001490650504, + "learning_rate": 4.20687219669649e-06, + "loss": 0.707, + "step": 6424 + }, + { + "epoch": 0.56, + "grad_norm": 8.688805709548701, + "learning_rate": 4.205467929474186e-06, + "loss": 0.7729, + "step": 6425 + }, + { + "epoch": 0.56, + "grad_norm": 8.47778236452753, + "learning_rate": 4.204063726544094e-06, + "loss": 0.7314, + "step": 6426 + }, + { + "epoch": 0.56, + "grad_norm": 6.214500976866368, + "learning_rate": 4.202659588019843e-06, + "loss": 0.8741, + "step": 6427 + }, + { + "epoch": 0.56, + "grad_norm": 6.290202436597755, + "learning_rate": 4.201255514015051e-06, + "loss": 0.7833, + "step": 6428 + }, + { + "epoch": 0.56, + "grad_norm": 9.40191399377486, + "learning_rate": 4.1998515046433356e-06, + "loss": 0.8272, + "step": 6429 + }, + { + "epoch": 0.56, + "grad_norm": 6.902222919966622, + "learning_rate": 4.1984475600183044e-06, + "loss": 0.8574, + "step": 6430 + }, + { + "epoch": 0.56, + "grad_norm": 16.289258429577405, + "learning_rate": 4.1970436802535626e-06, + "loss": 0.9116, + "step": 6431 + }, + { + "epoch": 0.56, + "grad_norm": 6.845528377471543, + "learning_rate": 4.1956398654627114e-06, + "loss": 0.6785, + "step": 6432 + }, + { + "epoch": 0.56, + "grad_norm": 10.362588728204212, + "learning_rate": 4.194236115759345e-06, + "loss": 0.5275, + "step": 6433 + }, + { + "epoch": 0.57, + "grad_norm": 10.997547077395616, + "learning_rate": 4.19283243125705e-06, + "loss": 0.7646, + "step": 6434 + }, + { + "epoch": 0.57, + "grad_norm": 11.576346514460074, + "learning_rate": 4.191428812069414e-06, + "loss": 0.8147, + "step": 6435 + }, + { + "epoch": 0.57, + "grad_norm": 8.337625856445355, + "learning_rate": 4.190025258310013e-06, + "loss": 1.0039, + "step": 6436 + }, + { + "epoch": 0.57, + "grad_norm": 6.092395734991257, + "learning_rate": 4.188621770092419e-06, + "loss": 0.7567, + "step": 6437 + }, + { + "epoch": 0.57, + "grad_norm": 4.894579157725733, + "learning_rate": 4.187218347530204e-06, + "loss": 0.6901, + "step": 6438 + }, + { + "epoch": 0.57, + "grad_norm": 22.92125000927421, + "learning_rate": 4.185814990736927e-06, + "loss": 0.8101, + "step": 6439 + }, + { + "epoch": 0.57, + "grad_norm": 2.0920454784479516, + "learning_rate": 4.184411699826146e-06, + "loss": 0.4341, + "step": 6440 + }, + { + "epoch": 0.57, + "grad_norm": 3.1328319410321557, + "learning_rate": 4.1830084749114155e-06, + "loss": 0.5444, + "step": 6441 + }, + { + "epoch": 0.57, + "grad_norm": 13.260197779504244, + "learning_rate": 4.18160531610628e-06, + "loss": 0.7071, + "step": 6442 + }, + { + "epoch": 0.57, + "grad_norm": 8.201238278160334, + "learning_rate": 4.18020222352428e-06, + "loss": 0.7888, + "step": 6443 + }, + { + "epoch": 0.57, + "grad_norm": 7.29448929204611, + "learning_rate": 4.178799197278953e-06, + "loss": 0.7498, + "step": 6444 + }, + { + "epoch": 0.57, + "grad_norm": 4.991654428268279, + "learning_rate": 4.177396237483828e-06, + "loss": 0.7797, + "step": 6445 + }, + { + "epoch": 0.57, + "grad_norm": 6.98126599292528, + "learning_rate": 4.175993344252432e-06, + "loss": 0.8306, + "step": 6446 + }, + { + "epoch": 0.57, + "grad_norm": 6.949302420030483, + "learning_rate": 4.174590517698284e-06, + "loss": 0.6737, + "step": 6447 + }, + { + "epoch": 0.57, + "grad_norm": 6.405998877168834, + "learning_rate": 4.173187757934897e-06, + "loss": 0.6011, + "step": 6448 + }, + { + "epoch": 0.57, + "grad_norm": 16.181290745764354, + "learning_rate": 4.171785065075783e-06, + "loss": 0.8009, + "step": 6449 + }, + { + "epoch": 0.57, + "grad_norm": 7.1776485855916095, + "learning_rate": 4.170382439234443e-06, + "loss": 0.7597, + "step": 6450 + }, + { + "epoch": 0.57, + "grad_norm": 9.012752744856265, + "learning_rate": 4.168979880524376e-06, + "loss": 0.7117, + "step": 6451 + }, + { + "epoch": 0.57, + "grad_norm": 12.769973418410505, + "learning_rate": 4.167577389059075e-06, + "loss": 0.8514, + "step": 6452 + }, + { + "epoch": 0.57, + "grad_norm": 6.580104025329665, + "learning_rate": 4.166174964952027e-06, + "loss": 0.8157, + "step": 6453 + }, + { + "epoch": 0.57, + "grad_norm": 3.1563940319669976, + "learning_rate": 4.164772608316713e-06, + "loss": 0.502, + "step": 6454 + }, + { + "epoch": 0.57, + "grad_norm": 13.776810493170387, + "learning_rate": 4.163370319266612e-06, + "loss": 0.7096, + "step": 6455 + }, + { + "epoch": 0.57, + "grad_norm": 7.076853473637085, + "learning_rate": 4.161968097915192e-06, + "loss": 0.8442, + "step": 6456 + }, + { + "epoch": 0.57, + "grad_norm": 9.196035522905655, + "learning_rate": 4.16056594437592e-06, + "loss": 1.0178, + "step": 6457 + }, + { + "epoch": 0.57, + "grad_norm": 25.916205230136956, + "learning_rate": 4.159163858762255e-06, + "loss": 0.7984, + "step": 6458 + }, + { + "epoch": 0.57, + "grad_norm": 9.201343760221722, + "learning_rate": 4.157761841187652e-06, + "loss": 0.7274, + "step": 6459 + }, + { + "epoch": 0.57, + "grad_norm": 4.682397000284535, + "learning_rate": 4.156359891765559e-06, + "loss": 0.7914, + "step": 6460 + }, + { + "epoch": 0.57, + "grad_norm": 2.4733159499102446, + "learning_rate": 4.1549580106094215e-06, + "loss": 0.4489, + "step": 6461 + }, + { + "epoch": 0.57, + "grad_norm": 5.773679406329175, + "learning_rate": 4.153556197832676e-06, + "loss": 0.7011, + "step": 6462 + }, + { + "epoch": 0.57, + "grad_norm": 2.466528578580158, + "learning_rate": 4.152154453548754e-06, + "loss": 0.4866, + "step": 6463 + }, + { + "epoch": 0.57, + "grad_norm": 25.92563134654225, + "learning_rate": 4.150752777871085e-06, + "loss": 0.7595, + "step": 6464 + }, + { + "epoch": 0.57, + "grad_norm": 9.95718953393872, + "learning_rate": 4.149351170913087e-06, + "loss": 0.7294, + "step": 6465 + }, + { + "epoch": 0.57, + "grad_norm": 6.226191811550551, + "learning_rate": 4.147949632788177e-06, + "loss": 0.7994, + "step": 6466 + }, + { + "epoch": 0.57, + "grad_norm": 8.356292975820429, + "learning_rate": 4.146548163609767e-06, + "loss": 0.6967, + "step": 6467 + }, + { + "epoch": 0.57, + "grad_norm": 6.186430193081568, + "learning_rate": 4.145146763491258e-06, + "loss": 0.7455, + "step": 6468 + }, + { + "epoch": 0.57, + "grad_norm": 8.651372820098421, + "learning_rate": 4.143745432546053e-06, + "loss": 0.7916, + "step": 6469 + }, + { + "epoch": 0.57, + "grad_norm": 6.819412460218001, + "learning_rate": 4.142344170887542e-06, + "loss": 0.8009, + "step": 6470 + }, + { + "epoch": 0.57, + "grad_norm": 5.225577669848662, + "learning_rate": 4.140942978629114e-06, + "loss": 0.646, + "step": 6471 + }, + { + "epoch": 0.57, + "grad_norm": 3.0724142817930193, + "learning_rate": 4.139541855884152e-06, + "loss": 0.5379, + "step": 6472 + }, + { + "epoch": 0.57, + "grad_norm": 6.000176348768616, + "learning_rate": 4.138140802766032e-06, + "loss": 0.7512, + "step": 6473 + }, + { + "epoch": 0.57, + "grad_norm": 7.256437517069027, + "learning_rate": 4.1367398193881235e-06, + "loss": 0.7528, + "step": 6474 + }, + { + "epoch": 0.57, + "grad_norm": 12.726348450775964, + "learning_rate": 4.135338905863792e-06, + "loss": 0.7836, + "step": 6475 + }, + { + "epoch": 0.57, + "grad_norm": 7.037229601576789, + "learning_rate": 4.1339380623064e-06, + "loss": 0.7839, + "step": 6476 + }, + { + "epoch": 0.57, + "grad_norm": 5.5671216941840775, + "learning_rate": 4.132537288829299e-06, + "loss": 0.6879, + "step": 6477 + }, + { + "epoch": 0.57, + "grad_norm": 9.918583201047355, + "learning_rate": 4.1311365855458375e-06, + "loss": 0.8202, + "step": 6478 + }, + { + "epoch": 0.57, + "grad_norm": 4.513830153951276, + "learning_rate": 4.129735952569358e-06, + "loss": 0.7447, + "step": 6479 + }, + { + "epoch": 0.57, + "grad_norm": 5.392854338413789, + "learning_rate": 4.1283353900131965e-06, + "loss": 0.858, + "step": 6480 + }, + { + "epoch": 0.57, + "grad_norm": 3.552109407868691, + "learning_rate": 4.126934897990687e-06, + "loss": 0.8012, + "step": 6481 + }, + { + "epoch": 0.57, + "grad_norm": 8.568831033573673, + "learning_rate": 4.125534476615153e-06, + "loss": 0.6528, + "step": 6482 + }, + { + "epoch": 0.57, + "grad_norm": 5.205384240753338, + "learning_rate": 4.124134125999913e-06, + "loss": 0.7086, + "step": 6483 + }, + { + "epoch": 0.57, + "grad_norm": 5.09588626731463, + "learning_rate": 4.122733846258285e-06, + "loss": 0.7216, + "step": 6484 + }, + { + "epoch": 0.57, + "grad_norm": 6.683914443552322, + "learning_rate": 4.1213336375035736e-06, + "loss": 0.7612, + "step": 6485 + }, + { + "epoch": 0.57, + "grad_norm": 22.382817648116475, + "learning_rate": 4.1199334998490826e-06, + "loss": 0.7654, + "step": 6486 + }, + { + "epoch": 0.57, + "grad_norm": 5.681740719371615, + "learning_rate": 4.11853343340811e-06, + "loss": 0.8214, + "step": 6487 + }, + { + "epoch": 0.57, + "grad_norm": 4.727543556477857, + "learning_rate": 4.117133438293943e-06, + "loss": 0.8043, + "step": 6488 + }, + { + "epoch": 0.57, + "grad_norm": 4.785924385196959, + "learning_rate": 4.115733514619872e-06, + "loss": 0.7302, + "step": 6489 + }, + { + "epoch": 0.57, + "grad_norm": 6.545736825408286, + "learning_rate": 4.1143336624991735e-06, + "loss": 0.788, + "step": 6490 + }, + { + "epoch": 0.57, + "grad_norm": 7.786663456284071, + "learning_rate": 4.112933882045121e-06, + "loss": 0.8691, + "step": 6491 + }, + { + "epoch": 0.57, + "grad_norm": 13.182110094697416, + "learning_rate": 4.111534173370985e-06, + "loss": 0.7598, + "step": 6492 + }, + { + "epoch": 0.57, + "grad_norm": 12.029620629458377, + "learning_rate": 4.110134536590025e-06, + "loss": 0.716, + "step": 6493 + }, + { + "epoch": 0.57, + "grad_norm": 5.145473011751791, + "learning_rate": 4.108734971815497e-06, + "loss": 0.7403, + "step": 6494 + }, + { + "epoch": 0.57, + "grad_norm": 8.97140757142112, + "learning_rate": 4.107335479160654e-06, + "loss": 0.6068, + "step": 6495 + }, + { + "epoch": 0.57, + "grad_norm": 6.888170838222825, + "learning_rate": 4.105936058738739e-06, + "loss": 0.7558, + "step": 6496 + }, + { + "epoch": 0.57, + "grad_norm": 6.048164492588874, + "learning_rate": 4.1045367106629894e-06, + "loss": 0.7393, + "step": 6497 + }, + { + "epoch": 0.57, + "grad_norm": 9.746443322856473, + "learning_rate": 4.103137435046641e-06, + "loss": 0.8141, + "step": 6498 + }, + { + "epoch": 0.57, + "grad_norm": 8.880340650637448, + "learning_rate": 4.1017382320029195e-06, + "loss": 0.7337, + "step": 6499 + }, + { + "epoch": 0.57, + "grad_norm": 10.301651424539122, + "learning_rate": 4.100339101645046e-06, + "loss": 0.5701, + "step": 6500 + }, + { + "epoch": 0.57, + "grad_norm": 9.955269933323844, + "learning_rate": 4.098940044086236e-06, + "loss": 0.7948, + "step": 6501 + }, + { + "epoch": 0.57, + "grad_norm": 5.821754993945669, + "learning_rate": 4.097541059439698e-06, + "loss": 0.746, + "step": 6502 + }, + { + "epoch": 0.57, + "grad_norm": 20.290572860470984, + "learning_rate": 4.096142147818637e-06, + "loss": 0.7716, + "step": 6503 + }, + { + "epoch": 0.57, + "grad_norm": 5.678926841018821, + "learning_rate": 4.0947433093362495e-06, + "loss": 0.7735, + "step": 6504 + }, + { + "epoch": 0.57, + "grad_norm": 5.800542504001237, + "learning_rate": 4.093344544105728e-06, + "loss": 0.748, + "step": 6505 + }, + { + "epoch": 0.57, + "grad_norm": 15.643335370588622, + "learning_rate": 4.091945852240258e-06, + "loss": 0.7729, + "step": 6506 + }, + { + "epoch": 0.57, + "grad_norm": 6.102755037014777, + "learning_rate": 4.090547233853019e-06, + "loss": 0.8353, + "step": 6507 + }, + { + "epoch": 0.57, + "grad_norm": 2.2547441308987226, + "learning_rate": 4.089148689057184e-06, + "loss": 0.5181, + "step": 6508 + }, + { + "epoch": 0.57, + "grad_norm": 8.132293825720367, + "learning_rate": 4.087750217965923e-06, + "loss": 0.633, + "step": 6509 + }, + { + "epoch": 0.57, + "grad_norm": 3.7633014326442984, + "learning_rate": 4.086351820692397e-06, + "loss": 0.484, + "step": 6510 + }, + { + "epoch": 0.57, + "grad_norm": 11.553319820574357, + "learning_rate": 4.08495349734976e-06, + "loss": 0.7349, + "step": 6511 + }, + { + "epoch": 0.57, + "grad_norm": 6.138205196950826, + "learning_rate": 4.083555248051167e-06, + "loss": 0.6991, + "step": 6512 + }, + { + "epoch": 0.57, + "grad_norm": 5.569562644394107, + "learning_rate": 4.082157072909757e-06, + "loss": 0.6095, + "step": 6513 + }, + { + "epoch": 0.57, + "grad_norm": 9.065708502341307, + "learning_rate": 4.08075897203867e-06, + "loss": 0.6197, + "step": 6514 + }, + { + "epoch": 0.57, + "grad_norm": 15.349707324363113, + "learning_rate": 4.0793609455510385e-06, + "loss": 0.7254, + "step": 6515 + }, + { + "epoch": 0.57, + "grad_norm": 10.31648175646792, + "learning_rate": 4.077962993559988e-06, + "loss": 0.886, + "step": 6516 + }, + { + "epoch": 0.57, + "grad_norm": 6.933747526518103, + "learning_rate": 4.076565116178638e-06, + "loss": 0.7678, + "step": 6517 + }, + { + "epoch": 0.57, + "grad_norm": 6.832543202341102, + "learning_rate": 4.075167313520103e-06, + "loss": 0.798, + "step": 6518 + }, + { + "epoch": 0.57, + "grad_norm": 2.2839939968694036, + "learning_rate": 4.07376958569749e-06, + "loss": 0.486, + "step": 6519 + }, + { + "epoch": 0.57, + "grad_norm": 3.4886088838909863, + "learning_rate": 4.072371932823902e-06, + "loss": 0.5879, + "step": 6520 + }, + { + "epoch": 0.57, + "grad_norm": 7.035889417042133, + "learning_rate": 4.0709743550124335e-06, + "loss": 0.7348, + "step": 6521 + }, + { + "epoch": 0.57, + "grad_norm": 7.717729306999016, + "learning_rate": 4.069576852376176e-06, + "loss": 0.8109, + "step": 6522 + }, + { + "epoch": 0.57, + "grad_norm": 8.05712986294733, + "learning_rate": 4.06817942502821e-06, + "loss": 0.8391, + "step": 6523 + }, + { + "epoch": 0.57, + "grad_norm": 5.234630563835904, + "learning_rate": 4.066782073081616e-06, + "loss": 0.9086, + "step": 6524 + }, + { + "epoch": 0.57, + "grad_norm": 8.687913716868211, + "learning_rate": 4.0653847966494655e-06, + "loss": 0.8742, + "step": 6525 + }, + { + "epoch": 0.57, + "grad_norm": 6.652521657726818, + "learning_rate": 4.063987595844821e-06, + "loss": 0.8144, + "step": 6526 + }, + { + "epoch": 0.57, + "grad_norm": 13.85661291816132, + "learning_rate": 4.062590470780744e-06, + "loss": 0.7932, + "step": 6527 + }, + { + "epoch": 0.57, + "grad_norm": 6.519127283505367, + "learning_rate": 4.061193421570285e-06, + "loss": 0.743, + "step": 6528 + }, + { + "epoch": 0.57, + "grad_norm": 6.653318786022925, + "learning_rate": 4.059796448326495e-06, + "loss": 0.7828, + "step": 6529 + }, + { + "epoch": 0.57, + "grad_norm": 6.24463842458365, + "learning_rate": 4.058399551162412e-06, + "loss": 0.7812, + "step": 6530 + }, + { + "epoch": 0.57, + "grad_norm": 7.724567698853265, + "learning_rate": 4.05700273019107e-06, + "loss": 0.6213, + "step": 6531 + }, + { + "epoch": 0.57, + "grad_norm": 8.999191792069475, + "learning_rate": 4.055605985525501e-06, + "loss": 0.808, + "step": 6532 + }, + { + "epoch": 0.57, + "grad_norm": 5.83322873128453, + "learning_rate": 4.054209317278722e-06, + "loss": 0.7326, + "step": 6533 + }, + { + "epoch": 0.57, + "grad_norm": 9.5014585182517, + "learning_rate": 4.052812725563752e-06, + "loss": 0.7833, + "step": 6534 + }, + { + "epoch": 0.57, + "grad_norm": 5.34197731777428, + "learning_rate": 4.0514162104936025e-06, + "loss": 0.813, + "step": 6535 + }, + { + "epoch": 0.57, + "grad_norm": 6.303553618338464, + "learning_rate": 4.050019772181275e-06, + "loss": 0.8873, + "step": 6536 + }, + { + "epoch": 0.57, + "grad_norm": 9.080978794631665, + "learning_rate": 4.048623410739765e-06, + "loss": 0.7292, + "step": 6537 + }, + { + "epoch": 0.57, + "grad_norm": 8.959946874988104, + "learning_rate": 4.047227126282068e-06, + "loss": 0.7298, + "step": 6538 + }, + { + "epoch": 0.57, + "grad_norm": 6.794086036324654, + "learning_rate": 4.045830918921168e-06, + "loss": 0.7051, + "step": 6539 + }, + { + "epoch": 0.57, + "grad_norm": 5.143491587485554, + "learning_rate": 4.044434788770041e-06, + "loss": 0.737, + "step": 6540 + }, + { + "epoch": 0.57, + "grad_norm": 7.028431367928465, + "learning_rate": 4.043038735941662e-06, + "loss": 0.7756, + "step": 6541 + }, + { + "epoch": 0.57, + "grad_norm": 5.00351977930131, + "learning_rate": 4.041642760548997e-06, + "loss": 0.7377, + "step": 6542 + }, + { + "epoch": 0.57, + "grad_norm": 5.879434342224708, + "learning_rate": 4.040246862705005e-06, + "loss": 0.7582, + "step": 6543 + }, + { + "epoch": 0.57, + "grad_norm": 6.282851189115786, + "learning_rate": 4.038851042522641e-06, + "loss": 0.7148, + "step": 6544 + }, + { + "epoch": 0.57, + "grad_norm": 4.899057286014473, + "learning_rate": 4.037455300114852e-06, + "loss": 0.5919, + "step": 6545 + }, + { + "epoch": 0.57, + "grad_norm": 7.444870196721369, + "learning_rate": 4.036059635594578e-06, + "loss": 0.6962, + "step": 6546 + }, + { + "epoch": 0.57, + "grad_norm": 14.11130144653717, + "learning_rate": 4.034664049074756e-06, + "loss": 0.7687, + "step": 6547 + }, + { + "epoch": 0.58, + "grad_norm": 7.384997949375411, + "learning_rate": 4.033268540668312e-06, + "loss": 0.7644, + "step": 6548 + }, + { + "epoch": 0.58, + "grad_norm": 7.556692420051202, + "learning_rate": 4.031873110488171e-06, + "loss": 0.7913, + "step": 6549 + }, + { + "epoch": 0.58, + "grad_norm": 4.940283792604664, + "learning_rate": 4.030477758647247e-06, + "loss": 0.7114, + "step": 6550 + }, + { + "epoch": 0.58, + "grad_norm": 12.249097468098853, + "learning_rate": 4.02908248525845e-06, + "loss": 0.7405, + "step": 6551 + }, + { + "epoch": 0.58, + "grad_norm": 8.113495445067056, + "learning_rate": 4.027687290434683e-06, + "loss": 0.7083, + "step": 6552 + }, + { + "epoch": 0.58, + "grad_norm": 5.339181600128008, + "learning_rate": 4.026292174288844e-06, + "loss": 0.6442, + "step": 6553 + }, + { + "epoch": 0.58, + "grad_norm": 6.119632433007391, + "learning_rate": 4.024897136933822e-06, + "loss": 0.8129, + "step": 6554 + }, + { + "epoch": 0.58, + "grad_norm": 5.867542750632004, + "learning_rate": 4.023502178482499e-06, + "loss": 0.9167, + "step": 6555 + }, + { + "epoch": 0.58, + "grad_norm": 5.962567062119209, + "learning_rate": 4.022107299047759e-06, + "loss": 0.6855, + "step": 6556 + }, + { + "epoch": 0.58, + "grad_norm": 7.4394165923351965, + "learning_rate": 4.020712498742469e-06, + "loss": 0.6887, + "step": 6557 + }, + { + "epoch": 0.58, + "grad_norm": 8.662409299736616, + "learning_rate": 4.0193177776794934e-06, + "loss": 0.8526, + "step": 6558 + }, + { + "epoch": 0.58, + "grad_norm": 6.475377589701196, + "learning_rate": 4.017923135971693e-06, + "loss": 0.8968, + "step": 6559 + }, + { + "epoch": 0.58, + "grad_norm": 7.711135680425273, + "learning_rate": 4.016528573731919e-06, + "loss": 0.8281, + "step": 6560 + }, + { + "epoch": 0.58, + "grad_norm": 7.778480210847887, + "learning_rate": 4.015134091073016e-06, + "loss": 0.8388, + "step": 6561 + }, + { + "epoch": 0.58, + "grad_norm": 7.386618709915272, + "learning_rate": 4.013739688107826e-06, + "loss": 0.7199, + "step": 6562 + }, + { + "epoch": 0.58, + "grad_norm": 6.405310261222673, + "learning_rate": 4.012345364949179e-06, + "loss": 0.733, + "step": 6563 + }, + { + "epoch": 0.58, + "grad_norm": 5.784089501877826, + "learning_rate": 4.010951121709901e-06, + "loss": 0.7263, + "step": 6564 + }, + { + "epoch": 0.58, + "grad_norm": 5.582295463244022, + "learning_rate": 4.0095569585028145e-06, + "loss": 0.7978, + "step": 6565 + }, + { + "epoch": 0.58, + "grad_norm": 5.344801085382627, + "learning_rate": 4.008162875440731e-06, + "loss": 0.6922, + "step": 6566 + }, + { + "epoch": 0.58, + "grad_norm": 13.733780570080988, + "learning_rate": 4.006768872636456e-06, + "loss": 0.7304, + "step": 6567 + }, + { + "epoch": 0.58, + "grad_norm": 8.868575147807707, + "learning_rate": 4.005374950202795e-06, + "loss": 0.7117, + "step": 6568 + }, + { + "epoch": 0.58, + "grad_norm": 14.251027732828051, + "learning_rate": 4.003981108252537e-06, + "loss": 0.6554, + "step": 6569 + }, + { + "epoch": 0.58, + "grad_norm": 7.515082019512403, + "learning_rate": 4.00258734689847e-06, + "loss": 0.7767, + "step": 6570 + }, + { + "epoch": 0.58, + "grad_norm": 7.063584157034518, + "learning_rate": 4.001193666253377e-06, + "loss": 0.72, + "step": 6571 + }, + { + "epoch": 0.58, + "grad_norm": 7.097731823134982, + "learning_rate": 3.9998000664300296e-06, + "loss": 0.714, + "step": 6572 + }, + { + "epoch": 0.58, + "grad_norm": 4.763151350450716, + "learning_rate": 3.998406547541198e-06, + "loss": 0.7089, + "step": 6573 + }, + { + "epoch": 0.58, + "grad_norm": 7.319552312688221, + "learning_rate": 3.997013109699643e-06, + "loss": 0.848, + "step": 6574 + }, + { + "epoch": 0.58, + "grad_norm": 11.303689842904248, + "learning_rate": 3.995619753018118e-06, + "loss": 0.7826, + "step": 6575 + }, + { + "epoch": 0.58, + "grad_norm": 10.638968949421644, + "learning_rate": 3.994226477609373e-06, + "loss": 0.7198, + "step": 6576 + }, + { + "epoch": 0.58, + "grad_norm": 9.557460556039281, + "learning_rate": 3.9928332835861475e-06, + "loss": 0.5771, + "step": 6577 + }, + { + "epoch": 0.58, + "grad_norm": 2.639841537579286, + "learning_rate": 3.9914401710611775e-06, + "loss": 0.4493, + "step": 6578 + }, + { + "epoch": 0.58, + "grad_norm": 7.376399119589039, + "learning_rate": 3.990047140147192e-06, + "loss": 0.7099, + "step": 6579 + }, + { + "epoch": 0.58, + "grad_norm": 8.049823319134449, + "learning_rate": 3.9886541909569106e-06, + "loss": 0.885, + "step": 6580 + }, + { + "epoch": 0.58, + "grad_norm": 5.734450484097139, + "learning_rate": 3.98726132360305e-06, + "loss": 0.5565, + "step": 6581 + }, + { + "epoch": 0.58, + "grad_norm": 8.423728105960153, + "learning_rate": 3.9858685381983195e-06, + "loss": 0.7455, + "step": 6582 + }, + { + "epoch": 0.58, + "grad_norm": 17.09889831949524, + "learning_rate": 3.984475834855419e-06, + "loss": 0.7845, + "step": 6583 + }, + { + "epoch": 0.58, + "grad_norm": 7.372473703448648, + "learning_rate": 3.9830832136870445e-06, + "loss": 0.7719, + "step": 6584 + }, + { + "epoch": 0.58, + "grad_norm": 9.709966795316504, + "learning_rate": 3.981690674805886e-06, + "loss": 0.701, + "step": 6585 + }, + { + "epoch": 0.58, + "grad_norm": 4.499417895656705, + "learning_rate": 3.9802982183246245e-06, + "loss": 0.6528, + "step": 6586 + }, + { + "epoch": 0.58, + "grad_norm": 7.643301074044195, + "learning_rate": 3.978905844355932e-06, + "loss": 0.7574, + "step": 6587 + }, + { + "epoch": 0.58, + "grad_norm": 6.3322374240600166, + "learning_rate": 3.977513553012483e-06, + "loss": 0.7514, + "step": 6588 + }, + { + "epoch": 0.58, + "grad_norm": 6.7751508760774835, + "learning_rate": 3.976121344406936e-06, + "loss": 0.7538, + "step": 6589 + }, + { + "epoch": 0.58, + "grad_norm": 4.823214267673554, + "learning_rate": 3.974729218651946e-06, + "loss": 0.7366, + "step": 6590 + }, + { + "epoch": 0.58, + "grad_norm": 7.960388752313103, + "learning_rate": 3.973337175860162e-06, + "loss": 0.833, + "step": 6591 + }, + { + "epoch": 0.58, + "grad_norm": 3.434896927078548, + "learning_rate": 3.9719452161442266e-06, + "loss": 0.6086, + "step": 6592 + }, + { + "epoch": 0.58, + "grad_norm": 2.2915581365237956, + "learning_rate": 3.970553339616774e-06, + "loss": 0.5305, + "step": 6593 + }, + { + "epoch": 0.58, + "grad_norm": 7.34943203731213, + "learning_rate": 3.969161546390433e-06, + "loss": 0.7356, + "step": 6594 + }, + { + "epoch": 0.58, + "grad_norm": 8.046073319412448, + "learning_rate": 3.9677698365778245e-06, + "loss": 0.7266, + "step": 6595 + }, + { + "epoch": 0.58, + "grad_norm": 5.108175790398677, + "learning_rate": 3.966378210291564e-06, + "loss": 0.6907, + "step": 6596 + }, + { + "epoch": 0.58, + "grad_norm": 9.528784272110332, + "learning_rate": 3.96498666764426e-06, + "loss": 0.7554, + "step": 6597 + }, + { + "epoch": 0.58, + "grad_norm": 6.314442232402435, + "learning_rate": 3.963595208748512e-06, + "loss": 0.7727, + "step": 6598 + }, + { + "epoch": 0.58, + "grad_norm": 5.888783584501519, + "learning_rate": 3.962203833716917e-06, + "loss": 0.7142, + "step": 6599 + }, + { + "epoch": 0.58, + "grad_norm": 6.330832510126669, + "learning_rate": 3.960812542662061e-06, + "loss": 0.6876, + "step": 6600 + }, + { + "epoch": 0.58, + "grad_norm": 7.6384040199847965, + "learning_rate": 3.959421335696524e-06, + "loss": 0.7847, + "step": 6601 + }, + { + "epoch": 0.58, + "grad_norm": 5.413744810435086, + "learning_rate": 3.958030212932884e-06, + "loss": 0.8091, + "step": 6602 + }, + { + "epoch": 0.58, + "grad_norm": 3.2591696226015294, + "learning_rate": 3.956639174483705e-06, + "loss": 0.5946, + "step": 6603 + }, + { + "epoch": 0.58, + "grad_norm": 5.9475575829546035, + "learning_rate": 3.955248220461548e-06, + "loss": 0.7707, + "step": 6604 + }, + { + "epoch": 0.58, + "grad_norm": 6.525567954608805, + "learning_rate": 3.953857350978968e-06, + "loss": 0.817, + "step": 6605 + }, + { + "epoch": 0.58, + "grad_norm": 6.652633365639072, + "learning_rate": 3.95246656614851e-06, + "loss": 0.7038, + "step": 6606 + }, + { + "epoch": 0.58, + "grad_norm": 11.009446595726745, + "learning_rate": 3.951075866082715e-06, + "loss": 0.7424, + "step": 6607 + }, + { + "epoch": 0.58, + "grad_norm": 6.519243555249348, + "learning_rate": 3.949685250894117e-06, + "loss": 0.7921, + "step": 6608 + }, + { + "epoch": 0.58, + "grad_norm": 15.444514584792222, + "learning_rate": 3.948294720695241e-06, + "loss": 0.789, + "step": 6609 + }, + { + "epoch": 0.58, + "grad_norm": 6.01065419797833, + "learning_rate": 3.9469042755986065e-06, + "loss": 0.7111, + "step": 6610 + }, + { + "epoch": 0.58, + "grad_norm": 7.3024197455524344, + "learning_rate": 3.945513915716727e-06, + "loss": 0.682, + "step": 6611 + }, + { + "epoch": 0.58, + "grad_norm": 12.220543292603269, + "learning_rate": 3.944123641162106e-06, + "loss": 1.0285, + "step": 6612 + }, + { + "epoch": 0.58, + "grad_norm": 5.459114038015858, + "learning_rate": 3.942733452047245e-06, + "loss": 0.7337, + "step": 6613 + }, + { + "epoch": 0.58, + "grad_norm": 3.680886639960826, + "learning_rate": 3.941343348484633e-06, + "loss": 0.6081, + "step": 6614 + }, + { + "epoch": 0.58, + "grad_norm": 8.530473053605034, + "learning_rate": 3.939953330586757e-06, + "loss": 0.7088, + "step": 6615 + }, + { + "epoch": 0.58, + "grad_norm": 5.871006236582575, + "learning_rate": 3.9385633984660946e-06, + "loss": 0.7911, + "step": 6616 + }, + { + "epoch": 0.58, + "grad_norm": 11.428375729540855, + "learning_rate": 3.937173552235117e-06, + "loss": 0.7218, + "step": 6617 + }, + { + "epoch": 0.58, + "grad_norm": 2.5307810303287854, + "learning_rate": 3.935783792006286e-06, + "loss": 0.5322, + "step": 6618 + }, + { + "epoch": 0.58, + "grad_norm": 7.919521919790608, + "learning_rate": 3.934394117892062e-06, + "loss": 0.7591, + "step": 6619 + }, + { + "epoch": 0.58, + "grad_norm": 2.0059625921940203, + "learning_rate": 3.933004530004893e-06, + "loss": 0.5021, + "step": 6620 + }, + { + "epoch": 0.58, + "grad_norm": 5.801928907097014, + "learning_rate": 3.93161502845722e-06, + "loss": 0.729, + "step": 6621 + }, + { + "epoch": 0.58, + "grad_norm": 8.752091409491294, + "learning_rate": 3.9302256133614846e-06, + "loss": 0.7234, + "step": 6622 + }, + { + "epoch": 0.58, + "grad_norm": 7.018380881665405, + "learning_rate": 3.928836284830113e-06, + "loss": 0.8057, + "step": 6623 + }, + { + "epoch": 0.58, + "grad_norm": 3.578396507173058, + "learning_rate": 3.927447042975525e-06, + "loss": 0.5611, + "step": 6624 + }, + { + "epoch": 0.58, + "grad_norm": 5.5984561385377045, + "learning_rate": 3.926057887910141e-06, + "loss": 0.7582, + "step": 6625 + }, + { + "epoch": 0.58, + "grad_norm": 7.779473333478784, + "learning_rate": 3.924668819746366e-06, + "loss": 0.8814, + "step": 6626 + }, + { + "epoch": 0.58, + "grad_norm": 4.1816596681849765, + "learning_rate": 3.9232798385966e-06, + "loss": 0.7468, + "step": 6627 + }, + { + "epoch": 0.58, + "grad_norm": 8.588512572934013, + "learning_rate": 3.921890944573239e-06, + "loss": 0.7263, + "step": 6628 + }, + { + "epoch": 0.58, + "grad_norm": 6.164220027865922, + "learning_rate": 3.920502137788669e-06, + "loss": 0.7522, + "step": 6629 + }, + { + "epoch": 0.58, + "grad_norm": 8.288054927492343, + "learning_rate": 3.919113418355271e-06, + "loss": 0.904, + "step": 6630 + }, + { + "epoch": 0.58, + "grad_norm": 5.464898968738458, + "learning_rate": 3.9177247863854175e-06, + "loss": 0.7124, + "step": 6631 + }, + { + "epoch": 0.58, + "grad_norm": 9.248651813297702, + "learning_rate": 3.916336241991472e-06, + "loss": 0.7651, + "step": 6632 + }, + { + "epoch": 0.58, + "grad_norm": 3.4121729873656927, + "learning_rate": 3.914947785285798e-06, + "loss": 0.5977, + "step": 6633 + }, + { + "epoch": 0.58, + "grad_norm": 36.7863816570145, + "learning_rate": 3.913559416380743e-06, + "loss": 0.8316, + "step": 6634 + }, + { + "epoch": 0.58, + "grad_norm": 4.998679132258088, + "learning_rate": 3.912171135388653e-06, + "loss": 0.9013, + "step": 6635 + }, + { + "epoch": 0.58, + "grad_norm": 3.1025507288660386, + "learning_rate": 3.910782942421865e-06, + "loss": 0.5228, + "step": 6636 + }, + { + "epoch": 0.58, + "grad_norm": 7.707007749121521, + "learning_rate": 3.90939483759271e-06, + "loss": 0.6927, + "step": 6637 + }, + { + "epoch": 0.58, + "grad_norm": 6.984398849061161, + "learning_rate": 3.908006821013509e-06, + "loss": 0.7289, + "step": 6638 + }, + { + "epoch": 0.58, + "grad_norm": 9.080156930430432, + "learning_rate": 3.9066188927965815e-06, + "loss": 0.6012, + "step": 6639 + }, + { + "epoch": 0.58, + "grad_norm": 11.57102225727343, + "learning_rate": 3.905231053054234e-06, + "loss": 0.8185, + "step": 6640 + }, + { + "epoch": 0.58, + "grad_norm": 2.852988013409753, + "learning_rate": 3.903843301898767e-06, + "loss": 0.5514, + "step": 6641 + }, + { + "epoch": 0.58, + "grad_norm": 11.039774055083054, + "learning_rate": 3.90245563944248e-06, + "loss": 0.6414, + "step": 6642 + }, + { + "epoch": 0.58, + "grad_norm": 10.791433410275715, + "learning_rate": 3.901068065797655e-06, + "loss": 0.7145, + "step": 6643 + }, + { + "epoch": 0.58, + "grad_norm": 5.449630725190626, + "learning_rate": 3.899680581076573e-06, + "loss": 0.6364, + "step": 6644 + }, + { + "epoch": 0.58, + "grad_norm": 41.56888706153647, + "learning_rate": 3.898293185391509e-06, + "loss": 0.7855, + "step": 6645 + }, + { + "epoch": 0.58, + "grad_norm": 6.758123013952301, + "learning_rate": 3.896905878854729e-06, + "loss": 0.8646, + "step": 6646 + }, + { + "epoch": 0.58, + "grad_norm": 5.999238935071564, + "learning_rate": 3.895518661578489e-06, + "loss": 0.6026, + "step": 6647 + }, + { + "epoch": 0.58, + "grad_norm": 6.3420201634362785, + "learning_rate": 3.8941315336750425e-06, + "loss": 0.7706, + "step": 6648 + }, + { + "epoch": 0.58, + "grad_norm": 9.0511899584689, + "learning_rate": 3.8927444952566325e-06, + "loss": 0.6115, + "step": 6649 + }, + { + "epoch": 0.58, + "grad_norm": 9.42335587533938, + "learning_rate": 3.891357546435496e-06, + "loss": 0.7958, + "step": 6650 + }, + { + "epoch": 0.58, + "grad_norm": 7.422084287493198, + "learning_rate": 3.889970687323863e-06, + "loss": 0.7723, + "step": 6651 + }, + { + "epoch": 0.58, + "grad_norm": 14.318255011480929, + "learning_rate": 3.888583918033955e-06, + "loss": 0.7475, + "step": 6652 + }, + { + "epoch": 0.58, + "grad_norm": 5.170946909355128, + "learning_rate": 3.887197238677987e-06, + "loss": 0.7151, + "step": 6653 + }, + { + "epoch": 0.58, + "grad_norm": 8.750549971486123, + "learning_rate": 3.88581064936817e-06, + "loss": 0.5565, + "step": 6654 + }, + { + "epoch": 0.58, + "grad_norm": 9.041328504817152, + "learning_rate": 3.884424150216699e-06, + "loss": 0.8624, + "step": 6655 + }, + { + "epoch": 0.58, + "grad_norm": 8.239373972246577, + "learning_rate": 3.883037741335772e-06, + "loss": 0.8554, + "step": 6656 + }, + { + "epoch": 0.58, + "grad_norm": 19.13411640734076, + "learning_rate": 3.881651422837572e-06, + "loss": 0.7417, + "step": 6657 + }, + { + "epoch": 0.58, + "grad_norm": 6.627505140031056, + "learning_rate": 3.880265194834279e-06, + "loss": 0.7628, + "step": 6658 + }, + { + "epoch": 0.58, + "grad_norm": 6.17205748248122, + "learning_rate": 3.878879057438063e-06, + "loss": 0.7362, + "step": 6659 + }, + { + "epoch": 0.58, + "grad_norm": 5.7839337961466635, + "learning_rate": 3.877493010761091e-06, + "loss": 0.6859, + "step": 6660 + }, + { + "epoch": 0.58, + "grad_norm": 7.975124329896267, + "learning_rate": 3.876107054915515e-06, + "loss": 0.6393, + "step": 6661 + }, + { + "epoch": 0.59, + "grad_norm": 8.368831910976178, + "learning_rate": 3.874721190013488e-06, + "loss": 0.8634, + "step": 6662 + }, + { + "epoch": 0.59, + "grad_norm": 7.3322726840843595, + "learning_rate": 3.87333541616715e-06, + "loss": 0.6643, + "step": 6663 + }, + { + "epoch": 0.59, + "grad_norm": 8.905648528927614, + "learning_rate": 3.8719497334886355e-06, + "loss": 0.7394, + "step": 6664 + }, + { + "epoch": 0.59, + "grad_norm": 13.01946972327841, + "learning_rate": 3.870564142090073e-06, + "loss": 0.8275, + "step": 6665 + }, + { + "epoch": 0.59, + "grad_norm": 8.709033933896759, + "learning_rate": 3.869178642083581e-06, + "loss": 0.6716, + "step": 6666 + }, + { + "epoch": 0.59, + "grad_norm": 6.711767407111096, + "learning_rate": 3.867793233581272e-06, + "loss": 0.7862, + "step": 6667 + }, + { + "epoch": 0.59, + "grad_norm": 7.203034003647584, + "learning_rate": 3.8664079166952526e-06, + "loss": 0.8479, + "step": 6668 + }, + { + "epoch": 0.59, + "grad_norm": 8.7253457829907, + "learning_rate": 3.865022691537617e-06, + "loss": 0.6148, + "step": 6669 + }, + { + "epoch": 0.59, + "grad_norm": 6.928409837047334, + "learning_rate": 3.863637558220458e-06, + "loss": 0.9439, + "step": 6670 + }, + { + "epoch": 0.59, + "grad_norm": 7.433483164458755, + "learning_rate": 3.862252516855859e-06, + "loss": 0.6884, + "step": 6671 + }, + { + "epoch": 0.59, + "grad_norm": 9.663996083846023, + "learning_rate": 3.860867567555891e-06, + "loss": 0.6936, + "step": 6672 + }, + { + "epoch": 0.59, + "grad_norm": 15.06390890096555, + "learning_rate": 3.8594827104326265e-06, + "loss": 0.6814, + "step": 6673 + }, + { + "epoch": 0.59, + "grad_norm": 9.503333285991015, + "learning_rate": 3.858097945598124e-06, + "loss": 0.7523, + "step": 6674 + }, + { + "epoch": 0.59, + "grad_norm": 10.760958220811148, + "learning_rate": 3.856713273164435e-06, + "loss": 0.8068, + "step": 6675 + }, + { + "epoch": 0.59, + "grad_norm": 7.732023367339685, + "learning_rate": 3.8553286932436075e-06, + "loss": 0.7683, + "step": 6676 + }, + { + "epoch": 0.59, + "grad_norm": 2.768428840528839, + "learning_rate": 3.853944205947678e-06, + "loss": 0.5177, + "step": 6677 + }, + { + "epoch": 0.59, + "grad_norm": 9.25465122144362, + "learning_rate": 3.852559811388676e-06, + "loss": 0.8142, + "step": 6678 + }, + { + "epoch": 0.59, + "grad_norm": 13.474556813211612, + "learning_rate": 3.851175509678626e-06, + "loss": 0.7281, + "step": 6679 + }, + { + "epoch": 0.59, + "grad_norm": 9.11615406051281, + "learning_rate": 3.849791300929543e-06, + "loss": 0.6994, + "step": 6680 + }, + { + "epoch": 0.59, + "grad_norm": 12.343457673618222, + "learning_rate": 3.8484071852534335e-06, + "loss": 0.7714, + "step": 6681 + }, + { + "epoch": 0.59, + "grad_norm": 8.791903892909483, + "learning_rate": 3.8470231627623006e-06, + "loss": 0.6821, + "step": 6682 + }, + { + "epoch": 0.59, + "grad_norm": 9.282090926615806, + "learning_rate": 3.845639233568135e-06, + "loss": 0.7883, + "step": 6683 + }, + { + "epoch": 0.59, + "grad_norm": 6.3687692388673, + "learning_rate": 3.8442553977829215e-06, + "loss": 0.6716, + "step": 6684 + }, + { + "epoch": 0.59, + "grad_norm": 7.563355796849528, + "learning_rate": 3.842871655518639e-06, + "loss": 0.7665, + "step": 6685 + }, + { + "epoch": 0.59, + "grad_norm": 5.824303113490084, + "learning_rate": 3.841488006887259e-06, + "loss": 0.7308, + "step": 6686 + }, + { + "epoch": 0.59, + "grad_norm": 5.579000983771308, + "learning_rate": 3.84010445200074e-06, + "loss": 0.898, + "step": 6687 + }, + { + "epoch": 0.59, + "grad_norm": 10.987697972105078, + "learning_rate": 3.8387209909710414e-06, + "loss": 0.871, + "step": 6688 + }, + { + "epoch": 0.59, + "grad_norm": 11.319029401962128, + "learning_rate": 3.8373376239101076e-06, + "loss": 0.6773, + "step": 6689 + }, + { + "epoch": 0.59, + "grad_norm": 8.86833103136118, + "learning_rate": 3.835954350929879e-06, + "loss": 0.8118, + "step": 6690 + }, + { + "epoch": 0.59, + "grad_norm": 5.69649222924922, + "learning_rate": 3.8345711721422885e-06, + "loss": 0.7113, + "step": 6691 + }, + { + "epoch": 0.59, + "grad_norm": 11.216973280760092, + "learning_rate": 3.83318808765926e-06, + "loss": 0.7044, + "step": 6692 + }, + { + "epoch": 0.59, + "grad_norm": 7.235280718457037, + "learning_rate": 3.831805097592711e-06, + "loss": 0.8326, + "step": 6693 + }, + { + "epoch": 0.59, + "grad_norm": 7.509503079305484, + "learning_rate": 3.83042220205455e-06, + "loss": 0.7977, + "step": 6694 + }, + { + "epoch": 0.59, + "grad_norm": 5.83461036007416, + "learning_rate": 3.8290394011566776e-06, + "loss": 0.667, + "step": 6695 + }, + { + "epoch": 0.59, + "grad_norm": 10.619928748498014, + "learning_rate": 3.82765669501099e-06, + "loss": 0.8245, + "step": 6696 + }, + { + "epoch": 0.59, + "grad_norm": 6.14375666246676, + "learning_rate": 3.826274083729374e-06, + "loss": 0.773, + "step": 6697 + }, + { + "epoch": 0.59, + "grad_norm": 6.498892352943082, + "learning_rate": 3.824891567423705e-06, + "loss": 0.9476, + "step": 6698 + }, + { + "epoch": 0.59, + "grad_norm": 9.249972406718339, + "learning_rate": 3.823509146205857e-06, + "loss": 0.8058, + "step": 6699 + }, + { + "epoch": 0.59, + "grad_norm": 7.416074634108132, + "learning_rate": 3.822126820187691e-06, + "loss": 0.6051, + "step": 6700 + }, + { + "epoch": 0.59, + "grad_norm": 9.181254938437378, + "learning_rate": 3.820744589481063e-06, + "loss": 0.6772, + "step": 6701 + }, + { + "epoch": 0.59, + "grad_norm": 5.236263169246369, + "learning_rate": 3.819362454197822e-06, + "loss": 0.7348, + "step": 6702 + }, + { + "epoch": 0.59, + "grad_norm": 6.334927773836604, + "learning_rate": 3.8179804144498075e-06, + "loss": 0.8127, + "step": 6703 + }, + { + "epoch": 0.59, + "grad_norm": 7.357215435376049, + "learning_rate": 3.81659847034885e-06, + "loss": 0.8472, + "step": 6704 + }, + { + "epoch": 0.59, + "grad_norm": 6.7130447632179315, + "learning_rate": 3.815216622006778e-06, + "loss": 0.689, + "step": 6705 + }, + { + "epoch": 0.59, + "grad_norm": 2.582268609598206, + "learning_rate": 3.8138348695354057e-06, + "loss": 0.5259, + "step": 6706 + }, + { + "epoch": 0.59, + "grad_norm": 7.066378251135938, + "learning_rate": 3.8124532130465414e-06, + "loss": 0.7869, + "step": 6707 + }, + { + "epoch": 0.59, + "grad_norm": 10.518034001305523, + "learning_rate": 3.811071652651989e-06, + "loss": 0.5995, + "step": 6708 + }, + { + "epoch": 0.59, + "grad_norm": 6.526011559922754, + "learning_rate": 3.8096901884635406e-06, + "loss": 0.642, + "step": 6709 + }, + { + "epoch": 0.59, + "grad_norm": 5.595729857925507, + "learning_rate": 3.8083088205929813e-06, + "loss": 0.5847, + "step": 6710 + }, + { + "epoch": 0.59, + "grad_norm": 7.072301650164533, + "learning_rate": 3.806927549152091e-06, + "loss": 0.7651, + "step": 6711 + }, + { + "epoch": 0.59, + "grad_norm": 10.943189594298994, + "learning_rate": 3.805546374252638e-06, + "loss": 0.8604, + "step": 6712 + }, + { + "epoch": 0.59, + "grad_norm": 6.345011271265172, + "learning_rate": 3.8041652960063856e-06, + "loss": 0.7295, + "step": 6713 + }, + { + "epoch": 0.59, + "grad_norm": 9.453473832020514, + "learning_rate": 3.802784314525089e-06, + "loss": 0.8005, + "step": 6714 + }, + { + "epoch": 0.59, + "grad_norm": 3.2462101234526717, + "learning_rate": 3.8014034299204933e-06, + "loss": 0.5277, + "step": 6715 + }, + { + "epoch": 0.59, + "grad_norm": 8.832806175471852, + "learning_rate": 3.800022642304339e-06, + "loss": 0.8789, + "step": 6716 + }, + { + "epoch": 0.59, + "grad_norm": 5.944516719086814, + "learning_rate": 3.7986419517883566e-06, + "loss": 0.7547, + "step": 6717 + }, + { + "epoch": 0.59, + "grad_norm": 6.937890953170971, + "learning_rate": 3.797261358484268e-06, + "loss": 0.7706, + "step": 6718 + }, + { + "epoch": 0.59, + "grad_norm": 6.743220166802001, + "learning_rate": 3.795880862503791e-06, + "loss": 0.7518, + "step": 6719 + }, + { + "epoch": 0.59, + "grad_norm": 7.704753288342478, + "learning_rate": 3.794500463958631e-06, + "loss": 0.8692, + "step": 6720 + }, + { + "epoch": 0.59, + "grad_norm": 7.315293738480773, + "learning_rate": 3.7931201629604873e-06, + "loss": 0.7075, + "step": 6721 + }, + { + "epoch": 0.59, + "grad_norm": 8.392359618594174, + "learning_rate": 3.791739959621054e-06, + "loss": 0.5867, + "step": 6722 + }, + { + "epoch": 0.59, + "grad_norm": 15.53955111670129, + "learning_rate": 3.790359854052013e-06, + "loss": 0.7127, + "step": 6723 + }, + { + "epoch": 0.59, + "grad_norm": 14.27842870007492, + "learning_rate": 3.7889798463650394e-06, + "loss": 0.7599, + "step": 6724 + }, + { + "epoch": 0.59, + "grad_norm": 5.057155299210537, + "learning_rate": 3.7875999366718036e-06, + "loss": 0.8222, + "step": 6725 + }, + { + "epoch": 0.59, + "grad_norm": 5.940696134038738, + "learning_rate": 3.7862201250839637e-06, + "loss": 0.7416, + "step": 6726 + }, + { + "epoch": 0.59, + "grad_norm": 7.075279058243422, + "learning_rate": 3.7848404117131716e-06, + "loss": 0.7199, + "step": 6727 + }, + { + "epoch": 0.59, + "grad_norm": 5.407629826681537, + "learning_rate": 3.783460796671073e-06, + "loss": 0.7146, + "step": 6728 + }, + { + "epoch": 0.59, + "grad_norm": 7.032617377342271, + "learning_rate": 3.782081280069303e-06, + "loss": 0.8574, + "step": 6729 + }, + { + "epoch": 0.59, + "grad_norm": 11.755237588300513, + "learning_rate": 3.7807018620194892e-06, + "loss": 0.7334, + "step": 6730 + }, + { + "epoch": 0.59, + "grad_norm": 9.28783560698202, + "learning_rate": 3.779322542633253e-06, + "loss": 0.8359, + "step": 6731 + }, + { + "epoch": 0.59, + "grad_norm": 6.7095998595989865, + "learning_rate": 3.7779433220222055e-06, + "loss": 0.7893, + "step": 6732 + }, + { + "epoch": 0.59, + "grad_norm": 16.34663557746273, + "learning_rate": 3.776564200297953e-06, + "loss": 0.7627, + "step": 6733 + }, + { + "epoch": 0.59, + "grad_norm": 6.131329455771491, + "learning_rate": 3.7751851775720892e-06, + "loss": 0.7399, + "step": 6734 + }, + { + "epoch": 0.59, + "grad_norm": 6.326578266206987, + "learning_rate": 3.7738062539562035e-06, + "loss": 0.692, + "step": 6735 + }, + { + "epoch": 0.59, + "grad_norm": 8.083838371758928, + "learning_rate": 3.7724274295618767e-06, + "loss": 0.725, + "step": 6736 + }, + { + "epoch": 0.59, + "grad_norm": 5.016997403763181, + "learning_rate": 3.7710487045006804e-06, + "loss": 0.7625, + "step": 6737 + }, + { + "epoch": 0.59, + "grad_norm": 4.5986368625596725, + "learning_rate": 3.7696700788841776e-06, + "loss": 0.7246, + "step": 6738 + }, + { + "epoch": 0.59, + "grad_norm": 6.906573204255785, + "learning_rate": 3.7682915528239266e-06, + "loss": 0.8667, + "step": 6739 + }, + { + "epoch": 0.59, + "grad_norm": 11.561154789392788, + "learning_rate": 3.766913126431474e-06, + "loss": 0.915, + "step": 6740 + }, + { + "epoch": 0.59, + "grad_norm": 8.030337918812474, + "learning_rate": 3.7655347998183595e-06, + "loss": 0.7106, + "step": 6741 + }, + { + "epoch": 0.59, + "grad_norm": 6.801343200624807, + "learning_rate": 3.764156573096116e-06, + "loss": 0.6745, + "step": 6742 + }, + { + "epoch": 0.59, + "grad_norm": 7.477861005388648, + "learning_rate": 3.7627784463762674e-06, + "loss": 0.734, + "step": 6743 + }, + { + "epoch": 0.59, + "grad_norm": 4.225806585706259, + "learning_rate": 3.761400419770328e-06, + "loss": 0.833, + "step": 6744 + }, + { + "epoch": 0.59, + "grad_norm": 7.706661963843632, + "learning_rate": 3.7600224933898073e-06, + "loss": 0.8522, + "step": 6745 + }, + { + "epoch": 0.59, + "grad_norm": 7.836754528078992, + "learning_rate": 3.7586446673462034e-06, + "loss": 0.7554, + "step": 6746 + }, + { + "epoch": 0.59, + "grad_norm": 7.87697252297223, + "learning_rate": 3.757266941751007e-06, + "loss": 0.7206, + "step": 6747 + }, + { + "epoch": 0.59, + "grad_norm": 4.955018373248068, + "learning_rate": 3.755889316715704e-06, + "loss": 0.6516, + "step": 6748 + }, + { + "epoch": 0.59, + "grad_norm": 6.660888029696172, + "learning_rate": 3.7545117923517683e-06, + "loss": 0.8009, + "step": 6749 + }, + { + "epoch": 0.59, + "grad_norm": 5.91909197371003, + "learning_rate": 3.7531343687706643e-06, + "loss": 0.7511, + "step": 6750 + }, + { + "epoch": 0.59, + "grad_norm": 7.260461210310112, + "learning_rate": 3.7517570460838555e-06, + "loss": 0.7519, + "step": 6751 + }, + { + "epoch": 0.59, + "grad_norm": 8.491309614205656, + "learning_rate": 3.7503798244027885e-06, + "loss": 0.6467, + "step": 6752 + }, + { + "epoch": 0.59, + "grad_norm": 9.048947864139196, + "learning_rate": 3.749002703838909e-06, + "loss": 0.7661, + "step": 6753 + }, + { + "epoch": 0.59, + "grad_norm": 2.2456186819163895, + "learning_rate": 3.747625684503649e-06, + "loss": 0.4646, + "step": 6754 + }, + { + "epoch": 0.59, + "grad_norm": 9.153860603668962, + "learning_rate": 3.746248766508435e-06, + "loss": 0.8436, + "step": 6755 + }, + { + "epoch": 0.59, + "grad_norm": 4.327843528623552, + "learning_rate": 3.7448719499646855e-06, + "loss": 0.6652, + "step": 6756 + }, + { + "epoch": 0.59, + "grad_norm": 24.038942110971647, + "learning_rate": 3.743495234983811e-06, + "loss": 0.7063, + "step": 6757 + }, + { + "epoch": 0.59, + "grad_norm": 2.336814436523308, + "learning_rate": 3.742118621677211e-06, + "loss": 0.5071, + "step": 6758 + }, + { + "epoch": 0.59, + "grad_norm": 5.5332695961672265, + "learning_rate": 3.7407421101562802e-06, + "loss": 0.7235, + "step": 6759 + }, + { + "epoch": 0.59, + "grad_norm": 6.462982488478755, + "learning_rate": 3.7393657005324036e-06, + "loss": 0.8665, + "step": 6760 + }, + { + "epoch": 0.59, + "grad_norm": 7.26413822819307, + "learning_rate": 3.737989392916956e-06, + "loss": 0.7907, + "step": 6761 + }, + { + "epoch": 0.59, + "grad_norm": 4.926868734663033, + "learning_rate": 3.7366131874213087e-06, + "loss": 0.702, + "step": 6762 + }, + { + "epoch": 0.59, + "grad_norm": 7.245720799152013, + "learning_rate": 3.735237084156821e-06, + "loss": 0.6347, + "step": 6763 + }, + { + "epoch": 0.59, + "grad_norm": 8.695246494861847, + "learning_rate": 3.733861083234843e-06, + "loss": 0.7586, + "step": 6764 + }, + { + "epoch": 0.59, + "grad_norm": 5.8022772530157685, + "learning_rate": 3.7324851847667222e-06, + "loss": 0.7222, + "step": 6765 + }, + { + "epoch": 0.59, + "grad_norm": 7.534666147580976, + "learning_rate": 3.7311093888637906e-06, + "loss": 0.6477, + "step": 6766 + }, + { + "epoch": 0.59, + "grad_norm": 9.189613830641104, + "learning_rate": 3.7297336956373766e-06, + "loss": 0.6313, + "step": 6767 + }, + { + "epoch": 0.59, + "grad_norm": 7.02372050033795, + "learning_rate": 3.728358105198799e-06, + "loss": 0.8554, + "step": 6768 + }, + { + "epoch": 0.59, + "grad_norm": 8.075337485094927, + "learning_rate": 3.7269826176593686e-06, + "loss": 0.6052, + "step": 6769 + }, + { + "epoch": 0.59, + "grad_norm": 6.437718373842741, + "learning_rate": 3.725607233130386e-06, + "loss": 0.712, + "step": 6770 + }, + { + "epoch": 0.59, + "grad_norm": 6.295559219127202, + "learning_rate": 3.724231951723148e-06, + "loss": 0.6837, + "step": 6771 + }, + { + "epoch": 0.59, + "grad_norm": 2.104895000176603, + "learning_rate": 3.722856773548937e-06, + "loss": 0.532, + "step": 6772 + }, + { + "epoch": 0.59, + "grad_norm": 8.580295618879704, + "learning_rate": 3.721481698719032e-06, + "loss": 0.8858, + "step": 6773 + }, + { + "epoch": 0.59, + "grad_norm": 7.172027059170933, + "learning_rate": 3.7201067273447023e-06, + "loss": 0.7218, + "step": 6774 + }, + { + "epoch": 0.59, + "grad_norm": 5.346000936632567, + "learning_rate": 3.7187318595372054e-06, + "loss": 0.7296, + "step": 6775 + }, + { + "epoch": 0.6, + "grad_norm": 5.798204318883443, + "learning_rate": 3.7173570954077965e-06, + "loss": 0.6854, + "step": 6776 + }, + { + "epoch": 0.6, + "grad_norm": 5.002075916846259, + "learning_rate": 3.7159824350677177e-06, + "loss": 0.6356, + "step": 6777 + }, + { + "epoch": 0.6, + "grad_norm": 7.729645189609478, + "learning_rate": 3.7146078786282024e-06, + "loss": 0.8015, + "step": 6778 + }, + { + "epoch": 0.6, + "grad_norm": 8.757444639657981, + "learning_rate": 3.713233426200482e-06, + "loss": 0.8618, + "step": 6779 + }, + { + "epoch": 0.6, + "grad_norm": 3.5148218905335233, + "learning_rate": 3.7118590778957717e-06, + "loss": 0.5731, + "step": 6780 + }, + { + "epoch": 0.6, + "grad_norm": 6.3169810388301375, + "learning_rate": 3.710484833825281e-06, + "loss": 0.8553, + "step": 6781 + }, + { + "epoch": 0.6, + "grad_norm": 7.0945429782431715, + "learning_rate": 3.7091106941002136e-06, + "loss": 0.8131, + "step": 6782 + }, + { + "epoch": 0.6, + "grad_norm": 7.124289977672291, + "learning_rate": 3.707736658831762e-06, + "loss": 0.7828, + "step": 6783 + }, + { + "epoch": 0.6, + "grad_norm": 6.238634773723898, + "learning_rate": 3.706362728131109e-06, + "loss": 0.6438, + "step": 6784 + }, + { + "epoch": 0.6, + "grad_norm": 6.056892972284894, + "learning_rate": 3.704988902109433e-06, + "loss": 0.8264, + "step": 6785 + }, + { + "epoch": 0.6, + "grad_norm": 6.4980234624491295, + "learning_rate": 3.703615180877901e-06, + "loss": 0.7128, + "step": 6786 + }, + { + "epoch": 0.6, + "grad_norm": 7.310484679364131, + "learning_rate": 3.702241564547671e-06, + "loss": 0.7996, + "step": 6787 + }, + { + "epoch": 0.6, + "grad_norm": 6.5222264672256784, + "learning_rate": 3.7008680532298962e-06, + "loss": 0.7486, + "step": 6788 + }, + { + "epoch": 0.6, + "grad_norm": 9.400332506760801, + "learning_rate": 3.6994946470357173e-06, + "loss": 0.7757, + "step": 6789 + }, + { + "epoch": 0.6, + "grad_norm": 25.922178200646922, + "learning_rate": 3.6981213460762676e-06, + "loss": 0.84, + "step": 6790 + }, + { + "epoch": 0.6, + "grad_norm": 7.682488061139591, + "learning_rate": 3.696748150462674e-06, + "loss": 0.5904, + "step": 6791 + }, + { + "epoch": 0.6, + "grad_norm": 9.2210894951215, + "learning_rate": 3.695375060306051e-06, + "loss": 0.6723, + "step": 6792 + }, + { + "epoch": 0.6, + "grad_norm": 9.216230956774861, + "learning_rate": 3.6940020757175086e-06, + "loss": 0.8441, + "step": 6793 + }, + { + "epoch": 0.6, + "grad_norm": 8.14617139493353, + "learning_rate": 3.692629196808147e-06, + "loss": 0.8169, + "step": 6794 + }, + { + "epoch": 0.6, + "grad_norm": 8.102899707403392, + "learning_rate": 3.6912564236890542e-06, + "loss": 0.6823, + "step": 6795 + }, + { + "epoch": 0.6, + "grad_norm": 6.524098052762599, + "learning_rate": 3.6898837564713158e-06, + "loss": 0.7972, + "step": 6796 + }, + { + "epoch": 0.6, + "grad_norm": 9.698743182456123, + "learning_rate": 3.6885111952660047e-06, + "loss": 0.726, + "step": 6797 + }, + { + "epoch": 0.6, + "grad_norm": 5.312133434732468, + "learning_rate": 3.6871387401841845e-06, + "loss": 0.7549, + "step": 6798 + }, + { + "epoch": 0.6, + "grad_norm": 8.891355804951635, + "learning_rate": 3.685766391336916e-06, + "loss": 0.8133, + "step": 6799 + }, + { + "epoch": 0.6, + "grad_norm": 5.131730050601796, + "learning_rate": 3.684394148835243e-06, + "loss": 0.7651, + "step": 6800 + }, + { + "epoch": 0.6, + "grad_norm": 6.36573996933004, + "learning_rate": 3.683022012790207e-06, + "loss": 0.7974, + "step": 6801 + }, + { + "epoch": 0.6, + "grad_norm": 3.8994406893967195, + "learning_rate": 3.6816499833128404e-06, + "loss": 0.606, + "step": 6802 + }, + { + "epoch": 0.6, + "grad_norm": 5.9603877220878205, + "learning_rate": 3.680278060514164e-06, + "loss": 0.8097, + "step": 6803 + }, + { + "epoch": 0.6, + "grad_norm": 7.999058772735441, + "learning_rate": 3.678906244505191e-06, + "loss": 0.8309, + "step": 6804 + }, + { + "epoch": 0.6, + "grad_norm": 3.50414556389951, + "learning_rate": 3.6775345353969285e-06, + "loss": 0.6211, + "step": 6805 + }, + { + "epoch": 0.6, + "grad_norm": 7.6587995073433355, + "learning_rate": 3.67616293330037e-06, + "loss": 0.6421, + "step": 6806 + }, + { + "epoch": 0.6, + "grad_norm": 7.590414589649342, + "learning_rate": 3.674791438326505e-06, + "loss": 0.8322, + "step": 6807 + }, + { + "epoch": 0.6, + "grad_norm": 7.488788265461552, + "learning_rate": 3.673420050586313e-06, + "loss": 0.6094, + "step": 6808 + }, + { + "epoch": 0.6, + "grad_norm": 5.832449856779511, + "learning_rate": 3.6720487701907648e-06, + "loss": 0.7357, + "step": 6809 + }, + { + "epoch": 0.6, + "grad_norm": 11.062945725373673, + "learning_rate": 3.670677597250819e-06, + "loss": 0.6627, + "step": 6810 + }, + { + "epoch": 0.6, + "grad_norm": 16.04870747273152, + "learning_rate": 3.6693065318774324e-06, + "loss": 0.7793, + "step": 6811 + }, + { + "epoch": 0.6, + "grad_norm": 8.115819662308388, + "learning_rate": 3.6679355741815464e-06, + "loss": 0.8058, + "step": 6812 + }, + { + "epoch": 0.6, + "grad_norm": 7.027673509549571, + "learning_rate": 3.666564724274099e-06, + "loss": 0.6608, + "step": 6813 + }, + { + "epoch": 0.6, + "grad_norm": 5.855050171697323, + "learning_rate": 3.6651939822660167e-06, + "loss": 0.6504, + "step": 6814 + }, + { + "epoch": 0.6, + "grad_norm": 8.981529549937461, + "learning_rate": 3.663823348268215e-06, + "loss": 0.6694, + "step": 6815 + }, + { + "epoch": 0.6, + "grad_norm": 6.5507838403056216, + "learning_rate": 3.6624528223916073e-06, + "loss": 0.6805, + "step": 6816 + }, + { + "epoch": 0.6, + "grad_norm": 4.90796523448499, + "learning_rate": 3.6610824047470916e-06, + "loss": 0.758, + "step": 6817 + }, + { + "epoch": 0.6, + "grad_norm": 4.155803512664998, + "learning_rate": 3.65971209544556e-06, + "loss": 0.6425, + "step": 6818 + }, + { + "epoch": 0.6, + "grad_norm": 10.512045334526407, + "learning_rate": 3.6583418945978976e-06, + "loss": 0.8081, + "step": 6819 + }, + { + "epoch": 0.6, + "grad_norm": 6.805759132790469, + "learning_rate": 3.6569718023149763e-06, + "loss": 0.7778, + "step": 6820 + }, + { + "epoch": 0.6, + "grad_norm": 7.760563754083494, + "learning_rate": 3.6556018187076624e-06, + "loss": 0.7531, + "step": 6821 + }, + { + "epoch": 0.6, + "grad_norm": 9.56649571619355, + "learning_rate": 3.654231943886814e-06, + "loss": 0.8287, + "step": 6822 + }, + { + "epoch": 0.6, + "grad_norm": 4.467269059220171, + "learning_rate": 3.6528621779632778e-06, + "loss": 0.7761, + "step": 6823 + }, + { + "epoch": 0.6, + "grad_norm": 8.140369081509014, + "learning_rate": 3.6514925210478925e-06, + "loss": 0.733, + "step": 6824 + }, + { + "epoch": 0.6, + "grad_norm": 7.520330920348531, + "learning_rate": 3.6501229732514902e-06, + "loss": 0.8675, + "step": 6825 + }, + { + "epoch": 0.6, + "grad_norm": 7.018376134387375, + "learning_rate": 3.6487535346848917e-06, + "loss": 0.8195, + "step": 6826 + }, + { + "epoch": 0.6, + "grad_norm": 6.713478399529163, + "learning_rate": 3.6473842054589083e-06, + "loss": 0.7251, + "step": 6827 + }, + { + "epoch": 0.6, + "grad_norm": 6.169287586520735, + "learning_rate": 3.6460149856843462e-06, + "loss": 0.7205, + "step": 6828 + }, + { + "epoch": 0.6, + "grad_norm": 14.792687228715039, + "learning_rate": 3.6446458754719992e-06, + "loss": 0.7414, + "step": 6829 + }, + { + "epoch": 0.6, + "grad_norm": 7.152972324072674, + "learning_rate": 3.6432768749326524e-06, + "loss": 0.907, + "step": 6830 + }, + { + "epoch": 0.6, + "grad_norm": 4.604333180944507, + "learning_rate": 3.641907984177085e-06, + "loss": 0.6622, + "step": 6831 + }, + { + "epoch": 0.6, + "grad_norm": 2.2561980902037457, + "learning_rate": 3.6405392033160637e-06, + "loss": 0.6188, + "step": 6832 + }, + { + "epoch": 0.6, + "grad_norm": 6.830726908548838, + "learning_rate": 3.6391705324603494e-06, + "loss": 0.7085, + "step": 6833 + }, + { + "epoch": 0.6, + "grad_norm": 9.071845196995353, + "learning_rate": 3.6378019717206927e-06, + "loss": 0.7565, + "step": 6834 + }, + { + "epoch": 0.6, + "grad_norm": 10.366413662027583, + "learning_rate": 3.6364335212078322e-06, + "loss": 0.8167, + "step": 6835 + }, + { + "epoch": 0.6, + "grad_norm": 9.72655088714386, + "learning_rate": 3.635065181032505e-06, + "loss": 0.7877, + "step": 6836 + }, + { + "epoch": 0.6, + "grad_norm": 8.890048095309078, + "learning_rate": 3.6336969513054326e-06, + "loss": 0.8311, + "step": 6837 + }, + { + "epoch": 0.6, + "grad_norm": 10.269806687086708, + "learning_rate": 3.632328832137329e-06, + "loss": 0.7023, + "step": 6838 + }, + { + "epoch": 0.6, + "grad_norm": 11.097340836077798, + "learning_rate": 3.6309608236389025e-06, + "loss": 1.0151, + "step": 6839 + }, + { + "epoch": 0.6, + "grad_norm": 5.7694757865016815, + "learning_rate": 3.629592925920849e-06, + "loss": 0.8462, + "step": 6840 + }, + { + "epoch": 0.6, + "grad_norm": 2.499021824964941, + "learning_rate": 3.628225139093855e-06, + "loss": 0.571, + "step": 6841 + }, + { + "epoch": 0.6, + "grad_norm": 20.453873649091467, + "learning_rate": 3.6268574632686025e-06, + "loss": 0.9227, + "step": 6842 + }, + { + "epoch": 0.6, + "grad_norm": 8.541227099089049, + "learning_rate": 3.6254898985557598e-06, + "loss": 0.5847, + "step": 6843 + }, + { + "epoch": 0.6, + "grad_norm": 1.943348658664623, + "learning_rate": 3.6241224450659866e-06, + "loss": 0.4281, + "step": 6844 + }, + { + "epoch": 0.6, + "grad_norm": 6.457269104283671, + "learning_rate": 3.6227551029099385e-06, + "loss": 0.7823, + "step": 6845 + }, + { + "epoch": 0.6, + "grad_norm": 3.2337766745378542, + "learning_rate": 3.6213878721982555e-06, + "loss": 0.5898, + "step": 6846 + }, + { + "epoch": 0.6, + "grad_norm": 10.26466879282682, + "learning_rate": 3.6200207530415722e-06, + "loss": 0.7483, + "step": 6847 + }, + { + "epoch": 0.6, + "grad_norm": 3.6141502914284813, + "learning_rate": 3.618653745550515e-06, + "loss": 0.6041, + "step": 6848 + }, + { + "epoch": 0.6, + "grad_norm": 6.29909799662833, + "learning_rate": 3.6172868498356977e-06, + "loss": 0.6979, + "step": 6849 + }, + { + "epoch": 0.6, + "grad_norm": 7.161599120733528, + "learning_rate": 3.6159200660077297e-06, + "loss": 0.8011, + "step": 6850 + }, + { + "epoch": 0.6, + "grad_norm": 9.230774621755705, + "learning_rate": 3.614553394177208e-06, + "loss": 0.7481, + "step": 6851 + }, + { + "epoch": 0.6, + "grad_norm": 4.990350659295141, + "learning_rate": 3.6131868344547193e-06, + "loss": 0.6711, + "step": 6852 + }, + { + "epoch": 0.6, + "grad_norm": 6.101433639746761, + "learning_rate": 3.6118203869508473e-06, + "loss": 0.7671, + "step": 6853 + }, + { + "epoch": 0.6, + "grad_norm": 6.272361814044108, + "learning_rate": 3.6104540517761594e-06, + "loss": 0.8123, + "step": 6854 + }, + { + "epoch": 0.6, + "grad_norm": 7.490390615663164, + "learning_rate": 3.609087829041217e-06, + "loss": 0.6396, + "step": 6855 + }, + { + "epoch": 0.6, + "grad_norm": 8.61235575935795, + "learning_rate": 3.607721718856576e-06, + "loss": 0.8228, + "step": 6856 + }, + { + "epoch": 0.6, + "grad_norm": 15.56751665973145, + "learning_rate": 3.6063557213327773e-06, + "loss": 0.7584, + "step": 6857 + }, + { + "epoch": 0.6, + "grad_norm": 5.670726238459962, + "learning_rate": 3.6049898365803537e-06, + "loss": 0.5995, + "step": 6858 + }, + { + "epoch": 0.6, + "grad_norm": 5.877364511772006, + "learning_rate": 3.6036240647098332e-06, + "loss": 0.7367, + "step": 6859 + }, + { + "epoch": 0.6, + "grad_norm": 6.583049134553328, + "learning_rate": 3.6022584058317313e-06, + "loss": 0.8553, + "step": 6860 + }, + { + "epoch": 0.6, + "grad_norm": 5.239251089894791, + "learning_rate": 3.600892860056552e-06, + "loss": 0.7555, + "step": 6861 + }, + { + "epoch": 0.6, + "grad_norm": 13.126903343255751, + "learning_rate": 3.5995274274947973e-06, + "loss": 0.7095, + "step": 6862 + }, + { + "epoch": 0.6, + "grad_norm": 4.648276215199217, + "learning_rate": 3.598162108256953e-06, + "loss": 0.7732, + "step": 6863 + }, + { + "epoch": 0.6, + "grad_norm": 8.798828685503912, + "learning_rate": 3.596796902453499e-06, + "loss": 0.9059, + "step": 6864 + }, + { + "epoch": 0.6, + "grad_norm": 8.436629885243187, + "learning_rate": 3.5954318101949047e-06, + "loss": 0.804, + "step": 6865 + }, + { + "epoch": 0.6, + "grad_norm": 7.962803068832419, + "learning_rate": 3.594066831591634e-06, + "loss": 0.8319, + "step": 6866 + }, + { + "epoch": 0.6, + "grad_norm": 15.171316946907234, + "learning_rate": 3.5927019667541343e-06, + "loss": 0.7771, + "step": 6867 + }, + { + "epoch": 0.6, + "grad_norm": 14.874326920294877, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.8258, + "step": 6868 + }, + { + "epoch": 0.6, + "grad_norm": 18.022600346247067, + "learning_rate": 3.5899725788182175e-06, + "loss": 0.7196, + "step": 6869 + }, + { + "epoch": 0.6, + "grad_norm": 2.354206880945465, + "learning_rate": 3.5886080559406575e-06, + "loss": 0.55, + "step": 6870 + }, + { + "epoch": 0.6, + "grad_norm": 6.003785018567411, + "learning_rate": 3.587243647270587e-06, + "loss": 0.6777, + "step": 6871 + }, + { + "epoch": 0.6, + "grad_norm": 3.73255651000599, + "learning_rate": 3.585879352918409e-06, + "loss": 0.4443, + "step": 6872 + }, + { + "epoch": 0.6, + "grad_norm": 6.973462334383259, + "learning_rate": 3.5845151729945223e-06, + "loss": 0.8121, + "step": 6873 + }, + { + "epoch": 0.6, + "grad_norm": 5.599089231802811, + "learning_rate": 3.5831511076093128e-06, + "loss": 0.6639, + "step": 6874 + }, + { + "epoch": 0.6, + "grad_norm": 7.2283929114988865, + "learning_rate": 3.581787156873159e-06, + "loss": 0.7761, + "step": 6875 + }, + { + "epoch": 0.6, + "grad_norm": 15.206342209445054, + "learning_rate": 3.580423320896429e-06, + "loss": 0.7389, + "step": 6876 + }, + { + "epoch": 0.6, + "grad_norm": 7.468697474231151, + "learning_rate": 3.579059599789484e-06, + "loss": 0.7311, + "step": 6877 + }, + { + "epoch": 0.6, + "grad_norm": 11.527931910997486, + "learning_rate": 3.5776959936626705e-06, + "loss": 0.6343, + "step": 6878 + }, + { + "epoch": 0.6, + "grad_norm": 7.98003550979604, + "learning_rate": 3.576332502626333e-06, + "loss": 0.6264, + "step": 6879 + }, + { + "epoch": 0.6, + "grad_norm": 12.68485737627907, + "learning_rate": 3.5749691267908006e-06, + "loss": 0.6197, + "step": 6880 + }, + { + "epoch": 0.6, + "grad_norm": 7.047934353552728, + "learning_rate": 3.573605866266395e-06, + "loss": 0.8382, + "step": 6881 + }, + { + "epoch": 0.6, + "grad_norm": 5.774968439083673, + "learning_rate": 3.5722427211634313e-06, + "loss": 0.7656, + "step": 6882 + }, + { + "epoch": 0.6, + "grad_norm": 8.611571934942946, + "learning_rate": 3.5708796915922117e-06, + "loss": 0.6891, + "step": 6883 + }, + { + "epoch": 0.6, + "grad_norm": 12.902006778788758, + "learning_rate": 3.569516777663029e-06, + "loss": 0.7076, + "step": 6884 + }, + { + "epoch": 0.6, + "grad_norm": 6.235618794160533, + "learning_rate": 3.5681539794861707e-06, + "loss": 0.6517, + "step": 6885 + }, + { + "epoch": 0.6, + "grad_norm": 7.853262176064914, + "learning_rate": 3.5667912971719104e-06, + "loss": 0.7648, + "step": 6886 + }, + { + "epoch": 0.6, + "grad_norm": 5.4927330341127565, + "learning_rate": 3.5654287308305137e-06, + "loss": 0.5872, + "step": 6887 + }, + { + "epoch": 0.6, + "grad_norm": 9.020780194949232, + "learning_rate": 3.564066280572238e-06, + "loss": 0.6815, + "step": 6888 + }, + { + "epoch": 0.61, + "grad_norm": 6.4500593230851235, + "learning_rate": 3.56270394650733e-06, + "loss": 0.8697, + "step": 6889 + }, + { + "epoch": 0.61, + "grad_norm": 12.630434152256122, + "learning_rate": 3.5613417287460284e-06, + "loss": 0.6925, + "step": 6890 + }, + { + "epoch": 0.61, + "grad_norm": 5.820493336610734, + "learning_rate": 3.559979627398561e-06, + "loss": 0.6769, + "step": 6891 + }, + { + "epoch": 0.61, + "grad_norm": 8.212233311263692, + "learning_rate": 3.5586176425751463e-06, + "loss": 0.6281, + "step": 6892 + }, + { + "epoch": 0.61, + "grad_norm": 9.251961395831186, + "learning_rate": 3.5572557743859947e-06, + "loss": 0.5863, + "step": 6893 + }, + { + "epoch": 0.61, + "grad_norm": 12.400564497392748, + "learning_rate": 3.5558940229413063e-06, + "loss": 0.7687, + "step": 6894 + }, + { + "epoch": 0.61, + "grad_norm": 7.243325808124822, + "learning_rate": 3.55453238835127e-06, + "loss": 0.7918, + "step": 6895 + }, + { + "epoch": 0.61, + "grad_norm": 8.047006301408697, + "learning_rate": 3.55317087072607e-06, + "loss": 0.7627, + "step": 6896 + }, + { + "epoch": 0.61, + "grad_norm": 8.851253340893738, + "learning_rate": 3.5518094701758765e-06, + "loss": 0.7895, + "step": 6897 + }, + { + "epoch": 0.61, + "grad_norm": 5.73729437608435, + "learning_rate": 3.55044818681085e-06, + "loss": 0.5726, + "step": 6898 + }, + { + "epoch": 0.61, + "grad_norm": 7.7103486040553335, + "learning_rate": 3.5490870207411466e-06, + "loss": 0.8622, + "step": 6899 + }, + { + "epoch": 0.61, + "grad_norm": 2.219413351131888, + "learning_rate": 3.5477259720769073e-06, + "loss": 0.5253, + "step": 6900 + }, + { + "epoch": 0.61, + "grad_norm": 8.528390775900828, + "learning_rate": 3.5463650409282667e-06, + "loss": 0.734, + "step": 6901 + }, + { + "epoch": 0.61, + "grad_norm": 12.549770671332254, + "learning_rate": 3.54500422740535e-06, + "loss": 0.6585, + "step": 6902 + }, + { + "epoch": 0.61, + "grad_norm": 12.202440857596168, + "learning_rate": 3.54364353161827e-06, + "loss": 0.8985, + "step": 6903 + }, + { + "epoch": 0.61, + "grad_norm": 4.973603525856219, + "learning_rate": 3.542282953677132e-06, + "loss": 0.5997, + "step": 6904 + }, + { + "epoch": 0.61, + "grad_norm": 5.752381884850094, + "learning_rate": 3.540922493692034e-06, + "loss": 0.7144, + "step": 6905 + }, + { + "epoch": 0.61, + "grad_norm": 4.767381535182011, + "learning_rate": 3.539562151773061e-06, + "loss": 0.753, + "step": 6906 + }, + { + "epoch": 0.61, + "grad_norm": 9.810195339915923, + "learning_rate": 3.5382019280302883e-06, + "loss": 0.7509, + "step": 6907 + }, + { + "epoch": 0.61, + "grad_norm": 6.39788067561238, + "learning_rate": 3.5368418225737844e-06, + "loss": 0.7714, + "step": 6908 + }, + { + "epoch": 0.61, + "grad_norm": 9.26974670234019, + "learning_rate": 3.5354818355136058e-06, + "loss": 0.7402, + "step": 6909 + }, + { + "epoch": 0.61, + "grad_norm": 8.515941374472625, + "learning_rate": 3.534121966959802e-06, + "loss": 0.7827, + "step": 6910 + }, + { + "epoch": 0.61, + "grad_norm": 6.486175619690894, + "learning_rate": 3.5327622170224103e-06, + "loss": 0.6373, + "step": 6911 + }, + { + "epoch": 0.61, + "grad_norm": 8.785708043979279, + "learning_rate": 3.531402585811458e-06, + "loss": 0.9373, + "step": 6912 + }, + { + "epoch": 0.61, + "grad_norm": 10.612721739636923, + "learning_rate": 3.5300430734369673e-06, + "loss": 0.709, + "step": 6913 + }, + { + "epoch": 0.61, + "grad_norm": 5.184274346963402, + "learning_rate": 3.5286836800089454e-06, + "loss": 0.7219, + "step": 6914 + }, + { + "epoch": 0.61, + "grad_norm": 6.038749726593299, + "learning_rate": 3.527324405637391e-06, + "loss": 0.652, + "step": 6915 + }, + { + "epoch": 0.61, + "grad_norm": 2.283701332933534, + "learning_rate": 3.5259652504322982e-06, + "loss": 0.5294, + "step": 6916 + }, + { + "epoch": 0.61, + "grad_norm": 9.483205241629385, + "learning_rate": 3.5246062145036454e-06, + "loss": 0.7668, + "step": 6917 + }, + { + "epoch": 0.61, + "grad_norm": 3.979073193329689, + "learning_rate": 3.523247297961402e-06, + "loss": 0.5654, + "step": 6918 + }, + { + "epoch": 0.61, + "grad_norm": 5.473825560142079, + "learning_rate": 3.521888500915532e-06, + "loss": 0.6971, + "step": 6919 + }, + { + "epoch": 0.61, + "grad_norm": 10.318147684566744, + "learning_rate": 3.5205298234759854e-06, + "loss": 0.6665, + "step": 6920 + }, + { + "epoch": 0.61, + "grad_norm": 5.1716317765407895, + "learning_rate": 3.5191712657527044e-06, + "loss": 0.7797, + "step": 6921 + }, + { + "epoch": 0.61, + "grad_norm": 5.734634534625945, + "learning_rate": 3.517812827855621e-06, + "loss": 0.698, + "step": 6922 + }, + { + "epoch": 0.61, + "grad_norm": 28.133600566910523, + "learning_rate": 3.5164545098946597e-06, + "loss": 0.8227, + "step": 6923 + }, + { + "epoch": 0.61, + "grad_norm": 5.993529588668682, + "learning_rate": 3.51509631197973e-06, + "loss": 0.7821, + "step": 6924 + }, + { + "epoch": 0.61, + "grad_norm": 2.563085484873978, + "learning_rate": 3.513738234220737e-06, + "loss": 0.5836, + "step": 6925 + }, + { + "epoch": 0.61, + "grad_norm": 4.974876896735623, + "learning_rate": 3.512380276727575e-06, + "loss": 0.6712, + "step": 6926 + }, + { + "epoch": 0.61, + "grad_norm": 4.932654582039302, + "learning_rate": 3.5110224396101243e-06, + "loss": 0.7968, + "step": 6927 + }, + { + "epoch": 0.61, + "grad_norm": 7.177502937308123, + "learning_rate": 3.509664722978263e-06, + "loss": 0.8065, + "step": 6928 + }, + { + "epoch": 0.61, + "grad_norm": 6.669716326937547, + "learning_rate": 3.5083071269418517e-06, + "loss": 0.7628, + "step": 6929 + }, + { + "epoch": 0.61, + "grad_norm": 7.316109686718726, + "learning_rate": 3.5069496516107477e-06, + "loss": 0.8847, + "step": 6930 + }, + { + "epoch": 0.61, + "grad_norm": 11.35331577318957, + "learning_rate": 3.5055922970947943e-06, + "loss": 0.7662, + "step": 6931 + }, + { + "epoch": 0.61, + "grad_norm": 2.1105392015139217, + "learning_rate": 3.5042350635038246e-06, + "loss": 0.4701, + "step": 6932 + }, + { + "epoch": 0.61, + "grad_norm": 10.264602636446451, + "learning_rate": 3.5028779509476675e-06, + "loss": 0.7181, + "step": 6933 + }, + { + "epoch": 0.61, + "grad_norm": 5.025875833479653, + "learning_rate": 3.501520959536136e-06, + "loss": 0.6181, + "step": 6934 + }, + { + "epoch": 0.61, + "grad_norm": 7.350646722235558, + "learning_rate": 3.5001640893790346e-06, + "loss": 0.7877, + "step": 6935 + }, + { + "epoch": 0.61, + "grad_norm": 6.231353825748236, + "learning_rate": 3.498807340586162e-06, + "loss": 0.7155, + "step": 6936 + }, + { + "epoch": 0.61, + "grad_norm": 5.405511969172568, + "learning_rate": 3.4974507132673014e-06, + "loss": 0.7286, + "step": 6937 + }, + { + "epoch": 0.61, + "grad_norm": 12.197283134664744, + "learning_rate": 3.496094207532229e-06, + "loss": 0.8156, + "step": 6938 + }, + { + "epoch": 0.61, + "grad_norm": 14.674486951909326, + "learning_rate": 3.4947378234907127e-06, + "loss": 0.7841, + "step": 6939 + }, + { + "epoch": 0.61, + "grad_norm": 5.644889950689664, + "learning_rate": 3.493381561252508e-06, + "loss": 0.9586, + "step": 6940 + }, + { + "epoch": 0.61, + "grad_norm": 2.461408019664983, + "learning_rate": 3.49202542092736e-06, + "loss": 0.4892, + "step": 6941 + }, + { + "epoch": 0.61, + "grad_norm": 7.087114400490897, + "learning_rate": 3.4906694026250075e-06, + "loss": 0.7487, + "step": 6942 + }, + { + "epoch": 0.61, + "grad_norm": 5.816701606348029, + "learning_rate": 3.489313506455177e-06, + "loss": 0.7554, + "step": 6943 + }, + { + "epoch": 0.61, + "grad_norm": 6.976944071849982, + "learning_rate": 3.487957732527583e-06, + "loss": 0.7413, + "step": 6944 + }, + { + "epoch": 0.61, + "grad_norm": 8.143803410961922, + "learning_rate": 3.486602080951935e-06, + "loss": 0.7478, + "step": 6945 + }, + { + "epoch": 0.61, + "grad_norm": 7.45314585510164, + "learning_rate": 3.4852465518379296e-06, + "loss": 0.5894, + "step": 6946 + }, + { + "epoch": 0.61, + "grad_norm": 10.129528813986852, + "learning_rate": 3.483891145295253e-06, + "loss": 0.6669, + "step": 6947 + }, + { + "epoch": 0.61, + "grad_norm": 10.763842597895767, + "learning_rate": 3.482535861433583e-06, + "loss": 0.7292, + "step": 6948 + }, + { + "epoch": 0.61, + "grad_norm": 10.560656920600785, + "learning_rate": 3.4811807003625865e-06, + "loss": 0.6957, + "step": 6949 + }, + { + "epoch": 0.61, + "grad_norm": 4.729998565395378, + "learning_rate": 3.4798256621919222e-06, + "loss": 0.7632, + "step": 6950 + }, + { + "epoch": 0.61, + "grad_norm": 14.969794184457017, + "learning_rate": 3.478470747031236e-06, + "loss": 0.717, + "step": 6951 + }, + { + "epoch": 0.61, + "grad_norm": 7.562968880921842, + "learning_rate": 3.477115954990166e-06, + "loss": 0.7708, + "step": 6952 + }, + { + "epoch": 0.61, + "grad_norm": 6.195769374764393, + "learning_rate": 3.475761286178341e-06, + "loss": 0.7157, + "step": 6953 + }, + { + "epoch": 0.61, + "grad_norm": 8.305301084964855, + "learning_rate": 3.474406740705376e-06, + "loss": 0.8228, + "step": 6954 + }, + { + "epoch": 0.61, + "grad_norm": 6.906435331129843, + "learning_rate": 3.4730523186808797e-06, + "loss": 0.8262, + "step": 6955 + }, + { + "epoch": 0.61, + "grad_norm": 5.696853174500272, + "learning_rate": 3.4716980202144507e-06, + "loss": 0.7362, + "step": 6956 + }, + { + "epoch": 0.61, + "grad_norm": 9.805805355283448, + "learning_rate": 3.4703438454156757e-06, + "loss": 0.6233, + "step": 6957 + }, + { + "epoch": 0.61, + "grad_norm": 8.408777152058049, + "learning_rate": 3.468989794394131e-06, + "loss": 0.8776, + "step": 6958 + }, + { + "epoch": 0.61, + "grad_norm": 11.214245049364823, + "learning_rate": 3.4676358672593867e-06, + "loss": 0.8213, + "step": 6959 + }, + { + "epoch": 0.61, + "grad_norm": 8.396136393608828, + "learning_rate": 3.4662820641209994e-06, + "loss": 0.6774, + "step": 6960 + }, + { + "epoch": 0.61, + "grad_norm": 11.266633539212638, + "learning_rate": 3.464928385088514e-06, + "loss": 0.6552, + "step": 6961 + }, + { + "epoch": 0.61, + "grad_norm": 5.033146767850508, + "learning_rate": 3.463574830271472e-06, + "loss": 0.8083, + "step": 6962 + }, + { + "epoch": 0.61, + "grad_norm": 8.397715127282625, + "learning_rate": 3.4622213997793998e-06, + "loss": 0.8205, + "step": 6963 + }, + { + "epoch": 0.61, + "grad_norm": 8.104419437369677, + "learning_rate": 3.460868093721812e-06, + "loss": 0.7703, + "step": 6964 + }, + { + "epoch": 0.61, + "grad_norm": 6.8049033814306075, + "learning_rate": 3.4595149122082193e-06, + "loss": 0.731, + "step": 6965 + }, + { + "epoch": 0.61, + "grad_norm": 5.5320132891056035, + "learning_rate": 3.4581618553481174e-06, + "loss": 0.6629, + "step": 6966 + }, + { + "epoch": 0.61, + "grad_norm": 6.192736495863142, + "learning_rate": 3.4568089232509915e-06, + "loss": 0.8152, + "step": 6967 + }, + { + "epoch": 0.61, + "grad_norm": 9.885963875669301, + "learning_rate": 3.4554561160263227e-06, + "loss": 0.7494, + "step": 6968 + }, + { + "epoch": 0.61, + "grad_norm": 3.0184692712917145, + "learning_rate": 3.4541034337835743e-06, + "loss": 0.5925, + "step": 6969 + }, + { + "epoch": 0.61, + "grad_norm": 6.3015287311515635, + "learning_rate": 3.452750876632206e-06, + "loss": 0.7187, + "step": 6970 + }, + { + "epoch": 0.61, + "grad_norm": 6.611967057069641, + "learning_rate": 3.451398444681663e-06, + "loss": 0.8728, + "step": 6971 + }, + { + "epoch": 0.61, + "grad_norm": 8.608321431967955, + "learning_rate": 3.4500461380413804e-06, + "loss": 0.6669, + "step": 6972 + }, + { + "epoch": 0.61, + "grad_norm": 8.352856933920156, + "learning_rate": 3.4486939568207874e-06, + "loss": 0.8212, + "step": 6973 + }, + { + "epoch": 0.61, + "grad_norm": 5.689489508810689, + "learning_rate": 3.4473419011292986e-06, + "loss": 0.7894, + "step": 6974 + }, + { + "epoch": 0.61, + "grad_norm": 5.825557990768669, + "learning_rate": 3.44598997107632e-06, + "loss": 0.6358, + "step": 6975 + }, + { + "epoch": 0.61, + "grad_norm": 5.416704564373648, + "learning_rate": 3.4446381667712487e-06, + "loss": 0.7026, + "step": 6976 + }, + { + "epoch": 0.61, + "grad_norm": 7.103829519648496, + "learning_rate": 3.4432864883234694e-06, + "loss": 0.6668, + "step": 6977 + }, + { + "epoch": 0.61, + "grad_norm": 16.238404012883173, + "learning_rate": 3.4419349358423576e-06, + "loss": 0.7723, + "step": 6978 + }, + { + "epoch": 0.61, + "grad_norm": 7.953535369261855, + "learning_rate": 3.44058350943728e-06, + "loss": 0.7569, + "step": 6979 + }, + { + "epoch": 0.61, + "grad_norm": 24.089890748216664, + "learning_rate": 3.4392322092175908e-06, + "loss": 0.7122, + "step": 6980 + }, + { + "epoch": 0.61, + "grad_norm": 9.46684202367794, + "learning_rate": 3.437881035292634e-06, + "loss": 0.5912, + "step": 6981 + }, + { + "epoch": 0.61, + "grad_norm": 2.39031034349769, + "learning_rate": 3.436529987771746e-06, + "loss": 0.5342, + "step": 6982 + }, + { + "epoch": 0.61, + "grad_norm": 7.542875877486775, + "learning_rate": 3.435179066764251e-06, + "loss": 0.6723, + "step": 6983 + }, + { + "epoch": 0.61, + "grad_norm": 2.8340486670219054, + "learning_rate": 3.4338282723794616e-06, + "loss": 0.5106, + "step": 6984 + }, + { + "epoch": 0.61, + "grad_norm": 5.999164642082342, + "learning_rate": 3.432477604726685e-06, + "loss": 0.5979, + "step": 6985 + }, + { + "epoch": 0.61, + "grad_norm": 2.690046383173244, + "learning_rate": 3.431127063915213e-06, + "loss": 0.5032, + "step": 6986 + }, + { + "epoch": 0.61, + "grad_norm": 9.416781658597134, + "learning_rate": 3.4297766500543274e-06, + "loss": 0.6849, + "step": 6987 + }, + { + "epoch": 0.61, + "grad_norm": 7.373926962044891, + "learning_rate": 3.4284263632533053e-06, + "loss": 0.802, + "step": 6988 + }, + { + "epoch": 0.61, + "grad_norm": 2.632585197047198, + "learning_rate": 3.427076203621406e-06, + "loss": 0.5362, + "step": 6989 + }, + { + "epoch": 0.61, + "grad_norm": 10.29260073227554, + "learning_rate": 3.425726171267886e-06, + "loss": 0.8491, + "step": 6990 + }, + { + "epoch": 0.61, + "grad_norm": 6.25622632807648, + "learning_rate": 3.424376266301984e-06, + "loss": 0.7164, + "step": 6991 + }, + { + "epoch": 0.61, + "grad_norm": 6.922828576569069, + "learning_rate": 3.4230264888329333e-06, + "loss": 0.7056, + "step": 6992 + }, + { + "epoch": 0.61, + "grad_norm": 2.6387253057439573, + "learning_rate": 3.421676838969957e-06, + "loss": 0.55, + "step": 6993 + }, + { + "epoch": 0.61, + "grad_norm": 5.336534686200923, + "learning_rate": 3.4203273168222662e-06, + "loss": 0.663, + "step": 6994 + }, + { + "epoch": 0.61, + "grad_norm": 6.40010544317422, + "learning_rate": 3.4189779224990595e-06, + "loss": 0.8498, + "step": 6995 + }, + { + "epoch": 0.61, + "grad_norm": 6.378045838932968, + "learning_rate": 3.4176286561095306e-06, + "loss": 0.7087, + "step": 6996 + }, + { + "epoch": 0.61, + "grad_norm": 11.885671811712166, + "learning_rate": 3.416279517762858e-06, + "loss": 0.6493, + "step": 6997 + }, + { + "epoch": 0.61, + "grad_norm": 9.944313044948174, + "learning_rate": 3.4149305075682125e-06, + "loss": 0.878, + "step": 6998 + }, + { + "epoch": 0.61, + "grad_norm": 6.585722765276889, + "learning_rate": 3.413581625634754e-06, + "loss": 0.7847, + "step": 6999 + }, + { + "epoch": 0.61, + "grad_norm": 5.442266906233952, + "learning_rate": 3.4122328720716322e-06, + "loss": 0.8241, + "step": 7000 + }, + { + "epoch": 0.61, + "grad_norm": 5.456744221519659, + "learning_rate": 3.4108842469879844e-06, + "loss": 0.6999, + "step": 7001 + }, + { + "epoch": 0.61, + "grad_norm": 38.218106556374345, + "learning_rate": 3.4095357504929406e-06, + "loss": 0.8408, + "step": 7002 + }, + { + "epoch": 0.62, + "grad_norm": 7.505127907778796, + "learning_rate": 3.408187382695618e-06, + "loss": 0.7524, + "step": 7003 + }, + { + "epoch": 0.62, + "grad_norm": 10.616007779498366, + "learning_rate": 3.406839143705123e-06, + "loss": 0.8539, + "step": 7004 + }, + { + "epoch": 0.62, + "grad_norm": 10.374928627558674, + "learning_rate": 3.405491033630556e-06, + "loss": 0.7902, + "step": 7005 + }, + { + "epoch": 0.62, + "grad_norm": 90.97078933003787, + "learning_rate": 3.404143052581003e-06, + "loss": 0.8526, + "step": 7006 + }, + { + "epoch": 0.62, + "grad_norm": 7.08668253611084, + "learning_rate": 3.4027952006655373e-06, + "loss": 0.7295, + "step": 7007 + }, + { + "epoch": 0.62, + "grad_norm": 6.277094697071511, + "learning_rate": 3.4014474779932295e-06, + "loss": 0.6253, + "step": 7008 + }, + { + "epoch": 0.62, + "grad_norm": 6.2395736835760625, + "learning_rate": 3.40009988467313e-06, + "loss": 0.59, + "step": 7009 + }, + { + "epoch": 0.62, + "grad_norm": 10.417981066484533, + "learning_rate": 3.398752420814289e-06, + "loss": 0.7323, + "step": 7010 + }, + { + "epoch": 0.62, + "grad_norm": 2.544306577897271, + "learning_rate": 3.397405086525738e-06, + "loss": 0.5656, + "step": 7011 + }, + { + "epoch": 0.62, + "grad_norm": 9.951066598016645, + "learning_rate": 3.3960578819165006e-06, + "loss": 0.6951, + "step": 7012 + }, + { + "epoch": 0.62, + "grad_norm": 6.740580079125485, + "learning_rate": 3.3947108070955925e-06, + "loss": 0.7106, + "step": 7013 + }, + { + "epoch": 0.62, + "grad_norm": 4.854881306106191, + "learning_rate": 3.3933638621720154e-06, + "loss": 0.7843, + "step": 7014 + }, + { + "epoch": 0.62, + "grad_norm": 12.206022091846055, + "learning_rate": 3.392017047254762e-06, + "loss": 0.6051, + "step": 7015 + }, + { + "epoch": 0.62, + "grad_norm": 7.35829184307401, + "learning_rate": 3.3906703624528148e-06, + "loss": 0.8092, + "step": 7016 + }, + { + "epoch": 0.62, + "grad_norm": 5.723084874957474, + "learning_rate": 3.3893238078751447e-06, + "loss": 0.7526, + "step": 7017 + }, + { + "epoch": 0.62, + "grad_norm": 8.010633111688605, + "learning_rate": 3.3879773836307115e-06, + "loss": 0.8736, + "step": 7018 + }, + { + "epoch": 0.62, + "grad_norm": 3.1064909288073976, + "learning_rate": 3.386631089828468e-06, + "loss": 0.485, + "step": 7019 + }, + { + "epoch": 0.62, + "grad_norm": 14.975415173300192, + "learning_rate": 3.3852849265773535e-06, + "loss": 0.7232, + "step": 7020 + }, + { + "epoch": 0.62, + "grad_norm": 6.223885896658722, + "learning_rate": 3.383938893986296e-06, + "loss": 0.6895, + "step": 7021 + }, + { + "epoch": 0.62, + "grad_norm": 3.98201474221007, + "learning_rate": 3.382592992164215e-06, + "loss": 0.5066, + "step": 7022 + }, + { + "epoch": 0.62, + "grad_norm": 9.982454949241683, + "learning_rate": 3.3812472212200198e-06, + "loss": 0.7753, + "step": 7023 + }, + { + "epoch": 0.62, + "grad_norm": 8.296915951277175, + "learning_rate": 3.3799015812626058e-06, + "loss": 0.7671, + "step": 7024 + }, + { + "epoch": 0.62, + "grad_norm": 5.535413099682795, + "learning_rate": 3.3785560724008613e-06, + "loss": 0.5263, + "step": 7025 + }, + { + "epoch": 0.62, + "grad_norm": 6.4986985096514545, + "learning_rate": 3.3772106947436624e-06, + "loss": 0.6588, + "step": 7026 + }, + { + "epoch": 0.62, + "grad_norm": 5.42137538484069, + "learning_rate": 3.375865448399874e-06, + "loss": 0.7747, + "step": 7027 + }, + { + "epoch": 0.62, + "grad_norm": 8.449415005620038, + "learning_rate": 3.3745203334783525e-06, + "loss": 0.6704, + "step": 7028 + }, + { + "epoch": 0.62, + "grad_norm": 6.196845143555452, + "learning_rate": 3.373175350087941e-06, + "loss": 0.6759, + "step": 7029 + }, + { + "epoch": 0.62, + "grad_norm": 2.364878766303576, + "learning_rate": 3.371830498337475e-06, + "loss": 0.4402, + "step": 7030 + }, + { + "epoch": 0.62, + "grad_norm": 10.80904330157275, + "learning_rate": 3.370485778335777e-06, + "loss": 0.7831, + "step": 7031 + }, + { + "epoch": 0.62, + "grad_norm": 12.064710049831152, + "learning_rate": 3.3691411901916583e-06, + "loss": 0.791, + "step": 7032 + }, + { + "epoch": 0.62, + "grad_norm": 6.41447287863763, + "learning_rate": 3.367796734013922e-06, + "loss": 0.7988, + "step": 7033 + }, + { + "epoch": 0.62, + "grad_norm": 9.38555898861451, + "learning_rate": 3.3664524099113595e-06, + "loss": 0.7999, + "step": 7034 + }, + { + "epoch": 0.62, + "grad_norm": 11.02497944579797, + "learning_rate": 3.3651082179927495e-06, + "loss": 0.7903, + "step": 7035 + }, + { + "epoch": 0.62, + "grad_norm": 5.776886484785868, + "learning_rate": 3.3637641583668645e-06, + "loss": 0.8186, + "step": 7036 + }, + { + "epoch": 0.62, + "grad_norm": 10.765332077731173, + "learning_rate": 3.362420231142461e-06, + "loss": 0.7031, + "step": 7037 + }, + { + "epoch": 0.62, + "grad_norm": 6.663324705830176, + "learning_rate": 3.361076436428289e-06, + "loss": 0.5794, + "step": 7038 + }, + { + "epoch": 0.62, + "grad_norm": 5.608065425632432, + "learning_rate": 3.3597327743330854e-06, + "loss": 0.757, + "step": 7039 + }, + { + "epoch": 0.62, + "grad_norm": 3.4163640351703597, + "learning_rate": 3.3583892449655776e-06, + "loss": 0.6958, + "step": 7040 + }, + { + "epoch": 0.62, + "grad_norm": 8.365003170593187, + "learning_rate": 3.35704584843448e-06, + "loss": 0.7832, + "step": 7041 + }, + { + "epoch": 0.62, + "grad_norm": 5.549253941214843, + "learning_rate": 3.355702584848501e-06, + "loss": 0.6751, + "step": 7042 + }, + { + "epoch": 0.62, + "grad_norm": 8.304887460857811, + "learning_rate": 3.354359454316334e-06, + "loss": 0.805, + "step": 7043 + }, + { + "epoch": 0.62, + "grad_norm": 5.630449738598472, + "learning_rate": 3.3530164569466604e-06, + "loss": 0.73, + "step": 7044 + }, + { + "epoch": 0.62, + "grad_norm": 5.810666118143353, + "learning_rate": 3.351673592848157e-06, + "loss": 0.6581, + "step": 7045 + }, + { + "epoch": 0.62, + "grad_norm": 7.031371557953944, + "learning_rate": 3.350330862129485e-06, + "loss": 0.7574, + "step": 7046 + }, + { + "epoch": 0.62, + "grad_norm": 7.042793562725828, + "learning_rate": 3.3489882648992943e-06, + "loss": 0.8167, + "step": 7047 + }, + { + "epoch": 0.62, + "grad_norm": 5.6063463509007425, + "learning_rate": 3.347645801266228e-06, + "loss": 0.7289, + "step": 7048 + }, + { + "epoch": 0.62, + "grad_norm": 12.459259440431287, + "learning_rate": 3.3463034713389136e-06, + "loss": 0.8652, + "step": 7049 + }, + { + "epoch": 0.62, + "grad_norm": 11.107924976835868, + "learning_rate": 3.344961275225973e-06, + "loss": 0.6729, + "step": 7050 + }, + { + "epoch": 0.62, + "grad_norm": 4.30374479934207, + "learning_rate": 3.3436192130360122e-06, + "loss": 0.7204, + "step": 7051 + }, + { + "epoch": 0.62, + "grad_norm": 6.297716616323988, + "learning_rate": 3.342277284877629e-06, + "loss": 0.7173, + "step": 7052 + }, + { + "epoch": 0.62, + "grad_norm": 7.315318736862504, + "learning_rate": 3.3409354908594118e-06, + "loss": 0.653, + "step": 7053 + }, + { + "epoch": 0.62, + "grad_norm": 5.47663014673547, + "learning_rate": 3.3395938310899345e-06, + "loss": 0.7815, + "step": 7054 + }, + { + "epoch": 0.62, + "grad_norm": 4.365194884010834, + "learning_rate": 3.3382523056777614e-06, + "loss": 0.7282, + "step": 7055 + }, + { + "epoch": 0.62, + "grad_norm": 7.467862324744397, + "learning_rate": 3.3369109147314483e-06, + "loss": 0.8764, + "step": 7056 + }, + { + "epoch": 0.62, + "grad_norm": 7.367938302879014, + "learning_rate": 3.335569658359538e-06, + "loss": 0.8274, + "step": 7057 + }, + { + "epoch": 0.62, + "grad_norm": 2.8884200274120286, + "learning_rate": 3.334228536670561e-06, + "loss": 0.4801, + "step": 7058 + }, + { + "epoch": 0.62, + "grad_norm": 7.211907604069436, + "learning_rate": 3.3328875497730416e-06, + "loss": 0.7543, + "step": 7059 + }, + { + "epoch": 0.62, + "grad_norm": 9.227955385843034, + "learning_rate": 3.331546697775488e-06, + "loss": 0.6949, + "step": 7060 + }, + { + "epoch": 0.62, + "grad_norm": 5.609078968973136, + "learning_rate": 3.3302059807863994e-06, + "loss": 0.7299, + "step": 7061 + }, + { + "epoch": 0.62, + "grad_norm": 5.789709433064772, + "learning_rate": 3.3288653989142663e-06, + "loss": 0.7664, + "step": 7062 + }, + { + "epoch": 0.62, + "grad_norm": 5.49438865174752, + "learning_rate": 3.3275249522675656e-06, + "loss": 0.6145, + "step": 7063 + }, + { + "epoch": 0.62, + "grad_norm": 10.631592726312364, + "learning_rate": 3.326184640954762e-06, + "loss": 0.7458, + "step": 7064 + }, + { + "epoch": 0.62, + "grad_norm": 5.562328880512771, + "learning_rate": 3.3248444650843147e-06, + "loss": 0.7459, + "step": 7065 + }, + { + "epoch": 0.62, + "grad_norm": 7.108855981678885, + "learning_rate": 3.3235044247646666e-06, + "loss": 0.8262, + "step": 7066 + }, + { + "epoch": 0.62, + "grad_norm": 7.495476841831979, + "learning_rate": 3.322164520104251e-06, + "loss": 0.4972, + "step": 7067 + }, + { + "epoch": 0.62, + "grad_norm": 6.3786032403127955, + "learning_rate": 3.320824751211492e-06, + "loss": 0.7461, + "step": 7068 + }, + { + "epoch": 0.62, + "grad_norm": 11.788343988263334, + "learning_rate": 3.3194851181947995e-06, + "loss": 0.7637, + "step": 7069 + }, + { + "epoch": 0.62, + "grad_norm": 2.685415823131022, + "learning_rate": 3.318145621162578e-06, + "loss": 0.4887, + "step": 7070 + }, + { + "epoch": 0.62, + "grad_norm": 2.3490756092602205, + "learning_rate": 3.3168062602232144e-06, + "loss": 0.5318, + "step": 7071 + }, + { + "epoch": 0.62, + "grad_norm": 8.009891612438809, + "learning_rate": 3.3154670354850884e-06, + "loss": 0.8904, + "step": 7072 + }, + { + "epoch": 0.62, + "grad_norm": 7.01381711329564, + "learning_rate": 3.314127947056568e-06, + "loss": 0.9904, + "step": 7073 + }, + { + "epoch": 0.62, + "grad_norm": 10.66633235580974, + "learning_rate": 3.3127889950460094e-06, + "loss": 0.7712, + "step": 7074 + }, + { + "epoch": 0.62, + "grad_norm": 6.328846917553465, + "learning_rate": 3.3114501795617584e-06, + "loss": 0.8701, + "step": 7075 + }, + { + "epoch": 0.62, + "grad_norm": 10.288041390152017, + "learning_rate": 3.310111500712151e-06, + "loss": 0.8132, + "step": 7076 + }, + { + "epoch": 0.62, + "grad_norm": 10.928131585380086, + "learning_rate": 3.3087729586055108e-06, + "loss": 0.8518, + "step": 7077 + }, + { + "epoch": 0.62, + "grad_norm": 7.76001425592815, + "learning_rate": 3.3074345533501475e-06, + "loss": 0.8648, + "step": 7078 + }, + { + "epoch": 0.62, + "grad_norm": 7.9995274466817605, + "learning_rate": 3.3060962850543655e-06, + "loss": 0.7009, + "step": 7079 + }, + { + "epoch": 0.62, + "grad_norm": 8.378905343214548, + "learning_rate": 3.304758153826455e-06, + "loss": 0.7052, + "step": 7080 + }, + { + "epoch": 0.62, + "grad_norm": 7.198653088737774, + "learning_rate": 3.303420159774694e-06, + "loss": 0.6911, + "step": 7081 + }, + { + "epoch": 0.62, + "grad_norm": 8.492888596689111, + "learning_rate": 3.3020823030073522e-06, + "loss": 0.5578, + "step": 7082 + }, + { + "epoch": 0.62, + "grad_norm": 4.172415534897321, + "learning_rate": 3.3007445836326862e-06, + "loss": 0.5126, + "step": 7083 + }, + { + "epoch": 0.62, + "grad_norm": 10.2471382026657, + "learning_rate": 3.2994070017589407e-06, + "loss": 0.6579, + "step": 7084 + }, + { + "epoch": 0.62, + "grad_norm": 9.008873634872629, + "learning_rate": 3.2980695574943532e-06, + "loss": 0.7621, + "step": 7085 + }, + { + "epoch": 0.62, + "grad_norm": 8.889507155288635, + "learning_rate": 3.296732250947146e-06, + "loss": 0.7276, + "step": 7086 + }, + { + "epoch": 0.62, + "grad_norm": 11.433883615983207, + "learning_rate": 3.295395082225531e-06, + "loss": 0.7226, + "step": 7087 + }, + { + "epoch": 0.62, + "grad_norm": 6.586379043348675, + "learning_rate": 3.2940580514377105e-06, + "loss": 0.7695, + "step": 7088 + }, + { + "epoch": 0.62, + "grad_norm": 8.791528628712516, + "learning_rate": 3.292721158691875e-06, + "loss": 0.7661, + "step": 7089 + }, + { + "epoch": 0.62, + "grad_norm": 8.501733581265901, + "learning_rate": 3.291384404096204e-06, + "loss": 0.8126, + "step": 7090 + }, + { + "epoch": 0.62, + "grad_norm": 11.930683552299618, + "learning_rate": 3.290047787758865e-06, + "loss": 0.8609, + "step": 7091 + }, + { + "epoch": 0.62, + "grad_norm": 6.220048006617326, + "learning_rate": 3.288711309788013e-06, + "loss": 0.7777, + "step": 7092 + }, + { + "epoch": 0.62, + "grad_norm": 7.062052616834003, + "learning_rate": 3.2873749702917966e-06, + "loss": 0.8316, + "step": 7093 + }, + { + "epoch": 0.62, + "grad_norm": 7.692427516936993, + "learning_rate": 3.286038769378349e-06, + "loss": 0.7866, + "step": 7094 + }, + { + "epoch": 0.62, + "grad_norm": 5.96486184203114, + "learning_rate": 3.2847027071557925e-06, + "loss": 0.7737, + "step": 7095 + }, + { + "epoch": 0.62, + "grad_norm": 11.388542074496273, + "learning_rate": 3.28336678373224e-06, + "loss": 0.7582, + "step": 7096 + }, + { + "epoch": 0.62, + "grad_norm": 8.443125121143245, + "learning_rate": 3.2820309992157926e-06, + "loss": 0.6231, + "step": 7097 + }, + { + "epoch": 0.62, + "grad_norm": 6.706001956489304, + "learning_rate": 3.2806953537145377e-06, + "loss": 0.782, + "step": 7098 + }, + { + "epoch": 0.62, + "grad_norm": 9.13704581747823, + "learning_rate": 3.279359847336555e-06, + "loss": 0.6687, + "step": 7099 + }, + { + "epoch": 0.62, + "grad_norm": 6.985063200930438, + "learning_rate": 3.2780244801899125e-06, + "loss": 0.6975, + "step": 7100 + }, + { + "epoch": 0.62, + "grad_norm": 5.068318947661814, + "learning_rate": 3.276689252382663e-06, + "loss": 0.6618, + "step": 7101 + }, + { + "epoch": 0.62, + "grad_norm": 6.087358187570023, + "learning_rate": 3.2753541640228544e-06, + "loss": 0.6241, + "step": 7102 + }, + { + "epoch": 0.62, + "grad_norm": 6.483587657588167, + "learning_rate": 3.2740192152185174e-06, + "loss": 0.7262, + "step": 7103 + }, + { + "epoch": 0.62, + "grad_norm": 6.519804280898003, + "learning_rate": 3.2726844060776738e-06, + "loss": 0.6778, + "step": 7104 + }, + { + "epoch": 0.62, + "grad_norm": 11.155226549505722, + "learning_rate": 3.271349736708336e-06, + "loss": 0.7755, + "step": 7105 + }, + { + "epoch": 0.62, + "grad_norm": 2.4865589352997493, + "learning_rate": 3.270015207218501e-06, + "loss": 0.4997, + "step": 7106 + }, + { + "epoch": 0.62, + "grad_norm": 6.714702165477015, + "learning_rate": 3.268680817716158e-06, + "loss": 0.7002, + "step": 7107 + }, + { + "epoch": 0.62, + "grad_norm": 7.887434566154509, + "learning_rate": 3.267346568309283e-06, + "loss": 0.681, + "step": 7108 + }, + { + "epoch": 0.62, + "grad_norm": 8.842339833060022, + "learning_rate": 3.2660124591058416e-06, + "loss": 0.8002, + "step": 7109 + }, + { + "epoch": 0.62, + "grad_norm": 7.4647500546411685, + "learning_rate": 3.2646784902137885e-06, + "loss": 0.7372, + "step": 7110 + }, + { + "epoch": 0.62, + "grad_norm": 9.65624301281317, + "learning_rate": 3.263344661741065e-06, + "loss": 0.576, + "step": 7111 + }, + { + "epoch": 0.62, + "grad_norm": 11.723568264373514, + "learning_rate": 3.2620109737956025e-06, + "loss": 0.7393, + "step": 7112 + }, + { + "epoch": 0.62, + "grad_norm": 9.057349909854063, + "learning_rate": 3.2606774264853215e-06, + "loss": 0.5323, + "step": 7113 + }, + { + "epoch": 0.62, + "grad_norm": 3.2038931860132354, + "learning_rate": 3.25934401991813e-06, + "loss": 0.5339, + "step": 7114 + }, + { + "epoch": 0.62, + "grad_norm": 5.714074500569605, + "learning_rate": 3.2580107542019236e-06, + "loss": 0.6855, + "step": 7115 + }, + { + "epoch": 0.62, + "grad_norm": 8.55883727684055, + "learning_rate": 3.256677629444591e-06, + "loss": 0.8135, + "step": 7116 + }, + { + "epoch": 0.63, + "grad_norm": 9.239694787404588, + "learning_rate": 3.255344645754004e-06, + "loss": 0.6385, + "step": 7117 + }, + { + "epoch": 0.63, + "grad_norm": 2.0793773523170493, + "learning_rate": 3.254011803238026e-06, + "loss": 0.5039, + "step": 7118 + }, + { + "epoch": 0.63, + "grad_norm": 11.022053431968963, + "learning_rate": 3.252679102004509e-06, + "loss": 0.8271, + "step": 7119 + }, + { + "epoch": 0.63, + "grad_norm": 7.308647554810755, + "learning_rate": 3.251346542161292e-06, + "loss": 0.6522, + "step": 7120 + }, + { + "epoch": 0.63, + "grad_norm": 9.591472459106084, + "learning_rate": 3.2500141238162042e-06, + "loss": 0.8024, + "step": 7121 + }, + { + "epoch": 0.63, + "grad_norm": 6.673719272308434, + "learning_rate": 3.2486818470770633e-06, + "loss": 0.6184, + "step": 7122 + }, + { + "epoch": 0.63, + "grad_norm": 8.216115157812586, + "learning_rate": 3.247349712051674e-06, + "loss": 0.7431, + "step": 7123 + }, + { + "epoch": 0.63, + "grad_norm": 4.621202630677304, + "learning_rate": 3.246017718847829e-06, + "loss": 0.8369, + "step": 7124 + }, + { + "epoch": 0.63, + "grad_norm": 12.154518218139557, + "learning_rate": 3.2446858675733152e-06, + "loss": 0.759, + "step": 7125 + }, + { + "epoch": 0.63, + "grad_norm": 2.9569811337656975, + "learning_rate": 3.2433541583358986e-06, + "loss": 0.488, + "step": 7126 + }, + { + "epoch": 0.63, + "grad_norm": 5.761402626627364, + "learning_rate": 3.242022591243343e-06, + "loss": 0.6748, + "step": 7127 + }, + { + "epoch": 0.63, + "grad_norm": 7.320269092165697, + "learning_rate": 3.2406911664033953e-06, + "loss": 0.6485, + "step": 7128 + }, + { + "epoch": 0.63, + "grad_norm": 8.083960515721378, + "learning_rate": 3.2393598839237903e-06, + "loss": 0.6583, + "step": 7129 + }, + { + "epoch": 0.63, + "grad_norm": 5.35951017898887, + "learning_rate": 3.238028743912257e-06, + "loss": 0.7017, + "step": 7130 + }, + { + "epoch": 0.63, + "grad_norm": 11.966873043093592, + "learning_rate": 3.236697746476506e-06, + "loss": 0.7471, + "step": 7131 + }, + { + "epoch": 0.63, + "grad_norm": 8.505825492855507, + "learning_rate": 3.2353668917242393e-06, + "loss": 0.6889, + "step": 7132 + }, + { + "epoch": 0.63, + "grad_norm": 7.507338891908518, + "learning_rate": 3.234036179763149e-06, + "loss": 0.7365, + "step": 7133 + }, + { + "epoch": 0.63, + "grad_norm": 7.52858804193205, + "learning_rate": 3.232705610700915e-06, + "loss": 0.7339, + "step": 7134 + }, + { + "epoch": 0.63, + "grad_norm": 9.206615103822317, + "learning_rate": 3.2313751846452005e-06, + "loss": 0.6457, + "step": 7135 + }, + { + "epoch": 0.63, + "grad_norm": 5.018279181446297, + "learning_rate": 3.230044901703666e-06, + "loss": 0.7362, + "step": 7136 + }, + { + "epoch": 0.63, + "grad_norm": 10.089929290451316, + "learning_rate": 3.2287147619839543e-06, + "loss": 0.78, + "step": 7137 + }, + { + "epoch": 0.63, + "grad_norm": 11.103657605210834, + "learning_rate": 3.227384765593696e-06, + "loss": 0.7672, + "step": 7138 + }, + { + "epoch": 0.63, + "grad_norm": 7.738451266892643, + "learning_rate": 3.226054912640515e-06, + "loss": 0.7228, + "step": 7139 + }, + { + "epoch": 0.63, + "grad_norm": 2.968041315900324, + "learning_rate": 3.22472520323202e-06, + "loss": 0.4586, + "step": 7140 + }, + { + "epoch": 0.63, + "grad_norm": 8.12351539115629, + "learning_rate": 3.223395637475807e-06, + "loss": 0.6723, + "step": 7141 + }, + { + "epoch": 0.63, + "grad_norm": 6.307981682838308, + "learning_rate": 3.2220662154794647e-06, + "loss": 0.6089, + "step": 7142 + }, + { + "epoch": 0.63, + "grad_norm": 3.279629387174283, + "learning_rate": 3.220736937350567e-06, + "loss": 0.6432, + "step": 7143 + }, + { + "epoch": 0.63, + "grad_norm": 6.216996398630466, + "learning_rate": 3.2194078031966756e-06, + "loss": 0.7312, + "step": 7144 + }, + { + "epoch": 0.63, + "grad_norm": 7.004456409049854, + "learning_rate": 3.218078813125344e-06, + "loss": 0.732, + "step": 7145 + }, + { + "epoch": 0.63, + "grad_norm": 21.48416647263533, + "learning_rate": 3.21674996724411e-06, + "loss": 0.7633, + "step": 7146 + }, + { + "epoch": 0.63, + "grad_norm": 5.003602918628753, + "learning_rate": 3.2154212656605033e-06, + "loss": 0.7718, + "step": 7147 + }, + { + "epoch": 0.63, + "grad_norm": 7.9445425164825885, + "learning_rate": 3.2140927084820393e-06, + "loss": 0.7718, + "step": 7148 + }, + { + "epoch": 0.63, + "grad_norm": 8.705023759271812, + "learning_rate": 3.2127642958162214e-06, + "loss": 0.7132, + "step": 7149 + }, + { + "epoch": 0.63, + "grad_norm": 2.4059467898014466, + "learning_rate": 3.211436027770545e-06, + "loss": 0.4657, + "step": 7150 + }, + { + "epoch": 0.63, + "grad_norm": 9.075146284151115, + "learning_rate": 3.2101079044524895e-06, + "loss": 0.8431, + "step": 7151 + }, + { + "epoch": 0.63, + "grad_norm": 7.58533987339685, + "learning_rate": 3.2087799259695254e-06, + "loss": 0.6466, + "step": 7152 + }, + { + "epoch": 0.63, + "grad_norm": 4.76974080897508, + "learning_rate": 3.20745209242911e-06, + "loss": 0.6866, + "step": 7153 + }, + { + "epoch": 0.63, + "grad_norm": 11.125929931564942, + "learning_rate": 3.2061244039386897e-06, + "loss": 0.7192, + "step": 7154 + }, + { + "epoch": 0.63, + "grad_norm": 12.248225280112846, + "learning_rate": 3.204796860605698e-06, + "loss": 0.6892, + "step": 7155 + }, + { + "epoch": 0.63, + "grad_norm": 2.2101251628428535, + "learning_rate": 3.2034694625375583e-06, + "loss": 0.5295, + "step": 7156 + }, + { + "epoch": 0.63, + "grad_norm": 6.319523650772548, + "learning_rate": 3.202142209841682e-06, + "loss": 0.7985, + "step": 7157 + }, + { + "epoch": 0.63, + "grad_norm": 7.618893641533851, + "learning_rate": 3.2008151026254663e-06, + "loss": 0.8298, + "step": 7158 + }, + { + "epoch": 0.63, + "grad_norm": 6.761209738006153, + "learning_rate": 3.1994881409963015e-06, + "loss": 0.8608, + "step": 7159 + }, + { + "epoch": 0.63, + "grad_norm": 9.61159742876431, + "learning_rate": 3.1981613250615606e-06, + "loss": 0.7967, + "step": 7160 + }, + { + "epoch": 0.63, + "grad_norm": 7.458741960315131, + "learning_rate": 3.1968346549286066e-06, + "loss": 0.7415, + "step": 7161 + }, + { + "epoch": 0.63, + "grad_norm": 6.511613920718984, + "learning_rate": 3.195508130704795e-06, + "loss": 0.6413, + "step": 7162 + }, + { + "epoch": 0.63, + "grad_norm": 5.466709485116815, + "learning_rate": 3.1941817524974634e-06, + "loss": 0.8006, + "step": 7163 + }, + { + "epoch": 0.63, + "grad_norm": 7.287460648745194, + "learning_rate": 3.1928555204139395e-06, + "loss": 0.9571, + "step": 7164 + }, + { + "epoch": 0.63, + "grad_norm": 8.198620571074002, + "learning_rate": 3.1915294345615414e-06, + "loss": 0.6807, + "step": 7165 + }, + { + "epoch": 0.63, + "grad_norm": 7.846792777277793, + "learning_rate": 3.190203495047573e-06, + "loss": 0.7473, + "step": 7166 + }, + { + "epoch": 0.63, + "grad_norm": 6.9052794183614425, + "learning_rate": 3.188877701979328e-06, + "loss": 0.7272, + "step": 7167 + }, + { + "epoch": 0.63, + "grad_norm": 3.111986661210142, + "learning_rate": 3.1875520554640873e-06, + "loss": 0.5179, + "step": 7168 + }, + { + "epoch": 0.63, + "grad_norm": 5.133422836049085, + "learning_rate": 3.186226555609117e-06, + "loss": 0.7218, + "step": 7169 + }, + { + "epoch": 0.63, + "grad_norm": 6.730550214703739, + "learning_rate": 3.1849012025216784e-06, + "loss": 0.6452, + "step": 7170 + }, + { + "epoch": 0.63, + "grad_norm": 6.9373235957682535, + "learning_rate": 3.1835759963090163e-06, + "loss": 0.6242, + "step": 7171 + }, + { + "epoch": 0.63, + "grad_norm": 6.962592401314381, + "learning_rate": 3.1822509370783605e-06, + "loss": 0.6209, + "step": 7172 + }, + { + "epoch": 0.63, + "grad_norm": 9.858816077350738, + "learning_rate": 3.1809260249369373e-06, + "loss": 0.714, + "step": 7173 + }, + { + "epoch": 0.63, + "grad_norm": 7.38791942818976, + "learning_rate": 3.1796012599919535e-06, + "loss": 0.7454, + "step": 7174 + }, + { + "epoch": 0.63, + "grad_norm": 6.168654477052674, + "learning_rate": 3.178276642350607e-06, + "loss": 0.6993, + "step": 7175 + }, + { + "epoch": 0.63, + "grad_norm": 12.215853810871145, + "learning_rate": 3.176952172120086e-06, + "loss": 0.8035, + "step": 7176 + }, + { + "epoch": 0.63, + "grad_norm": 7.787439881674173, + "learning_rate": 3.175627849407562e-06, + "loss": 0.6961, + "step": 7177 + }, + { + "epoch": 0.63, + "grad_norm": 9.040677084402994, + "learning_rate": 3.1743036743201973e-06, + "loss": 0.9013, + "step": 7178 + }, + { + "epoch": 0.63, + "grad_norm": 8.452715681803038, + "learning_rate": 3.1729796469651424e-06, + "loss": 0.7131, + "step": 7179 + }, + { + "epoch": 0.63, + "grad_norm": 3.5473231797612153, + "learning_rate": 3.1716557674495363e-06, + "loss": 0.5423, + "step": 7180 + }, + { + "epoch": 0.63, + "grad_norm": 14.872001402077457, + "learning_rate": 3.1703320358805024e-06, + "loss": 0.648, + "step": 7181 + }, + { + "epoch": 0.63, + "grad_norm": 17.357327495851106, + "learning_rate": 3.1690084523651587e-06, + "loss": 0.8755, + "step": 7182 + }, + { + "epoch": 0.63, + "grad_norm": 4.57197733268332, + "learning_rate": 3.1676850170106044e-06, + "loss": 0.7607, + "step": 7183 + }, + { + "epoch": 0.63, + "grad_norm": 5.239322722703704, + "learning_rate": 3.1663617299239303e-06, + "loss": 0.7166, + "step": 7184 + }, + { + "epoch": 0.63, + "grad_norm": 6.865356654149148, + "learning_rate": 3.165038591212215e-06, + "loss": 0.5691, + "step": 7185 + }, + { + "epoch": 0.63, + "grad_norm": 13.79481479525837, + "learning_rate": 3.1637156009825245e-06, + "loss": 0.7119, + "step": 7186 + }, + { + "epoch": 0.63, + "grad_norm": 8.521250388624555, + "learning_rate": 3.1623927593419134e-06, + "loss": 0.7917, + "step": 7187 + }, + { + "epoch": 0.63, + "grad_norm": 8.157761104282248, + "learning_rate": 3.161070066397424e-06, + "loss": 0.6926, + "step": 7188 + }, + { + "epoch": 0.63, + "grad_norm": 7.4215101122175025, + "learning_rate": 3.1597475222560845e-06, + "loss": 0.7951, + "step": 7189 + }, + { + "epoch": 0.63, + "grad_norm": 5.559117848152394, + "learning_rate": 3.1584251270249166e-06, + "loss": 0.6635, + "step": 7190 + }, + { + "epoch": 0.63, + "grad_norm": 6.790064567591456, + "learning_rate": 3.1571028808109228e-06, + "loss": 0.6865, + "step": 7191 + }, + { + "epoch": 0.63, + "grad_norm": 7.211340259005349, + "learning_rate": 3.155780783721098e-06, + "loss": 0.7056, + "step": 7192 + }, + { + "epoch": 0.63, + "grad_norm": 9.084792988567857, + "learning_rate": 3.154458835862425e-06, + "loss": 0.7394, + "step": 7193 + }, + { + "epoch": 0.63, + "grad_norm": 7.25310776107596, + "learning_rate": 3.1531370373418734e-06, + "loss": 0.8705, + "step": 7194 + }, + { + "epoch": 0.63, + "grad_norm": 8.930970841372002, + "learning_rate": 3.1518153882663994e-06, + "loss": 0.6658, + "step": 7195 + }, + { + "epoch": 0.63, + "grad_norm": 6.807340496784247, + "learning_rate": 3.1504938887429517e-06, + "loss": 0.7408, + "step": 7196 + }, + { + "epoch": 0.63, + "grad_norm": 16.717853461824234, + "learning_rate": 3.1491725388784612e-06, + "loss": 0.7746, + "step": 7197 + }, + { + "epoch": 0.63, + "grad_norm": 6.882507819813122, + "learning_rate": 3.147851338779849e-06, + "loss": 0.8427, + "step": 7198 + }, + { + "epoch": 0.63, + "grad_norm": 7.687491483157562, + "learning_rate": 3.1465302885540273e-06, + "loss": 0.8248, + "step": 7199 + }, + { + "epoch": 0.63, + "grad_norm": 10.835707058327737, + "learning_rate": 3.1452093883078915e-06, + "loss": 0.8467, + "step": 7200 + }, + { + "epoch": 0.63, + "grad_norm": 3.5363259876511957, + "learning_rate": 3.1438886381483254e-06, + "loss": 0.6828, + "step": 7201 + }, + { + "epoch": 0.63, + "grad_norm": 12.035909679203229, + "learning_rate": 3.1425680381822045e-06, + "loss": 0.7775, + "step": 7202 + }, + { + "epoch": 0.63, + "grad_norm": 3.0269915941779013, + "learning_rate": 3.141247588516387e-06, + "loss": 0.5512, + "step": 7203 + }, + { + "epoch": 0.63, + "grad_norm": 7.922511292009874, + "learning_rate": 3.1399272892577233e-06, + "loss": 0.7263, + "step": 7204 + }, + { + "epoch": 0.63, + "grad_norm": 8.399198914463895, + "learning_rate": 3.1386071405130495e-06, + "loss": 0.721, + "step": 7205 + }, + { + "epoch": 0.63, + "grad_norm": 15.254781905743931, + "learning_rate": 3.1372871423891894e-06, + "loss": 0.6312, + "step": 7206 + }, + { + "epoch": 0.63, + "grad_norm": 2.8087835219456685, + "learning_rate": 3.1359672949929554e-06, + "loss": 0.5449, + "step": 7207 + }, + { + "epoch": 0.63, + "grad_norm": 6.868720664078829, + "learning_rate": 3.1346475984311475e-06, + "loss": 0.5689, + "step": 7208 + }, + { + "epoch": 0.63, + "grad_norm": 6.0442941819987235, + "learning_rate": 3.1333280528105523e-06, + "loss": 0.7819, + "step": 7209 + }, + { + "epoch": 0.63, + "grad_norm": 3.028118336812347, + "learning_rate": 3.1320086582379473e-06, + "loss": 0.5549, + "step": 7210 + }, + { + "epoch": 0.63, + "grad_norm": 7.082527006757609, + "learning_rate": 3.1306894148200938e-06, + "loss": 0.623, + "step": 7211 + }, + { + "epoch": 0.63, + "grad_norm": 21.68801513332356, + "learning_rate": 3.129370322663743e-06, + "loss": 0.59, + "step": 7212 + }, + { + "epoch": 0.63, + "grad_norm": 10.817590831587667, + "learning_rate": 3.128051381875634e-06, + "loss": 0.7246, + "step": 7213 + }, + { + "epoch": 0.63, + "grad_norm": 3.7833706733059667, + "learning_rate": 3.1267325925624944e-06, + "loss": 0.5635, + "step": 7214 + }, + { + "epoch": 0.63, + "grad_norm": 7.06242093391622, + "learning_rate": 3.1254139548310356e-06, + "loss": 0.7293, + "step": 7215 + }, + { + "epoch": 0.63, + "grad_norm": 5.73340082174401, + "learning_rate": 3.1240954687879634e-06, + "loss": 0.7722, + "step": 7216 + }, + { + "epoch": 0.63, + "grad_norm": 7.50020626596262, + "learning_rate": 3.1227771345399647e-06, + "loss": 0.7613, + "step": 7217 + }, + { + "epoch": 0.63, + "grad_norm": 7.0602904938455575, + "learning_rate": 3.1214589521937167e-06, + "loss": 0.7941, + "step": 7218 + }, + { + "epoch": 0.63, + "grad_norm": 5.271051019147781, + "learning_rate": 3.1201409218558867e-06, + "loss": 0.7001, + "step": 7219 + }, + { + "epoch": 0.63, + "grad_norm": 4.949026898423252, + "learning_rate": 3.1188230436331257e-06, + "loss": 0.8109, + "step": 7220 + }, + { + "epoch": 0.63, + "grad_norm": 7.647814994138319, + "learning_rate": 3.1175053176320745e-06, + "loss": 0.8097, + "step": 7221 + }, + { + "epoch": 0.63, + "grad_norm": 5.790165401520655, + "learning_rate": 3.1161877439593622e-06, + "loss": 0.6363, + "step": 7222 + }, + { + "epoch": 0.63, + "grad_norm": 11.183244125207294, + "learning_rate": 3.114870322721605e-06, + "loss": 0.6866, + "step": 7223 + }, + { + "epoch": 0.63, + "grad_norm": 10.014769027465041, + "learning_rate": 3.1135530540254037e-06, + "loss": 0.7589, + "step": 7224 + }, + { + "epoch": 0.63, + "grad_norm": 5.673355846261871, + "learning_rate": 3.1122359379773526e-06, + "loss": 0.927, + "step": 7225 + }, + { + "epoch": 0.63, + "grad_norm": 6.609087447718807, + "learning_rate": 3.1109189746840284e-06, + "loss": 0.7687, + "step": 7226 + }, + { + "epoch": 0.63, + "grad_norm": 6.322724796433201, + "learning_rate": 3.109602164251999e-06, + "loss": 0.8549, + "step": 7227 + }, + { + "epoch": 0.63, + "grad_norm": 7.018572071748756, + "learning_rate": 3.1082855067878182e-06, + "loss": 0.7394, + "step": 7228 + }, + { + "epoch": 0.63, + "grad_norm": 10.840657326966586, + "learning_rate": 3.106969002398027e-06, + "loss": 0.7856, + "step": 7229 + }, + { + "epoch": 0.63, + "grad_norm": 16.237086247552632, + "learning_rate": 3.1056526511891556e-06, + "loss": 0.7959, + "step": 7230 + }, + { + "epoch": 0.64, + "grad_norm": 7.296512200702533, + "learning_rate": 3.1043364532677205e-06, + "loss": 0.7981, + "step": 7231 + }, + { + "epoch": 0.64, + "grad_norm": 7.7208886410030475, + "learning_rate": 3.1030204087402256e-06, + "loss": 0.7054, + "step": 7232 + }, + { + "epoch": 0.64, + "grad_norm": 6.123101205952753, + "learning_rate": 3.1017045177131645e-06, + "loss": 0.8419, + "step": 7233 + }, + { + "epoch": 0.64, + "grad_norm": 16.71039345927045, + "learning_rate": 3.1003887802930166e-06, + "loss": 0.7521, + "step": 7234 + }, + { + "epoch": 0.64, + "grad_norm": 6.308828964571302, + "learning_rate": 3.0990731965862475e-06, + "loss": 0.9546, + "step": 7235 + }, + { + "epoch": 0.64, + "grad_norm": 7.252503605499903, + "learning_rate": 3.0977577666993143e-06, + "loss": 0.7441, + "step": 7236 + }, + { + "epoch": 0.64, + "grad_norm": 2.4584598881816224, + "learning_rate": 3.096442490738658e-06, + "loss": 0.5688, + "step": 7237 + }, + { + "epoch": 0.64, + "grad_norm": 6.375234756332906, + "learning_rate": 3.0951273688107088e-06, + "loss": 0.6915, + "step": 7238 + }, + { + "epoch": 0.64, + "grad_norm": 12.310044800909393, + "learning_rate": 3.093812401021885e-06, + "loss": 0.7963, + "step": 7239 + }, + { + "epoch": 0.64, + "grad_norm": 9.3609987706574, + "learning_rate": 3.092497587478591e-06, + "loss": 0.7371, + "step": 7240 + }, + { + "epoch": 0.64, + "grad_norm": 6.357683150874012, + "learning_rate": 3.0911829282872175e-06, + "loss": 0.7477, + "step": 7241 + }, + { + "epoch": 0.64, + "grad_norm": 2.2843647694921403, + "learning_rate": 3.089868423554148e-06, + "loss": 0.512, + "step": 7242 + }, + { + "epoch": 0.64, + "grad_norm": 4.829541475579877, + "learning_rate": 3.0885540733857482e-06, + "loss": 0.788, + "step": 7243 + }, + { + "epoch": 0.64, + "grad_norm": 7.609172800929599, + "learning_rate": 3.087239877888373e-06, + "loss": 0.6891, + "step": 7244 + }, + { + "epoch": 0.64, + "grad_norm": 8.043211936735496, + "learning_rate": 3.0859258371683647e-06, + "loss": 0.764, + "step": 7245 + }, + { + "epoch": 0.64, + "grad_norm": 7.120004828893745, + "learning_rate": 3.0846119513320538e-06, + "loss": 0.627, + "step": 7246 + }, + { + "epoch": 0.64, + "grad_norm": 6.377175842824763, + "learning_rate": 3.0832982204857582e-06, + "loss": 0.6595, + "step": 7247 + }, + { + "epoch": 0.64, + "grad_norm": 4.9946025871010855, + "learning_rate": 3.0819846447357825e-06, + "loss": 0.8769, + "step": 7248 + }, + { + "epoch": 0.64, + "grad_norm": 6.32795018856219, + "learning_rate": 3.080671224188418e-06, + "loss": 0.5281, + "step": 7249 + }, + { + "epoch": 0.64, + "grad_norm": 7.496274372218289, + "learning_rate": 3.079357958949946e-06, + "loss": 0.6307, + "step": 7250 + }, + { + "epoch": 0.64, + "grad_norm": 4.506779988077543, + "learning_rate": 3.0780448491266335e-06, + "loss": 0.6609, + "step": 7251 + }, + { + "epoch": 0.64, + "grad_norm": 12.053364668276561, + "learning_rate": 3.076731894824734e-06, + "loss": 0.8527, + "step": 7252 + }, + { + "epoch": 0.64, + "grad_norm": 2.199973387311514, + "learning_rate": 3.075419096150491e-06, + "loss": 0.4974, + "step": 7253 + }, + { + "epoch": 0.64, + "grad_norm": 2.8757621895075776, + "learning_rate": 3.074106453210134e-06, + "loss": 0.5095, + "step": 7254 + }, + { + "epoch": 0.64, + "grad_norm": 2.659289705142732, + "learning_rate": 3.072793966109877e-06, + "loss": 0.52, + "step": 7255 + }, + { + "epoch": 0.64, + "grad_norm": 8.726131400565846, + "learning_rate": 3.0714816349559286e-06, + "loss": 0.7113, + "step": 7256 + }, + { + "epoch": 0.64, + "grad_norm": 10.593503375030831, + "learning_rate": 3.070169459854478e-06, + "loss": 0.733, + "step": 7257 + }, + { + "epoch": 0.64, + "grad_norm": 9.094598806120857, + "learning_rate": 3.0688574409117032e-06, + "loss": 0.7689, + "step": 7258 + }, + { + "epoch": 0.64, + "grad_norm": 11.937876737982482, + "learning_rate": 3.0675455782337737e-06, + "loss": 0.641, + "step": 7259 + }, + { + "epoch": 0.64, + "grad_norm": 7.2759912359342325, + "learning_rate": 3.0662338719268414e-06, + "loss": 0.7332, + "step": 7260 + }, + { + "epoch": 0.64, + "grad_norm": 8.097278600020822, + "learning_rate": 3.0649223220970458e-06, + "loss": 0.6524, + "step": 7261 + }, + { + "epoch": 0.64, + "grad_norm": 7.51394530943193, + "learning_rate": 3.063610928850519e-06, + "loss": 0.762, + "step": 7262 + }, + { + "epoch": 0.64, + "grad_norm": 1.9922937262547162, + "learning_rate": 3.0622996922933746e-06, + "loss": 0.5057, + "step": 7263 + }, + { + "epoch": 0.64, + "grad_norm": 7.709417832870503, + "learning_rate": 3.0609886125317145e-06, + "loss": 0.8845, + "step": 7264 + }, + { + "epoch": 0.64, + "grad_norm": 5.053409745196172, + "learning_rate": 3.059677689671632e-06, + "loss": 0.682, + "step": 7265 + }, + { + "epoch": 0.64, + "grad_norm": 7.632605572018003, + "learning_rate": 3.058366923819202e-06, + "loss": 0.7292, + "step": 7266 + }, + { + "epoch": 0.64, + "grad_norm": 10.109746773071238, + "learning_rate": 3.0570563150804922e-06, + "loss": 0.842, + "step": 7267 + }, + { + "epoch": 0.64, + "grad_norm": 6.0774296545085855, + "learning_rate": 3.0557458635615526e-06, + "loss": 0.6633, + "step": 7268 + }, + { + "epoch": 0.64, + "grad_norm": 2.242056211416664, + "learning_rate": 3.0544355693684234e-06, + "loss": 0.5199, + "step": 7269 + }, + { + "epoch": 0.64, + "grad_norm": 10.69305561732588, + "learning_rate": 3.0531254326071324e-06, + "loss": 0.6674, + "step": 7270 + }, + { + "epoch": 0.64, + "grad_norm": 6.747953441565145, + "learning_rate": 3.051815453383693e-06, + "loss": 0.5887, + "step": 7271 + }, + { + "epoch": 0.64, + "grad_norm": 9.152918371367152, + "learning_rate": 3.050505631804105e-06, + "loss": 0.7478, + "step": 7272 + }, + { + "epoch": 0.64, + "grad_norm": 6.307943831661068, + "learning_rate": 3.04919596797436e-06, + "loss": 0.7724, + "step": 7273 + }, + { + "epoch": 0.64, + "grad_norm": 4.065200915328811, + "learning_rate": 3.0478864620004323e-06, + "loss": 0.7439, + "step": 7274 + }, + { + "epoch": 0.64, + "grad_norm": 6.657668602193843, + "learning_rate": 3.046577113988284e-06, + "loss": 0.7659, + "step": 7275 + }, + { + "epoch": 0.64, + "grad_norm": 9.343332956634766, + "learning_rate": 3.0452679240438666e-06, + "loss": 0.8112, + "step": 7276 + }, + { + "epoch": 0.64, + "grad_norm": 6.941729996762271, + "learning_rate": 3.0439588922731177e-06, + "loss": 0.7672, + "step": 7277 + }, + { + "epoch": 0.64, + "grad_norm": 7.251275516594712, + "learning_rate": 3.0426500187819603e-06, + "loss": 0.6246, + "step": 7278 + }, + { + "epoch": 0.64, + "grad_norm": 12.79029637350339, + "learning_rate": 3.0413413036763084e-06, + "loss": 0.8041, + "step": 7279 + }, + { + "epoch": 0.64, + "grad_norm": 6.646265569701132, + "learning_rate": 3.0400327470620604e-06, + "loss": 0.8059, + "step": 7280 + }, + { + "epoch": 0.64, + "grad_norm": 2.0755693944540883, + "learning_rate": 3.0387243490451015e-06, + "loss": 0.4544, + "step": 7281 + }, + { + "epoch": 0.64, + "grad_norm": 19.388349849184614, + "learning_rate": 3.037416109731307e-06, + "loss": 0.6611, + "step": 7282 + }, + { + "epoch": 0.64, + "grad_norm": 6.948177919163165, + "learning_rate": 3.0361080292265354e-06, + "loss": 0.6853, + "step": 7283 + }, + { + "epoch": 0.64, + "grad_norm": 11.495846556922784, + "learning_rate": 3.0348001076366353e-06, + "loss": 0.674, + "step": 7284 + }, + { + "epoch": 0.64, + "grad_norm": 6.348330038988026, + "learning_rate": 3.033492345067442e-06, + "loss": 0.7213, + "step": 7285 + }, + { + "epoch": 0.64, + "grad_norm": 9.020175084658606, + "learning_rate": 3.032184741624776e-06, + "loss": 0.6789, + "step": 7286 + }, + { + "epoch": 0.64, + "grad_norm": 6.001355844858278, + "learning_rate": 3.0308772974144483e-06, + "loss": 0.784, + "step": 7287 + }, + { + "epoch": 0.64, + "grad_norm": 7.940837631799384, + "learning_rate": 3.029570012542255e-06, + "loss": 0.6479, + "step": 7288 + }, + { + "epoch": 0.64, + "grad_norm": 10.132883053199793, + "learning_rate": 3.0282628871139776e-06, + "loss": 0.7554, + "step": 7289 + }, + { + "epoch": 0.64, + "grad_norm": 4.23032465837541, + "learning_rate": 3.026955921235387e-06, + "loss": 0.6887, + "step": 7290 + }, + { + "epoch": 0.64, + "grad_norm": 4.850876224901327, + "learning_rate": 3.0256491150122424e-06, + "loss": 0.6895, + "step": 7291 + }, + { + "epoch": 0.64, + "grad_norm": 6.889767341799598, + "learning_rate": 3.0243424685502863e-06, + "loss": 0.742, + "step": 7292 + }, + { + "epoch": 0.64, + "grad_norm": 8.31989990585631, + "learning_rate": 3.0230359819552525e-06, + "loss": 0.8124, + "step": 7293 + }, + { + "epoch": 0.64, + "grad_norm": 8.321131617608232, + "learning_rate": 3.021729655332858e-06, + "loss": 0.7335, + "step": 7294 + }, + { + "epoch": 0.64, + "grad_norm": 8.036717262775781, + "learning_rate": 3.0204234887888083e-06, + "loss": 0.7535, + "step": 7295 + }, + { + "epoch": 0.64, + "grad_norm": 8.785953455251894, + "learning_rate": 3.0191174824287983e-06, + "loss": 0.5933, + "step": 7296 + }, + { + "epoch": 0.64, + "grad_norm": 10.00015780464944, + "learning_rate": 3.0178116363585063e-06, + "loss": 0.5127, + "step": 7297 + }, + { + "epoch": 0.64, + "grad_norm": 7.587286227158814, + "learning_rate": 3.016505950683599e-06, + "loss": 0.6961, + "step": 7298 + }, + { + "epoch": 0.64, + "grad_norm": 9.460582828810619, + "learning_rate": 3.015200425509731e-06, + "loss": 0.864, + "step": 7299 + }, + { + "epoch": 0.64, + "grad_norm": 7.579072156076605, + "learning_rate": 3.013895060942543e-06, + "loss": 0.7585, + "step": 7300 + }, + { + "epoch": 0.64, + "grad_norm": 7.68835116748166, + "learning_rate": 3.0125898570876625e-06, + "loss": 0.8386, + "step": 7301 + }, + { + "epoch": 0.64, + "grad_norm": 13.492918425969975, + "learning_rate": 3.0112848140507056e-06, + "loss": 0.6981, + "step": 7302 + }, + { + "epoch": 0.64, + "grad_norm": 9.545514967658084, + "learning_rate": 3.0099799319372734e-06, + "loss": 0.7584, + "step": 7303 + }, + { + "epoch": 0.64, + "grad_norm": 5.964443639645618, + "learning_rate": 3.0086752108529547e-06, + "loss": 0.986, + "step": 7304 + }, + { + "epoch": 0.64, + "grad_norm": 9.862897736841335, + "learning_rate": 3.0073706509033257e-06, + "loss": 0.9429, + "step": 7305 + }, + { + "epoch": 0.64, + "grad_norm": 7.886570765382824, + "learning_rate": 3.006066252193949e-06, + "loss": 0.8268, + "step": 7306 + }, + { + "epoch": 0.64, + "grad_norm": 6.691585415576342, + "learning_rate": 3.004762014830374e-06, + "loss": 0.773, + "step": 7307 + }, + { + "epoch": 0.64, + "grad_norm": 14.193803355323121, + "learning_rate": 3.0034579389181386e-06, + "loss": 0.6045, + "step": 7308 + }, + { + "epoch": 0.64, + "grad_norm": 6.432932857191662, + "learning_rate": 3.0021540245627644e-06, + "loss": 0.7958, + "step": 7309 + }, + { + "epoch": 0.64, + "grad_norm": 5.991363247447734, + "learning_rate": 3.0008502718697653e-06, + "loss": 0.7854, + "step": 7310 + }, + { + "epoch": 0.64, + "grad_norm": 12.990691434961649, + "learning_rate": 2.9995466809446355e-06, + "loss": 0.7516, + "step": 7311 + }, + { + "epoch": 0.64, + "grad_norm": 6.992676454016782, + "learning_rate": 2.9982432518928606e-06, + "loss": 0.7961, + "step": 7312 + }, + { + "epoch": 0.64, + "grad_norm": 9.659538100342035, + "learning_rate": 2.996939984819912e-06, + "loss": 0.7841, + "step": 7313 + }, + { + "epoch": 0.64, + "grad_norm": 7.774971274853956, + "learning_rate": 2.9956368798312487e-06, + "loss": 0.6758, + "step": 7314 + }, + { + "epoch": 0.64, + "grad_norm": 6.383159430727946, + "learning_rate": 2.9943339370323143e-06, + "loss": 0.7847, + "step": 7315 + }, + { + "epoch": 0.64, + "grad_norm": 7.754070774740053, + "learning_rate": 2.993031156528542e-06, + "loss": 0.9212, + "step": 7316 + }, + { + "epoch": 0.64, + "grad_norm": 6.447411947486471, + "learning_rate": 2.9917285384253495e-06, + "loss": 0.7339, + "step": 7317 + }, + { + "epoch": 0.64, + "grad_norm": 4.84051353327935, + "learning_rate": 2.990426082828143e-06, + "loss": 0.6986, + "step": 7318 + }, + { + "epoch": 0.64, + "grad_norm": 5.73620143495299, + "learning_rate": 2.9891237898423155e-06, + "loss": 0.6787, + "step": 7319 + }, + { + "epoch": 0.64, + "grad_norm": 13.831863478598825, + "learning_rate": 2.987821659573247e-06, + "loss": 0.7379, + "step": 7320 + }, + { + "epoch": 0.64, + "grad_norm": 9.051026734835808, + "learning_rate": 2.9865196921263006e-06, + "loss": 0.7678, + "step": 7321 + }, + { + "epoch": 0.64, + "grad_norm": 7.692868518039859, + "learning_rate": 2.9852178876068326e-06, + "loss": 0.8934, + "step": 7322 + }, + { + "epoch": 0.64, + "grad_norm": 6.251076612005387, + "learning_rate": 2.983916246120182e-06, + "loss": 0.6841, + "step": 7323 + }, + { + "epoch": 0.64, + "grad_norm": 15.028275112770142, + "learning_rate": 2.9826147677716745e-06, + "loss": 0.7841, + "step": 7324 + }, + { + "epoch": 0.64, + "grad_norm": 11.53567965624408, + "learning_rate": 2.981313452666625e-06, + "loss": 0.787, + "step": 7325 + }, + { + "epoch": 0.64, + "grad_norm": 6.3304159956017205, + "learning_rate": 2.9800123009103317e-06, + "loss": 0.8242, + "step": 7326 + }, + { + "epoch": 0.64, + "grad_norm": 7.8922265072185045, + "learning_rate": 2.978711312608084e-06, + "loss": 0.8015, + "step": 7327 + }, + { + "epoch": 0.64, + "grad_norm": 7.64829984162956, + "learning_rate": 2.977410487865155e-06, + "loss": 0.7333, + "step": 7328 + }, + { + "epoch": 0.64, + "grad_norm": 12.761620085889236, + "learning_rate": 2.976109826786804e-06, + "loss": 0.7247, + "step": 7329 + }, + { + "epoch": 0.64, + "grad_norm": 10.427505112157982, + "learning_rate": 2.97480932947828e-06, + "loss": 0.7863, + "step": 7330 + }, + { + "epoch": 0.64, + "grad_norm": 8.61227010056983, + "learning_rate": 2.973508996044816e-06, + "loss": 0.8015, + "step": 7331 + }, + { + "epoch": 0.64, + "grad_norm": 6.781584315201957, + "learning_rate": 2.9722088265916328e-06, + "loss": 0.7608, + "step": 7332 + }, + { + "epoch": 0.64, + "grad_norm": 7.053404083218058, + "learning_rate": 2.9709088212239385e-06, + "loss": 0.7701, + "step": 7333 + }, + { + "epoch": 0.64, + "grad_norm": 4.384387568138304, + "learning_rate": 2.9696089800469276e-06, + "loss": 0.567, + "step": 7334 + }, + { + "epoch": 0.64, + "grad_norm": 10.277433549845567, + "learning_rate": 2.9683093031657796e-06, + "loss": 0.7456, + "step": 7335 + }, + { + "epoch": 0.64, + "grad_norm": 2.440445962569539, + "learning_rate": 2.967009790685664e-06, + "loss": 0.4243, + "step": 7336 + }, + { + "epoch": 0.64, + "grad_norm": 11.077394083391546, + "learning_rate": 2.965710442711735e-06, + "loss": 0.7116, + "step": 7337 + }, + { + "epoch": 0.64, + "grad_norm": 2.3234827280257724, + "learning_rate": 2.9644112593491315e-06, + "loss": 0.4953, + "step": 7338 + }, + { + "epoch": 0.64, + "grad_norm": 7.079056998542137, + "learning_rate": 2.9631122407029844e-06, + "loss": 0.5834, + "step": 7339 + }, + { + "epoch": 0.64, + "grad_norm": 18.294019494054425, + "learning_rate": 2.9618133868784064e-06, + "loss": 0.6728, + "step": 7340 + }, + { + "epoch": 0.64, + "grad_norm": 5.945759630690335, + "learning_rate": 2.960514697980498e-06, + "loss": 0.856, + "step": 7341 + }, + { + "epoch": 0.64, + "grad_norm": 6.219410567617363, + "learning_rate": 2.959216174114348e-06, + "loss": 0.7245, + "step": 7342 + }, + { + "epoch": 0.64, + "grad_norm": 5.967970346436276, + "learning_rate": 2.957917815385031e-06, + "loss": 0.809, + "step": 7343 + }, + { + "epoch": 0.64, + "grad_norm": 9.788186549146198, + "learning_rate": 2.9566196218976073e-06, + "loss": 0.6178, + "step": 7344 + }, + { + "epoch": 0.65, + "grad_norm": 2.7023342955858976, + "learning_rate": 2.9553215937571244e-06, + "loss": 0.4661, + "step": 7345 + }, + { + "epoch": 0.65, + "grad_norm": 6.383926479975589, + "learning_rate": 2.954023731068617e-06, + "loss": 0.69, + "step": 7346 + }, + { + "epoch": 0.65, + "grad_norm": 7.963831589018071, + "learning_rate": 2.952726033937107e-06, + "loss": 0.8225, + "step": 7347 + }, + { + "epoch": 0.65, + "grad_norm": 6.112355462525483, + "learning_rate": 2.9514285024676004e-06, + "loss": 0.8151, + "step": 7348 + }, + { + "epoch": 0.65, + "grad_norm": 4.92782524542855, + "learning_rate": 2.9501311367650908e-06, + "loss": 0.5374, + "step": 7349 + }, + { + "epoch": 0.65, + "grad_norm": 8.715216267848785, + "learning_rate": 2.948833936934561e-06, + "loss": 0.7355, + "step": 7350 + }, + { + "epoch": 0.65, + "grad_norm": 11.903013626701203, + "learning_rate": 2.9475369030809762e-06, + "loss": 0.7359, + "step": 7351 + }, + { + "epoch": 0.65, + "grad_norm": 10.59542674438449, + "learning_rate": 2.9462400353092915e-06, + "loss": 0.6143, + "step": 7352 + }, + { + "epoch": 0.65, + "grad_norm": 4.9997228767154684, + "learning_rate": 2.944943333724447e-06, + "loss": 0.6856, + "step": 7353 + }, + { + "epoch": 0.65, + "grad_norm": 5.077393099378951, + "learning_rate": 2.94364679843137e-06, + "loss": 0.6948, + "step": 7354 + }, + { + "epoch": 0.65, + "grad_norm": 12.361650793219223, + "learning_rate": 2.9423504295349725e-06, + "loss": 0.7403, + "step": 7355 + }, + { + "epoch": 0.65, + "grad_norm": 22.627339547066935, + "learning_rate": 2.941054227140156e-06, + "loss": 0.7472, + "step": 7356 + }, + { + "epoch": 0.65, + "grad_norm": 5.133943989281646, + "learning_rate": 2.939758191351807e-06, + "loss": 0.7002, + "step": 7357 + }, + { + "epoch": 0.65, + "grad_norm": 10.675778057769278, + "learning_rate": 2.9384623222747964e-06, + "loss": 0.6997, + "step": 7358 + }, + { + "epoch": 0.65, + "grad_norm": 9.917927597839665, + "learning_rate": 2.9371666200139875e-06, + "loss": 0.7911, + "step": 7359 + }, + { + "epoch": 0.65, + "grad_norm": 9.401623095966231, + "learning_rate": 2.9358710846742237e-06, + "loss": 0.649, + "step": 7360 + }, + { + "epoch": 0.65, + "grad_norm": 12.105910510084989, + "learning_rate": 2.934575716360338e-06, + "loss": 0.7795, + "step": 7361 + }, + { + "epoch": 0.65, + "grad_norm": 14.562247169002342, + "learning_rate": 2.93328051517715e-06, + "loss": 0.8596, + "step": 7362 + }, + { + "epoch": 0.65, + "grad_norm": 2.1999614787701165, + "learning_rate": 2.9319854812294644e-06, + "loss": 0.5306, + "step": 7363 + }, + { + "epoch": 0.65, + "grad_norm": 7.644188906345926, + "learning_rate": 2.930690614622074e-06, + "loss": 0.6844, + "step": 7364 + }, + { + "epoch": 0.65, + "grad_norm": 8.891934430405211, + "learning_rate": 2.929395915459757e-06, + "loss": 0.735, + "step": 7365 + }, + { + "epoch": 0.65, + "grad_norm": 7.730902086646575, + "learning_rate": 2.9281013838472777e-06, + "loss": 0.7916, + "step": 7366 + }, + { + "epoch": 0.65, + "grad_norm": 2.3856797433423016, + "learning_rate": 2.926807019889389e-06, + "loss": 0.4636, + "step": 7367 + }, + { + "epoch": 0.65, + "grad_norm": 6.949526301325403, + "learning_rate": 2.925512823690827e-06, + "loss": 0.7147, + "step": 7368 + }, + { + "epoch": 0.65, + "grad_norm": 7.143669833874045, + "learning_rate": 2.9242187953563163e-06, + "loss": 0.7379, + "step": 7369 + }, + { + "epoch": 0.65, + "grad_norm": 6.096750294762567, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.8259, + "step": 7370 + }, + { + "epoch": 0.65, + "grad_norm": 12.434330420299647, + "learning_rate": 2.92163124269828e-06, + "loss": 0.8235, + "step": 7371 + }, + { + "epoch": 0.65, + "grad_norm": 6.146138933676239, + "learning_rate": 2.920337718584133e-06, + "loss": 0.7689, + "step": 7372 + }, + { + "epoch": 0.65, + "grad_norm": 6.221273241751208, + "learning_rate": 2.9190443627528e-06, + "loss": 0.6758, + "step": 7373 + }, + { + "epoch": 0.65, + "grad_norm": 6.923272566305542, + "learning_rate": 2.917751175308935e-06, + "loss": 0.8229, + "step": 7374 + }, + { + "epoch": 0.65, + "grad_norm": 12.303414814145304, + "learning_rate": 2.9164581563571813e-06, + "loss": 0.6776, + "step": 7375 + }, + { + "epoch": 0.65, + "grad_norm": 14.229614012723102, + "learning_rate": 2.915165306002166e-06, + "loss": 0.758, + "step": 7376 + }, + { + "epoch": 0.65, + "grad_norm": 2.417127710229334, + "learning_rate": 2.9138726243485083e-06, + "loss": 0.4531, + "step": 7377 + }, + { + "epoch": 0.65, + "grad_norm": 2.297728398298338, + "learning_rate": 2.912580111500808e-06, + "loss": 0.5165, + "step": 7378 + }, + { + "epoch": 0.65, + "grad_norm": 10.13552425008564, + "learning_rate": 2.9112877675636523e-06, + "loss": 0.8205, + "step": 7379 + }, + { + "epoch": 0.65, + "grad_norm": 6.208622029244748, + "learning_rate": 2.9099955926416162e-06, + "loss": 0.6619, + "step": 7380 + }, + { + "epoch": 0.65, + "grad_norm": 6.1905331711212535, + "learning_rate": 2.9087035868392604e-06, + "loss": 0.6314, + "step": 7381 + }, + { + "epoch": 0.65, + "grad_norm": 7.600156966494344, + "learning_rate": 2.90741175026113e-06, + "loss": 0.7358, + "step": 7382 + }, + { + "epoch": 0.65, + "grad_norm": 5.373150024283153, + "learning_rate": 2.9061200830117608e-06, + "loss": 0.6934, + "step": 7383 + }, + { + "epoch": 0.65, + "grad_norm": 3.3069269249074926, + "learning_rate": 2.9048285851956726e-06, + "loss": 0.5449, + "step": 7384 + }, + { + "epoch": 0.65, + "grad_norm": 2.6218406396524854, + "learning_rate": 2.903537256917369e-06, + "loss": 0.5026, + "step": 7385 + }, + { + "epoch": 0.65, + "grad_norm": 2.23128648506708, + "learning_rate": 2.9022460982813446e-06, + "loss": 0.4411, + "step": 7386 + }, + { + "epoch": 0.65, + "grad_norm": 6.2354822788251205, + "learning_rate": 2.900955109392075e-06, + "loss": 0.6585, + "step": 7387 + }, + { + "epoch": 0.65, + "grad_norm": 7.5321230263357615, + "learning_rate": 2.8996642903540257e-06, + "loss": 0.9174, + "step": 7388 + }, + { + "epoch": 0.65, + "grad_norm": 5.1940037312003815, + "learning_rate": 2.8983736412716503e-06, + "loss": 0.5683, + "step": 7389 + }, + { + "epoch": 0.65, + "grad_norm": 9.885874792802596, + "learning_rate": 2.8970831622493833e-06, + "loss": 0.5283, + "step": 7390 + }, + { + "epoch": 0.65, + "grad_norm": 6.361149995247099, + "learning_rate": 2.895792853391649e-06, + "loss": 0.6793, + "step": 7391 + }, + { + "epoch": 0.65, + "grad_norm": 5.246794900930961, + "learning_rate": 2.8945027148028567e-06, + "loss": 0.6967, + "step": 7392 + }, + { + "epoch": 0.65, + "grad_norm": 10.565012576170812, + "learning_rate": 2.8932127465874004e-06, + "loss": 0.7238, + "step": 7393 + }, + { + "epoch": 0.65, + "grad_norm": 2.510211521804937, + "learning_rate": 2.891922948849666e-06, + "loss": 0.5308, + "step": 7394 + }, + { + "epoch": 0.65, + "grad_norm": 2.9887842912687854, + "learning_rate": 2.8906333216940206e-06, + "loss": 0.5923, + "step": 7395 + }, + { + "epoch": 0.65, + "grad_norm": 7.982071451441763, + "learning_rate": 2.889343865224817e-06, + "loss": 0.7504, + "step": 7396 + }, + { + "epoch": 0.65, + "grad_norm": 8.656762516531565, + "learning_rate": 2.888054579546398e-06, + "loss": 0.8518, + "step": 7397 + }, + { + "epoch": 0.65, + "grad_norm": 8.781615635402513, + "learning_rate": 2.8867654647630896e-06, + "loss": 0.772, + "step": 7398 + }, + { + "epoch": 0.65, + "grad_norm": 7.258144513933813, + "learning_rate": 2.885476520979202e-06, + "loss": 0.8089, + "step": 7399 + }, + { + "epoch": 0.65, + "grad_norm": 6.205675891783307, + "learning_rate": 2.884187748299039e-06, + "loss": 0.8241, + "step": 7400 + }, + { + "epoch": 0.65, + "grad_norm": 5.68467911764409, + "learning_rate": 2.882899146826884e-06, + "loss": 0.5968, + "step": 7401 + }, + { + "epoch": 0.65, + "grad_norm": 6.447020522980887, + "learning_rate": 2.8816107166670084e-06, + "loss": 0.7151, + "step": 7402 + }, + { + "epoch": 0.65, + "grad_norm": 8.515328738515988, + "learning_rate": 2.8803224579236698e-06, + "loss": 0.7103, + "step": 7403 + }, + { + "epoch": 0.65, + "grad_norm": 8.539532644705835, + "learning_rate": 2.8790343707011114e-06, + "loss": 0.7988, + "step": 7404 + }, + { + "epoch": 0.65, + "grad_norm": 7.942518998735967, + "learning_rate": 2.877746455103563e-06, + "loss": 0.7691, + "step": 7405 + }, + { + "epoch": 0.65, + "grad_norm": 5.8752490235554315, + "learning_rate": 2.876458711235243e-06, + "loss": 0.6608, + "step": 7406 + }, + { + "epoch": 0.65, + "grad_norm": 12.971709226549457, + "learning_rate": 2.8751711392003504e-06, + "loss": 0.8501, + "step": 7407 + }, + { + "epoch": 0.65, + "grad_norm": 7.165044106646558, + "learning_rate": 2.873883739103076e-06, + "loss": 0.6435, + "step": 7408 + }, + { + "epoch": 0.65, + "grad_norm": 7.282647152176684, + "learning_rate": 2.8725965110475922e-06, + "loss": 0.6988, + "step": 7409 + }, + { + "epoch": 0.65, + "grad_norm": 12.185217957591675, + "learning_rate": 2.8713094551380605e-06, + "loss": 0.7943, + "step": 7410 + }, + { + "epoch": 0.65, + "grad_norm": 6.3083345283375625, + "learning_rate": 2.8700225714786243e-06, + "loss": 0.8359, + "step": 7411 + }, + { + "epoch": 0.65, + "grad_norm": 6.862864346116962, + "learning_rate": 2.868735860173421e-06, + "loss": 0.7546, + "step": 7412 + }, + { + "epoch": 0.65, + "grad_norm": 6.538336940174061, + "learning_rate": 2.8674493213265663e-06, + "loss": 0.7998, + "step": 7413 + }, + { + "epoch": 0.65, + "grad_norm": 13.785754004849656, + "learning_rate": 2.8661629550421655e-06, + "loss": 0.8166, + "step": 7414 + }, + { + "epoch": 0.65, + "grad_norm": 11.254091368000182, + "learning_rate": 2.864876761424309e-06, + "loss": 0.6668, + "step": 7415 + }, + { + "epoch": 0.65, + "grad_norm": 9.880016737200231, + "learning_rate": 2.863590740577071e-06, + "loss": 0.7302, + "step": 7416 + }, + { + "epoch": 0.65, + "grad_norm": 8.632767127208085, + "learning_rate": 2.8623048926045194e-06, + "loss": 0.6377, + "step": 7417 + }, + { + "epoch": 0.65, + "grad_norm": 7.609742960040366, + "learning_rate": 2.8610192176106993e-06, + "loss": 0.7667, + "step": 7418 + }, + { + "epoch": 0.65, + "grad_norm": 8.325852744988202, + "learning_rate": 2.859733715699646e-06, + "loss": 0.7062, + "step": 7419 + }, + { + "epoch": 0.65, + "grad_norm": 13.95958657073152, + "learning_rate": 2.858448386975381e-06, + "loss": 0.6403, + "step": 7420 + }, + { + "epoch": 0.65, + "grad_norm": 11.392750824155064, + "learning_rate": 2.8571632315419097e-06, + "loss": 0.6855, + "step": 7421 + }, + { + "epoch": 0.65, + "grad_norm": 8.094990178169773, + "learning_rate": 2.8558782495032234e-06, + "loss": 0.7813, + "step": 7422 + }, + { + "epoch": 0.65, + "grad_norm": 7.885875784292227, + "learning_rate": 2.854593440963304e-06, + "loss": 0.6771, + "step": 7423 + }, + { + "epoch": 0.65, + "grad_norm": 32.008885553185365, + "learning_rate": 2.853308806026115e-06, + "loss": 0.7809, + "step": 7424 + }, + { + "epoch": 0.65, + "grad_norm": 6.537942954262997, + "learning_rate": 2.852024344795607e-06, + "loss": 0.7458, + "step": 7425 + }, + { + "epoch": 0.65, + "grad_norm": 5.9586037857392675, + "learning_rate": 2.850740057375716e-06, + "loss": 0.6948, + "step": 7426 + }, + { + "epoch": 0.65, + "grad_norm": 5.61127564178804, + "learning_rate": 2.849455943870364e-06, + "loss": 0.79, + "step": 7427 + }, + { + "epoch": 0.65, + "grad_norm": 5.940756442777731, + "learning_rate": 2.8481720043834584e-06, + "loss": 0.6766, + "step": 7428 + }, + { + "epoch": 0.65, + "grad_norm": 8.058719086250907, + "learning_rate": 2.8468882390188956e-06, + "loss": 0.7272, + "step": 7429 + }, + { + "epoch": 0.65, + "grad_norm": 7.590403535403262, + "learning_rate": 2.8456046478805568e-06, + "loss": 0.8166, + "step": 7430 + }, + { + "epoch": 0.65, + "grad_norm": 6.221702552876591, + "learning_rate": 2.8443212310723045e-06, + "loss": 0.8844, + "step": 7431 + }, + { + "epoch": 0.65, + "grad_norm": 7.599233854976476, + "learning_rate": 2.843037988697993e-06, + "loss": 0.7507, + "step": 7432 + }, + { + "epoch": 0.65, + "grad_norm": 6.1990267021801255, + "learning_rate": 2.841754920861458e-06, + "loss": 0.7779, + "step": 7433 + }, + { + "epoch": 0.65, + "grad_norm": 7.399249318487045, + "learning_rate": 2.8404720276665264e-06, + "loss": 0.8272, + "step": 7434 + }, + { + "epoch": 0.65, + "grad_norm": 8.124204894369804, + "learning_rate": 2.8391893092170064e-06, + "loss": 0.6376, + "step": 7435 + }, + { + "epoch": 0.65, + "grad_norm": 8.332227487196105, + "learning_rate": 2.8379067656166923e-06, + "loss": 0.8545, + "step": 7436 + }, + { + "epoch": 0.65, + "grad_norm": 7.108957924242657, + "learning_rate": 2.8366243969693674e-06, + "loss": 0.6739, + "step": 7437 + }, + { + "epoch": 0.65, + "grad_norm": 10.716279466250274, + "learning_rate": 2.835342203378797e-06, + "loss": 0.7632, + "step": 7438 + }, + { + "epoch": 0.65, + "grad_norm": 8.606085328531208, + "learning_rate": 2.8340601849487333e-06, + "loss": 0.695, + "step": 7439 + }, + { + "epoch": 0.65, + "grad_norm": 233.15474128974745, + "learning_rate": 2.832778341782918e-06, + "loss": 0.6385, + "step": 7440 + }, + { + "epoch": 0.65, + "grad_norm": 8.663022055819775, + "learning_rate": 2.8314966739850748e-06, + "loss": 0.817, + "step": 7441 + }, + { + "epoch": 0.65, + "grad_norm": 7.373317836137239, + "learning_rate": 2.830215181658913e-06, + "loss": 0.6713, + "step": 7442 + }, + { + "epoch": 0.65, + "grad_norm": 10.66279778734426, + "learning_rate": 2.82893386490813e-06, + "loss": 0.8901, + "step": 7443 + }, + { + "epoch": 0.65, + "grad_norm": 8.288182527987088, + "learning_rate": 2.827652723836407e-06, + "loss": 0.8475, + "step": 7444 + }, + { + "epoch": 0.65, + "grad_norm": 16.627538838769315, + "learning_rate": 2.8263717585474103e-06, + "loss": 0.7802, + "step": 7445 + }, + { + "epoch": 0.65, + "grad_norm": 6.160101233159402, + "learning_rate": 2.825090969144797e-06, + "loss": 0.5904, + "step": 7446 + }, + { + "epoch": 0.65, + "grad_norm": 6.8514701743917765, + "learning_rate": 2.823810355732205e-06, + "loss": 0.7699, + "step": 7447 + }, + { + "epoch": 0.65, + "grad_norm": 7.1115431375565725, + "learning_rate": 2.822529918413259e-06, + "loss": 0.653, + "step": 7448 + }, + { + "epoch": 0.65, + "grad_norm": 14.776819010666463, + "learning_rate": 2.8212496572915693e-06, + "loss": 0.6424, + "step": 7449 + }, + { + "epoch": 0.65, + "grad_norm": 9.214242255346324, + "learning_rate": 2.819969572470734e-06, + "loss": 0.8878, + "step": 7450 + }, + { + "epoch": 0.65, + "grad_norm": 9.381664081905104, + "learning_rate": 2.8186896640543325e-06, + "loss": 0.765, + "step": 7451 + }, + { + "epoch": 0.65, + "grad_norm": 6.805665211449883, + "learning_rate": 2.817409932145937e-06, + "loss": 0.9608, + "step": 7452 + }, + { + "epoch": 0.65, + "grad_norm": 16.466490038146926, + "learning_rate": 2.816130376849099e-06, + "loss": 0.7355, + "step": 7453 + }, + { + "epoch": 0.65, + "grad_norm": 5.882658575514831, + "learning_rate": 2.8148509982673577e-06, + "loss": 0.7083, + "step": 7454 + }, + { + "epoch": 0.65, + "grad_norm": 2.347277596542968, + "learning_rate": 2.81357179650424e-06, + "loss": 0.4738, + "step": 7455 + }, + { + "epoch": 0.65, + "grad_norm": 6.044426506986635, + "learning_rate": 2.8122927716632525e-06, + "loss": 0.7719, + "step": 7456 + }, + { + "epoch": 0.65, + "grad_norm": 8.39877209909347, + "learning_rate": 2.8110139238478974e-06, + "loss": 0.6325, + "step": 7457 + }, + { + "epoch": 0.65, + "grad_norm": 9.02982381146496, + "learning_rate": 2.8097352531616544e-06, + "loss": 0.7769, + "step": 7458 + }, + { + "epoch": 0.66, + "grad_norm": 7.412494955443026, + "learning_rate": 2.8084567597079915e-06, + "loss": 0.7718, + "step": 7459 + }, + { + "epoch": 0.66, + "grad_norm": 6.02184619440724, + "learning_rate": 2.8071784435903615e-06, + "loss": 0.7157, + "step": 7460 + }, + { + "epoch": 0.66, + "grad_norm": 5.691678192769801, + "learning_rate": 2.805900304912205e-06, + "loss": 0.71, + "step": 7461 + }, + { + "epoch": 0.66, + "grad_norm": 6.182434632983907, + "learning_rate": 2.8046223437769436e-06, + "loss": 0.7227, + "step": 7462 + }, + { + "epoch": 0.66, + "grad_norm": 2.8764466616280875, + "learning_rate": 2.8033445602879927e-06, + "loss": 0.4594, + "step": 7463 + }, + { + "epoch": 0.66, + "grad_norm": 16.64975707295027, + "learning_rate": 2.802066954548746e-06, + "loss": 0.8924, + "step": 7464 + }, + { + "epoch": 0.66, + "grad_norm": 9.384291144030064, + "learning_rate": 2.800789526662585e-06, + "loss": 0.6335, + "step": 7465 + }, + { + "epoch": 0.66, + "grad_norm": 2.4228715349110033, + "learning_rate": 2.7995122767328776e-06, + "loss": 0.4735, + "step": 7466 + }, + { + "epoch": 0.66, + "grad_norm": 16.998547387603594, + "learning_rate": 2.798235204862977e-06, + "loss": 0.6283, + "step": 7467 + }, + { + "epoch": 0.66, + "grad_norm": 9.18725297924323, + "learning_rate": 2.796958311156218e-06, + "loss": 0.8617, + "step": 7468 + }, + { + "epoch": 0.66, + "grad_norm": 6.3536728922990635, + "learning_rate": 2.7956815957159305e-06, + "loss": 0.7675, + "step": 7469 + }, + { + "epoch": 0.66, + "grad_norm": 6.239959109288688, + "learning_rate": 2.7944050586454215e-06, + "loss": 0.704, + "step": 7470 + }, + { + "epoch": 0.66, + "grad_norm": 27.435766611752825, + "learning_rate": 2.793128700047986e-06, + "loss": 0.7328, + "step": 7471 + }, + { + "epoch": 0.66, + "grad_norm": 6.625360583357938, + "learning_rate": 2.791852520026906e-06, + "loss": 0.7105, + "step": 7472 + }, + { + "epoch": 0.66, + "grad_norm": 9.84165202448421, + "learning_rate": 2.790576518685444e-06, + "loss": 0.7429, + "step": 7473 + }, + { + "epoch": 0.66, + "grad_norm": 12.585633912713945, + "learning_rate": 2.7893006961268577e-06, + "loss": 0.7928, + "step": 7474 + }, + { + "epoch": 0.66, + "grad_norm": 10.26412810412022, + "learning_rate": 2.7880250524543816e-06, + "loss": 0.7739, + "step": 7475 + }, + { + "epoch": 0.66, + "grad_norm": 8.493600784463363, + "learning_rate": 2.7867495877712387e-06, + "loss": 0.8374, + "step": 7476 + }, + { + "epoch": 0.66, + "grad_norm": 8.656240216674249, + "learning_rate": 2.7854743021806374e-06, + "loss": 0.8217, + "step": 7477 + }, + { + "epoch": 0.66, + "grad_norm": 9.673310677198636, + "learning_rate": 2.784199195785772e-06, + "loss": 0.7052, + "step": 7478 + }, + { + "epoch": 0.66, + "grad_norm": 9.392513704838267, + "learning_rate": 2.78292426868982e-06, + "loss": 0.7704, + "step": 7479 + }, + { + "epoch": 0.66, + "grad_norm": 8.565760507382421, + "learning_rate": 2.7816495209959505e-06, + "loss": 0.7584, + "step": 7480 + }, + { + "epoch": 0.66, + "grad_norm": 8.747614047918633, + "learning_rate": 2.7803749528073108e-06, + "loss": 0.7722, + "step": 7481 + }, + { + "epoch": 0.66, + "grad_norm": 2.5222616007495, + "learning_rate": 2.7791005642270384e-06, + "loss": 0.5358, + "step": 7482 + }, + { + "epoch": 0.66, + "grad_norm": 6.024929021172989, + "learning_rate": 2.777826355358254e-06, + "loss": 0.698, + "step": 7483 + }, + { + "epoch": 0.66, + "grad_norm": 6.0454055081809495, + "learning_rate": 2.7765523263040652e-06, + "loss": 0.5937, + "step": 7484 + }, + { + "epoch": 0.66, + "grad_norm": 8.427978694710264, + "learning_rate": 2.775278477167561e-06, + "loss": 0.8035, + "step": 7485 + }, + { + "epoch": 0.66, + "grad_norm": 10.526800542353042, + "learning_rate": 2.7740048080518233e-06, + "loss": 0.772, + "step": 7486 + }, + { + "epoch": 0.66, + "grad_norm": 15.32874654179955, + "learning_rate": 2.772731319059915e-06, + "loss": 0.6237, + "step": 7487 + }, + { + "epoch": 0.66, + "grad_norm": 7.525989240052338, + "learning_rate": 2.771458010294883e-06, + "loss": 0.7326, + "step": 7488 + }, + { + "epoch": 0.66, + "grad_norm": 6.241100937534386, + "learning_rate": 2.770184881859761e-06, + "loss": 0.7466, + "step": 7489 + }, + { + "epoch": 0.66, + "grad_norm": 23.082556142039373, + "learning_rate": 2.76891193385757e-06, + "loss": 0.6783, + "step": 7490 + }, + { + "epoch": 0.66, + "grad_norm": 10.053830009192485, + "learning_rate": 2.7676391663913122e-06, + "loss": 0.599, + "step": 7491 + }, + { + "epoch": 0.66, + "grad_norm": 6.328868085003097, + "learning_rate": 2.7663665795639815e-06, + "loss": 0.6722, + "step": 7492 + }, + { + "epoch": 0.66, + "grad_norm": 7.893845243508072, + "learning_rate": 2.765094173478552e-06, + "loss": 0.7159, + "step": 7493 + }, + { + "epoch": 0.66, + "grad_norm": 5.171135782050655, + "learning_rate": 2.7638219482379838e-06, + "loss": 0.6932, + "step": 7494 + }, + { + "epoch": 0.66, + "grad_norm": 5.402239915488754, + "learning_rate": 2.762549903945224e-06, + "loss": 0.5748, + "step": 7495 + }, + { + "epoch": 0.66, + "grad_norm": 5.676187830443245, + "learning_rate": 2.7612780407032026e-06, + "loss": 0.6884, + "step": 7496 + }, + { + "epoch": 0.66, + "grad_norm": 7.990431457850802, + "learning_rate": 2.7600063586148394e-06, + "loss": 0.6818, + "step": 7497 + }, + { + "epoch": 0.66, + "grad_norm": 2.982890235900606, + "learning_rate": 2.7587348577830363e-06, + "loss": 0.5869, + "step": 7498 + }, + { + "epoch": 0.66, + "grad_norm": 7.809384542013845, + "learning_rate": 2.7574635383106806e-06, + "loss": 0.6176, + "step": 7499 + }, + { + "epoch": 0.66, + "grad_norm": 9.894531063916489, + "learning_rate": 2.7561924003006445e-06, + "loss": 0.5684, + "step": 7500 + }, + { + "epoch": 0.66, + "grad_norm": 10.264358476985608, + "learning_rate": 2.754921443855787e-06, + "loss": 0.7338, + "step": 7501 + }, + { + "epoch": 0.66, + "grad_norm": 5.258739169213021, + "learning_rate": 2.75365066907895e-06, + "loss": 0.6082, + "step": 7502 + }, + { + "epoch": 0.66, + "grad_norm": 11.750253412055745, + "learning_rate": 2.752380076072967e-06, + "loss": 0.7285, + "step": 7503 + }, + { + "epoch": 0.66, + "grad_norm": 7.587943261040613, + "learning_rate": 2.7511096649406484e-06, + "loss": 0.7508, + "step": 7504 + }, + { + "epoch": 0.66, + "grad_norm": 2.179669841404539, + "learning_rate": 2.749839435784795e-06, + "loss": 0.4414, + "step": 7505 + }, + { + "epoch": 0.66, + "grad_norm": 7.205834972764672, + "learning_rate": 2.748569388708192e-06, + "loss": 0.84, + "step": 7506 + }, + { + "epoch": 0.66, + "grad_norm": 10.564768111393384, + "learning_rate": 2.7472995238136086e-06, + "loss": 0.7091, + "step": 7507 + }, + { + "epoch": 0.66, + "grad_norm": 10.242479781264663, + "learning_rate": 2.746029841203799e-06, + "loss": 0.809, + "step": 7508 + }, + { + "epoch": 0.66, + "grad_norm": 13.225963904341324, + "learning_rate": 2.744760340981507e-06, + "loss": 0.7423, + "step": 7509 + }, + { + "epoch": 0.66, + "grad_norm": 8.61417346418182, + "learning_rate": 2.7434910232494565e-06, + "loss": 0.8125, + "step": 7510 + }, + { + "epoch": 0.66, + "grad_norm": 7.188833227017178, + "learning_rate": 2.742221888110359e-06, + "loss": 0.6886, + "step": 7511 + }, + { + "epoch": 0.66, + "grad_norm": 6.479277204068277, + "learning_rate": 2.740952935666912e-06, + "loss": 0.7157, + "step": 7512 + }, + { + "epoch": 0.66, + "grad_norm": 7.75022934163855, + "learning_rate": 2.739684166021792e-06, + "loss": 0.5447, + "step": 7513 + }, + { + "epoch": 0.66, + "grad_norm": 7.7469545609988, + "learning_rate": 2.7384155792776724e-06, + "loss": 0.7088, + "step": 7514 + }, + { + "epoch": 0.66, + "grad_norm": 10.54982953093329, + "learning_rate": 2.7371471755372016e-06, + "loss": 0.8586, + "step": 7515 + }, + { + "epoch": 0.66, + "grad_norm": 6.821088761155818, + "learning_rate": 2.735878954903017e-06, + "loss": 0.7825, + "step": 7516 + }, + { + "epoch": 0.66, + "grad_norm": 5.8450609769695285, + "learning_rate": 2.7346109174777424e-06, + "loss": 0.8267, + "step": 7517 + }, + { + "epoch": 0.66, + "grad_norm": 4.761355522113533, + "learning_rate": 2.7333430633639834e-06, + "loss": 0.6033, + "step": 7518 + }, + { + "epoch": 0.66, + "grad_norm": 4.878028613275361, + "learning_rate": 2.732075392664332e-06, + "loss": 0.5638, + "step": 7519 + }, + { + "epoch": 0.66, + "grad_norm": 9.32302668021424, + "learning_rate": 2.7308079054813684e-06, + "loss": 0.5584, + "step": 7520 + }, + { + "epoch": 0.66, + "grad_norm": 2.5646359737565367, + "learning_rate": 2.7295406019176544e-06, + "loss": 0.6182, + "step": 7521 + }, + { + "epoch": 0.66, + "grad_norm": 7.422020609683606, + "learning_rate": 2.7282734820757382e-06, + "loss": 0.8121, + "step": 7522 + }, + { + "epoch": 0.66, + "grad_norm": 5.135380721697851, + "learning_rate": 2.727006546058154e-06, + "loss": 0.596, + "step": 7523 + }, + { + "epoch": 0.66, + "grad_norm": 9.24892196964001, + "learning_rate": 2.7257397939674186e-06, + "loss": 0.7735, + "step": 7524 + }, + { + "epoch": 0.66, + "grad_norm": 5.658286721594651, + "learning_rate": 2.7244732259060335e-06, + "loss": 0.675, + "step": 7525 + }, + { + "epoch": 0.66, + "grad_norm": 7.5802990886434145, + "learning_rate": 2.7232068419764924e-06, + "loss": 0.7549, + "step": 7526 + }, + { + "epoch": 0.66, + "grad_norm": 8.434825453707552, + "learning_rate": 2.7219406422812666e-06, + "loss": 0.7828, + "step": 7527 + }, + { + "epoch": 0.66, + "grad_norm": 9.78573953257768, + "learning_rate": 2.720674626922814e-06, + "loss": 0.5292, + "step": 7528 + }, + { + "epoch": 0.66, + "grad_norm": 10.151161963664258, + "learning_rate": 2.71940879600358e-06, + "loss": 0.7764, + "step": 7529 + }, + { + "epoch": 0.66, + "grad_norm": 2.4567118674770483, + "learning_rate": 2.7181431496259912e-06, + "loss": 0.5479, + "step": 7530 + }, + { + "epoch": 0.66, + "grad_norm": 8.520840219963397, + "learning_rate": 2.716877687892463e-06, + "loss": 0.7211, + "step": 7531 + }, + { + "epoch": 0.66, + "grad_norm": 7.506970679023457, + "learning_rate": 2.715612410905395e-06, + "loss": 0.6738, + "step": 7532 + }, + { + "epoch": 0.66, + "grad_norm": 6.070258534000667, + "learning_rate": 2.7143473187671716e-06, + "loss": 0.7319, + "step": 7533 + }, + { + "epoch": 0.66, + "grad_norm": 8.3798083585184, + "learning_rate": 2.7130824115801606e-06, + "loss": 0.7825, + "step": 7534 + }, + { + "epoch": 0.66, + "grad_norm": 7.809305515260231, + "learning_rate": 2.7118176894467173e-06, + "loss": 0.8257, + "step": 7535 + }, + { + "epoch": 0.66, + "grad_norm": 2.910793150531666, + "learning_rate": 2.710553152469178e-06, + "loss": 0.5495, + "step": 7536 + }, + { + "epoch": 0.66, + "grad_norm": 8.693995504680442, + "learning_rate": 2.709288800749872e-06, + "loss": 0.8108, + "step": 7537 + }, + { + "epoch": 0.66, + "grad_norm": 2.8504183515497505, + "learning_rate": 2.7080246343911047e-06, + "loss": 0.5512, + "step": 7538 + }, + { + "epoch": 0.66, + "grad_norm": 2.5297291722091435, + "learning_rate": 2.7067606534951716e-06, + "loss": 0.5066, + "step": 7539 + }, + { + "epoch": 0.66, + "grad_norm": 6.446007561430907, + "learning_rate": 2.7054968581643528e-06, + "loss": 0.7479, + "step": 7540 + }, + { + "epoch": 0.66, + "grad_norm": 5.399975688880717, + "learning_rate": 2.704233248500911e-06, + "loss": 0.6516, + "step": 7541 + }, + { + "epoch": 0.66, + "grad_norm": 9.560361252253307, + "learning_rate": 2.702969824607094e-06, + "loss": 0.7471, + "step": 7542 + }, + { + "epoch": 0.66, + "grad_norm": 7.731344394881103, + "learning_rate": 2.70170658658514e-06, + "loss": 0.6538, + "step": 7543 + }, + { + "epoch": 0.66, + "grad_norm": 6.0747354754791365, + "learning_rate": 2.700443534537266e-06, + "loss": 0.7611, + "step": 7544 + }, + { + "epoch": 0.66, + "grad_norm": 11.575664058754759, + "learning_rate": 2.6991806685656754e-06, + "loss": 0.6697, + "step": 7545 + }, + { + "epoch": 0.66, + "grad_norm": 6.613364954563266, + "learning_rate": 2.697917988772558e-06, + "loss": 0.7681, + "step": 7546 + }, + { + "epoch": 0.66, + "grad_norm": 6.01722042607524, + "learning_rate": 2.6966554952600886e-06, + "loss": 0.6797, + "step": 7547 + }, + { + "epoch": 0.66, + "grad_norm": 2.749702015043722, + "learning_rate": 2.695393188130422e-06, + "loss": 0.5531, + "step": 7548 + }, + { + "epoch": 0.66, + "grad_norm": 8.625754175934258, + "learning_rate": 2.694131067485708e-06, + "loss": 0.5957, + "step": 7549 + }, + { + "epoch": 0.66, + "grad_norm": 9.234337207715456, + "learning_rate": 2.692869133428072e-06, + "loss": 0.625, + "step": 7550 + }, + { + "epoch": 0.66, + "grad_norm": 4.800362975648781, + "learning_rate": 2.6916073860596283e-06, + "loss": 0.7156, + "step": 7551 + }, + { + "epoch": 0.66, + "grad_norm": 9.383636829023768, + "learning_rate": 2.690345825482474e-06, + "loss": 0.631, + "step": 7552 + }, + { + "epoch": 0.66, + "grad_norm": 6.104046136098489, + "learning_rate": 2.6890844517986926e-06, + "loss": 0.7663, + "step": 7553 + }, + { + "epoch": 0.66, + "grad_norm": 5.188581682649463, + "learning_rate": 2.687823265110355e-06, + "loss": 0.7944, + "step": 7554 + }, + { + "epoch": 0.66, + "grad_norm": 4.190747387631763, + "learning_rate": 2.6865622655195133e-06, + "loss": 0.7106, + "step": 7555 + }, + { + "epoch": 0.66, + "grad_norm": 5.9065297797451635, + "learning_rate": 2.685301453128204e-06, + "loss": 0.7919, + "step": 7556 + }, + { + "epoch": 0.66, + "grad_norm": 5.779476151049142, + "learning_rate": 2.684040828038451e-06, + "loss": 0.5069, + "step": 7557 + }, + { + "epoch": 0.66, + "grad_norm": 6.354240439203401, + "learning_rate": 2.682780390352262e-06, + "loss": 0.691, + "step": 7558 + }, + { + "epoch": 0.66, + "grad_norm": 2.514206790611191, + "learning_rate": 2.6815201401716274e-06, + "loss": 0.4878, + "step": 7559 + }, + { + "epoch": 0.66, + "grad_norm": 8.255886544692729, + "learning_rate": 2.6802600775985286e-06, + "loss": 0.6375, + "step": 7560 + }, + { + "epoch": 0.66, + "grad_norm": 6.654947710444986, + "learning_rate": 2.6790002027349254e-06, + "loss": 0.9086, + "step": 7561 + }, + { + "epoch": 0.66, + "grad_norm": 5.940274684155334, + "learning_rate": 2.677740515682765e-06, + "loss": 0.7963, + "step": 7562 + }, + { + "epoch": 0.66, + "grad_norm": 20.97878024826662, + "learning_rate": 2.6764810165439793e-06, + "loss": 0.7614, + "step": 7563 + }, + { + "epoch": 0.66, + "grad_norm": 7.443572260129436, + "learning_rate": 2.675221705420484e-06, + "loss": 0.6938, + "step": 7564 + }, + { + "epoch": 0.66, + "grad_norm": 6.175572167750398, + "learning_rate": 2.6739625824141806e-06, + "loss": 0.7696, + "step": 7565 + }, + { + "epoch": 0.66, + "grad_norm": 4.304618619554496, + "learning_rate": 2.6727036476269573e-06, + "loss": 0.7076, + "step": 7566 + }, + { + "epoch": 0.66, + "grad_norm": 8.25029824915597, + "learning_rate": 2.6714449011606835e-06, + "loss": 0.8448, + "step": 7567 + }, + { + "epoch": 0.66, + "grad_norm": 8.594005218014528, + "learning_rate": 2.670186343117215e-06, + "loss": 0.8289, + "step": 7568 + }, + { + "epoch": 0.66, + "grad_norm": 6.4079439865120325, + "learning_rate": 2.668927973598392e-06, + "loss": 0.7158, + "step": 7569 + }, + { + "epoch": 0.66, + "grad_norm": 5.524653015130611, + "learning_rate": 2.6676697927060397e-06, + "loss": 0.6751, + "step": 7570 + }, + { + "epoch": 0.66, + "grad_norm": 6.376995356210869, + "learning_rate": 2.666411800541966e-06, + "loss": 0.8238, + "step": 7571 + }, + { + "epoch": 0.66, + "grad_norm": 8.36694948684319, + "learning_rate": 2.6651539972079705e-06, + "loss": 0.7381, + "step": 7572 + }, + { + "epoch": 0.67, + "grad_norm": 9.071994813302206, + "learning_rate": 2.663896382805829e-06, + "loss": 0.768, + "step": 7573 + }, + { + "epoch": 0.67, + "grad_norm": 11.539735271878788, + "learning_rate": 2.662638957437307e-06, + "loss": 0.6282, + "step": 7574 + }, + { + "epoch": 0.67, + "grad_norm": 9.658893375142414, + "learning_rate": 2.6613817212041516e-06, + "loss": 0.7395, + "step": 7575 + }, + { + "epoch": 0.67, + "grad_norm": 10.342576096914845, + "learning_rate": 2.6601246742080953e-06, + "loss": 0.7179, + "step": 7576 + }, + { + "epoch": 0.67, + "grad_norm": 7.133090239995559, + "learning_rate": 2.658867816550861e-06, + "loss": 0.6588, + "step": 7577 + }, + { + "epoch": 0.67, + "grad_norm": 8.23603012686333, + "learning_rate": 2.657611148334148e-06, + "loss": 0.7322, + "step": 7578 + }, + { + "epoch": 0.67, + "grad_norm": 5.588508316635008, + "learning_rate": 2.6563546696596454e-06, + "loss": 0.6386, + "step": 7579 + }, + { + "epoch": 0.67, + "grad_norm": 9.92477638366923, + "learning_rate": 2.655098380629024e-06, + "loss": 0.7703, + "step": 7580 + }, + { + "epoch": 0.67, + "grad_norm": 5.196105659224212, + "learning_rate": 2.6538422813439405e-06, + "loss": 0.6683, + "step": 7581 + }, + { + "epoch": 0.67, + "grad_norm": 12.587095241666269, + "learning_rate": 2.6525863719060356e-06, + "loss": 0.8338, + "step": 7582 + }, + { + "epoch": 0.67, + "grad_norm": 9.510511774975164, + "learning_rate": 2.651330652416939e-06, + "loss": 0.7122, + "step": 7583 + }, + { + "epoch": 0.67, + "grad_norm": 8.378016630612299, + "learning_rate": 2.6500751229782583e-06, + "loss": 0.807, + "step": 7584 + }, + { + "epoch": 0.67, + "grad_norm": 9.230702013804121, + "learning_rate": 2.6488197836915908e-06, + "loss": 0.7355, + "step": 7585 + }, + { + "epoch": 0.67, + "grad_norm": 8.870642703505013, + "learning_rate": 2.647564634658515e-06, + "loss": 0.8353, + "step": 7586 + }, + { + "epoch": 0.67, + "grad_norm": 6.405214851792323, + "learning_rate": 2.6463096759805952e-06, + "loss": 0.6741, + "step": 7587 + }, + { + "epoch": 0.67, + "grad_norm": 7.504497250151003, + "learning_rate": 2.64505490775938e-06, + "loss": 0.7499, + "step": 7588 + }, + { + "epoch": 0.67, + "grad_norm": 14.033165716348584, + "learning_rate": 2.6438003300964065e-06, + "loss": 0.6876, + "step": 7589 + }, + { + "epoch": 0.67, + "grad_norm": 6.552846117766591, + "learning_rate": 2.6425459430931906e-06, + "loss": 0.7559, + "step": 7590 + }, + { + "epoch": 0.67, + "grad_norm": 8.43388766617968, + "learning_rate": 2.6412917468512354e-06, + "loss": 0.6558, + "step": 7591 + }, + { + "epoch": 0.67, + "grad_norm": 7.416758020018775, + "learning_rate": 2.640037741472029e-06, + "loss": 0.7069, + "step": 7592 + }, + { + "epoch": 0.67, + "grad_norm": 6.9246504789664876, + "learning_rate": 2.6387839270570403e-06, + "loss": 0.7645, + "step": 7593 + }, + { + "epoch": 0.67, + "grad_norm": 7.2794047259414665, + "learning_rate": 2.6375303037077317e-06, + "loss": 0.731, + "step": 7594 + }, + { + "epoch": 0.67, + "grad_norm": 15.03450634188539, + "learning_rate": 2.6362768715255405e-06, + "loss": 0.6497, + "step": 7595 + }, + { + "epoch": 0.67, + "grad_norm": 5.02512186603736, + "learning_rate": 2.6350236306118925e-06, + "loss": 0.6693, + "step": 7596 + }, + { + "epoch": 0.67, + "grad_norm": 6.783615413050197, + "learning_rate": 2.633770581068199e-06, + "loss": 0.6037, + "step": 7597 + }, + { + "epoch": 0.67, + "grad_norm": 12.386639003245573, + "learning_rate": 2.6325177229958536e-06, + "loss": 0.8031, + "step": 7598 + }, + { + "epoch": 0.67, + "grad_norm": 7.79644431976303, + "learning_rate": 2.6312650564962342e-06, + "loss": 0.8665, + "step": 7599 + }, + { + "epoch": 0.67, + "grad_norm": 5.500346122053925, + "learning_rate": 2.6300125816707082e-06, + "loss": 0.8492, + "step": 7600 + }, + { + "epoch": 0.67, + "grad_norm": 2.5204636748576266, + "learning_rate": 2.628760298620622e-06, + "loss": 0.4799, + "step": 7601 + }, + { + "epoch": 0.67, + "grad_norm": 8.820822623637047, + "learning_rate": 2.627508207447308e-06, + "loss": 0.7561, + "step": 7602 + }, + { + "epoch": 0.67, + "grad_norm": 11.61362938169615, + "learning_rate": 2.626256308252084e-06, + "loss": 0.768, + "step": 7603 + }, + { + "epoch": 0.67, + "grad_norm": 9.989423773235607, + "learning_rate": 2.6250046011362494e-06, + "loss": 0.8615, + "step": 7604 + }, + { + "epoch": 0.67, + "grad_norm": 6.4423207870967705, + "learning_rate": 2.623753086201092e-06, + "loss": 0.75, + "step": 7605 + }, + { + "epoch": 0.67, + "grad_norm": 8.838130970348239, + "learning_rate": 2.622501763547882e-06, + "loss": 0.7999, + "step": 7606 + }, + { + "epoch": 0.67, + "grad_norm": 13.609486205710851, + "learning_rate": 2.6212506332778765e-06, + "loss": 0.7964, + "step": 7607 + }, + { + "epoch": 0.67, + "grad_norm": 3.1942283208939894, + "learning_rate": 2.6199996954923114e-06, + "loss": 0.5013, + "step": 7608 + }, + { + "epoch": 0.67, + "grad_norm": 6.992045765406232, + "learning_rate": 2.618748950292413e-06, + "loss": 0.8054, + "step": 7609 + }, + { + "epoch": 0.67, + "grad_norm": 5.1748364428636195, + "learning_rate": 2.6174983977793876e-06, + "loss": 0.7718, + "step": 7610 + }, + { + "epoch": 0.67, + "grad_norm": 6.785036531685371, + "learning_rate": 2.6162480380544274e-06, + "loss": 0.8317, + "step": 7611 + }, + { + "epoch": 0.67, + "grad_norm": 11.276800857832763, + "learning_rate": 2.6149978712187128e-06, + "loss": 0.6867, + "step": 7612 + }, + { + "epoch": 0.67, + "grad_norm": 5.757251490926302, + "learning_rate": 2.613747897373403e-06, + "loss": 0.6321, + "step": 7613 + }, + { + "epoch": 0.67, + "grad_norm": 11.79509712713587, + "learning_rate": 2.6124981166196443e-06, + "loss": 0.7522, + "step": 7614 + }, + { + "epoch": 0.67, + "grad_norm": 7.944588024034334, + "learning_rate": 2.6112485290585667e-06, + "loss": 0.6501, + "step": 7615 + }, + { + "epoch": 0.67, + "grad_norm": 7.938280863784059, + "learning_rate": 2.609999134791282e-06, + "loss": 0.7498, + "step": 7616 + }, + { + "epoch": 0.67, + "grad_norm": 7.37570730806983, + "learning_rate": 2.6087499339188942e-06, + "loss": 0.829, + "step": 7617 + }, + { + "epoch": 0.67, + "grad_norm": 3.2661849510783583, + "learning_rate": 2.6075009265424846e-06, + "loss": 0.5498, + "step": 7618 + }, + { + "epoch": 0.67, + "grad_norm": 9.040298120755406, + "learning_rate": 2.606252112763119e-06, + "loss": 0.8522, + "step": 7619 + }, + { + "epoch": 0.67, + "grad_norm": 14.189102083205327, + "learning_rate": 2.605003492681852e-06, + "loss": 0.7133, + "step": 7620 + }, + { + "epoch": 0.67, + "grad_norm": 6.539991944796849, + "learning_rate": 2.603755066399718e-06, + "loss": 0.85, + "step": 7621 + }, + { + "epoch": 0.67, + "grad_norm": 9.880233926693407, + "learning_rate": 2.6025068340177357e-06, + "loss": 0.7049, + "step": 7622 + }, + { + "epoch": 0.67, + "grad_norm": 7.134366141385587, + "learning_rate": 2.6012587956369147e-06, + "loss": 0.764, + "step": 7623 + }, + { + "epoch": 0.67, + "grad_norm": 5.907713305270398, + "learning_rate": 2.6000109513582417e-06, + "loss": 0.7128, + "step": 7624 + }, + { + "epoch": 0.67, + "grad_norm": 7.939160590238008, + "learning_rate": 2.5987633012826907e-06, + "loss": 0.6416, + "step": 7625 + }, + { + "epoch": 0.67, + "grad_norm": 11.328586642770734, + "learning_rate": 2.597515845511218e-06, + "loss": 0.6519, + "step": 7626 + }, + { + "epoch": 0.67, + "grad_norm": 2.8260586304683457, + "learning_rate": 2.5962685841447677e-06, + "loss": 0.498, + "step": 7627 + }, + { + "epoch": 0.67, + "grad_norm": 19.774539408393203, + "learning_rate": 2.5950215172842636e-06, + "loss": 0.7138, + "step": 7628 + }, + { + "epoch": 0.67, + "grad_norm": 6.489781840455477, + "learning_rate": 2.593774645030619e-06, + "loss": 0.6215, + "step": 7629 + }, + { + "epoch": 0.67, + "grad_norm": 5.959506309640264, + "learning_rate": 2.5925279674847282e-06, + "loss": 0.7709, + "step": 7630 + }, + { + "epoch": 0.67, + "grad_norm": 6.27653503125923, + "learning_rate": 2.5912814847474687e-06, + "loss": 0.7587, + "step": 7631 + }, + { + "epoch": 0.67, + "grad_norm": 3.812934112515167, + "learning_rate": 2.590035196919706e-06, + "loss": 0.5132, + "step": 7632 + }, + { + "epoch": 0.67, + "grad_norm": 5.4160688527167, + "learning_rate": 2.588789104102284e-06, + "loss": 0.8362, + "step": 7633 + }, + { + "epoch": 0.67, + "grad_norm": 11.348267693153888, + "learning_rate": 2.5875432063960383e-06, + "loss": 0.7506, + "step": 7634 + }, + { + "epoch": 0.67, + "grad_norm": 7.070002408157285, + "learning_rate": 2.5862975039017835e-06, + "loss": 0.9244, + "step": 7635 + }, + { + "epoch": 0.67, + "grad_norm": 9.07323638044828, + "learning_rate": 2.58505199672032e-06, + "loss": 0.8416, + "step": 7636 + }, + { + "epoch": 0.67, + "grad_norm": 6.57737828984423, + "learning_rate": 2.5838066849524316e-06, + "loss": 0.8486, + "step": 7637 + }, + { + "epoch": 0.67, + "grad_norm": 8.185353747619097, + "learning_rate": 2.5825615686988877e-06, + "loss": 0.7874, + "step": 7638 + }, + { + "epoch": 0.67, + "grad_norm": 8.363891568062826, + "learning_rate": 2.581316648060438e-06, + "loss": 0.7681, + "step": 7639 + }, + { + "epoch": 0.67, + "grad_norm": 2.653609057923413, + "learning_rate": 2.580071923137824e-06, + "loss": 0.5439, + "step": 7640 + }, + { + "epoch": 0.67, + "grad_norm": 7.640618928734803, + "learning_rate": 2.5788273940317644e-06, + "loss": 0.7945, + "step": 7641 + }, + { + "epoch": 0.67, + "grad_norm": 9.197556847004769, + "learning_rate": 2.577583060842965e-06, + "loss": 0.6962, + "step": 7642 + }, + { + "epoch": 0.67, + "grad_norm": 10.4420720952271, + "learning_rate": 2.5763389236721148e-06, + "loss": 0.8384, + "step": 7643 + }, + { + "epoch": 0.67, + "grad_norm": 5.763453740467003, + "learning_rate": 2.5750949826198866e-06, + "loss": 0.644, + "step": 7644 + }, + { + "epoch": 0.67, + "grad_norm": 2.2975029059478707, + "learning_rate": 2.5738512377869377e-06, + "loss": 0.5035, + "step": 7645 + }, + { + "epoch": 0.67, + "grad_norm": 8.348704348719359, + "learning_rate": 2.5726076892739127e-06, + "loss": 0.7464, + "step": 7646 + }, + { + "epoch": 0.67, + "grad_norm": 7.065934383097176, + "learning_rate": 2.5713643371814355e-06, + "loss": 0.7743, + "step": 7647 + }, + { + "epoch": 0.67, + "grad_norm": 7.5747667476836655, + "learning_rate": 2.570121181610117e-06, + "loss": 0.8163, + "step": 7648 + }, + { + "epoch": 0.67, + "grad_norm": 5.065873091145448, + "learning_rate": 2.5688782226605502e-06, + "loss": 0.6885, + "step": 7649 + }, + { + "epoch": 0.67, + "grad_norm": 3.210073837732233, + "learning_rate": 2.5676354604333133e-06, + "loss": 0.6201, + "step": 7650 + }, + { + "epoch": 0.67, + "grad_norm": 5.614772908178064, + "learning_rate": 2.5663928950289675e-06, + "loss": 0.7815, + "step": 7651 + }, + { + "epoch": 0.67, + "grad_norm": 6.724445859815526, + "learning_rate": 2.5651505265480616e-06, + "loss": 0.8748, + "step": 7652 + }, + { + "epoch": 0.67, + "grad_norm": 9.726559757391655, + "learning_rate": 2.563908355091126e-06, + "loss": 0.6111, + "step": 7653 + }, + { + "epoch": 0.67, + "grad_norm": 2.1964340687959942, + "learning_rate": 2.562666380758673e-06, + "loss": 0.4772, + "step": 7654 + }, + { + "epoch": 0.67, + "grad_norm": 8.598952948354064, + "learning_rate": 2.561424603651203e-06, + "loss": 0.6725, + "step": 7655 + }, + { + "epoch": 0.67, + "grad_norm": 6.306899138041751, + "learning_rate": 2.5601830238691956e-06, + "loss": 0.7832, + "step": 7656 + }, + { + "epoch": 0.67, + "grad_norm": 4.764154796996667, + "learning_rate": 2.5589416415131215e-06, + "loss": 0.6848, + "step": 7657 + }, + { + "epoch": 0.67, + "grad_norm": 9.211631588516541, + "learning_rate": 2.5577004566834286e-06, + "loss": 0.6947, + "step": 7658 + }, + { + "epoch": 0.67, + "grad_norm": 15.414772154874901, + "learning_rate": 2.556459469480553e-06, + "loss": 0.7255, + "step": 7659 + }, + { + "epoch": 0.67, + "grad_norm": 6.467898925450451, + "learning_rate": 2.555218680004912e-06, + "loss": 0.632, + "step": 7660 + }, + { + "epoch": 0.67, + "grad_norm": 6.513569424287414, + "learning_rate": 2.553978088356909e-06, + "loss": 0.8142, + "step": 7661 + }, + { + "epoch": 0.67, + "grad_norm": 13.794383955893506, + "learning_rate": 2.552737694636929e-06, + "loss": 0.8862, + "step": 7662 + }, + { + "epoch": 0.67, + "grad_norm": 6.745085680666608, + "learning_rate": 2.5514974989453443e-06, + "loss": 0.8607, + "step": 7663 + }, + { + "epoch": 0.67, + "grad_norm": 12.785585045050855, + "learning_rate": 2.55025750138251e-06, + "loss": 0.7049, + "step": 7664 + }, + { + "epoch": 0.67, + "grad_norm": 2.545483808010917, + "learning_rate": 2.549017702048764e-06, + "loss": 0.4815, + "step": 7665 + }, + { + "epoch": 0.67, + "grad_norm": 9.813296906038996, + "learning_rate": 2.547778101044428e-06, + "loss": 0.7546, + "step": 7666 + }, + { + "epoch": 0.67, + "grad_norm": 19.623828564378773, + "learning_rate": 2.54653869846981e-06, + "loss": 0.6223, + "step": 7667 + }, + { + "epoch": 0.67, + "grad_norm": 5.499443922849205, + "learning_rate": 2.5452994944251962e-06, + "loss": 0.7058, + "step": 7668 + }, + { + "epoch": 0.67, + "grad_norm": 8.001885536145057, + "learning_rate": 2.5440604890108666e-06, + "loss": 0.6252, + "step": 7669 + }, + { + "epoch": 0.67, + "grad_norm": 6.069764837065823, + "learning_rate": 2.5428216823270772e-06, + "loss": 0.8306, + "step": 7670 + }, + { + "epoch": 0.67, + "grad_norm": 9.232958861292, + "learning_rate": 2.54158307447407e-06, + "loss": 0.6712, + "step": 7671 + }, + { + "epoch": 0.67, + "grad_norm": 8.545310501102675, + "learning_rate": 2.540344665552071e-06, + "loss": 0.8666, + "step": 7672 + }, + { + "epoch": 0.67, + "grad_norm": 6.060793041569368, + "learning_rate": 2.5391064556612877e-06, + "loss": 0.7384, + "step": 7673 + }, + { + "epoch": 0.67, + "grad_norm": 6.4981228957540615, + "learning_rate": 2.537868444901919e-06, + "loss": 0.6524, + "step": 7674 + }, + { + "epoch": 0.67, + "grad_norm": 5.51079645142963, + "learning_rate": 2.5366306333741404e-06, + "loss": 0.7533, + "step": 7675 + }, + { + "epoch": 0.67, + "grad_norm": 7.324418095059783, + "learning_rate": 2.535393021178113e-06, + "loss": 0.814, + "step": 7676 + }, + { + "epoch": 0.67, + "grad_norm": 9.704732269069268, + "learning_rate": 2.534155608413983e-06, + "loss": 0.7541, + "step": 7677 + }, + { + "epoch": 0.67, + "grad_norm": 7.960356373921939, + "learning_rate": 2.5329183951818786e-06, + "loss": 0.7102, + "step": 7678 + }, + { + "epoch": 0.67, + "grad_norm": 4.715281573862161, + "learning_rate": 2.531681381581913e-06, + "loss": 0.901, + "step": 7679 + }, + { + "epoch": 0.67, + "grad_norm": 16.262299842859463, + "learning_rate": 2.5304445677141855e-06, + "loss": 0.8602, + "step": 7680 + }, + { + "epoch": 0.67, + "grad_norm": 5.8012130661752, + "learning_rate": 2.5292079536787756e-06, + "loss": 0.7494, + "step": 7681 + }, + { + "epoch": 0.67, + "grad_norm": 2.496850260054102, + "learning_rate": 2.527971539575748e-06, + "loss": 0.4855, + "step": 7682 + }, + { + "epoch": 0.67, + "grad_norm": 6.388504102267359, + "learning_rate": 2.526735325505152e-06, + "loss": 0.7762, + "step": 7683 + }, + { + "epoch": 0.67, + "grad_norm": 8.417395068032222, + "learning_rate": 2.525499311567019e-06, + "loss": 0.5935, + "step": 7684 + }, + { + "epoch": 0.67, + "grad_norm": 6.352445320073837, + "learning_rate": 2.524263497861364e-06, + "loss": 0.7174, + "step": 7685 + }, + { + "epoch": 0.68, + "grad_norm": 7.2672849931829315, + "learning_rate": 2.52302788448819e-06, + "loss": 0.7464, + "step": 7686 + }, + { + "epoch": 0.68, + "grad_norm": 6.5853204796518225, + "learning_rate": 2.5217924715474794e-06, + "loss": 0.8408, + "step": 7687 + }, + { + "epoch": 0.68, + "grad_norm": 4.817527048158252, + "learning_rate": 2.5205572591392003e-06, + "loss": 0.6632, + "step": 7688 + }, + { + "epoch": 0.68, + "grad_norm": 6.963331697009694, + "learning_rate": 2.5193222473633027e-06, + "loss": 0.7263, + "step": 7689 + }, + { + "epoch": 0.68, + "grad_norm": 25.522138079387812, + "learning_rate": 2.5180874363197217e-06, + "loss": 0.579, + "step": 7690 + }, + { + "epoch": 0.68, + "grad_norm": 5.627346380656049, + "learning_rate": 2.516852826108378e-06, + "loss": 0.662, + "step": 7691 + }, + { + "epoch": 0.68, + "grad_norm": 6.200635591776335, + "learning_rate": 2.5156184168291733e-06, + "loss": 0.7458, + "step": 7692 + }, + { + "epoch": 0.68, + "grad_norm": 5.074584162789168, + "learning_rate": 2.514384208581993e-06, + "loss": 0.5724, + "step": 7693 + }, + { + "epoch": 0.68, + "grad_norm": 4.257249274040635, + "learning_rate": 2.513150201466709e-06, + "loss": 0.6081, + "step": 7694 + }, + { + "epoch": 0.68, + "grad_norm": 10.742705949290317, + "learning_rate": 2.511916395583173e-06, + "loss": 0.6634, + "step": 7695 + }, + { + "epoch": 0.68, + "grad_norm": 8.09660470036199, + "learning_rate": 2.510682791031223e-06, + "loss": 0.8403, + "step": 7696 + }, + { + "epoch": 0.68, + "grad_norm": 4.596875858861272, + "learning_rate": 2.5094493879106813e-06, + "loss": 0.7379, + "step": 7697 + }, + { + "epoch": 0.68, + "grad_norm": 11.631255696411973, + "learning_rate": 2.508216186321353e-06, + "loss": 0.6901, + "step": 7698 + }, + { + "epoch": 0.68, + "grad_norm": 7.626948562438817, + "learning_rate": 2.5069831863630257e-06, + "loss": 0.839, + "step": 7699 + }, + { + "epoch": 0.68, + "grad_norm": 8.051384705325301, + "learning_rate": 2.5057503881354726e-06, + "loss": 0.7907, + "step": 7700 + }, + { + "epoch": 0.68, + "grad_norm": 2.2552246738610537, + "learning_rate": 2.504517791738449e-06, + "loss": 0.4903, + "step": 7701 + }, + { + "epoch": 0.68, + "grad_norm": 5.874859925041625, + "learning_rate": 2.503285397271692e-06, + "loss": 0.7802, + "step": 7702 + }, + { + "epoch": 0.68, + "grad_norm": 6.330489294052568, + "learning_rate": 2.50205320483493e-06, + "loss": 0.7233, + "step": 7703 + }, + { + "epoch": 0.68, + "grad_norm": 9.026774393790737, + "learning_rate": 2.5008212145278675e-06, + "loss": 0.8203, + "step": 7704 + }, + { + "epoch": 0.68, + "grad_norm": 21.543112865720126, + "learning_rate": 2.4995894264501956e-06, + "loss": 0.7794, + "step": 7705 + }, + { + "epoch": 0.68, + "grad_norm": 7.927498599950003, + "learning_rate": 2.498357840701588e-06, + "loss": 0.8402, + "step": 7706 + }, + { + "epoch": 0.68, + "grad_norm": 16.116045753806315, + "learning_rate": 2.497126457381702e-06, + "loss": 0.9079, + "step": 7707 + }, + { + "epoch": 0.68, + "grad_norm": 7.791924756347548, + "learning_rate": 2.4958952765901786e-06, + "loss": 0.8094, + "step": 7708 + }, + { + "epoch": 0.68, + "grad_norm": 5.484562697800526, + "learning_rate": 2.4946642984266457e-06, + "loss": 0.768, + "step": 7709 + }, + { + "epoch": 0.68, + "grad_norm": 7.342641898743457, + "learning_rate": 2.4934335229907105e-06, + "loss": 0.6919, + "step": 7710 + }, + { + "epoch": 0.68, + "grad_norm": 7.024632174536307, + "learning_rate": 2.4922029503819644e-06, + "loss": 0.496, + "step": 7711 + }, + { + "epoch": 0.68, + "grad_norm": 5.018477400820564, + "learning_rate": 2.4909725806999847e-06, + "loss": 0.6539, + "step": 7712 + }, + { + "epoch": 0.68, + "grad_norm": 7.644246180284332, + "learning_rate": 2.489742414044328e-06, + "loss": 0.8196, + "step": 7713 + }, + { + "epoch": 0.68, + "grad_norm": 7.426505397100763, + "learning_rate": 2.488512450514542e-06, + "loss": 0.5986, + "step": 7714 + }, + { + "epoch": 0.68, + "grad_norm": 5.169883513062484, + "learning_rate": 2.4872826902101506e-06, + "loss": 0.8097, + "step": 7715 + }, + { + "epoch": 0.68, + "grad_norm": 7.149500335752097, + "learning_rate": 2.4860531332306644e-06, + "loss": 0.6794, + "step": 7716 + }, + { + "epoch": 0.68, + "grad_norm": 7.399636851634183, + "learning_rate": 2.4848237796755764e-06, + "loss": 0.8102, + "step": 7717 + }, + { + "epoch": 0.68, + "grad_norm": 9.54085116677983, + "learning_rate": 2.4835946296443648e-06, + "loss": 0.8096, + "step": 7718 + }, + { + "epoch": 0.68, + "grad_norm": 7.258912783128812, + "learning_rate": 2.4823656832364872e-06, + "loss": 0.8297, + "step": 7719 + }, + { + "epoch": 0.68, + "grad_norm": 2.300283723659086, + "learning_rate": 2.4811369405513935e-06, + "loss": 0.5006, + "step": 7720 + }, + { + "epoch": 0.68, + "grad_norm": 5.05124595042686, + "learning_rate": 2.4799084016885082e-06, + "loss": 0.6522, + "step": 7721 + }, + { + "epoch": 0.68, + "grad_norm": 4.453164854975357, + "learning_rate": 2.478680066747243e-06, + "loss": 0.6915, + "step": 7722 + }, + { + "epoch": 0.68, + "grad_norm": 9.972336006904873, + "learning_rate": 2.4774519358269932e-06, + "loss": 0.7233, + "step": 7723 + }, + { + "epoch": 0.68, + "grad_norm": 8.295572299742146, + "learning_rate": 2.4762240090271367e-06, + "loss": 0.68, + "step": 7724 + }, + { + "epoch": 0.68, + "grad_norm": 7.619927643992105, + "learning_rate": 2.4749962864470334e-06, + "loss": 0.699, + "step": 7725 + }, + { + "epoch": 0.68, + "grad_norm": 7.754591210688418, + "learning_rate": 2.4737687681860323e-06, + "loss": 0.759, + "step": 7726 + }, + { + "epoch": 0.68, + "grad_norm": 24.35528399626327, + "learning_rate": 2.47254145434346e-06, + "loss": 0.8575, + "step": 7727 + }, + { + "epoch": 0.68, + "grad_norm": 4.881187432370092, + "learning_rate": 2.4713143450186294e-06, + "loss": 0.7466, + "step": 7728 + }, + { + "epoch": 0.68, + "grad_norm": 8.693719539109757, + "learning_rate": 2.4700874403108353e-06, + "loss": 0.6848, + "step": 7729 + }, + { + "epoch": 0.68, + "grad_norm": 6.768852867102027, + "learning_rate": 2.468860740319356e-06, + "loss": 0.6991, + "step": 7730 + }, + { + "epoch": 0.68, + "grad_norm": 8.930823036517838, + "learning_rate": 2.4676342451434564e-06, + "loss": 0.6889, + "step": 7731 + }, + { + "epoch": 0.68, + "grad_norm": 2.075281570592039, + "learning_rate": 2.4664079548823822e-06, + "loss": 0.5207, + "step": 7732 + }, + { + "epoch": 0.68, + "grad_norm": 9.18026200613167, + "learning_rate": 2.4651818696353614e-06, + "loss": 0.7681, + "step": 7733 + }, + { + "epoch": 0.68, + "grad_norm": 7.171426403969815, + "learning_rate": 2.463955989501607e-06, + "loss": 0.7403, + "step": 7734 + }, + { + "epoch": 0.68, + "grad_norm": 9.590809462002637, + "learning_rate": 2.4627303145803156e-06, + "loss": 0.7059, + "step": 7735 + }, + { + "epoch": 0.68, + "grad_norm": 6.0832899662123, + "learning_rate": 2.4615048449706645e-06, + "loss": 0.9148, + "step": 7736 + }, + { + "epoch": 0.68, + "grad_norm": 7.251641014525764, + "learning_rate": 2.460279580771821e-06, + "loss": 0.8604, + "step": 7737 + }, + { + "epoch": 0.68, + "grad_norm": 6.785862073145582, + "learning_rate": 2.4590545220829295e-06, + "loss": 0.7734, + "step": 7738 + }, + { + "epoch": 0.68, + "grad_norm": 7.293790164506161, + "learning_rate": 2.4578296690031187e-06, + "loss": 0.6526, + "step": 7739 + }, + { + "epoch": 0.68, + "grad_norm": 6.0799231334941926, + "learning_rate": 2.456605021631503e-06, + "loss": 0.6435, + "step": 7740 + }, + { + "epoch": 0.68, + "grad_norm": 6.924213387876127, + "learning_rate": 2.4553805800671783e-06, + "loss": 0.7089, + "step": 7741 + }, + { + "epoch": 0.68, + "grad_norm": 9.7187544530175, + "learning_rate": 2.454156344409222e-06, + "loss": 0.6378, + "step": 7742 + }, + { + "epoch": 0.68, + "grad_norm": 4.8159577323029525, + "learning_rate": 2.4529323147567015e-06, + "loss": 0.8743, + "step": 7743 + }, + { + "epoch": 0.68, + "grad_norm": 6.572530647271361, + "learning_rate": 2.4517084912086612e-06, + "loss": 0.703, + "step": 7744 + }, + { + "epoch": 0.68, + "grad_norm": 8.965237272797841, + "learning_rate": 2.4504848738641313e-06, + "loss": 0.5817, + "step": 7745 + }, + { + "epoch": 0.68, + "grad_norm": 6.22302676445679, + "learning_rate": 2.449261462822124e-06, + "loss": 0.6257, + "step": 7746 + }, + { + "epoch": 0.68, + "grad_norm": 8.215735485331722, + "learning_rate": 2.4480382581816362e-06, + "loss": 0.8005, + "step": 7747 + }, + { + "epoch": 0.68, + "grad_norm": 7.6906508127709845, + "learning_rate": 2.446815260041646e-06, + "loss": 0.7171, + "step": 7748 + }, + { + "epoch": 0.68, + "grad_norm": 9.260592896864324, + "learning_rate": 2.44559246850112e-06, + "loss": 0.7759, + "step": 7749 + }, + { + "epoch": 0.68, + "grad_norm": 10.589763693488274, + "learning_rate": 2.4443698836590026e-06, + "loss": 0.7348, + "step": 7750 + }, + { + "epoch": 0.68, + "grad_norm": 10.621397710601906, + "learning_rate": 2.4431475056142224e-06, + "loss": 0.656, + "step": 7751 + }, + { + "epoch": 0.68, + "grad_norm": 6.601998440872901, + "learning_rate": 2.441925334465693e-06, + "loss": 0.7763, + "step": 7752 + }, + { + "epoch": 0.68, + "grad_norm": 2.946986807054121, + "learning_rate": 2.440703370312309e-06, + "loss": 0.5289, + "step": 7753 + }, + { + "epoch": 0.68, + "grad_norm": 2.515197426102164, + "learning_rate": 2.4394816132529526e-06, + "loss": 0.5395, + "step": 7754 + }, + { + "epoch": 0.68, + "grad_norm": 11.545806397062318, + "learning_rate": 2.438260063386485e-06, + "loss": 0.8386, + "step": 7755 + }, + { + "epoch": 0.68, + "grad_norm": 7.721002271549318, + "learning_rate": 2.437038720811752e-06, + "loss": 0.8011, + "step": 7756 + }, + { + "epoch": 0.68, + "grad_norm": 12.049717467977267, + "learning_rate": 2.435817585627582e-06, + "loss": 0.828, + "step": 7757 + }, + { + "epoch": 0.68, + "grad_norm": 10.51997445467103, + "learning_rate": 2.434596657932788e-06, + "loss": 0.728, + "step": 7758 + }, + { + "epoch": 0.68, + "grad_norm": 11.001654897709798, + "learning_rate": 2.4333759378261635e-06, + "loss": 0.7844, + "step": 7759 + }, + { + "epoch": 0.68, + "grad_norm": 9.759965168749902, + "learning_rate": 2.43215542540649e-06, + "loss": 0.7573, + "step": 7760 + }, + { + "epoch": 0.68, + "grad_norm": 9.78653045207583, + "learning_rate": 2.4309351207725286e-06, + "loss": 0.8888, + "step": 7761 + }, + { + "epoch": 0.68, + "grad_norm": 8.156722705001592, + "learning_rate": 2.4297150240230233e-06, + "loss": 0.8153, + "step": 7762 + }, + { + "epoch": 0.68, + "grad_norm": 2.4529839336331203, + "learning_rate": 2.428495135256703e-06, + "loss": 0.5537, + "step": 7763 + }, + { + "epoch": 0.68, + "grad_norm": 7.32557439911466, + "learning_rate": 2.4272754545722793e-06, + "loss": 0.6996, + "step": 7764 + }, + { + "epoch": 0.68, + "grad_norm": 6.0914102555389515, + "learning_rate": 2.4260559820684442e-06, + "loss": 0.7021, + "step": 7765 + }, + { + "epoch": 0.68, + "grad_norm": 6.0028003050378365, + "learning_rate": 2.424836717843879e-06, + "loss": 0.7654, + "step": 7766 + }, + { + "epoch": 0.68, + "grad_norm": 7.252256551462772, + "learning_rate": 2.4236176619972436e-06, + "loss": 0.6821, + "step": 7767 + }, + { + "epoch": 0.68, + "grad_norm": 6.086045193844204, + "learning_rate": 2.422398814627181e-06, + "loss": 0.7397, + "step": 7768 + }, + { + "epoch": 0.68, + "grad_norm": 7.373786734686854, + "learning_rate": 2.4211801758323187e-06, + "loss": 0.5214, + "step": 7769 + }, + { + "epoch": 0.68, + "grad_norm": 7.029439357613932, + "learning_rate": 2.419961745711265e-06, + "loss": 0.6647, + "step": 7770 + }, + { + "epoch": 0.68, + "grad_norm": 3.7218140657647076, + "learning_rate": 2.418743524362617e-06, + "loss": 0.4877, + "step": 7771 + }, + { + "epoch": 0.68, + "grad_norm": 7.060000344207337, + "learning_rate": 2.41752551188495e-06, + "loss": 0.6682, + "step": 7772 + }, + { + "epoch": 0.68, + "grad_norm": 5.793864052111303, + "learning_rate": 2.416307708376822e-06, + "loss": 0.7918, + "step": 7773 + }, + { + "epoch": 0.68, + "grad_norm": 7.940948367339142, + "learning_rate": 2.4150901139367774e-06, + "loss": 0.768, + "step": 7774 + }, + { + "epoch": 0.68, + "grad_norm": 5.792361320405898, + "learning_rate": 2.41387272866334e-06, + "loss": 0.7493, + "step": 7775 + }, + { + "epoch": 0.68, + "grad_norm": 7.775088197023681, + "learning_rate": 2.412655552655018e-06, + "loss": 0.8152, + "step": 7776 + }, + { + "epoch": 0.68, + "grad_norm": 9.784689318553035, + "learning_rate": 2.4114385860103074e-06, + "loss": 0.8237, + "step": 7777 + }, + { + "epoch": 0.68, + "grad_norm": 9.417762456489891, + "learning_rate": 2.41022182882768e-06, + "loss": 0.6935, + "step": 7778 + }, + { + "epoch": 0.68, + "grad_norm": 12.464824118241012, + "learning_rate": 2.409005281205594e-06, + "loss": 0.7151, + "step": 7779 + }, + { + "epoch": 0.68, + "grad_norm": 5.553862010350217, + "learning_rate": 2.407788943242492e-06, + "loss": 0.8023, + "step": 7780 + }, + { + "epoch": 0.68, + "grad_norm": 6.605325185987221, + "learning_rate": 2.4065728150367957e-06, + "loss": 0.755, + "step": 7781 + }, + { + "epoch": 0.68, + "grad_norm": 11.351466275596406, + "learning_rate": 2.4053568966869113e-06, + "loss": 0.7346, + "step": 7782 + }, + { + "epoch": 0.68, + "grad_norm": 5.753171585175407, + "learning_rate": 2.404141188291233e-06, + "loss": 0.8334, + "step": 7783 + }, + { + "epoch": 0.68, + "grad_norm": 11.869809469606993, + "learning_rate": 2.4029256899481316e-06, + "loss": 0.6507, + "step": 7784 + }, + { + "epoch": 0.68, + "grad_norm": 6.566049598926788, + "learning_rate": 2.401710401755964e-06, + "loss": 0.7528, + "step": 7785 + }, + { + "epoch": 0.68, + "grad_norm": 13.266237985923782, + "learning_rate": 2.400495323813068e-06, + "loss": 0.7724, + "step": 7786 + }, + { + "epoch": 0.68, + "grad_norm": 5.713040633823841, + "learning_rate": 2.399280456217767e-06, + "loss": 0.6608, + "step": 7787 + }, + { + "epoch": 0.68, + "grad_norm": 16.607425313238572, + "learning_rate": 2.3980657990683624e-06, + "loss": 0.6269, + "step": 7788 + }, + { + "epoch": 0.68, + "grad_norm": 14.978776885449602, + "learning_rate": 2.3968513524631483e-06, + "loss": 0.7249, + "step": 7789 + }, + { + "epoch": 0.68, + "grad_norm": 7.753811633918352, + "learning_rate": 2.3956371165003923e-06, + "loss": 0.6388, + "step": 7790 + }, + { + "epoch": 0.68, + "grad_norm": 7.763340954069615, + "learning_rate": 2.3944230912783485e-06, + "loss": 0.7, + "step": 7791 + }, + { + "epoch": 0.68, + "grad_norm": 10.240578404032142, + "learning_rate": 2.3932092768952537e-06, + "loss": 0.7209, + "step": 7792 + }, + { + "epoch": 0.68, + "grad_norm": 30.585869591229507, + "learning_rate": 2.3919956734493264e-06, + "loss": 0.7591, + "step": 7793 + }, + { + "epoch": 0.68, + "grad_norm": 5.05903554510123, + "learning_rate": 2.3907822810387734e-06, + "loss": 0.546, + "step": 7794 + }, + { + "epoch": 0.68, + "grad_norm": 9.280916113852031, + "learning_rate": 2.3895690997617776e-06, + "loss": 0.9142, + "step": 7795 + }, + { + "epoch": 0.68, + "grad_norm": 7.038767602912184, + "learning_rate": 2.388356129716508e-06, + "loss": 0.7555, + "step": 7796 + }, + { + "epoch": 0.68, + "grad_norm": 11.935369397183564, + "learning_rate": 2.387143371001116e-06, + "loss": 0.6738, + "step": 7797 + }, + { + "epoch": 0.68, + "grad_norm": 2.602541454042691, + "learning_rate": 2.3859308237137362e-06, + "loss": 0.5057, + "step": 7798 + }, + { + "epoch": 0.68, + "grad_norm": 9.25312204504065, + "learning_rate": 2.3847184879524844e-06, + "loss": 0.7748, + "step": 7799 + }, + { + "epoch": 0.69, + "grad_norm": 14.710401538547467, + "learning_rate": 2.3835063638154636e-06, + "loss": 0.8278, + "step": 7800 + }, + { + "epoch": 0.69, + "grad_norm": 6.916711562305464, + "learning_rate": 2.382294451400756e-06, + "loss": 0.6943, + "step": 7801 + }, + { + "epoch": 0.69, + "grad_norm": 10.20989193771405, + "learning_rate": 2.3810827508064265e-06, + "loss": 0.7314, + "step": 7802 + }, + { + "epoch": 0.69, + "grad_norm": 6.07265595380452, + "learning_rate": 2.3798712621305252e-06, + "loss": 0.7276, + "step": 7803 + }, + { + "epoch": 0.69, + "grad_norm": 3.140220953538023, + "learning_rate": 2.3786599854710822e-06, + "loss": 0.5719, + "step": 7804 + }, + { + "epoch": 0.69, + "grad_norm": 11.77276407726749, + "learning_rate": 2.3774489209261114e-06, + "loss": 0.7319, + "step": 7805 + }, + { + "epoch": 0.69, + "grad_norm": 10.4275320325617, + "learning_rate": 2.3762380685936136e-06, + "loss": 0.7921, + "step": 7806 + }, + { + "epoch": 0.69, + "grad_norm": 7.005486209289046, + "learning_rate": 2.3750274285715665e-06, + "loss": 0.7032, + "step": 7807 + }, + { + "epoch": 0.69, + "grad_norm": 10.589434130249517, + "learning_rate": 2.3738170009579336e-06, + "loss": 0.7032, + "step": 7808 + }, + { + "epoch": 0.69, + "grad_norm": 3.57379778954685, + "learning_rate": 2.3726067858506607e-06, + "loss": 0.4818, + "step": 7809 + }, + { + "epoch": 0.69, + "grad_norm": 6.239387678514647, + "learning_rate": 2.371396783347674e-06, + "loss": 0.7717, + "step": 7810 + }, + { + "epoch": 0.69, + "grad_norm": 3.3871436948870426, + "learning_rate": 2.3701869935468893e-06, + "loss": 0.5962, + "step": 7811 + }, + { + "epoch": 0.69, + "grad_norm": 7.8386344867508155, + "learning_rate": 2.3689774165461985e-06, + "loss": 0.6283, + "step": 7812 + }, + { + "epoch": 0.69, + "grad_norm": 12.598648011361073, + "learning_rate": 2.3677680524434787e-06, + "loss": 0.7632, + "step": 7813 + }, + { + "epoch": 0.69, + "grad_norm": 2.58625714662605, + "learning_rate": 2.3665589013365895e-06, + "loss": 0.4718, + "step": 7814 + }, + { + "epoch": 0.69, + "grad_norm": 6.182840684024028, + "learning_rate": 2.3653499633233736e-06, + "loss": 0.707, + "step": 7815 + }, + { + "epoch": 0.69, + "grad_norm": 7.297194517490899, + "learning_rate": 2.3641412385016542e-06, + "loss": 0.7737, + "step": 7816 + }, + { + "epoch": 0.69, + "grad_norm": 6.825411932619147, + "learning_rate": 2.362932726969243e-06, + "loss": 0.7416, + "step": 7817 + }, + { + "epoch": 0.69, + "grad_norm": 6.289228078871548, + "learning_rate": 2.3617244288239296e-06, + "loss": 0.7668, + "step": 7818 + }, + { + "epoch": 0.69, + "grad_norm": 7.541699684829566, + "learning_rate": 2.3605163441634863e-06, + "loss": 0.9041, + "step": 7819 + }, + { + "epoch": 0.69, + "grad_norm": 8.208376180639952, + "learning_rate": 2.3593084730856703e-06, + "loss": 0.7817, + "step": 7820 + }, + { + "epoch": 0.69, + "grad_norm": 8.279209006339338, + "learning_rate": 2.3581008156882194e-06, + "loss": 0.6878, + "step": 7821 + }, + { + "epoch": 0.69, + "grad_norm": 6.014234145089906, + "learning_rate": 2.356893372068855e-06, + "loss": 0.8445, + "step": 7822 + }, + { + "epoch": 0.69, + "grad_norm": 8.2083752164252, + "learning_rate": 2.3556861423252836e-06, + "loss": 0.7001, + "step": 7823 + }, + { + "epoch": 0.69, + "grad_norm": 8.768370593897663, + "learning_rate": 2.3544791265551907e-06, + "loss": 0.8165, + "step": 7824 + }, + { + "epoch": 0.69, + "grad_norm": 9.26017277187458, + "learning_rate": 2.3532723248562465e-06, + "loss": 0.7241, + "step": 7825 + }, + { + "epoch": 0.69, + "grad_norm": 5.0844794355076095, + "learning_rate": 2.3520657373261035e-06, + "loss": 0.6737, + "step": 7826 + }, + { + "epoch": 0.69, + "grad_norm": 6.854355715018314, + "learning_rate": 2.350859364062397e-06, + "loss": 0.7789, + "step": 7827 + }, + { + "epoch": 0.69, + "grad_norm": 7.415271437907167, + "learning_rate": 2.3496532051627406e-06, + "loss": 0.7687, + "step": 7828 + }, + { + "epoch": 0.69, + "grad_norm": 9.579497705287, + "learning_rate": 2.3484472607247415e-06, + "loss": 0.6535, + "step": 7829 + }, + { + "epoch": 0.69, + "grad_norm": 5.931621600033334, + "learning_rate": 2.347241530845979e-06, + "loss": 0.828, + "step": 7830 + }, + { + "epoch": 0.69, + "grad_norm": 6.4923676612041135, + "learning_rate": 2.3460360156240193e-06, + "loss": 0.8144, + "step": 7831 + }, + { + "epoch": 0.69, + "grad_norm": 14.949402161515637, + "learning_rate": 2.3448307151564103e-06, + "loss": 0.8173, + "step": 7832 + }, + { + "epoch": 0.69, + "grad_norm": 13.874831480097297, + "learning_rate": 2.343625629540681e-06, + "loss": 0.6611, + "step": 7833 + }, + { + "epoch": 0.69, + "grad_norm": 9.760696633669335, + "learning_rate": 2.3424207588743497e-06, + "loss": 0.7623, + "step": 7834 + }, + { + "epoch": 0.69, + "grad_norm": 2.456018448812985, + "learning_rate": 2.3412161032549097e-06, + "loss": 0.5881, + "step": 7835 + }, + { + "epoch": 0.69, + "grad_norm": 9.369877413234441, + "learning_rate": 2.3400116627798397e-06, + "loss": 0.7307, + "step": 7836 + }, + { + "epoch": 0.69, + "grad_norm": 2.284921922125305, + "learning_rate": 2.3388074375466015e-06, + "loss": 0.5169, + "step": 7837 + }, + { + "epoch": 0.69, + "grad_norm": 25.419267325165357, + "learning_rate": 2.337603427652639e-06, + "loss": 0.6943, + "step": 7838 + }, + { + "epoch": 0.69, + "grad_norm": 9.335712151719486, + "learning_rate": 2.336399633195376e-06, + "loss": 0.7325, + "step": 7839 + }, + { + "epoch": 0.69, + "grad_norm": 7.422526339835715, + "learning_rate": 2.335196054272226e-06, + "loss": 0.8364, + "step": 7840 + }, + { + "epoch": 0.69, + "grad_norm": 9.13807024818529, + "learning_rate": 2.3339926909805795e-06, + "loss": 0.619, + "step": 7841 + }, + { + "epoch": 0.69, + "grad_norm": 6.442532727647234, + "learning_rate": 2.332789543417809e-06, + "loss": 0.6787, + "step": 7842 + }, + { + "epoch": 0.69, + "grad_norm": 8.965751813421619, + "learning_rate": 2.331586611681272e-06, + "loss": 0.7538, + "step": 7843 + }, + { + "epoch": 0.69, + "grad_norm": 5.617646216398942, + "learning_rate": 2.3303838958683077e-06, + "loss": 0.8185, + "step": 7844 + }, + { + "epoch": 0.69, + "grad_norm": 24.150515163091885, + "learning_rate": 2.329181396076236e-06, + "loss": 0.6631, + "step": 7845 + }, + { + "epoch": 0.69, + "grad_norm": 10.778503876865301, + "learning_rate": 2.327979112402365e-06, + "loss": 0.7405, + "step": 7846 + }, + { + "epoch": 0.69, + "grad_norm": 24.315265569344604, + "learning_rate": 2.3267770449439797e-06, + "loss": 0.6249, + "step": 7847 + }, + { + "epoch": 0.69, + "grad_norm": 12.112368976021179, + "learning_rate": 2.325575193798349e-06, + "loss": 0.828, + "step": 7848 + }, + { + "epoch": 0.69, + "grad_norm": 9.17302604659974, + "learning_rate": 2.324373559062725e-06, + "loss": 0.7772, + "step": 7849 + }, + { + "epoch": 0.69, + "grad_norm": 4.899328359555276, + "learning_rate": 2.32317214083434e-06, + "loss": 0.7022, + "step": 7850 + }, + { + "epoch": 0.69, + "grad_norm": 5.701303437152671, + "learning_rate": 2.3219709392104145e-06, + "loss": 0.6276, + "step": 7851 + }, + { + "epoch": 0.69, + "grad_norm": 10.614837430751757, + "learning_rate": 2.320769954288146e-06, + "loss": 0.698, + "step": 7852 + }, + { + "epoch": 0.69, + "grad_norm": 13.644654692993269, + "learning_rate": 2.319569186164716e-06, + "loss": 0.6114, + "step": 7853 + }, + { + "epoch": 0.69, + "grad_norm": 2.3919281698667105, + "learning_rate": 2.31836863493729e-06, + "loss": 0.4641, + "step": 7854 + }, + { + "epoch": 0.69, + "grad_norm": 10.416795647701719, + "learning_rate": 2.3171683007030117e-06, + "loss": 0.7293, + "step": 7855 + }, + { + "epoch": 0.69, + "grad_norm": 6.823407483307708, + "learning_rate": 2.315968183559011e-06, + "loss": 0.6425, + "step": 7856 + }, + { + "epoch": 0.69, + "grad_norm": 6.71346353760784, + "learning_rate": 2.3147682836024015e-06, + "loss": 0.6941, + "step": 7857 + }, + { + "epoch": 0.69, + "grad_norm": 11.030908187199675, + "learning_rate": 2.313568600930276e-06, + "loss": 0.6917, + "step": 7858 + }, + { + "epoch": 0.69, + "grad_norm": 2.7235242212900297, + "learning_rate": 2.312369135639711e-06, + "loss": 0.5167, + "step": 7859 + }, + { + "epoch": 0.69, + "grad_norm": 6.1967964610890895, + "learning_rate": 2.3111698878277644e-06, + "loss": 0.7618, + "step": 7860 + }, + { + "epoch": 0.69, + "grad_norm": 2.5519694053977764, + "learning_rate": 2.309970857591478e-06, + "loss": 0.4744, + "step": 7861 + }, + { + "epoch": 0.69, + "grad_norm": 6.300011806106735, + "learning_rate": 2.3087720450278737e-06, + "loss": 0.8463, + "step": 7862 + }, + { + "epoch": 0.69, + "grad_norm": 7.950165985288835, + "learning_rate": 2.3075734502339604e-06, + "loss": 0.7484, + "step": 7863 + }, + { + "epoch": 0.69, + "grad_norm": 9.807181154660682, + "learning_rate": 2.3063750733067248e-06, + "loss": 0.8159, + "step": 7864 + }, + { + "epoch": 0.69, + "grad_norm": 7.881994928522505, + "learning_rate": 2.305176914343138e-06, + "loss": 0.7098, + "step": 7865 + }, + { + "epoch": 0.69, + "grad_norm": 3.003041735275444, + "learning_rate": 2.3039789734401524e-06, + "loss": 0.516, + "step": 7866 + }, + { + "epoch": 0.69, + "grad_norm": 7.071332481831787, + "learning_rate": 2.302781250694704e-06, + "loss": 0.6964, + "step": 7867 + }, + { + "epoch": 0.69, + "grad_norm": 6.260568581214155, + "learning_rate": 2.301583746203708e-06, + "loss": 0.7293, + "step": 7868 + }, + { + "epoch": 0.69, + "grad_norm": 2.719708130468721, + "learning_rate": 2.3003864600640683e-06, + "loss": 0.5101, + "step": 7869 + }, + { + "epoch": 0.69, + "grad_norm": 12.589149990096576, + "learning_rate": 2.299189392372666e-06, + "loss": 0.8933, + "step": 7870 + }, + { + "epoch": 0.69, + "grad_norm": 5.173484321362837, + "learning_rate": 2.2979925432263658e-06, + "loss": 0.7731, + "step": 7871 + }, + { + "epoch": 0.69, + "grad_norm": 7.746829688208416, + "learning_rate": 2.296795912722014e-06, + "loss": 0.677, + "step": 7872 + }, + { + "epoch": 0.69, + "grad_norm": 2.2986147053565125, + "learning_rate": 2.2955995009564387e-06, + "loss": 0.487, + "step": 7873 + }, + { + "epoch": 0.69, + "grad_norm": 4.809688832042555, + "learning_rate": 2.294403308026455e-06, + "loss": 0.6148, + "step": 7874 + }, + { + "epoch": 0.69, + "grad_norm": 12.88747309736029, + "learning_rate": 2.293207334028855e-06, + "loss": 0.5804, + "step": 7875 + }, + { + "epoch": 0.69, + "grad_norm": 6.452208553622876, + "learning_rate": 2.2920115790604155e-06, + "loss": 0.7633, + "step": 7876 + }, + { + "epoch": 0.69, + "grad_norm": 8.31451364946783, + "learning_rate": 2.2908160432178937e-06, + "loss": 0.653, + "step": 7877 + }, + { + "epoch": 0.69, + "grad_norm": 8.984825793268502, + "learning_rate": 2.289620726598032e-06, + "loss": 0.7978, + "step": 7878 + }, + { + "epoch": 0.69, + "grad_norm": 5.617348781777318, + "learning_rate": 2.2884256292975505e-06, + "loss": 0.6443, + "step": 7879 + }, + { + "epoch": 0.69, + "grad_norm": 7.044211553347918, + "learning_rate": 2.2872307514131583e-06, + "loss": 0.7471, + "step": 7880 + }, + { + "epoch": 0.69, + "grad_norm": 3.111206104908175, + "learning_rate": 2.2860360930415416e-06, + "loss": 0.482, + "step": 7881 + }, + { + "epoch": 0.69, + "grad_norm": 7.485154524575754, + "learning_rate": 2.2848416542793695e-06, + "loss": 0.7986, + "step": 7882 + }, + { + "epoch": 0.69, + "grad_norm": 13.770896368875093, + "learning_rate": 2.283647435223294e-06, + "loss": 0.7771, + "step": 7883 + }, + { + "epoch": 0.69, + "grad_norm": 3.259954871740184, + "learning_rate": 2.28245343596995e-06, + "loss": 0.5586, + "step": 7884 + }, + { + "epoch": 0.69, + "grad_norm": 2.799379627422887, + "learning_rate": 2.2812596566159516e-06, + "loss": 0.5452, + "step": 7885 + }, + { + "epoch": 0.69, + "grad_norm": 9.23589583945397, + "learning_rate": 2.2800660972579013e-06, + "loss": 0.6954, + "step": 7886 + }, + { + "epoch": 0.69, + "grad_norm": 6.290837250259278, + "learning_rate": 2.278872757992378e-06, + "loss": 0.6674, + "step": 7887 + }, + { + "epoch": 0.69, + "grad_norm": 2.496166636201292, + "learning_rate": 2.277679638915945e-06, + "loss": 0.5749, + "step": 7888 + }, + { + "epoch": 0.69, + "grad_norm": 6.456202549363419, + "learning_rate": 2.2764867401251473e-06, + "loss": 0.5495, + "step": 7889 + }, + { + "epoch": 0.69, + "grad_norm": 5.178728089920499, + "learning_rate": 2.27529406171651e-06, + "loss": 0.8163, + "step": 7890 + }, + { + "epoch": 0.69, + "grad_norm": 14.137335882526179, + "learning_rate": 2.2741016037865467e-06, + "loss": 0.5539, + "step": 7891 + }, + { + "epoch": 0.69, + "grad_norm": 2.6730272971303246, + "learning_rate": 2.2729093664317477e-06, + "loss": 0.4996, + "step": 7892 + }, + { + "epoch": 0.69, + "grad_norm": 5.703911578258446, + "learning_rate": 2.271717349748586e-06, + "loss": 0.5881, + "step": 7893 + }, + { + "epoch": 0.69, + "grad_norm": 7.491650091763726, + "learning_rate": 2.2705255538335185e-06, + "loss": 0.7094, + "step": 7894 + }, + { + "epoch": 0.69, + "grad_norm": 9.559359811354653, + "learning_rate": 2.269333978782983e-06, + "loss": 0.756, + "step": 7895 + }, + { + "epoch": 0.69, + "grad_norm": 5.461947556989039, + "learning_rate": 2.268142624693398e-06, + "loss": 0.7566, + "step": 7896 + }, + { + "epoch": 0.69, + "grad_norm": 5.394845121472298, + "learning_rate": 2.266951491661169e-06, + "loss": 0.7372, + "step": 7897 + }, + { + "epoch": 0.69, + "grad_norm": 8.580153483476103, + "learning_rate": 2.2657605797826794e-06, + "loss": 0.727, + "step": 7898 + }, + { + "epoch": 0.69, + "grad_norm": 5.676353766095071, + "learning_rate": 2.264569889154295e-06, + "loss": 0.6298, + "step": 7899 + }, + { + "epoch": 0.69, + "grad_norm": 3.6919190962170685, + "learning_rate": 2.263379419872366e-06, + "loss": 0.5643, + "step": 7900 + }, + { + "epoch": 0.69, + "grad_norm": 5.526359849655089, + "learning_rate": 2.2621891720332213e-06, + "loss": 0.6873, + "step": 7901 + }, + { + "epoch": 0.69, + "grad_norm": 9.663077499359533, + "learning_rate": 2.2609991457331733e-06, + "loss": 0.769, + "step": 7902 + }, + { + "epoch": 0.69, + "grad_norm": 6.310863036839872, + "learning_rate": 2.2598093410685197e-06, + "loss": 0.6936, + "step": 7903 + }, + { + "epoch": 0.69, + "grad_norm": 6.531752614038012, + "learning_rate": 2.258619758135537e-06, + "loss": 0.7316, + "step": 7904 + }, + { + "epoch": 0.69, + "grad_norm": 9.166302019474122, + "learning_rate": 2.2574303970304824e-06, + "loss": 0.8001, + "step": 7905 + }, + { + "epoch": 0.69, + "grad_norm": 7.304617403845519, + "learning_rate": 2.2562412578495983e-06, + "loss": 0.7084, + "step": 7906 + }, + { + "epoch": 0.69, + "grad_norm": 7.317609792865763, + "learning_rate": 2.2550523406891084e-06, + "loss": 0.7457, + "step": 7907 + }, + { + "epoch": 0.69, + "grad_norm": 11.238978743859551, + "learning_rate": 2.2538636456452145e-06, + "loss": 0.7217, + "step": 7908 + }, + { + "epoch": 0.69, + "grad_norm": 6.401975403667777, + "learning_rate": 2.252675172814108e-06, + "loss": 0.7788, + "step": 7909 + }, + { + "epoch": 0.69, + "grad_norm": 4.427019615361958, + "learning_rate": 2.251486922291957e-06, + "loss": 0.6185, + "step": 7910 + }, + { + "epoch": 0.69, + "grad_norm": 5.587000948333879, + "learning_rate": 2.2502988941749126e-06, + "loss": 0.6884, + "step": 7911 + }, + { + "epoch": 0.69, + "grad_norm": 10.668526694959796, + "learning_rate": 2.2491110885591076e-06, + "loss": 0.6285, + "step": 7912 + }, + { + "epoch": 0.69, + "grad_norm": 9.303177418731236, + "learning_rate": 2.2479235055406558e-06, + "loss": 0.7575, + "step": 7913 + }, + { + "epoch": 0.7, + "grad_norm": 7.536012059669229, + "learning_rate": 2.2467361452156577e-06, + "loss": 0.6876, + "step": 7914 + }, + { + "epoch": 0.7, + "grad_norm": 6.3887121163728064, + "learning_rate": 2.245549007680191e-06, + "loss": 0.7685, + "step": 7915 + }, + { + "epoch": 0.7, + "grad_norm": 5.5707819886662735, + "learning_rate": 2.2443620930303167e-06, + "loss": 0.7534, + "step": 7916 + }, + { + "epoch": 0.7, + "grad_norm": 7.231667004780043, + "learning_rate": 2.243175401362078e-06, + "loss": 0.6281, + "step": 7917 + }, + { + "epoch": 0.7, + "grad_norm": 7.037004822618641, + "learning_rate": 2.2419889327715e-06, + "loss": 0.8752, + "step": 7918 + }, + { + "epoch": 0.7, + "grad_norm": 7.023422804144087, + "learning_rate": 2.2408026873545886e-06, + "loss": 0.7254, + "step": 7919 + }, + { + "epoch": 0.7, + "grad_norm": 5.776741737232062, + "learning_rate": 2.2396166652073354e-06, + "loss": 0.8232, + "step": 7920 + }, + { + "epoch": 0.7, + "grad_norm": 10.185661029946605, + "learning_rate": 2.23843086642571e-06, + "loss": 0.7864, + "step": 7921 + }, + { + "epoch": 0.7, + "grad_norm": 12.188351722181007, + "learning_rate": 2.2372452911056653e-06, + "loss": 0.9333, + "step": 7922 + }, + { + "epoch": 0.7, + "grad_norm": 8.495850816047783, + "learning_rate": 2.2360599393431357e-06, + "loss": 0.6373, + "step": 7923 + }, + { + "epoch": 0.7, + "grad_norm": 5.257095836054187, + "learning_rate": 2.2348748112340384e-06, + "loss": 0.719, + "step": 7924 + }, + { + "epoch": 0.7, + "grad_norm": 8.704098513627546, + "learning_rate": 2.2336899068742705e-06, + "loss": 0.6122, + "step": 7925 + }, + { + "epoch": 0.7, + "grad_norm": 11.077528893557473, + "learning_rate": 2.2325052263597153e-06, + "loss": 0.733, + "step": 7926 + }, + { + "epoch": 0.7, + "grad_norm": 6.742119179907231, + "learning_rate": 2.2313207697862338e-06, + "loss": 0.8165, + "step": 7927 + }, + { + "epoch": 0.7, + "grad_norm": 10.041035198829627, + "learning_rate": 2.2301365372496697e-06, + "loss": 0.7068, + "step": 7928 + }, + { + "epoch": 0.7, + "grad_norm": 9.66565196392636, + "learning_rate": 2.2289525288458504e-06, + "loss": 0.6396, + "step": 7929 + }, + { + "epoch": 0.7, + "grad_norm": 5.466429836295668, + "learning_rate": 2.2277687446705805e-06, + "loss": 0.6349, + "step": 7930 + }, + { + "epoch": 0.7, + "grad_norm": 15.819349823989281, + "learning_rate": 2.2265851848196548e-06, + "loss": 0.8365, + "step": 7931 + }, + { + "epoch": 0.7, + "grad_norm": 2.8077385893642943, + "learning_rate": 2.225401849388842e-06, + "loss": 0.4994, + "step": 7932 + }, + { + "epoch": 0.7, + "grad_norm": 11.193932405522059, + "learning_rate": 2.2242187384738965e-06, + "loss": 0.6979, + "step": 7933 + }, + { + "epoch": 0.7, + "grad_norm": 6.582306718633237, + "learning_rate": 2.2230358521705537e-06, + "loss": 0.6234, + "step": 7934 + }, + { + "epoch": 0.7, + "grad_norm": 1.9376875816506074, + "learning_rate": 2.221853190574528e-06, + "loss": 0.4453, + "step": 7935 + }, + { + "epoch": 0.7, + "grad_norm": 23.489960144596893, + "learning_rate": 2.2206707537815233e-06, + "loss": 0.5942, + "step": 7936 + }, + { + "epoch": 0.7, + "grad_norm": 2.920480994458309, + "learning_rate": 2.2194885418872157e-06, + "loss": 0.5046, + "step": 7937 + }, + { + "epoch": 0.7, + "grad_norm": 3.1387143920800518, + "learning_rate": 2.218306554987273e-06, + "loss": 0.5, + "step": 7938 + }, + { + "epoch": 0.7, + "grad_norm": 7.532980697733367, + "learning_rate": 2.217124793177336e-06, + "loss": 0.759, + "step": 7939 + }, + { + "epoch": 0.7, + "grad_norm": 7.868368870424443, + "learning_rate": 2.2159432565530312e-06, + "loss": 0.778, + "step": 7940 + }, + { + "epoch": 0.7, + "grad_norm": 8.377386596825225, + "learning_rate": 2.214761945209968e-06, + "loss": 0.7672, + "step": 7941 + }, + { + "epoch": 0.7, + "grad_norm": 2.735098017631767, + "learning_rate": 2.2135808592437346e-06, + "loss": 0.5355, + "step": 7942 + }, + { + "epoch": 0.7, + "grad_norm": 9.677655830796036, + "learning_rate": 2.2123999987499015e-06, + "loss": 0.6745, + "step": 7943 + }, + { + "epoch": 0.7, + "grad_norm": 8.43287096684975, + "learning_rate": 2.2112193638240256e-06, + "loss": 0.7211, + "step": 7944 + }, + { + "epoch": 0.7, + "grad_norm": 6.026296197998259, + "learning_rate": 2.2100389545616397e-06, + "loss": 0.5979, + "step": 7945 + }, + { + "epoch": 0.7, + "grad_norm": 3.7428390241686857, + "learning_rate": 2.2088587710582604e-06, + "loss": 0.5619, + "step": 7946 + }, + { + "epoch": 0.7, + "grad_norm": 5.185416879638191, + "learning_rate": 2.207678813409387e-06, + "loss": 0.7984, + "step": 7947 + }, + { + "epoch": 0.7, + "grad_norm": 7.453943347054683, + "learning_rate": 2.2064990817104988e-06, + "loss": 0.7922, + "step": 7948 + }, + { + "epoch": 0.7, + "grad_norm": 7.309012799035033, + "learning_rate": 2.2053195760570563e-06, + "loss": 0.76, + "step": 7949 + }, + { + "epoch": 0.7, + "grad_norm": 3.3331573864730486, + "learning_rate": 2.2041402965445074e-06, + "loss": 0.5244, + "step": 7950 + }, + { + "epoch": 0.7, + "grad_norm": 2.5178509460679686, + "learning_rate": 2.2029612432682744e-06, + "loss": 0.4605, + "step": 7951 + }, + { + "epoch": 0.7, + "grad_norm": 7.44167637863019, + "learning_rate": 2.2017824163237656e-06, + "loss": 0.7003, + "step": 7952 + }, + { + "epoch": 0.7, + "grad_norm": 13.478860578333526, + "learning_rate": 2.2006038158063687e-06, + "loss": 0.7536, + "step": 7953 + }, + { + "epoch": 0.7, + "grad_norm": 9.974035410796242, + "learning_rate": 2.1994254418114524e-06, + "loss": 0.6231, + "step": 7954 + }, + { + "epoch": 0.7, + "grad_norm": 7.374931132581373, + "learning_rate": 2.198247294434373e-06, + "loss": 0.507, + "step": 7955 + }, + { + "epoch": 0.7, + "grad_norm": 13.39551928427904, + "learning_rate": 2.197069373770462e-06, + "loss": 0.8818, + "step": 7956 + }, + { + "epoch": 0.7, + "grad_norm": 5.964011991545457, + "learning_rate": 2.195891679915035e-06, + "loss": 0.8066, + "step": 7957 + }, + { + "epoch": 0.7, + "grad_norm": 11.709466716423625, + "learning_rate": 2.1947142129633884e-06, + "loss": 0.644, + "step": 7958 + }, + { + "epoch": 0.7, + "grad_norm": 7.345429900764151, + "learning_rate": 2.1935369730108014e-06, + "loss": 0.7898, + "step": 7959 + }, + { + "epoch": 0.7, + "grad_norm": 8.865776411684113, + "learning_rate": 2.1923599601525333e-06, + "loss": 0.736, + "step": 7960 + }, + { + "epoch": 0.7, + "grad_norm": 6.011696172522522, + "learning_rate": 2.1911831744838274e-06, + "loss": 0.6742, + "step": 7961 + }, + { + "epoch": 0.7, + "grad_norm": 7.254254246725763, + "learning_rate": 2.190006616099908e-06, + "loss": 0.7771, + "step": 7962 + }, + { + "epoch": 0.7, + "grad_norm": 6.987118784950883, + "learning_rate": 2.188830285095978e-06, + "loss": 0.7805, + "step": 7963 + }, + { + "epoch": 0.7, + "grad_norm": 2.714735669785119, + "learning_rate": 2.1876541815672257e-06, + "loss": 0.4662, + "step": 7964 + }, + { + "epoch": 0.7, + "grad_norm": 12.01587145125135, + "learning_rate": 2.186478305608819e-06, + "loss": 0.7403, + "step": 7965 + }, + { + "epoch": 0.7, + "grad_norm": 9.649577462080549, + "learning_rate": 2.1853026573159063e-06, + "loss": 0.7166, + "step": 7966 + }, + { + "epoch": 0.7, + "grad_norm": 10.432268345516595, + "learning_rate": 2.184127236783622e-06, + "loss": 0.8626, + "step": 7967 + }, + { + "epoch": 0.7, + "grad_norm": 8.785103165688549, + "learning_rate": 2.1829520441070775e-06, + "loss": 0.7062, + "step": 7968 + }, + { + "epoch": 0.7, + "grad_norm": 6.422180406755793, + "learning_rate": 2.181777079381368e-06, + "loss": 0.6343, + "step": 7969 + }, + { + "epoch": 0.7, + "grad_norm": 7.277681200081509, + "learning_rate": 2.180602342701569e-06, + "loss": 0.68, + "step": 7970 + }, + { + "epoch": 0.7, + "grad_norm": 19.930404817407236, + "learning_rate": 2.179427834162739e-06, + "loss": 0.7271, + "step": 7971 + }, + { + "epoch": 0.7, + "grad_norm": 13.082025171373175, + "learning_rate": 2.178253553859915e-06, + "loss": 0.7469, + "step": 7972 + }, + { + "epoch": 0.7, + "grad_norm": 6.047849782151473, + "learning_rate": 2.177079501888121e-06, + "loss": 0.7178, + "step": 7973 + }, + { + "epoch": 0.7, + "grad_norm": 12.983679886659305, + "learning_rate": 2.1759056783423587e-06, + "loss": 0.6684, + "step": 7974 + }, + { + "epoch": 0.7, + "grad_norm": 4.243752158382541, + "learning_rate": 2.1747320833176112e-06, + "loss": 0.8089, + "step": 7975 + }, + { + "epoch": 0.7, + "grad_norm": 8.664703932192092, + "learning_rate": 2.1735587169088435e-06, + "loss": 0.8623, + "step": 7976 + }, + { + "epoch": 0.7, + "grad_norm": 5.117878427217061, + "learning_rate": 2.1723855792110015e-06, + "loss": 0.6633, + "step": 7977 + }, + { + "epoch": 0.7, + "grad_norm": 10.159228949428282, + "learning_rate": 2.171212670319016e-06, + "loss": 0.6609, + "step": 7978 + }, + { + "epoch": 0.7, + "grad_norm": 2.512932969100755, + "learning_rate": 2.170039990327797e-06, + "loss": 0.4877, + "step": 7979 + }, + { + "epoch": 0.7, + "grad_norm": 7.60911005878027, + "learning_rate": 2.168867539332233e-06, + "loss": 0.7018, + "step": 7980 + }, + { + "epoch": 0.7, + "grad_norm": 9.070180525852711, + "learning_rate": 2.1676953174271997e-06, + "loss": 0.7635, + "step": 7981 + }, + { + "epoch": 0.7, + "grad_norm": 2.9241717484784986, + "learning_rate": 2.1665233247075497e-06, + "loss": 0.5125, + "step": 7982 + }, + { + "epoch": 0.7, + "grad_norm": 2.8934136474423346, + "learning_rate": 2.165351561268117e-06, + "loss": 0.4204, + "step": 7983 + }, + { + "epoch": 0.7, + "grad_norm": 8.113216312404866, + "learning_rate": 2.164180027203723e-06, + "loss": 0.6443, + "step": 7984 + }, + { + "epoch": 0.7, + "grad_norm": 11.65677724548709, + "learning_rate": 2.163008722609163e-06, + "loss": 0.7632, + "step": 7985 + }, + { + "epoch": 0.7, + "grad_norm": 9.784895830673378, + "learning_rate": 2.161837647579219e-06, + "loss": 0.7472, + "step": 7986 + }, + { + "epoch": 0.7, + "grad_norm": 10.801368716574885, + "learning_rate": 2.1606668022086517e-06, + "loss": 0.8094, + "step": 7987 + }, + { + "epoch": 0.7, + "grad_norm": 6.49863259137217, + "learning_rate": 2.1594961865922033e-06, + "loss": 0.7807, + "step": 7988 + }, + { + "epoch": 0.7, + "grad_norm": 6.6865974978387985, + "learning_rate": 2.1583258008245965e-06, + "loss": 0.6643, + "step": 7989 + }, + { + "epoch": 0.7, + "grad_norm": 4.8720807591502275, + "learning_rate": 2.1571556450005415e-06, + "loss": 0.6659, + "step": 7990 + }, + { + "epoch": 0.7, + "grad_norm": 9.173209973454064, + "learning_rate": 2.1559857192147226e-06, + "loss": 0.7697, + "step": 7991 + }, + { + "epoch": 0.7, + "grad_norm": 9.557403170403102, + "learning_rate": 2.154816023561808e-06, + "loss": 0.8299, + "step": 7992 + }, + { + "epoch": 0.7, + "grad_norm": 8.25994245287881, + "learning_rate": 2.153646558136449e-06, + "loss": 0.7521, + "step": 7993 + }, + { + "epoch": 0.7, + "grad_norm": 20.119541767508696, + "learning_rate": 2.1524773230332733e-06, + "loss": 0.657, + "step": 7994 + }, + { + "epoch": 0.7, + "grad_norm": 7.456976845728732, + "learning_rate": 2.151308318346898e-06, + "loss": 0.9123, + "step": 7995 + }, + { + "epoch": 0.7, + "grad_norm": 3.7901423672081127, + "learning_rate": 2.1501395441719153e-06, + "loss": 0.5364, + "step": 7996 + }, + { + "epoch": 0.7, + "grad_norm": 11.222287821238934, + "learning_rate": 2.1489710006029e-06, + "loss": 0.6783, + "step": 7997 + }, + { + "epoch": 0.7, + "grad_norm": 12.73356537996995, + "learning_rate": 2.147802687734409e-06, + "loss": 0.7671, + "step": 7998 + }, + { + "epoch": 0.7, + "grad_norm": 5.020757284165625, + "learning_rate": 2.14663460566098e-06, + "loss": 0.849, + "step": 7999 + }, + { + "epoch": 0.7, + "grad_norm": 9.322477841890144, + "learning_rate": 2.14546675447713e-06, + "loss": 0.6366, + "step": 8000 + }, + { + "epoch": 0.7, + "grad_norm": 6.276944509466434, + "learning_rate": 2.1442991342773643e-06, + "loss": 0.7829, + "step": 8001 + }, + { + "epoch": 0.7, + "grad_norm": 11.762111108448847, + "learning_rate": 2.143131745156163e-06, + "loss": 0.7062, + "step": 8002 + }, + { + "epoch": 0.7, + "grad_norm": 5.946757071740263, + "learning_rate": 2.141964587207988e-06, + "loss": 0.6766, + "step": 8003 + }, + { + "epoch": 0.7, + "grad_norm": 7.201597392253819, + "learning_rate": 2.1407976605272847e-06, + "loss": 0.7161, + "step": 8004 + }, + { + "epoch": 0.7, + "grad_norm": 9.13301158812962, + "learning_rate": 2.1396309652084783e-06, + "loss": 0.6696, + "step": 8005 + }, + { + "epoch": 0.7, + "grad_norm": 9.421198920338806, + "learning_rate": 2.1384645013459755e-06, + "loss": 0.7463, + "step": 8006 + }, + { + "epoch": 0.7, + "grad_norm": 5.273834381226778, + "learning_rate": 2.1372982690341657e-06, + "loss": 0.6856, + "step": 8007 + }, + { + "epoch": 0.7, + "grad_norm": 6.9138216319741, + "learning_rate": 2.136132268367419e-06, + "loss": 0.6341, + "step": 8008 + }, + { + "epoch": 0.7, + "grad_norm": 2.931497120341922, + "learning_rate": 2.1349664994400853e-06, + "loss": 0.5387, + "step": 8009 + }, + { + "epoch": 0.7, + "grad_norm": 7.250181928413038, + "learning_rate": 2.1338009623464967e-06, + "loss": 0.6123, + "step": 8010 + }, + { + "epoch": 0.7, + "grad_norm": 43.417551091225526, + "learning_rate": 2.1326356571809662e-06, + "loss": 0.7612, + "step": 8011 + }, + { + "epoch": 0.7, + "grad_norm": 7.7190999998589, + "learning_rate": 2.1314705840377877e-06, + "loss": 0.6373, + "step": 8012 + }, + { + "epoch": 0.7, + "grad_norm": 5.846441443520751, + "learning_rate": 2.130305743011239e-06, + "loss": 0.6129, + "step": 8013 + }, + { + "epoch": 0.7, + "grad_norm": 2.686607628371759, + "learning_rate": 2.1291411341955757e-06, + "loss": 0.4762, + "step": 8014 + }, + { + "epoch": 0.7, + "grad_norm": 6.906053876386427, + "learning_rate": 2.1279767576850375e-06, + "loss": 0.661, + "step": 8015 + }, + { + "epoch": 0.7, + "grad_norm": 5.469312284320135, + "learning_rate": 2.126812613573842e-06, + "loss": 0.5681, + "step": 8016 + }, + { + "epoch": 0.7, + "grad_norm": 22.919225693806812, + "learning_rate": 2.125648701956189e-06, + "loss": 0.6269, + "step": 8017 + }, + { + "epoch": 0.7, + "grad_norm": 8.356637657040766, + "learning_rate": 2.124485022926264e-06, + "loss": 0.7625, + "step": 8018 + }, + { + "epoch": 0.7, + "grad_norm": 7.877954262222203, + "learning_rate": 2.1233215765782266e-06, + "loss": 0.6347, + "step": 8019 + }, + { + "epoch": 0.7, + "grad_norm": 6.91384865471579, + "learning_rate": 2.122158363006223e-06, + "loss": 0.7263, + "step": 8020 + }, + { + "epoch": 0.7, + "grad_norm": 6.733670863566629, + "learning_rate": 2.120995382304377e-06, + "loss": 0.6748, + "step": 8021 + }, + { + "epoch": 0.7, + "grad_norm": 4.748344809618262, + "learning_rate": 2.1198326345667964e-06, + "loss": 0.7069, + "step": 8022 + }, + { + "epoch": 0.7, + "grad_norm": 7.410083913789318, + "learning_rate": 2.118670119887566e-06, + "loss": 0.8674, + "step": 8023 + }, + { + "epoch": 0.7, + "grad_norm": 2.5016852586272096, + "learning_rate": 2.1175078383607584e-06, + "loss": 0.4772, + "step": 8024 + }, + { + "epoch": 0.7, + "grad_norm": 6.425976847896706, + "learning_rate": 2.116345790080422e-06, + "loss": 0.8055, + "step": 8025 + }, + { + "epoch": 0.7, + "grad_norm": 10.543858126285194, + "learning_rate": 2.1151839751405872e-06, + "loss": 0.8546, + "step": 8026 + }, + { + "epoch": 0.7, + "grad_norm": 3.0415048145904025, + "learning_rate": 2.1140223936352665e-06, + "loss": 0.5715, + "step": 8027 + }, + { + "epoch": 0.71, + "grad_norm": 21.120111163624653, + "learning_rate": 2.1128610456584535e-06, + "loss": 0.7499, + "step": 8028 + }, + { + "epoch": 0.71, + "grad_norm": 6.687240380259934, + "learning_rate": 2.1116999313041203e-06, + "loss": 0.6168, + "step": 8029 + }, + { + "epoch": 0.71, + "grad_norm": 6.431580581348549, + "learning_rate": 2.1105390506662252e-06, + "loss": 0.8059, + "step": 8030 + }, + { + "epoch": 0.71, + "grad_norm": 6.455349446672567, + "learning_rate": 2.109378403838705e-06, + "loss": 0.7423, + "step": 8031 + }, + { + "epoch": 0.71, + "grad_norm": 12.039265703860517, + "learning_rate": 2.1082179909154756e-06, + "loss": 0.6714, + "step": 8032 + }, + { + "epoch": 0.71, + "grad_norm": 8.949696578007327, + "learning_rate": 2.1070578119904357e-06, + "loss": 0.6994, + "step": 8033 + }, + { + "epoch": 0.71, + "grad_norm": 6.297983669794781, + "learning_rate": 2.1058978671574643e-06, + "loss": 0.5984, + "step": 8034 + }, + { + "epoch": 0.71, + "grad_norm": 9.674827545288283, + "learning_rate": 2.104738156510425e-06, + "loss": 0.7694, + "step": 8035 + }, + { + "epoch": 0.71, + "grad_norm": 7.417638687852908, + "learning_rate": 2.1035786801431573e-06, + "loss": 0.8065, + "step": 8036 + }, + { + "epoch": 0.71, + "grad_norm": 5.892705029999771, + "learning_rate": 2.1024194381494856e-06, + "loss": 0.7458, + "step": 8037 + }, + { + "epoch": 0.71, + "grad_norm": 5.794743364532036, + "learning_rate": 2.1012604306232133e-06, + "loss": 0.5806, + "step": 8038 + }, + { + "epoch": 0.71, + "grad_norm": 17.1984016825747, + "learning_rate": 2.1001016576581247e-06, + "loss": 0.7149, + "step": 8039 + }, + { + "epoch": 0.71, + "grad_norm": 11.759861821522579, + "learning_rate": 2.0989431193479846e-06, + "loss": 0.8164, + "step": 8040 + }, + { + "epoch": 0.71, + "grad_norm": 6.616398577582335, + "learning_rate": 2.097784815786543e-06, + "loss": 0.6444, + "step": 8041 + }, + { + "epoch": 0.71, + "grad_norm": 6.405738104039414, + "learning_rate": 2.0966267470675273e-06, + "loss": 0.6902, + "step": 8042 + }, + { + "epoch": 0.71, + "grad_norm": 8.021023430878493, + "learning_rate": 2.095468913284645e-06, + "loss": 0.7868, + "step": 8043 + }, + { + "epoch": 0.71, + "grad_norm": 14.925329440233513, + "learning_rate": 2.0943113145315875e-06, + "loss": 0.6108, + "step": 8044 + }, + { + "epoch": 0.71, + "grad_norm": 14.512462670010724, + "learning_rate": 2.093153950902024e-06, + "loss": 0.7167, + "step": 8045 + }, + { + "epoch": 0.71, + "grad_norm": 9.710170134962299, + "learning_rate": 2.091996822489607e-06, + "loss": 0.755, + "step": 8046 + }, + { + "epoch": 0.71, + "grad_norm": 4.163137197916116, + "learning_rate": 2.090839929387971e-06, + "loss": 0.6982, + "step": 8047 + }, + { + "epoch": 0.71, + "grad_norm": 9.841704271654239, + "learning_rate": 2.0896832716907288e-06, + "loss": 0.7488, + "step": 8048 + }, + { + "epoch": 0.71, + "grad_norm": 8.567589902429782, + "learning_rate": 2.0885268494914755e-06, + "loss": 0.742, + "step": 8049 + }, + { + "epoch": 0.71, + "grad_norm": 5.912484071748822, + "learning_rate": 2.0873706628837863e-06, + "loss": 0.744, + "step": 8050 + }, + { + "epoch": 0.71, + "grad_norm": 6.458107932799931, + "learning_rate": 2.086214711961218e-06, + "loss": 0.806, + "step": 8051 + }, + { + "epoch": 0.71, + "grad_norm": 2.312633003675402, + "learning_rate": 2.085058996817307e-06, + "loss": 0.5227, + "step": 8052 + }, + { + "epoch": 0.71, + "grad_norm": 8.498588790344442, + "learning_rate": 2.0839035175455748e-06, + "loss": 0.8339, + "step": 8053 + }, + { + "epoch": 0.71, + "grad_norm": 9.015494265986657, + "learning_rate": 2.082748274239519e-06, + "loss": 0.6681, + "step": 8054 + }, + { + "epoch": 0.71, + "grad_norm": 8.179024960455537, + "learning_rate": 2.08159326699262e-06, + "loss": 0.6366, + "step": 8055 + }, + { + "epoch": 0.71, + "grad_norm": 7.301934748022621, + "learning_rate": 2.0804384958983403e-06, + "loss": 0.6497, + "step": 8056 + }, + { + "epoch": 0.71, + "grad_norm": 7.698281178034605, + "learning_rate": 2.079283961050119e-06, + "loss": 0.7156, + "step": 8057 + }, + { + "epoch": 0.71, + "grad_norm": 8.042078518704914, + "learning_rate": 2.0781296625413827e-06, + "loss": 0.6526, + "step": 8058 + }, + { + "epoch": 0.71, + "grad_norm": 2.935476583187813, + "learning_rate": 2.0769756004655337e-06, + "loss": 0.5444, + "step": 8059 + }, + { + "epoch": 0.71, + "grad_norm": 7.153277899924052, + "learning_rate": 2.075821774915957e-06, + "loss": 0.6926, + "step": 8060 + }, + { + "epoch": 0.71, + "grad_norm": 2.8527578229345085, + "learning_rate": 2.0746681859860187e-06, + "loss": 0.4866, + "step": 8061 + }, + { + "epoch": 0.71, + "grad_norm": 8.195110212528183, + "learning_rate": 2.073514833769064e-06, + "loss": 0.7083, + "step": 8062 + }, + { + "epoch": 0.71, + "grad_norm": 7.40755843094126, + "learning_rate": 2.0723617183584196e-06, + "loss": 0.6778, + "step": 8063 + }, + { + "epoch": 0.71, + "grad_norm": 6.376141760522707, + "learning_rate": 2.071208839847397e-06, + "loss": 0.5042, + "step": 8064 + }, + { + "epoch": 0.71, + "grad_norm": 8.133711327069642, + "learning_rate": 2.070056198329283e-06, + "loss": 0.7934, + "step": 8065 + }, + { + "epoch": 0.71, + "grad_norm": 10.246078660243008, + "learning_rate": 2.068903793897348e-06, + "loss": 0.7931, + "step": 8066 + }, + { + "epoch": 0.71, + "grad_norm": 9.074455214688745, + "learning_rate": 2.0677516266448415e-06, + "loss": 0.8101, + "step": 8067 + }, + { + "epoch": 0.71, + "grad_norm": 9.8758030616572, + "learning_rate": 2.0665996966649967e-06, + "loss": 0.5445, + "step": 8068 + }, + { + "epoch": 0.71, + "grad_norm": 7.89528942085552, + "learning_rate": 2.065448004051023e-06, + "loss": 0.8103, + "step": 8069 + }, + { + "epoch": 0.71, + "grad_norm": 9.33479984113416, + "learning_rate": 2.064296548896117e-06, + "loss": 0.7312, + "step": 8070 + }, + { + "epoch": 0.71, + "grad_norm": 8.725929848058746, + "learning_rate": 2.063145331293451e-06, + "loss": 0.7031, + "step": 8071 + }, + { + "epoch": 0.71, + "grad_norm": 8.106005821754604, + "learning_rate": 2.06199435133618e-06, + "loss": 0.6301, + "step": 8072 + }, + { + "epoch": 0.71, + "grad_norm": 15.425589396828245, + "learning_rate": 2.060843609117439e-06, + "loss": 0.7419, + "step": 8073 + }, + { + "epoch": 0.71, + "grad_norm": 6.501259645337178, + "learning_rate": 2.059693104730342e-06, + "loss": 0.7281, + "step": 8074 + }, + { + "epoch": 0.71, + "grad_norm": 7.044760160346764, + "learning_rate": 2.0585428382679894e-06, + "loss": 0.6735, + "step": 8075 + }, + { + "epoch": 0.71, + "grad_norm": 9.766520269120527, + "learning_rate": 2.0573928098234584e-06, + "loss": 0.7337, + "step": 8076 + }, + { + "epoch": 0.71, + "grad_norm": 12.315377142810569, + "learning_rate": 2.056243019489806e-06, + "loss": 0.6972, + "step": 8077 + }, + { + "epoch": 0.71, + "grad_norm": 5.1293642230267045, + "learning_rate": 2.0550934673600713e-06, + "loss": 0.7393, + "step": 8078 + }, + { + "epoch": 0.71, + "grad_norm": 6.983788021147583, + "learning_rate": 2.0539441535272754e-06, + "loss": 0.8388, + "step": 8079 + }, + { + "epoch": 0.71, + "grad_norm": 6.018893158154406, + "learning_rate": 2.052795078084416e-06, + "loss": 0.6944, + "step": 8080 + }, + { + "epoch": 0.71, + "grad_norm": 3.554452800714091, + "learning_rate": 2.051646241124478e-06, + "loss": 0.4853, + "step": 8081 + }, + { + "epoch": 0.71, + "grad_norm": 8.637370213948673, + "learning_rate": 2.0504976427404222e-06, + "loss": 0.7786, + "step": 8082 + }, + { + "epoch": 0.71, + "grad_norm": 2.646231943687767, + "learning_rate": 2.049349283025191e-06, + "loss": 0.438, + "step": 8083 + }, + { + "epoch": 0.71, + "grad_norm": 6.605888388444852, + "learning_rate": 2.048201162071707e-06, + "loss": 0.7841, + "step": 8084 + }, + { + "epoch": 0.71, + "grad_norm": 2.6762849295015885, + "learning_rate": 2.0470532799728752e-06, + "loss": 0.511, + "step": 8085 + }, + { + "epoch": 0.71, + "grad_norm": 7.247340221546179, + "learning_rate": 2.0459056368215786e-06, + "loss": 0.7677, + "step": 8086 + }, + { + "epoch": 0.71, + "grad_norm": 2.8790874061456844, + "learning_rate": 2.0447582327106856e-06, + "loss": 0.4959, + "step": 8087 + }, + { + "epoch": 0.71, + "grad_norm": 10.603239337133246, + "learning_rate": 2.0436110677330408e-06, + "loss": 0.9156, + "step": 8088 + }, + { + "epoch": 0.71, + "grad_norm": 2.5489326235904897, + "learning_rate": 2.04246414198147e-06, + "loss": 0.5593, + "step": 8089 + }, + { + "epoch": 0.71, + "grad_norm": 8.696115851153431, + "learning_rate": 2.041317455548782e-06, + "loss": 0.782, + "step": 8090 + }, + { + "epoch": 0.71, + "grad_norm": 2.414043107716104, + "learning_rate": 2.0401710085277615e-06, + "loss": 0.4727, + "step": 8091 + }, + { + "epoch": 0.71, + "grad_norm": 8.517701661801413, + "learning_rate": 2.0390248010111816e-06, + "loss": 0.5463, + "step": 8092 + }, + { + "epoch": 0.71, + "grad_norm": 7.011695198878993, + "learning_rate": 2.0378788330917898e-06, + "loss": 0.6729, + "step": 8093 + }, + { + "epoch": 0.71, + "grad_norm": 10.564823281040526, + "learning_rate": 2.0367331048623153e-06, + "loss": 0.6914, + "step": 8094 + }, + { + "epoch": 0.71, + "grad_norm": 6.068994214270463, + "learning_rate": 2.035587616415469e-06, + "loss": 0.7648, + "step": 8095 + }, + { + "epoch": 0.71, + "grad_norm": 5.929377652079401, + "learning_rate": 2.0344423678439413e-06, + "loss": 0.66, + "step": 8096 + }, + { + "epoch": 0.71, + "grad_norm": 7.058026527686949, + "learning_rate": 2.0332973592404027e-06, + "loss": 0.6842, + "step": 8097 + }, + { + "epoch": 0.71, + "grad_norm": 2.354827692024245, + "learning_rate": 2.0321525906975087e-06, + "loss": 0.5051, + "step": 8098 + }, + { + "epoch": 0.71, + "grad_norm": 9.559470618358944, + "learning_rate": 2.0310080623078905e-06, + "loss": 0.8006, + "step": 8099 + }, + { + "epoch": 0.71, + "grad_norm": 7.210749298810933, + "learning_rate": 2.0298637741641613e-06, + "loss": 0.6474, + "step": 8100 + }, + { + "epoch": 0.71, + "grad_norm": 6.761918240482704, + "learning_rate": 2.028719726358915e-06, + "loss": 0.7618, + "step": 8101 + }, + { + "epoch": 0.71, + "grad_norm": 26.512198554561085, + "learning_rate": 2.0275759189847253e-06, + "loss": 0.8854, + "step": 8102 + }, + { + "epoch": 0.71, + "grad_norm": 9.981820440644075, + "learning_rate": 2.0264323521341465e-06, + "loss": 0.6704, + "step": 8103 + }, + { + "epoch": 0.71, + "grad_norm": 6.369878762141758, + "learning_rate": 2.025289025899717e-06, + "loss": 0.7353, + "step": 8104 + }, + { + "epoch": 0.71, + "grad_norm": 5.089248667377884, + "learning_rate": 2.0241459403739515e-06, + "loss": 0.6643, + "step": 8105 + }, + { + "epoch": 0.71, + "grad_norm": 9.74436715263599, + "learning_rate": 2.0230030956493467e-06, + "loss": 0.6773, + "step": 8106 + }, + { + "epoch": 0.71, + "grad_norm": 7.293296066471462, + "learning_rate": 2.021860491818379e-06, + "loss": 0.6975, + "step": 8107 + }, + { + "epoch": 0.71, + "grad_norm": 8.783880947621007, + "learning_rate": 2.0207181289735073e-06, + "loss": 0.6374, + "step": 8108 + }, + { + "epoch": 0.71, + "grad_norm": 9.551091775768874, + "learning_rate": 2.0195760072071665e-06, + "loss": 0.8134, + "step": 8109 + }, + { + "epoch": 0.71, + "grad_norm": 6.949340568958892, + "learning_rate": 2.0184341266117792e-06, + "loss": 0.6619, + "step": 8110 + }, + { + "epoch": 0.71, + "grad_norm": 5.48322730983666, + "learning_rate": 2.017292487279742e-06, + "loss": 0.6901, + "step": 8111 + }, + { + "epoch": 0.71, + "grad_norm": 5.661027891596334, + "learning_rate": 2.016151089303436e-06, + "loss": 0.7025, + "step": 8112 + }, + { + "epoch": 0.71, + "grad_norm": 8.155669809325099, + "learning_rate": 2.015009932775221e-06, + "loss": 0.6133, + "step": 8113 + }, + { + "epoch": 0.71, + "grad_norm": 6.7967596685714025, + "learning_rate": 2.013869017787435e-06, + "loss": 0.6239, + "step": 8114 + }, + { + "epoch": 0.71, + "grad_norm": 8.624945830881659, + "learning_rate": 2.012728344432402e-06, + "loss": 0.7455, + "step": 8115 + }, + { + "epoch": 0.71, + "grad_norm": 12.56593749384099, + "learning_rate": 2.011587912802423e-06, + "loss": 0.5892, + "step": 8116 + }, + { + "epoch": 0.71, + "grad_norm": 6.501163306591654, + "learning_rate": 2.0104477229897783e-06, + "loss": 0.6837, + "step": 8117 + }, + { + "epoch": 0.71, + "grad_norm": 4.910192538384062, + "learning_rate": 2.009307775086732e-06, + "loss": 0.6646, + "step": 8118 + }, + { + "epoch": 0.71, + "grad_norm": 3.5620029740702117, + "learning_rate": 2.008168069185525e-06, + "loss": 0.5715, + "step": 8119 + }, + { + "epoch": 0.71, + "grad_norm": 5.218783924356844, + "learning_rate": 2.00702860537838e-06, + "loss": 0.6921, + "step": 8120 + }, + { + "epoch": 0.71, + "grad_norm": 2.857797082229984, + "learning_rate": 2.0058893837575027e-06, + "loss": 0.4687, + "step": 8121 + }, + { + "epoch": 0.71, + "grad_norm": 5.554542627889928, + "learning_rate": 2.0047504044150766e-06, + "loss": 0.4882, + "step": 8122 + }, + { + "epoch": 0.71, + "grad_norm": 7.854796820114138, + "learning_rate": 2.0036116674432653e-06, + "loss": 0.6435, + "step": 8123 + }, + { + "epoch": 0.71, + "grad_norm": 6.49030090323636, + "learning_rate": 2.002473172934214e-06, + "loss": 0.5751, + "step": 8124 + }, + { + "epoch": 0.71, + "grad_norm": 8.422899893632406, + "learning_rate": 2.001334920980047e-06, + "loss": 0.7768, + "step": 8125 + }, + { + "epoch": 0.71, + "grad_norm": 9.663966129882795, + "learning_rate": 2.000196911672869e-06, + "loss": 0.811, + "step": 8126 + }, + { + "epoch": 0.71, + "grad_norm": 4.920566503743946, + "learning_rate": 1.9990591451047674e-06, + "loss": 0.6763, + "step": 8127 + }, + { + "epoch": 0.71, + "grad_norm": 5.749719659593924, + "learning_rate": 1.997921621367809e-06, + "loss": 0.7098, + "step": 8128 + }, + { + "epoch": 0.71, + "grad_norm": 7.728763452612927, + "learning_rate": 1.996784340554039e-06, + "loss": 0.906, + "step": 8129 + }, + { + "epoch": 0.71, + "grad_norm": 8.276303634908773, + "learning_rate": 1.9956473027554846e-06, + "loss": 0.7269, + "step": 8130 + }, + { + "epoch": 0.71, + "grad_norm": 7.348624527471915, + "learning_rate": 1.9945105080641514e-06, + "loss": 0.5529, + "step": 8131 + }, + { + "epoch": 0.71, + "grad_norm": 13.515624140233756, + "learning_rate": 1.99337395657203e-06, + "loss": 0.8739, + "step": 8132 + }, + { + "epoch": 0.71, + "grad_norm": 16.624254625525015, + "learning_rate": 1.992237648371087e-06, + "loss": 0.9383, + "step": 8133 + }, + { + "epoch": 0.71, + "grad_norm": 6.378164132157378, + "learning_rate": 1.9911015835532703e-06, + "loss": 0.6716, + "step": 8134 + }, + { + "epoch": 0.71, + "grad_norm": 6.428236556539073, + "learning_rate": 1.9899657622105085e-06, + "loss": 0.6941, + "step": 8135 + }, + { + "epoch": 0.71, + "grad_norm": 11.523302727970185, + "learning_rate": 1.98883018443471e-06, + "loss": 0.7439, + "step": 8136 + }, + { + "epoch": 0.71, + "grad_norm": 7.930882859201439, + "learning_rate": 1.987694850317763e-06, + "loss": 0.6708, + "step": 8137 + }, + { + "epoch": 0.71, + "grad_norm": 9.305625978208765, + "learning_rate": 1.986559759951539e-06, + "loss": 0.6781, + "step": 8138 + }, + { + "epoch": 0.71, + "grad_norm": 14.881792660930556, + "learning_rate": 1.985424913427887e-06, + "loss": 0.7103, + "step": 8139 + }, + { + "epoch": 0.71, + "grad_norm": 10.504806864028147, + "learning_rate": 1.9842903108386363e-06, + "loss": 0.7472, + "step": 8140 + }, + { + "epoch": 0.71, + "grad_norm": 3.1412947156985327, + "learning_rate": 1.9831559522755976e-06, + "loss": 0.4659, + "step": 8141 + }, + { + "epoch": 0.72, + "grad_norm": 10.384654739959009, + "learning_rate": 1.9820218378305604e-06, + "loss": 0.6584, + "step": 8142 + }, + { + "epoch": 0.72, + "grad_norm": 5.981829959937305, + "learning_rate": 1.980887967595295e-06, + "loss": 0.6337, + "step": 8143 + }, + { + "epoch": 0.72, + "grad_norm": 7.39561661245173, + "learning_rate": 1.9797543416615543e-06, + "loss": 0.6807, + "step": 8144 + }, + { + "epoch": 0.72, + "grad_norm": 16.083641512571187, + "learning_rate": 1.9786209601210688e-06, + "loss": 0.6704, + "step": 8145 + }, + { + "epoch": 0.72, + "grad_norm": 9.745409763984542, + "learning_rate": 1.977487823065549e-06, + "loss": 0.6436, + "step": 8146 + }, + { + "epoch": 0.72, + "grad_norm": 8.358985214113172, + "learning_rate": 1.9763549305866877e-06, + "loss": 0.7587, + "step": 8147 + }, + { + "epoch": 0.72, + "grad_norm": 7.427833408695279, + "learning_rate": 1.975222282776155e-06, + "loss": 0.687, + "step": 8148 + }, + { + "epoch": 0.72, + "grad_norm": 8.607491699806245, + "learning_rate": 1.974089879725602e-06, + "loss": 0.708, + "step": 8149 + }, + { + "epoch": 0.72, + "grad_norm": 10.391345410601696, + "learning_rate": 1.972957721526664e-06, + "loss": 0.7281, + "step": 8150 + }, + { + "epoch": 0.72, + "grad_norm": 5.743069062650508, + "learning_rate": 1.971825808270953e-06, + "loss": 0.6416, + "step": 8151 + }, + { + "epoch": 0.72, + "grad_norm": 10.544398309418394, + "learning_rate": 1.97069414005006e-06, + "loss": 0.6138, + "step": 8152 + }, + { + "epoch": 0.72, + "grad_norm": 2.432533640938879, + "learning_rate": 1.9695627169555582e-06, + "loss": 0.4817, + "step": 8153 + }, + { + "epoch": 0.72, + "grad_norm": 8.371078230466182, + "learning_rate": 1.9684315390789988e-06, + "loss": 0.7164, + "step": 8154 + }, + { + "epoch": 0.72, + "grad_norm": 8.164687455165216, + "learning_rate": 1.9673006065119176e-06, + "loss": 0.6836, + "step": 8155 + }, + { + "epoch": 0.72, + "grad_norm": 7.223822405113708, + "learning_rate": 1.9661699193458276e-06, + "loss": 0.6974, + "step": 8156 + }, + { + "epoch": 0.72, + "grad_norm": 11.33209539879255, + "learning_rate": 1.9650394776722208e-06, + "loss": 0.8794, + "step": 8157 + }, + { + "epoch": 0.72, + "grad_norm": 2.118233979919337, + "learning_rate": 1.963909281582571e-06, + "loss": 0.5711, + "step": 8158 + }, + { + "epoch": 0.72, + "grad_norm": 6.81094275513176, + "learning_rate": 1.962779331168332e-06, + "loss": 0.7786, + "step": 8159 + }, + { + "epoch": 0.72, + "grad_norm": 6.935670548743002, + "learning_rate": 1.9616496265209357e-06, + "loss": 0.6753, + "step": 8160 + }, + { + "epoch": 0.72, + "grad_norm": 11.471629956103097, + "learning_rate": 1.9605201677317993e-06, + "loss": 0.7405, + "step": 8161 + }, + { + "epoch": 0.72, + "grad_norm": 8.473733121735078, + "learning_rate": 1.9593909548923147e-06, + "loss": 0.6754, + "step": 8162 + }, + { + "epoch": 0.72, + "grad_norm": 2.6454441868960914, + "learning_rate": 1.9582619880938565e-06, + "loss": 0.4546, + "step": 8163 + }, + { + "epoch": 0.72, + "grad_norm": 7.270964283133673, + "learning_rate": 1.9571332674277783e-06, + "loss": 0.7269, + "step": 8164 + }, + { + "epoch": 0.72, + "grad_norm": 9.88848875432669, + "learning_rate": 1.956004792985415e-06, + "loss": 0.6877, + "step": 8165 + }, + { + "epoch": 0.72, + "grad_norm": 7.338865158467419, + "learning_rate": 1.954876564858078e-06, + "loss": 0.6804, + "step": 8166 + }, + { + "epoch": 0.72, + "grad_norm": 10.032246589736248, + "learning_rate": 1.9537485831370666e-06, + "loss": 0.7222, + "step": 8167 + }, + { + "epoch": 0.72, + "grad_norm": 8.37054914165336, + "learning_rate": 1.952620847913652e-06, + "loss": 0.8411, + "step": 8168 + }, + { + "epoch": 0.72, + "grad_norm": 9.065245724910357, + "learning_rate": 1.9514933592790898e-06, + "loss": 0.7722, + "step": 8169 + }, + { + "epoch": 0.72, + "grad_norm": 7.982301619942317, + "learning_rate": 1.950366117324614e-06, + "loss": 0.6024, + "step": 8170 + }, + { + "epoch": 0.72, + "grad_norm": 54.267102314011055, + "learning_rate": 1.9492391221414377e-06, + "loss": 0.977, + "step": 8171 + }, + { + "epoch": 0.72, + "grad_norm": 6.260924042325864, + "learning_rate": 1.948112373820758e-06, + "loss": 0.6795, + "step": 8172 + }, + { + "epoch": 0.72, + "grad_norm": 2.651594289398862, + "learning_rate": 1.946985872453749e-06, + "loss": 0.4455, + "step": 8173 + }, + { + "epoch": 0.72, + "grad_norm": 7.731082634098112, + "learning_rate": 1.9458596181315643e-06, + "loss": 0.6981, + "step": 8174 + }, + { + "epoch": 0.72, + "grad_norm": 5.877218884608836, + "learning_rate": 1.944733610945339e-06, + "loss": 0.6617, + "step": 8175 + }, + { + "epoch": 0.72, + "grad_norm": 9.08397611071007, + "learning_rate": 1.9436078509861884e-06, + "loss": 0.6924, + "step": 8176 + }, + { + "epoch": 0.72, + "grad_norm": 13.60616398889948, + "learning_rate": 1.942482338345204e-06, + "loss": 0.7939, + "step": 8177 + }, + { + "epoch": 0.72, + "grad_norm": 7.137240994678861, + "learning_rate": 1.941357073113465e-06, + "loss": 0.6826, + "step": 8178 + }, + { + "epoch": 0.72, + "grad_norm": 2.9794916587107427, + "learning_rate": 1.9402320553820237e-06, + "loss": 0.4983, + "step": 8179 + }, + { + "epoch": 0.72, + "grad_norm": 6.402113017121095, + "learning_rate": 1.9391072852419146e-06, + "loss": 0.6934, + "step": 8180 + }, + { + "epoch": 0.72, + "grad_norm": 2.8310388570792004, + "learning_rate": 1.937982762784153e-06, + "loss": 0.4918, + "step": 8181 + }, + { + "epoch": 0.72, + "grad_norm": 9.513410176755327, + "learning_rate": 1.936858488099733e-06, + "loss": 0.8034, + "step": 8182 + }, + { + "epoch": 0.72, + "grad_norm": 5.998860355811258, + "learning_rate": 1.935734461279626e-06, + "loss": 0.622, + "step": 8183 + }, + { + "epoch": 0.72, + "grad_norm": 11.976099057830668, + "learning_rate": 1.9346106824147913e-06, + "loss": 0.7842, + "step": 8184 + }, + { + "epoch": 0.72, + "grad_norm": 9.109304941404247, + "learning_rate": 1.9334871515961616e-06, + "loss": 0.8157, + "step": 8185 + }, + { + "epoch": 0.72, + "grad_norm": 5.885322992533147, + "learning_rate": 1.9323638689146513e-06, + "loss": 0.5936, + "step": 8186 + }, + { + "epoch": 0.72, + "grad_norm": 8.832693361251666, + "learning_rate": 1.9312408344611526e-06, + "loss": 0.6081, + "step": 8187 + }, + { + "epoch": 0.72, + "grad_norm": 8.147785876813119, + "learning_rate": 1.930118048326542e-06, + "loss": 0.7089, + "step": 8188 + }, + { + "epoch": 0.72, + "grad_norm": 10.656494880267548, + "learning_rate": 1.9289955106016705e-06, + "loss": 0.805, + "step": 8189 + }, + { + "epoch": 0.72, + "grad_norm": 15.937230730836749, + "learning_rate": 1.9278732213773756e-06, + "loss": 0.9058, + "step": 8190 + }, + { + "epoch": 0.72, + "grad_norm": 7.811782372167603, + "learning_rate": 1.926751180744469e-06, + "loss": 0.7486, + "step": 8191 + }, + { + "epoch": 0.72, + "grad_norm": 6.4185070374510955, + "learning_rate": 1.925629388793745e-06, + "loss": 0.6049, + "step": 8192 + }, + { + "epoch": 0.72, + "grad_norm": 10.189730783405523, + "learning_rate": 1.924507845615977e-06, + "loss": 0.7302, + "step": 8193 + }, + { + "epoch": 0.72, + "grad_norm": 7.773679767497121, + "learning_rate": 1.923386551301917e-06, + "loss": 0.6614, + "step": 8194 + }, + { + "epoch": 0.72, + "grad_norm": 8.588458227560094, + "learning_rate": 1.9222655059423008e-06, + "loss": 0.7415, + "step": 8195 + }, + { + "epoch": 0.72, + "grad_norm": 8.580589946070765, + "learning_rate": 1.9211447096278403e-06, + "loss": 0.9238, + "step": 8196 + }, + { + "epoch": 0.72, + "grad_norm": 8.85747401908089, + "learning_rate": 1.920024162449229e-06, + "loss": 0.7014, + "step": 8197 + }, + { + "epoch": 0.72, + "grad_norm": 11.19129661548461, + "learning_rate": 1.9189038644971385e-06, + "loss": 0.7752, + "step": 8198 + }, + { + "epoch": 0.72, + "grad_norm": 5.315410178054438, + "learning_rate": 1.9177838158622227e-06, + "loss": 0.6765, + "step": 8199 + }, + { + "epoch": 0.72, + "grad_norm": 14.074744908201392, + "learning_rate": 1.9166640166351114e-06, + "loss": 0.8563, + "step": 8200 + }, + { + "epoch": 0.72, + "grad_norm": 8.087986885343554, + "learning_rate": 1.9155444669064204e-06, + "loss": 0.7398, + "step": 8201 + }, + { + "epoch": 0.72, + "grad_norm": 5.234020653626053, + "learning_rate": 1.9144251667667413e-06, + "loss": 0.5177, + "step": 8202 + }, + { + "epoch": 0.72, + "grad_norm": 13.35876732352042, + "learning_rate": 1.9133061163066447e-06, + "loss": 0.8245, + "step": 8203 + }, + { + "epoch": 0.72, + "grad_norm": 10.052784281650515, + "learning_rate": 1.9121873156166827e-06, + "loss": 0.6346, + "step": 8204 + }, + { + "epoch": 0.72, + "grad_norm": 9.078668096492377, + "learning_rate": 1.9110687647873866e-06, + "loss": 0.7593, + "step": 8205 + }, + { + "epoch": 0.72, + "grad_norm": 9.562704810309713, + "learning_rate": 1.9099504639092665e-06, + "loss": 0.8328, + "step": 8206 + }, + { + "epoch": 0.72, + "grad_norm": 10.544156537518456, + "learning_rate": 1.9088324130728164e-06, + "loss": 0.836, + "step": 8207 + }, + { + "epoch": 0.72, + "grad_norm": 8.030559675290094, + "learning_rate": 1.907714612368505e-06, + "loss": 0.7635, + "step": 8208 + }, + { + "epoch": 0.72, + "grad_norm": 7.1593475308012335, + "learning_rate": 1.9065970618867841e-06, + "loss": 0.6823, + "step": 8209 + }, + { + "epoch": 0.72, + "grad_norm": 7.712562946359963, + "learning_rate": 1.9054797617180836e-06, + "loss": 0.6647, + "step": 8210 + }, + { + "epoch": 0.72, + "grad_norm": 8.777304463296963, + "learning_rate": 1.9043627119528107e-06, + "loss": 0.6154, + "step": 8211 + }, + { + "epoch": 0.72, + "grad_norm": 11.421876946859774, + "learning_rate": 1.9032459126813602e-06, + "loss": 0.7169, + "step": 8212 + }, + { + "epoch": 0.72, + "grad_norm": 7.256629387132332, + "learning_rate": 1.9021293639940991e-06, + "loss": 0.6335, + "step": 8213 + }, + { + "epoch": 0.72, + "grad_norm": 10.126802815901005, + "learning_rate": 1.901013065981377e-06, + "loss": 0.8192, + "step": 8214 + }, + { + "epoch": 0.72, + "grad_norm": 17.215765502180858, + "learning_rate": 1.8998970187335225e-06, + "loss": 0.7141, + "step": 8215 + }, + { + "epoch": 0.72, + "grad_norm": 8.542083796112776, + "learning_rate": 1.8987812223408448e-06, + "loss": 0.7333, + "step": 8216 + }, + { + "epoch": 0.72, + "grad_norm": 8.898514327938202, + "learning_rate": 1.8976656768936298e-06, + "loss": 0.7301, + "step": 8217 + }, + { + "epoch": 0.72, + "grad_norm": 7.605311657593351, + "learning_rate": 1.8965503824821496e-06, + "loss": 0.8662, + "step": 8218 + }, + { + "epoch": 0.72, + "grad_norm": 13.048112288880667, + "learning_rate": 1.8954353391966502e-06, + "loss": 0.9007, + "step": 8219 + }, + { + "epoch": 0.72, + "grad_norm": 8.76107126177317, + "learning_rate": 1.894320547127359e-06, + "loss": 0.6567, + "step": 8220 + }, + { + "epoch": 0.72, + "grad_norm": 6.159546990591983, + "learning_rate": 1.893206006364483e-06, + "loss": 0.7415, + "step": 8221 + }, + { + "epoch": 0.72, + "grad_norm": 8.056551349532207, + "learning_rate": 1.892091716998209e-06, + "loss": 0.6873, + "step": 8222 + }, + { + "epoch": 0.72, + "grad_norm": 2.3075946311885764, + "learning_rate": 1.8909776791187017e-06, + "loss": 0.4854, + "step": 8223 + }, + { + "epoch": 0.72, + "grad_norm": 14.8086189880713, + "learning_rate": 1.8898638928161112e-06, + "loss": 0.6213, + "step": 8224 + }, + { + "epoch": 0.72, + "grad_norm": 28.534195480675695, + "learning_rate": 1.8887503581805605e-06, + "loss": 0.8559, + "step": 8225 + }, + { + "epoch": 0.72, + "grad_norm": 9.66428150461691, + "learning_rate": 1.8876370753021562e-06, + "loss": 0.7847, + "step": 8226 + }, + { + "epoch": 0.72, + "grad_norm": 6.07005988251029, + "learning_rate": 1.886524044270982e-06, + "loss": 0.6638, + "step": 8227 + }, + { + "epoch": 0.72, + "grad_norm": 7.27492232065414, + "learning_rate": 1.8854112651771034e-06, + "loss": 0.7648, + "step": 8228 + }, + { + "epoch": 0.72, + "grad_norm": 22.717380073732485, + "learning_rate": 1.8842987381105626e-06, + "loss": 0.8964, + "step": 8229 + }, + { + "epoch": 0.72, + "grad_norm": 7.130869579113625, + "learning_rate": 1.8831864631613872e-06, + "loss": 0.81, + "step": 8230 + }, + { + "epoch": 0.72, + "grad_norm": 7.28619044709792, + "learning_rate": 1.882074440419578e-06, + "loss": 0.7075, + "step": 8231 + }, + { + "epoch": 0.72, + "grad_norm": 7.336736882042022, + "learning_rate": 1.8809626699751194e-06, + "loss": 0.6608, + "step": 8232 + }, + { + "epoch": 0.72, + "grad_norm": 9.461703676459383, + "learning_rate": 1.879851151917973e-06, + "loss": 0.7137, + "step": 8233 + }, + { + "epoch": 0.72, + "grad_norm": 2.348688547237567, + "learning_rate": 1.878739886338079e-06, + "loss": 0.4927, + "step": 8234 + }, + { + "epoch": 0.72, + "grad_norm": 7.143027534109274, + "learning_rate": 1.8776288733253638e-06, + "loss": 0.6795, + "step": 8235 + }, + { + "epoch": 0.72, + "grad_norm": 12.251295512707705, + "learning_rate": 1.8765181129697263e-06, + "loss": 0.7052, + "step": 8236 + }, + { + "epoch": 0.72, + "grad_norm": 11.605980039208784, + "learning_rate": 1.8754076053610476e-06, + "loss": 0.6174, + "step": 8237 + }, + { + "epoch": 0.72, + "grad_norm": 6.422355315704915, + "learning_rate": 1.8742973505891876e-06, + "loss": 0.6674, + "step": 8238 + }, + { + "epoch": 0.72, + "grad_norm": 10.845335718437251, + "learning_rate": 1.8731873487439871e-06, + "loss": 0.7244, + "step": 8239 + }, + { + "epoch": 0.72, + "grad_norm": 5.789027661482534, + "learning_rate": 1.872077599915263e-06, + "loss": 0.6632, + "step": 8240 + }, + { + "epoch": 0.72, + "grad_norm": 10.157578321958288, + "learning_rate": 1.8709681041928185e-06, + "loss": 0.6689, + "step": 8241 + }, + { + "epoch": 0.72, + "grad_norm": 6.76574440367555, + "learning_rate": 1.8698588616664299e-06, + "loss": 0.6863, + "step": 8242 + }, + { + "epoch": 0.72, + "grad_norm": 6.619096998141231, + "learning_rate": 1.8687498724258556e-06, + "loss": 0.7506, + "step": 8243 + }, + { + "epoch": 0.72, + "grad_norm": 18.982373659487184, + "learning_rate": 1.8676411365608333e-06, + "loss": 0.5901, + "step": 8244 + }, + { + "epoch": 0.72, + "grad_norm": 9.957246750997736, + "learning_rate": 1.8665326541610795e-06, + "loss": 0.6701, + "step": 8245 + }, + { + "epoch": 0.72, + "grad_norm": 13.414607724586835, + "learning_rate": 1.8654244253162895e-06, + "loss": 0.6251, + "step": 8246 + }, + { + "epoch": 0.72, + "grad_norm": 7.324904998941813, + "learning_rate": 1.8643164501161427e-06, + "loss": 0.7422, + "step": 8247 + }, + { + "epoch": 0.72, + "grad_norm": 7.602882918111543, + "learning_rate": 1.8632087286502925e-06, + "loss": 0.763, + "step": 8248 + }, + { + "epoch": 0.72, + "grad_norm": 8.830668554616958, + "learning_rate": 1.8621012610083743e-06, + "loss": 0.7902, + "step": 8249 + }, + { + "epoch": 0.72, + "grad_norm": 3.0618685290896566, + "learning_rate": 1.8609940472800026e-06, + "loss": 0.4837, + "step": 8250 + }, + { + "epoch": 0.72, + "grad_norm": 3.1577623745141112, + "learning_rate": 1.8598870875547691e-06, + "loss": 0.6205, + "step": 8251 + }, + { + "epoch": 0.72, + "grad_norm": 2.6049459645352946, + "learning_rate": 1.8587803819222511e-06, + "loss": 0.5227, + "step": 8252 + }, + { + "epoch": 0.72, + "grad_norm": 5.483523708175944, + "learning_rate": 1.8576739304719993e-06, + "loss": 0.6958, + "step": 8253 + }, + { + "epoch": 0.72, + "grad_norm": 10.736680682135566, + "learning_rate": 1.8565677332935461e-06, + "loss": 0.7676, + "step": 8254 + }, + { + "epoch": 0.72, + "grad_norm": 10.703418428733663, + "learning_rate": 1.8554617904764033e-06, + "loss": 0.6256, + "step": 8255 + }, + { + "epoch": 0.73, + "grad_norm": 6.321818963406208, + "learning_rate": 1.8543561021100615e-06, + "loss": 0.7357, + "step": 8256 + }, + { + "epoch": 0.73, + "grad_norm": 9.081398214385747, + "learning_rate": 1.8532506682839901e-06, + "loss": 0.7795, + "step": 8257 + }, + { + "epoch": 0.73, + "grad_norm": 6.001463054326604, + "learning_rate": 1.8521454890876416e-06, + "loss": 0.6731, + "step": 8258 + }, + { + "epoch": 0.73, + "grad_norm": 9.698933462331565, + "learning_rate": 1.851040564610444e-06, + "loss": 0.6917, + "step": 8259 + }, + { + "epoch": 0.73, + "grad_norm": 5.886383190230203, + "learning_rate": 1.8499358949418062e-06, + "loss": 0.6054, + "step": 8260 + }, + { + "epoch": 0.73, + "grad_norm": 15.859143383078179, + "learning_rate": 1.8488314801711155e-06, + "loss": 0.801, + "step": 8261 + }, + { + "epoch": 0.73, + "grad_norm": 11.099737040486309, + "learning_rate": 1.84772732038774e-06, + "loss": 0.7702, + "step": 8262 + }, + { + "epoch": 0.73, + "grad_norm": 11.269959688932493, + "learning_rate": 1.8466234156810242e-06, + "loss": 0.7362, + "step": 8263 + }, + { + "epoch": 0.73, + "grad_norm": 2.9934011628941652, + "learning_rate": 1.845519766140298e-06, + "loss": 0.5182, + "step": 8264 + }, + { + "epoch": 0.73, + "grad_norm": 8.705763845181417, + "learning_rate": 1.8444163718548653e-06, + "loss": 0.8161, + "step": 8265 + }, + { + "epoch": 0.73, + "grad_norm": 7.626358170680674, + "learning_rate": 1.8433132329140107e-06, + "loss": 0.6797, + "step": 8266 + }, + { + "epoch": 0.73, + "grad_norm": 7.440832563916868, + "learning_rate": 1.8422103494069981e-06, + "loss": 0.7937, + "step": 8267 + }, + { + "epoch": 0.73, + "grad_norm": 7.060875740540013, + "learning_rate": 1.8411077214230716e-06, + "loss": 0.7484, + "step": 8268 + }, + { + "epoch": 0.73, + "grad_norm": 2.8013491308789633, + "learning_rate": 1.840005349051452e-06, + "loss": 0.5416, + "step": 8269 + }, + { + "epoch": 0.73, + "grad_norm": 5.708681879375964, + "learning_rate": 1.838903232381345e-06, + "loss": 0.7053, + "step": 8270 + }, + { + "epoch": 0.73, + "grad_norm": 8.650006607413458, + "learning_rate": 1.83780137150193e-06, + "loss": 0.716, + "step": 8271 + }, + { + "epoch": 0.73, + "grad_norm": 6.674119296560576, + "learning_rate": 1.8366997665023684e-06, + "loss": 0.8399, + "step": 8272 + }, + { + "epoch": 0.73, + "grad_norm": 7.302827115559472, + "learning_rate": 1.8355984174717994e-06, + "loss": 0.7957, + "step": 8273 + }, + { + "epoch": 0.73, + "grad_norm": 8.279371216585162, + "learning_rate": 1.8344973244993407e-06, + "loss": 0.7878, + "step": 8274 + }, + { + "epoch": 0.73, + "grad_norm": 7.840508053533849, + "learning_rate": 1.833396487674095e-06, + "loss": 0.7018, + "step": 8275 + }, + { + "epoch": 0.73, + "grad_norm": 2.824407255236933, + "learning_rate": 1.8322959070851388e-06, + "loss": 0.507, + "step": 8276 + }, + { + "epoch": 0.73, + "grad_norm": 6.687548925824558, + "learning_rate": 1.8311955828215277e-06, + "loss": 0.7802, + "step": 8277 + }, + { + "epoch": 0.73, + "grad_norm": 8.850961068500633, + "learning_rate": 1.8300955149722993e-06, + "loss": 0.8022, + "step": 8278 + }, + { + "epoch": 0.73, + "grad_norm": 7.007938434712425, + "learning_rate": 1.8289957036264693e-06, + "loss": 0.6501, + "step": 8279 + }, + { + "epoch": 0.73, + "grad_norm": 5.165550142476091, + "learning_rate": 1.8278961488730302e-06, + "loss": 0.6252, + "step": 8280 + }, + { + "epoch": 0.73, + "grad_norm": 10.620168603067007, + "learning_rate": 1.82679685080096e-06, + "loss": 0.7244, + "step": 8281 + }, + { + "epoch": 0.73, + "grad_norm": 11.020102932679213, + "learning_rate": 1.82569780949921e-06, + "loss": 0.6096, + "step": 8282 + }, + { + "epoch": 0.73, + "grad_norm": 6.952643578365161, + "learning_rate": 1.8245990250567135e-06, + "loss": 0.7859, + "step": 8283 + }, + { + "epoch": 0.73, + "grad_norm": 6.7005745054380625, + "learning_rate": 1.8235004975623816e-06, + "loss": 0.7172, + "step": 8284 + }, + { + "epoch": 0.73, + "grad_norm": 7.117059104274886, + "learning_rate": 1.8224022271051056e-06, + "loss": 0.8233, + "step": 8285 + }, + { + "epoch": 0.73, + "grad_norm": 19.343735332793095, + "learning_rate": 1.8213042137737542e-06, + "loss": 0.7429, + "step": 8286 + }, + { + "epoch": 0.73, + "grad_norm": 13.547047014755432, + "learning_rate": 1.8202064576571798e-06, + "loss": 0.5863, + "step": 8287 + }, + { + "epoch": 0.73, + "grad_norm": 7.179591803023243, + "learning_rate": 1.8191089588442102e-06, + "loss": 0.7448, + "step": 8288 + }, + { + "epoch": 0.73, + "grad_norm": 8.129676380549812, + "learning_rate": 1.8180117174236516e-06, + "loss": 0.7066, + "step": 8289 + }, + { + "epoch": 0.73, + "grad_norm": 7.026452766159264, + "learning_rate": 1.8169147334842925e-06, + "loss": 0.7049, + "step": 8290 + }, + { + "epoch": 0.73, + "grad_norm": 3.7126633939169356, + "learning_rate": 1.8158180071148962e-06, + "loss": 0.4919, + "step": 8291 + }, + { + "epoch": 0.73, + "grad_norm": 7.967747500345474, + "learning_rate": 1.8147215384042121e-06, + "loss": 0.7837, + "step": 8292 + }, + { + "epoch": 0.73, + "grad_norm": 2.4578359469048636, + "learning_rate": 1.8136253274409626e-06, + "loss": 0.524, + "step": 8293 + }, + { + "epoch": 0.73, + "grad_norm": 5.5732119159740865, + "learning_rate": 1.8125293743138516e-06, + "loss": 0.5952, + "step": 8294 + }, + { + "epoch": 0.73, + "grad_norm": 11.954911298615745, + "learning_rate": 1.811433679111561e-06, + "loss": 0.7212, + "step": 8295 + }, + { + "epoch": 0.73, + "grad_norm": 6.587901813872472, + "learning_rate": 1.8103382419227538e-06, + "loss": 0.593, + "step": 8296 + }, + { + "epoch": 0.73, + "grad_norm": 8.9787218812222, + "learning_rate": 1.8092430628360686e-06, + "loss": 0.8155, + "step": 8297 + }, + { + "epoch": 0.73, + "grad_norm": 4.883242805654862, + "learning_rate": 1.8081481419401286e-06, + "loss": 0.5549, + "step": 8298 + }, + { + "epoch": 0.73, + "grad_norm": 5.635412923415014, + "learning_rate": 1.8070534793235318e-06, + "loss": 0.699, + "step": 8299 + }, + { + "epoch": 0.73, + "grad_norm": 8.183901561980452, + "learning_rate": 1.8059590750748557e-06, + "loss": 0.8167, + "step": 8300 + }, + { + "epoch": 0.73, + "grad_norm": 5.73257260680639, + "learning_rate": 1.8048649292826587e-06, + "loss": 0.6941, + "step": 8301 + }, + { + "epoch": 0.73, + "grad_norm": 12.778879841630578, + "learning_rate": 1.8037710420354765e-06, + "loss": 0.7303, + "step": 8302 + }, + { + "epoch": 0.73, + "grad_norm": 6.342553276337215, + "learning_rate": 1.802677413421824e-06, + "loss": 0.6747, + "step": 8303 + }, + { + "epoch": 0.73, + "grad_norm": 8.436782642611888, + "learning_rate": 1.8015840435301974e-06, + "loss": 0.721, + "step": 8304 + }, + { + "epoch": 0.73, + "grad_norm": 11.042426434612802, + "learning_rate": 1.8004909324490705e-06, + "loss": 0.7132, + "step": 8305 + }, + { + "epoch": 0.73, + "grad_norm": 8.95694719631494, + "learning_rate": 1.7993980802668947e-06, + "loss": 0.7778, + "step": 8306 + }, + { + "epoch": 0.73, + "grad_norm": 16.34393687718759, + "learning_rate": 1.7983054870721023e-06, + "loss": 0.6367, + "step": 8307 + }, + { + "epoch": 0.73, + "grad_norm": 4.6667617201479095, + "learning_rate": 1.7972131529531045e-06, + "loss": 0.5833, + "step": 8308 + }, + { + "epoch": 0.73, + "grad_norm": 9.48038682200917, + "learning_rate": 1.796121077998289e-06, + "loss": 0.7993, + "step": 8309 + }, + { + "epoch": 0.73, + "grad_norm": 20.752122669611026, + "learning_rate": 1.795029262296028e-06, + "loss": 0.8705, + "step": 8310 + }, + { + "epoch": 0.73, + "grad_norm": 25.716118050423677, + "learning_rate": 1.7939377059346686e-06, + "loss": 0.6589, + "step": 8311 + }, + { + "epoch": 0.73, + "grad_norm": 12.625094880477144, + "learning_rate": 1.7928464090025361e-06, + "loss": 0.9279, + "step": 8312 + }, + { + "epoch": 0.73, + "grad_norm": 9.4598588943753, + "learning_rate": 1.7917553715879376e-06, + "loss": 0.7235, + "step": 8313 + }, + { + "epoch": 0.73, + "grad_norm": 6.5714357700481285, + "learning_rate": 1.7906645937791567e-06, + "loss": 0.7322, + "step": 8314 + }, + { + "epoch": 0.73, + "grad_norm": 11.64254652224199, + "learning_rate": 1.789574075664459e-06, + "loss": 0.8106, + "step": 8315 + }, + { + "epoch": 0.73, + "grad_norm": 5.547001312348773, + "learning_rate": 1.788483817332088e-06, + "loss": 0.688, + "step": 8316 + }, + { + "epoch": 0.73, + "grad_norm": 8.161363385222172, + "learning_rate": 1.787393818870264e-06, + "loss": 0.6584, + "step": 8317 + }, + { + "epoch": 0.73, + "grad_norm": 9.479386153329227, + "learning_rate": 1.7863040803671876e-06, + "loss": 0.6292, + "step": 8318 + }, + { + "epoch": 0.73, + "grad_norm": 3.112716020122786, + "learning_rate": 1.78521460191104e-06, + "loss": 0.4567, + "step": 8319 + }, + { + "epoch": 0.73, + "grad_norm": 16.501082885021404, + "learning_rate": 1.784125383589978e-06, + "loss": 0.7057, + "step": 8320 + }, + { + "epoch": 0.73, + "grad_norm": 1.9417759993614574, + "learning_rate": 1.7830364254921418e-06, + "loss": 0.5186, + "step": 8321 + }, + { + "epoch": 0.73, + "grad_norm": 11.156783018853192, + "learning_rate": 1.781947727705647e-06, + "loss": 0.8414, + "step": 8322 + }, + { + "epoch": 0.73, + "grad_norm": 8.04977409519038, + "learning_rate": 1.7808592903185885e-06, + "loss": 0.7699, + "step": 8323 + }, + { + "epoch": 0.73, + "grad_norm": 8.848908783583036, + "learning_rate": 1.779771113419042e-06, + "loss": 0.7827, + "step": 8324 + }, + { + "epoch": 0.73, + "grad_norm": 10.663884777505299, + "learning_rate": 1.7786831970950608e-06, + "loss": 0.8173, + "step": 8325 + }, + { + "epoch": 0.73, + "grad_norm": 3.712354537545698, + "learning_rate": 1.7775955414346746e-06, + "loss": 0.6829, + "step": 8326 + }, + { + "epoch": 0.73, + "grad_norm": 10.560900046469957, + "learning_rate": 1.7765081465258988e-06, + "loss": 0.8142, + "step": 8327 + }, + { + "epoch": 0.73, + "grad_norm": 11.749481238324982, + "learning_rate": 1.7754210124567216e-06, + "loss": 0.7746, + "step": 8328 + }, + { + "epoch": 0.73, + "grad_norm": 6.1662176539944085, + "learning_rate": 1.7743341393151132e-06, + "loss": 0.7746, + "step": 8329 + }, + { + "epoch": 0.73, + "grad_norm": 8.039826661460681, + "learning_rate": 1.7732475271890198e-06, + "loss": 0.6935, + "step": 8330 + }, + { + "epoch": 0.73, + "grad_norm": 6.007438242991735, + "learning_rate": 1.7721611761663676e-06, + "loss": 0.668, + "step": 8331 + }, + { + "epoch": 0.73, + "grad_norm": 10.823440635514938, + "learning_rate": 1.7710750863350652e-06, + "loss": 0.6921, + "step": 8332 + }, + { + "epoch": 0.73, + "grad_norm": 15.425732563488683, + "learning_rate": 1.7699892577829963e-06, + "loss": 0.9564, + "step": 8333 + }, + { + "epoch": 0.73, + "grad_norm": 11.106896209814549, + "learning_rate": 1.7689036905980234e-06, + "loss": 0.6376, + "step": 8334 + }, + { + "epoch": 0.73, + "grad_norm": 6.618719491276665, + "learning_rate": 1.7678183848679892e-06, + "loss": 0.658, + "step": 8335 + }, + { + "epoch": 0.73, + "grad_norm": 6.038214526727057, + "learning_rate": 1.7667333406807153e-06, + "loss": 0.7161, + "step": 8336 + }, + { + "epoch": 0.73, + "grad_norm": 8.33691428528028, + "learning_rate": 1.7656485581239991e-06, + "loss": 0.6427, + "step": 8337 + }, + { + "epoch": 0.73, + "grad_norm": 8.353517635431393, + "learning_rate": 1.7645640372856232e-06, + "loss": 0.6449, + "step": 8338 + }, + { + "epoch": 0.73, + "grad_norm": 8.020436483922643, + "learning_rate": 1.7634797782533436e-06, + "loss": 0.7365, + "step": 8339 + }, + { + "epoch": 0.73, + "grad_norm": 9.106022542256731, + "learning_rate": 1.7623957811148967e-06, + "loss": 0.7165, + "step": 8340 + }, + { + "epoch": 0.73, + "grad_norm": 5.7225629367585364, + "learning_rate": 1.7613120459579974e-06, + "loss": 0.6297, + "step": 8341 + }, + { + "epoch": 0.73, + "grad_norm": 7.099446207270211, + "learning_rate": 1.7602285728703405e-06, + "loss": 0.5903, + "step": 8342 + }, + { + "epoch": 0.73, + "grad_norm": 12.039021039768235, + "learning_rate": 1.7591453619395964e-06, + "loss": 0.6711, + "step": 8343 + }, + { + "epoch": 0.73, + "grad_norm": 3.680958410122508, + "learning_rate": 1.7580624132534207e-06, + "loss": 0.4765, + "step": 8344 + }, + { + "epoch": 0.73, + "grad_norm": 7.283566731966688, + "learning_rate": 1.756979726899442e-06, + "loss": 0.8039, + "step": 8345 + }, + { + "epoch": 0.73, + "grad_norm": 2.6875031292313887, + "learning_rate": 1.7558973029652687e-06, + "loss": 0.4511, + "step": 8346 + }, + { + "epoch": 0.73, + "grad_norm": 7.308616539428295, + "learning_rate": 1.7548151415384894e-06, + "loss": 0.5645, + "step": 8347 + }, + { + "epoch": 0.73, + "grad_norm": 2.465574125183435, + "learning_rate": 1.7537332427066717e-06, + "loss": 0.5713, + "step": 8348 + }, + { + "epoch": 0.73, + "grad_norm": 6.380566860472206, + "learning_rate": 1.7526516065573574e-06, + "loss": 0.7402, + "step": 8349 + }, + { + "epoch": 0.73, + "grad_norm": 8.641935553982922, + "learning_rate": 1.7515702331780753e-06, + "loss": 0.7647, + "step": 8350 + }, + { + "epoch": 0.73, + "grad_norm": 10.988144178664621, + "learning_rate": 1.750489122656327e-06, + "loss": 0.779, + "step": 8351 + }, + { + "epoch": 0.73, + "grad_norm": 10.057313666561024, + "learning_rate": 1.7494082750795931e-06, + "loss": 0.7828, + "step": 8352 + }, + { + "epoch": 0.73, + "grad_norm": 5.291375827013235, + "learning_rate": 1.7483276905353347e-06, + "loss": 0.685, + "step": 8353 + }, + { + "epoch": 0.73, + "grad_norm": 10.29624076670107, + "learning_rate": 1.7472473691109886e-06, + "loss": 0.8162, + "step": 8354 + }, + { + "epoch": 0.73, + "grad_norm": 9.119256262056807, + "learning_rate": 1.746167310893977e-06, + "loss": 0.6641, + "step": 8355 + }, + { + "epoch": 0.73, + "grad_norm": 32.599742963650364, + "learning_rate": 1.745087515971694e-06, + "loss": 0.6343, + "step": 8356 + }, + { + "epoch": 0.73, + "grad_norm": 12.154894207953458, + "learning_rate": 1.7440079844315145e-06, + "loss": 0.7878, + "step": 8357 + }, + { + "epoch": 0.73, + "grad_norm": 7.2203227298337564, + "learning_rate": 1.7429287163607933e-06, + "loss": 0.6387, + "step": 8358 + }, + { + "epoch": 0.73, + "grad_norm": 6.3816525856312465, + "learning_rate": 1.7418497118468625e-06, + "loss": 0.6222, + "step": 8359 + }, + { + "epoch": 0.73, + "grad_norm": 3.3870858443054854, + "learning_rate": 1.7407709709770316e-06, + "loss": 0.4515, + "step": 8360 + }, + { + "epoch": 0.73, + "grad_norm": 9.394835427237556, + "learning_rate": 1.7396924938385933e-06, + "loss": 0.6299, + "step": 8361 + }, + { + "epoch": 0.73, + "grad_norm": 5.63105224740166, + "learning_rate": 1.738614280518816e-06, + "loss": 0.5081, + "step": 8362 + }, + { + "epoch": 0.73, + "grad_norm": 14.00919324386235, + "learning_rate": 1.737536331104946e-06, + "loss": 0.8331, + "step": 8363 + }, + { + "epoch": 0.73, + "grad_norm": 7.2286034298426, + "learning_rate": 1.7364586456842091e-06, + "loss": 0.6944, + "step": 8364 + }, + { + "epoch": 0.73, + "grad_norm": 6.740646822288422, + "learning_rate": 1.73538122434381e-06, + "loss": 0.6938, + "step": 8365 + }, + { + "epoch": 0.73, + "grad_norm": 2.44733187910827, + "learning_rate": 1.7343040671709299e-06, + "loss": 0.5215, + "step": 8366 + }, + { + "epoch": 0.73, + "grad_norm": 14.340033297071459, + "learning_rate": 1.7332271742527346e-06, + "loss": 0.6218, + "step": 8367 + }, + { + "epoch": 0.73, + "grad_norm": 13.870753144053085, + "learning_rate": 1.7321505456763621e-06, + "loss": 0.6786, + "step": 8368 + }, + { + "epoch": 0.73, + "grad_norm": 5.532164209742208, + "learning_rate": 1.7310741815289317e-06, + "loss": 0.7389, + "step": 8369 + }, + { + "epoch": 0.74, + "grad_norm": 10.128190154318629, + "learning_rate": 1.729998081897541e-06, + "loss": 0.656, + "step": 8370 + }, + { + "epoch": 0.74, + "grad_norm": 7.864504216789701, + "learning_rate": 1.728922246869264e-06, + "loss": 0.7254, + "step": 8371 + }, + { + "epoch": 0.74, + "grad_norm": 12.88292133707278, + "learning_rate": 1.7278466765311597e-06, + "loss": 0.7096, + "step": 8372 + }, + { + "epoch": 0.74, + "grad_norm": 10.568279176732416, + "learning_rate": 1.7267713709702593e-06, + "loss": 0.8145, + "step": 8373 + }, + { + "epoch": 0.74, + "grad_norm": 10.983276617089702, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.7406, + "step": 8374 + }, + { + "epoch": 0.74, + "grad_norm": 9.707455754882762, + "learning_rate": 1.724621554528097e-06, + "loss": 0.8272, + "step": 8375 + }, + { + "epoch": 0.74, + "grad_norm": 5.97667240878277, + "learning_rate": 1.7235470438207946e-06, + "loss": 0.6721, + "step": 8376 + }, + { + "epoch": 0.74, + "grad_norm": 7.52569568956844, + "learning_rate": 1.7224727982386137e-06, + "loss": 0.7351, + "step": 8377 + }, + { + "epoch": 0.74, + "grad_norm": 7.141787888162531, + "learning_rate": 1.7213988178684832e-06, + "loss": 0.6704, + "step": 8378 + }, + { + "epoch": 0.74, + "grad_norm": 12.134448490241484, + "learning_rate": 1.7203251027973073e-06, + "loss": 0.7411, + "step": 8379 + }, + { + "epoch": 0.74, + "grad_norm": 11.527851150590596, + "learning_rate": 1.7192516531119685e-06, + "loss": 0.7107, + "step": 8380 + }, + { + "epoch": 0.74, + "grad_norm": 8.395824172221035, + "learning_rate": 1.7181784688993285e-06, + "loss": 0.5885, + "step": 8381 + }, + { + "epoch": 0.74, + "grad_norm": 2.5013408327976063, + "learning_rate": 1.717105550246228e-06, + "loss": 0.495, + "step": 8382 + }, + { + "epoch": 0.74, + "grad_norm": 8.438973947784797, + "learning_rate": 1.7160328972394835e-06, + "loss": 0.7144, + "step": 8383 + }, + { + "epoch": 0.74, + "grad_norm": 6.42990845753634, + "learning_rate": 1.7149605099658968e-06, + "loss": 0.6583, + "step": 8384 + }, + { + "epoch": 0.74, + "grad_norm": 15.526520550587366, + "learning_rate": 1.7138883885122405e-06, + "loss": 0.7782, + "step": 8385 + }, + { + "epoch": 0.74, + "grad_norm": 6.8792435149450535, + "learning_rate": 1.7128165329652696e-06, + "loss": 0.7968, + "step": 8386 + }, + { + "epoch": 0.74, + "grad_norm": 7.978528720042928, + "learning_rate": 1.7117449434117172e-06, + "loss": 0.621, + "step": 8387 + }, + { + "epoch": 0.74, + "grad_norm": 8.939038424568208, + "learning_rate": 1.7106736199382928e-06, + "loss": 0.6704, + "step": 8388 + }, + { + "epoch": 0.74, + "grad_norm": 10.505255218415154, + "learning_rate": 1.7096025626316887e-06, + "loss": 0.7015, + "step": 8389 + }, + { + "epoch": 0.74, + "grad_norm": 3.2841394954814924, + "learning_rate": 1.7085317715785722e-06, + "loss": 0.5461, + "step": 8390 + }, + { + "epoch": 0.74, + "grad_norm": 102.54124235664405, + "learning_rate": 1.70746124686559e-06, + "loss": 0.7273, + "step": 8391 + }, + { + "epoch": 0.74, + "grad_norm": 15.152776603187558, + "learning_rate": 1.706390988579366e-06, + "loss": 0.6293, + "step": 8392 + }, + { + "epoch": 0.74, + "grad_norm": 8.622066809928754, + "learning_rate": 1.7053209968065053e-06, + "loss": 0.7339, + "step": 8393 + }, + { + "epoch": 0.74, + "grad_norm": 8.521664075480006, + "learning_rate": 1.7042512716335873e-06, + "loss": 0.689, + "step": 8394 + }, + { + "epoch": 0.74, + "grad_norm": 9.483589597862613, + "learning_rate": 1.703181813147176e-06, + "loss": 0.6793, + "step": 8395 + }, + { + "epoch": 0.74, + "grad_norm": 12.844805095829942, + "learning_rate": 1.702112621433808e-06, + "loss": 0.7231, + "step": 8396 + }, + { + "epoch": 0.74, + "grad_norm": 3.7054524378731153, + "learning_rate": 1.7010436965800014e-06, + "loss": 0.4662, + "step": 8397 + }, + { + "epoch": 0.74, + "grad_norm": 5.272371252054157, + "learning_rate": 1.6999750386722508e-06, + "loss": 0.789, + "step": 8398 + }, + { + "epoch": 0.74, + "grad_norm": 15.372010415289932, + "learning_rate": 1.6989066477970311e-06, + "loss": 0.6013, + "step": 8399 + }, + { + "epoch": 0.74, + "grad_norm": 8.696531632921108, + "learning_rate": 1.697838524040793e-06, + "loss": 0.7604, + "step": 8400 + }, + { + "epoch": 0.74, + "grad_norm": 9.844964052060604, + "learning_rate": 1.6967706674899698e-06, + "loss": 0.6411, + "step": 8401 + }, + { + "epoch": 0.74, + "grad_norm": 6.674566282308897, + "learning_rate": 1.6957030782309697e-06, + "loss": 0.7188, + "step": 8402 + }, + { + "epoch": 0.74, + "grad_norm": 3.24355008013504, + "learning_rate": 1.6946357563501803e-06, + "loss": 0.5229, + "step": 8403 + }, + { + "epoch": 0.74, + "grad_norm": 4.607811346630351, + "learning_rate": 1.693568701933967e-06, + "loss": 0.7295, + "step": 8404 + }, + { + "epoch": 0.74, + "grad_norm": 10.734069219977972, + "learning_rate": 1.6925019150686744e-06, + "loss": 0.7948, + "step": 8405 + }, + { + "epoch": 0.74, + "grad_norm": 9.766753006856899, + "learning_rate": 1.6914353958406239e-06, + "loss": 0.8585, + "step": 8406 + }, + { + "epoch": 0.74, + "grad_norm": 3.1635784417728807, + "learning_rate": 1.6903691443361186e-06, + "loss": 0.4825, + "step": 8407 + }, + { + "epoch": 0.74, + "grad_norm": 8.356331713339515, + "learning_rate": 1.6893031606414372e-06, + "loss": 0.7265, + "step": 8408 + }, + { + "epoch": 0.74, + "grad_norm": 2.685735905082464, + "learning_rate": 1.6882374448428368e-06, + "loss": 0.5156, + "step": 8409 + }, + { + "epoch": 0.74, + "grad_norm": 5.902183120397703, + "learning_rate": 1.6871719970265537e-06, + "loss": 0.6924, + "step": 8410 + }, + { + "epoch": 0.74, + "grad_norm": 13.636513227314694, + "learning_rate": 1.6861068172788004e-06, + "loss": 0.8625, + "step": 8411 + }, + { + "epoch": 0.74, + "grad_norm": 7.550669564532233, + "learning_rate": 1.6850419056857732e-06, + "loss": 0.668, + "step": 8412 + }, + { + "epoch": 0.74, + "grad_norm": 11.580216107389832, + "learning_rate": 1.6839772623336397e-06, + "loss": 0.7467, + "step": 8413 + }, + { + "epoch": 0.74, + "grad_norm": 8.594250628568377, + "learning_rate": 1.6829128873085509e-06, + "loss": 0.5686, + "step": 8414 + }, + { + "epoch": 0.74, + "grad_norm": 6.736053649095858, + "learning_rate": 1.6818487806966338e-06, + "loss": 0.6646, + "step": 8415 + }, + { + "epoch": 0.74, + "grad_norm": 10.115211156775985, + "learning_rate": 1.6807849425839933e-06, + "loss": 0.6952, + "step": 8416 + }, + { + "epoch": 0.74, + "grad_norm": 17.74926001792039, + "learning_rate": 1.679721373056713e-06, + "loss": 0.8544, + "step": 8417 + }, + { + "epoch": 0.74, + "grad_norm": 16.871612226757442, + "learning_rate": 1.6786580722008572e-06, + "loss": 0.8561, + "step": 8418 + }, + { + "epoch": 0.74, + "grad_norm": 10.540533935027424, + "learning_rate": 1.677595040102466e-06, + "loss": 0.68, + "step": 8419 + }, + { + "epoch": 0.74, + "grad_norm": 7.692333898875996, + "learning_rate": 1.6765322768475572e-06, + "loss": 0.6917, + "step": 8420 + }, + { + "epoch": 0.74, + "grad_norm": 6.608838322484807, + "learning_rate": 1.6754697825221284e-06, + "loss": 0.7742, + "step": 8421 + }, + { + "epoch": 0.74, + "grad_norm": 7.496857784054018, + "learning_rate": 1.6744075572121549e-06, + "loss": 0.6921, + "step": 8422 + }, + { + "epoch": 0.74, + "grad_norm": 7.531659009905924, + "learning_rate": 1.673345601003588e-06, + "loss": 0.6176, + "step": 8423 + }, + { + "epoch": 0.74, + "grad_norm": 8.34646863935619, + "learning_rate": 1.6722839139823632e-06, + "loss": 0.6704, + "step": 8424 + }, + { + "epoch": 0.74, + "grad_norm": 11.722683965090887, + "learning_rate": 1.6712224962343887e-06, + "loss": 0.8667, + "step": 8425 + }, + { + "epoch": 0.74, + "grad_norm": 9.812886363594258, + "learning_rate": 1.6701613478455532e-06, + "loss": 0.7113, + "step": 8426 + }, + { + "epoch": 0.74, + "grad_norm": 9.741806167767784, + "learning_rate": 1.669100468901722e-06, + "loss": 0.6719, + "step": 8427 + }, + { + "epoch": 0.74, + "grad_norm": 11.563413470616592, + "learning_rate": 1.6680398594887387e-06, + "loss": 0.5685, + "step": 8428 + }, + { + "epoch": 0.74, + "grad_norm": 7.819948285433203, + "learning_rate": 1.6669795196924293e-06, + "loss": 0.7239, + "step": 8429 + }, + { + "epoch": 0.74, + "grad_norm": 11.784795833409106, + "learning_rate": 1.6659194495985926e-06, + "loss": 0.8922, + "step": 8430 + }, + { + "epoch": 0.74, + "grad_norm": 9.628885380277868, + "learning_rate": 1.664859649293008e-06, + "loss": 0.7051, + "step": 8431 + }, + { + "epoch": 0.74, + "grad_norm": 8.98336029403342, + "learning_rate": 1.6638001188614334e-06, + "loss": 0.6586, + "step": 8432 + }, + { + "epoch": 0.74, + "grad_norm": 6.564631148334392, + "learning_rate": 1.6627408583896038e-06, + "loss": 0.6754, + "step": 8433 + }, + { + "epoch": 0.74, + "grad_norm": 6.5979749713626195, + "learning_rate": 1.66168186796323e-06, + "loss": 0.6157, + "step": 8434 + }, + { + "epoch": 0.74, + "grad_norm": 19.975552531196282, + "learning_rate": 1.660623147668009e-06, + "loss": 0.8078, + "step": 8435 + }, + { + "epoch": 0.74, + "grad_norm": 9.390903377712078, + "learning_rate": 1.6595646975896079e-06, + "loss": 0.6823, + "step": 8436 + }, + { + "epoch": 0.74, + "grad_norm": 7.583344041700277, + "learning_rate": 1.658506517813675e-06, + "loss": 0.8455, + "step": 8437 + }, + { + "epoch": 0.74, + "grad_norm": 9.645377687445125, + "learning_rate": 1.6574486084258369e-06, + "loss": 0.68, + "step": 8438 + }, + { + "epoch": 0.74, + "grad_norm": 12.13298727191256, + "learning_rate": 1.6563909695116965e-06, + "loss": 0.5736, + "step": 8439 + }, + { + "epoch": 0.74, + "grad_norm": 2.271266655951732, + "learning_rate": 1.6553336011568354e-06, + "loss": 0.4778, + "step": 8440 + }, + { + "epoch": 0.74, + "grad_norm": 6.363872778372598, + "learning_rate": 1.6542765034468177e-06, + "loss": 0.7419, + "step": 8441 + }, + { + "epoch": 0.74, + "grad_norm": 6.490319739157949, + "learning_rate": 1.6532196764671803e-06, + "loss": 0.6022, + "step": 8442 + }, + { + "epoch": 0.74, + "grad_norm": 8.655533027069136, + "learning_rate": 1.6521631203034388e-06, + "loss": 0.6979, + "step": 8443 + }, + { + "epoch": 0.74, + "grad_norm": 5.961636162141338, + "learning_rate": 1.6511068350410896e-06, + "loss": 0.6911, + "step": 8444 + }, + { + "epoch": 0.74, + "grad_norm": 3.716410944792572, + "learning_rate": 1.650050820765604e-06, + "loss": 0.5528, + "step": 8445 + }, + { + "epoch": 0.74, + "grad_norm": 6.157892462617744, + "learning_rate": 1.6489950775624325e-06, + "loss": 0.73, + "step": 8446 + }, + { + "epoch": 0.74, + "grad_norm": 6.5727629221477475, + "learning_rate": 1.6479396055170066e-06, + "loss": 0.7694, + "step": 8447 + }, + { + "epoch": 0.74, + "grad_norm": 13.88966615344621, + "learning_rate": 1.646884404714732e-06, + "loss": 0.7347, + "step": 8448 + }, + { + "epoch": 0.74, + "grad_norm": 7.231696261527453, + "learning_rate": 1.6458294752409943e-06, + "loss": 0.5285, + "step": 8449 + }, + { + "epoch": 0.74, + "grad_norm": 9.990641367101626, + "learning_rate": 1.6447748171811557e-06, + "loss": 0.7515, + "step": 8450 + }, + { + "epoch": 0.74, + "grad_norm": 7.526329778810997, + "learning_rate": 1.6437204306205556e-06, + "loss": 0.7395, + "step": 8451 + }, + { + "epoch": 0.74, + "grad_norm": 19.96218420629482, + "learning_rate": 1.6426663156445178e-06, + "loss": 0.7278, + "step": 8452 + }, + { + "epoch": 0.74, + "grad_norm": 3.652578398297678, + "learning_rate": 1.6416124723383365e-06, + "loss": 0.5533, + "step": 8453 + }, + { + "epoch": 0.74, + "grad_norm": 2.765740119635543, + "learning_rate": 1.6405589007872875e-06, + "loss": 0.532, + "step": 8454 + }, + { + "epoch": 0.74, + "grad_norm": 8.696382714499057, + "learning_rate": 1.6395056010766247e-06, + "loss": 0.6254, + "step": 8455 + }, + { + "epoch": 0.74, + "grad_norm": 8.088371114898175, + "learning_rate": 1.6384525732915785e-06, + "loss": 0.7305, + "step": 8456 + }, + { + "epoch": 0.74, + "grad_norm": 3.0900718413939225, + "learning_rate": 1.6373998175173566e-06, + "loss": 0.4664, + "step": 8457 + }, + { + "epoch": 0.74, + "grad_norm": 5.007941835223616, + "learning_rate": 1.6363473338391494e-06, + "loss": 0.691, + "step": 8458 + }, + { + "epoch": 0.74, + "grad_norm": 7.824836587208542, + "learning_rate": 1.6352951223421216e-06, + "loss": 0.6206, + "step": 8459 + }, + { + "epoch": 0.74, + "grad_norm": 7.6778695812738045, + "learning_rate": 1.6342431831114153e-06, + "loss": 0.7258, + "step": 8460 + }, + { + "epoch": 0.74, + "grad_norm": 9.911737553874243, + "learning_rate": 1.6331915162321516e-06, + "loss": 0.7212, + "step": 8461 + }, + { + "epoch": 0.74, + "grad_norm": 19.30342537588659, + "learning_rate": 1.6321401217894306e-06, + "loss": 0.704, + "step": 8462 + }, + { + "epoch": 0.74, + "grad_norm": 2.544859162193193, + "learning_rate": 1.6310889998683267e-06, + "loss": 0.5119, + "step": 8463 + }, + { + "epoch": 0.74, + "grad_norm": 5.923759961912442, + "learning_rate": 1.6300381505538987e-06, + "loss": 0.7365, + "step": 8464 + }, + { + "epoch": 0.74, + "grad_norm": 9.043281829094074, + "learning_rate": 1.6289875739311784e-06, + "loss": 0.8094, + "step": 8465 + }, + { + "epoch": 0.74, + "grad_norm": 7.776244730256433, + "learning_rate": 1.6279372700851753e-06, + "loss": 0.7052, + "step": 8466 + }, + { + "epoch": 0.74, + "grad_norm": 8.332278224432661, + "learning_rate": 1.62688723910088e-06, + "loss": 0.648, + "step": 8467 + }, + { + "epoch": 0.74, + "grad_norm": 6.220943046365553, + "learning_rate": 1.6258374810632565e-06, + "loss": 0.829, + "step": 8468 + }, + { + "epoch": 0.74, + "grad_norm": 8.432893418735883, + "learning_rate": 1.6247879960572532e-06, + "loss": 0.7899, + "step": 8469 + }, + { + "epoch": 0.74, + "grad_norm": 9.31431411838204, + "learning_rate": 1.6237387841677903e-06, + "loss": 0.6876, + "step": 8470 + }, + { + "epoch": 0.74, + "grad_norm": 7.7712551585180485, + "learning_rate": 1.6226898454797697e-06, + "loss": 0.742, + "step": 8471 + }, + { + "epoch": 0.74, + "grad_norm": 7.631636079756886, + "learning_rate": 1.6216411800780684e-06, + "loss": 0.6476, + "step": 8472 + }, + { + "epoch": 0.74, + "grad_norm": 7.803851633511972, + "learning_rate": 1.6205927880475436e-06, + "loss": 0.6679, + "step": 8473 + }, + { + "epoch": 0.74, + "grad_norm": 9.175774211669747, + "learning_rate": 1.619544669473027e-06, + "loss": 0.7014, + "step": 8474 + }, + { + "epoch": 0.74, + "grad_norm": 8.937206365435655, + "learning_rate": 1.6184968244393346e-06, + "loss": 0.6692, + "step": 8475 + }, + { + "epoch": 0.74, + "grad_norm": 2.835753516073856, + "learning_rate": 1.617449253031254e-06, + "loss": 0.5624, + "step": 8476 + }, + { + "epoch": 0.74, + "grad_norm": 11.388461307459853, + "learning_rate": 1.6164019553335536e-06, + "loss": 0.8155, + "step": 8477 + }, + { + "epoch": 0.74, + "grad_norm": 3.4710811168129467, + "learning_rate": 1.615354931430978e-06, + "loss": 0.5641, + "step": 8478 + }, + { + "epoch": 0.74, + "grad_norm": 17.070774370551128, + "learning_rate": 1.6143081814082517e-06, + "loss": 0.7172, + "step": 8479 + }, + { + "epoch": 0.74, + "grad_norm": 7.3778466420404065, + "learning_rate": 1.6132617053500742e-06, + "loss": 0.662, + "step": 8480 + }, + { + "epoch": 0.74, + "grad_norm": 13.592538431854827, + "learning_rate": 1.6122155033411275e-06, + "loss": 0.8337, + "step": 8481 + }, + { + "epoch": 0.74, + "grad_norm": 11.191840760603018, + "learning_rate": 1.6111695754660667e-06, + "loss": 0.7729, + "step": 8482 + }, + { + "epoch": 0.74, + "grad_norm": 11.01705694457065, + "learning_rate": 1.610123921809527e-06, + "loss": 0.7129, + "step": 8483 + }, + { + "epoch": 0.75, + "grad_norm": 10.883050453179976, + "learning_rate": 1.6090785424561205e-06, + "loss": 0.8079, + "step": 8484 + }, + { + "epoch": 0.75, + "grad_norm": 9.332615430122809, + "learning_rate": 1.6080334374904382e-06, + "loss": 0.6889, + "step": 8485 + }, + { + "epoch": 0.75, + "grad_norm": 7.491804221906894, + "learning_rate": 1.6069886069970459e-06, + "loss": 0.6936, + "step": 8486 + }, + { + "epoch": 0.75, + "grad_norm": 7.037287181853367, + "learning_rate": 1.6059440510604934e-06, + "loss": 0.7108, + "step": 8487 + }, + { + "epoch": 0.75, + "grad_norm": 6.494380521277996, + "learning_rate": 1.604899769765303e-06, + "loss": 0.7425, + "step": 8488 + }, + { + "epoch": 0.75, + "grad_norm": 12.397374914793565, + "learning_rate": 1.603855763195975e-06, + "loss": 0.7701, + "step": 8489 + }, + { + "epoch": 0.75, + "grad_norm": 9.080608031131403, + "learning_rate": 1.6028120314369899e-06, + "loss": 0.5783, + "step": 8490 + }, + { + "epoch": 0.75, + "grad_norm": 9.485295910746032, + "learning_rate": 1.6017685745728024e-06, + "loss": 0.6153, + "step": 8491 + }, + { + "epoch": 0.75, + "grad_norm": 5.638652586740039, + "learning_rate": 1.6007253926878509e-06, + "loss": 0.6237, + "step": 8492 + }, + { + "epoch": 0.75, + "grad_norm": 9.906578388931013, + "learning_rate": 1.599682485866546e-06, + "loss": 0.5531, + "step": 8493 + }, + { + "epoch": 0.75, + "grad_norm": 5.761081297030989, + "learning_rate": 1.5986398541932785e-06, + "loss": 0.7157, + "step": 8494 + }, + { + "epoch": 0.75, + "grad_norm": 10.48370913742756, + "learning_rate": 1.597597497752416e-06, + "loss": 0.6729, + "step": 8495 + }, + { + "epoch": 0.75, + "grad_norm": 2.879034914018316, + "learning_rate": 1.5965554166283048e-06, + "loss": 0.5343, + "step": 8496 + }, + { + "epoch": 0.75, + "grad_norm": 9.392964961843791, + "learning_rate": 1.595513610905266e-06, + "loss": 0.7506, + "step": 8497 + }, + { + "epoch": 0.75, + "grad_norm": 6.83513484761346, + "learning_rate": 1.5944720806676046e-06, + "loss": 0.8197, + "step": 8498 + }, + { + "epoch": 0.75, + "grad_norm": 2.511963281657001, + "learning_rate": 1.5934308259995967e-06, + "loss": 0.4673, + "step": 8499 + }, + { + "epoch": 0.75, + "grad_norm": 2.3576064410826003, + "learning_rate": 1.5923898469855008e-06, + "loss": 0.433, + "step": 8500 + }, + { + "epoch": 0.75, + "grad_norm": 11.146095915902418, + "learning_rate": 1.5913491437095495e-06, + "loss": 0.7252, + "step": 8501 + }, + { + "epoch": 0.75, + "grad_norm": 2.8395661483103765, + "learning_rate": 1.590308716255956e-06, + "loss": 0.4901, + "step": 8502 + }, + { + "epoch": 0.75, + "grad_norm": 11.092905480705472, + "learning_rate": 1.589268564708908e-06, + "loss": 0.7658, + "step": 8503 + }, + { + "epoch": 0.75, + "grad_norm": 12.720851322713989, + "learning_rate": 1.5882286891525755e-06, + "loss": 0.6985, + "step": 8504 + }, + { + "epoch": 0.75, + "grad_norm": 18.199052602344004, + "learning_rate": 1.5871890896711022e-06, + "loss": 0.6408, + "step": 8505 + }, + { + "epoch": 0.75, + "grad_norm": 20.056156961691137, + "learning_rate": 1.5861497663486115e-06, + "loss": 0.6804, + "step": 8506 + }, + { + "epoch": 0.75, + "grad_norm": 11.285820645292498, + "learning_rate": 1.5851107192692022e-06, + "loss": 0.8953, + "step": 8507 + }, + { + "epoch": 0.75, + "grad_norm": 6.459634051552324, + "learning_rate": 1.584071948516952e-06, + "loss": 0.6678, + "step": 8508 + }, + { + "epoch": 0.75, + "grad_norm": 8.355571334643207, + "learning_rate": 1.583033454175919e-06, + "loss": 0.7943, + "step": 8509 + }, + { + "epoch": 0.75, + "grad_norm": 9.576877636104202, + "learning_rate": 1.5819952363301355e-06, + "loss": 0.7769, + "step": 8510 + }, + { + "epoch": 0.75, + "grad_norm": 13.269156431710888, + "learning_rate": 1.5809572950636115e-06, + "loss": 0.6767, + "step": 8511 + }, + { + "epoch": 0.75, + "grad_norm": 9.771812330695433, + "learning_rate": 1.5799196304603359e-06, + "loss": 0.7226, + "step": 8512 + }, + { + "epoch": 0.75, + "grad_norm": 6.78705668132965, + "learning_rate": 1.578882242604275e-06, + "loss": 0.6827, + "step": 8513 + }, + { + "epoch": 0.75, + "grad_norm": 12.345500123681868, + "learning_rate": 1.5778451315793708e-06, + "loss": 0.7318, + "step": 8514 + }, + { + "epoch": 0.75, + "grad_norm": 5.884934936892067, + "learning_rate": 1.5768082974695476e-06, + "loss": 0.7115, + "step": 8515 + }, + { + "epoch": 0.75, + "grad_norm": 6.447194239445622, + "learning_rate": 1.5757717403587026e-06, + "loss": 0.7869, + "step": 8516 + }, + { + "epoch": 0.75, + "grad_norm": 8.55197594239687, + "learning_rate": 1.5747354603307124e-06, + "loss": 0.7658, + "step": 8517 + }, + { + "epoch": 0.75, + "grad_norm": 9.433694525641064, + "learning_rate": 1.5736994574694315e-06, + "loss": 0.7573, + "step": 8518 + }, + { + "epoch": 0.75, + "grad_norm": 3.5647607198266296, + "learning_rate": 1.5726637318586907e-06, + "loss": 0.5118, + "step": 8519 + }, + { + "epoch": 0.75, + "grad_norm": 3.0082495824633373, + "learning_rate": 1.5716282835822982e-06, + "loss": 0.5242, + "step": 8520 + }, + { + "epoch": 0.75, + "grad_norm": 5.578120095228559, + "learning_rate": 1.5705931127240436e-06, + "loss": 0.7267, + "step": 8521 + }, + { + "epoch": 0.75, + "grad_norm": 8.442413953175512, + "learning_rate": 1.56955821936769e-06, + "loss": 0.8138, + "step": 8522 + }, + { + "epoch": 0.75, + "grad_norm": 2.9591862998038696, + "learning_rate": 1.5685236035969791e-06, + "loss": 0.5597, + "step": 8523 + }, + { + "epoch": 0.75, + "grad_norm": 6.568500651142722, + "learning_rate": 1.5674892654956303e-06, + "loss": 0.6916, + "step": 8524 + }, + { + "epoch": 0.75, + "grad_norm": 8.648688667622237, + "learning_rate": 1.5664552051473403e-06, + "loss": 0.751, + "step": 8525 + }, + { + "epoch": 0.75, + "grad_norm": 9.543951874064566, + "learning_rate": 1.5654214226357822e-06, + "loss": 0.4947, + "step": 8526 + }, + { + "epoch": 0.75, + "grad_norm": 3.0288846654846413, + "learning_rate": 1.564387918044611e-06, + "loss": 0.5007, + "step": 8527 + }, + { + "epoch": 0.75, + "grad_norm": 11.262710322424455, + "learning_rate": 1.5633546914574543e-06, + "loss": 0.8162, + "step": 8528 + }, + { + "epoch": 0.75, + "grad_norm": 6.970984624722944, + "learning_rate": 1.5623217429579197e-06, + "loss": 0.6712, + "step": 8529 + }, + { + "epoch": 0.75, + "grad_norm": 8.679992438930341, + "learning_rate": 1.5612890726295915e-06, + "loss": 0.7324, + "step": 8530 + }, + { + "epoch": 0.75, + "grad_norm": 6.5086154856096465, + "learning_rate": 1.5602566805560298e-06, + "loss": 0.6092, + "step": 8531 + }, + { + "epoch": 0.75, + "grad_norm": 8.209281215792087, + "learning_rate": 1.5592245668207773e-06, + "loss": 0.7059, + "step": 8532 + }, + { + "epoch": 0.75, + "grad_norm": 7.740260863698005, + "learning_rate": 1.5581927315073498e-06, + "loss": 0.7532, + "step": 8533 + }, + { + "epoch": 0.75, + "grad_norm": 6.72373249539814, + "learning_rate": 1.557161174699241e-06, + "loss": 0.6891, + "step": 8534 + }, + { + "epoch": 0.75, + "grad_norm": 6.210648791840469, + "learning_rate": 1.5561298964799226e-06, + "loss": 0.6401, + "step": 8535 + }, + { + "epoch": 0.75, + "grad_norm": 10.09838326409548, + "learning_rate": 1.5550988969328452e-06, + "loss": 0.6849, + "step": 8536 + }, + { + "epoch": 0.75, + "grad_norm": 11.557115486940821, + "learning_rate": 1.5540681761414327e-06, + "loss": 0.8161, + "step": 8537 + }, + { + "epoch": 0.75, + "grad_norm": 8.523392495402678, + "learning_rate": 1.5530377341890934e-06, + "loss": 0.7377, + "step": 8538 + }, + { + "epoch": 0.75, + "grad_norm": 6.511233572399954, + "learning_rate": 1.5520075711592064e-06, + "loss": 0.7642, + "step": 8539 + }, + { + "epoch": 0.75, + "grad_norm": 5.782598787834428, + "learning_rate": 1.5509776871351317e-06, + "loss": 0.8053, + "step": 8540 + }, + { + "epoch": 0.75, + "grad_norm": 6.392696691940456, + "learning_rate": 1.549948082200205e-06, + "loss": 0.7857, + "step": 8541 + }, + { + "epoch": 0.75, + "grad_norm": 7.9847050181726456, + "learning_rate": 1.548918756437741e-06, + "loss": 0.672, + "step": 8542 + }, + { + "epoch": 0.75, + "grad_norm": 8.520517472260943, + "learning_rate": 1.5478897099310293e-06, + "loss": 0.6617, + "step": 8543 + }, + { + "epoch": 0.75, + "grad_norm": 6.848700305922219, + "learning_rate": 1.5468609427633407e-06, + "loss": 0.66, + "step": 8544 + }, + { + "epoch": 0.75, + "grad_norm": 8.40687256998086, + "learning_rate": 1.545832455017921e-06, + "loss": 0.7704, + "step": 8545 + }, + { + "epoch": 0.75, + "grad_norm": 6.677262364833497, + "learning_rate": 1.544804246777994e-06, + "loss": 0.7226, + "step": 8546 + }, + { + "epoch": 0.75, + "grad_norm": 6.846057551573491, + "learning_rate": 1.543776318126759e-06, + "loss": 0.7682, + "step": 8547 + }, + { + "epoch": 0.75, + "grad_norm": 20.217751257287517, + "learning_rate": 1.5427486691473942e-06, + "loss": 0.6934, + "step": 8548 + }, + { + "epoch": 0.75, + "grad_norm": 24.329532775862706, + "learning_rate": 1.5417212999230574e-06, + "loss": 0.9943, + "step": 8549 + }, + { + "epoch": 0.75, + "grad_norm": 10.600214956976119, + "learning_rate": 1.5406942105368805e-06, + "loss": 0.6416, + "step": 8550 + }, + { + "epoch": 0.75, + "grad_norm": 6.767233291598702, + "learning_rate": 1.5396674010719742e-06, + "loss": 0.7983, + "step": 8551 + }, + { + "epoch": 0.75, + "grad_norm": 5.564063306240261, + "learning_rate": 1.5386408716114253e-06, + "loss": 0.5261, + "step": 8552 + }, + { + "epoch": 0.75, + "grad_norm": 2.1109232451952775, + "learning_rate": 1.5376146222382998e-06, + "loss": 0.4578, + "step": 8553 + }, + { + "epoch": 0.75, + "grad_norm": 5.166591892554484, + "learning_rate": 1.5365886530356383e-06, + "loss": 0.6679, + "step": 8554 + }, + { + "epoch": 0.75, + "grad_norm": 8.776802264264738, + "learning_rate": 1.5355629640864633e-06, + "loss": 0.7266, + "step": 8555 + }, + { + "epoch": 0.75, + "grad_norm": 9.107638075798425, + "learning_rate": 1.5345375554737701e-06, + "loss": 0.6477, + "step": 8556 + }, + { + "epoch": 0.75, + "grad_norm": 5.443133946308248, + "learning_rate": 1.533512427280534e-06, + "loss": 0.6437, + "step": 8557 + }, + { + "epoch": 0.75, + "grad_norm": 10.290922298592482, + "learning_rate": 1.5324875795897055e-06, + "loss": 0.7368, + "step": 8558 + }, + { + "epoch": 0.75, + "grad_norm": 5.660450604183244, + "learning_rate": 1.5314630124842144e-06, + "loss": 0.7069, + "step": 8559 + }, + { + "epoch": 0.75, + "grad_norm": 10.8764427330057, + "learning_rate": 1.5304387260469649e-06, + "loss": 0.8502, + "step": 8560 + }, + { + "epoch": 0.75, + "grad_norm": 6.720069378406839, + "learning_rate": 1.5294147203608445e-06, + "loss": 0.7641, + "step": 8561 + }, + { + "epoch": 0.75, + "grad_norm": 10.464921323192327, + "learning_rate": 1.5283909955087118e-06, + "loss": 0.6295, + "step": 8562 + }, + { + "epoch": 0.75, + "grad_norm": 13.904380142162156, + "learning_rate": 1.5273675515734048e-06, + "loss": 0.7967, + "step": 8563 + }, + { + "epoch": 0.75, + "grad_norm": 10.27111462947087, + "learning_rate": 1.526344388637739e-06, + "loss": 0.8464, + "step": 8564 + }, + { + "epoch": 0.75, + "grad_norm": 14.725634055098789, + "learning_rate": 1.5253215067845072e-06, + "loss": 0.692, + "step": 8565 + }, + { + "epoch": 0.75, + "grad_norm": 7.5156833817861735, + "learning_rate": 1.5242989060964775e-06, + "loss": 0.6554, + "step": 8566 + }, + { + "epoch": 0.75, + "grad_norm": 6.711847873801069, + "learning_rate": 1.5232765866564008e-06, + "loss": 0.6713, + "step": 8567 + }, + { + "epoch": 0.75, + "grad_norm": 3.366930652475933, + "learning_rate": 1.5222545485469992e-06, + "loss": 0.5107, + "step": 8568 + }, + { + "epoch": 0.75, + "grad_norm": 3.852678565023859, + "learning_rate": 1.5212327918509746e-06, + "loss": 0.5128, + "step": 8569 + }, + { + "epoch": 0.75, + "grad_norm": 2.501791673645262, + "learning_rate": 1.5202113166510058e-06, + "loss": 0.5615, + "step": 8570 + }, + { + "epoch": 0.75, + "grad_norm": 10.083560740901278, + "learning_rate": 1.5191901230297479e-06, + "loss": 0.7033, + "step": 8571 + }, + { + "epoch": 0.75, + "grad_norm": 7.151569980176252, + "learning_rate": 1.518169211069836e-06, + "loss": 0.5953, + "step": 8572 + }, + { + "epoch": 0.75, + "grad_norm": 2.362504454695799, + "learning_rate": 1.5171485808538805e-06, + "loss": 0.5403, + "step": 8573 + }, + { + "epoch": 0.75, + "grad_norm": 20.13495985926574, + "learning_rate": 1.5161282324644677e-06, + "loss": 0.7557, + "step": 8574 + }, + { + "epoch": 0.75, + "grad_norm": 6.636352580801394, + "learning_rate": 1.5151081659841637e-06, + "loss": 0.8396, + "step": 8575 + }, + { + "epoch": 0.75, + "grad_norm": 8.413487969617869, + "learning_rate": 1.5140883814955098e-06, + "loss": 0.7888, + "step": 8576 + }, + { + "epoch": 0.75, + "grad_norm": 7.594903811874022, + "learning_rate": 1.5130688790810245e-06, + "loss": 0.5991, + "step": 8577 + }, + { + "epoch": 0.75, + "grad_norm": 6.103117054421952, + "learning_rate": 1.5120496588232064e-06, + "loss": 0.7405, + "step": 8578 + }, + { + "epoch": 0.75, + "grad_norm": 5.853156201506522, + "learning_rate": 1.5110307208045276e-06, + "loss": 0.6849, + "step": 8579 + }, + { + "epoch": 0.75, + "grad_norm": 7.025900373854029, + "learning_rate": 1.5100120651074402e-06, + "loss": 0.8434, + "step": 8580 + }, + { + "epoch": 0.75, + "grad_norm": 9.269889018219232, + "learning_rate": 1.5089936918143705e-06, + "loss": 0.7657, + "step": 8581 + }, + { + "epoch": 0.75, + "grad_norm": 8.95294946546739, + "learning_rate": 1.5079756010077246e-06, + "loss": 0.6653, + "step": 8582 + }, + { + "epoch": 0.75, + "grad_norm": 7.416903548152963, + "learning_rate": 1.506957792769883e-06, + "loss": 0.7257, + "step": 8583 + }, + { + "epoch": 0.75, + "grad_norm": 9.635353210345, + "learning_rate": 1.5059402671832074e-06, + "loss": 0.6949, + "step": 8584 + }, + { + "epoch": 0.75, + "grad_norm": 8.952602838064735, + "learning_rate": 1.504923024330034e-06, + "loss": 0.7208, + "step": 8585 + }, + { + "epoch": 0.75, + "grad_norm": 3.0449894668617463, + "learning_rate": 1.503906064292675e-06, + "loss": 0.4734, + "step": 8586 + }, + { + "epoch": 0.75, + "grad_norm": 12.813262142613844, + "learning_rate": 1.5028893871534222e-06, + "loss": 0.6691, + "step": 8587 + }, + { + "epoch": 0.75, + "grad_norm": 5.963288372561439, + "learning_rate": 1.5018729929945419e-06, + "loss": 0.7866, + "step": 8588 + }, + { + "epoch": 0.75, + "grad_norm": 9.205979626015985, + "learning_rate": 1.500856881898281e-06, + "loss": 0.7463, + "step": 8589 + }, + { + "epoch": 0.75, + "grad_norm": 6.680866513270182, + "learning_rate": 1.499841053946861e-06, + "loss": 0.7819, + "step": 8590 + }, + { + "epoch": 0.75, + "grad_norm": 6.317456982996292, + "learning_rate": 1.4988255092224807e-06, + "loss": 0.7218, + "step": 8591 + }, + { + "epoch": 0.75, + "grad_norm": 10.64039907740373, + "learning_rate": 1.4978102478073165e-06, + "loss": 0.9669, + "step": 8592 + }, + { + "epoch": 0.75, + "grad_norm": 7.030823987843608, + "learning_rate": 1.4967952697835215e-06, + "loss": 0.635, + "step": 8593 + }, + { + "epoch": 0.75, + "grad_norm": 7.368592926441498, + "learning_rate": 1.4957805752332238e-06, + "loss": 0.6712, + "step": 8594 + }, + { + "epoch": 0.75, + "grad_norm": 8.735990256527868, + "learning_rate": 1.4947661642385348e-06, + "loss": 0.6835, + "step": 8595 + }, + { + "epoch": 0.75, + "grad_norm": 14.178039071394313, + "learning_rate": 1.4937520368815373e-06, + "loss": 0.7241, + "step": 8596 + }, + { + "epoch": 0.76, + "grad_norm": 7.760659674023067, + "learning_rate": 1.4927381932442924e-06, + "loss": 0.6831, + "step": 8597 + }, + { + "epoch": 0.76, + "grad_norm": 9.512942472345003, + "learning_rate": 1.4917246334088392e-06, + "loss": 0.6711, + "step": 8598 + }, + { + "epoch": 0.76, + "grad_norm": 9.55721544020012, + "learning_rate": 1.490711357457193e-06, + "loss": 0.762, + "step": 8599 + }, + { + "epoch": 0.76, + "grad_norm": 8.12630125265379, + "learning_rate": 1.4896983654713442e-06, + "loss": 0.7379, + "step": 8600 + }, + { + "epoch": 0.76, + "grad_norm": 7.22758666669917, + "learning_rate": 1.488685657533267e-06, + "loss": 0.8363, + "step": 8601 + }, + { + "epoch": 0.76, + "grad_norm": 3.6743187209754744, + "learning_rate": 1.487673233724905e-06, + "loss": 0.5894, + "step": 8602 + }, + { + "epoch": 0.76, + "grad_norm": 9.432268072949796, + "learning_rate": 1.4866610941281823e-06, + "loss": 0.8046, + "step": 8603 + }, + { + "epoch": 0.76, + "grad_norm": 2.5814207984856115, + "learning_rate": 1.4856492388250005e-06, + "loss": 0.4714, + "step": 8604 + }, + { + "epoch": 0.76, + "grad_norm": 2.375906674814552, + "learning_rate": 1.4846376678972356e-06, + "loss": 0.5159, + "step": 8605 + }, + { + "epoch": 0.76, + "grad_norm": 2.5840452200785125, + "learning_rate": 1.4836263814267421e-06, + "loss": 0.5057, + "step": 8606 + }, + { + "epoch": 0.76, + "grad_norm": 7.461582609559691, + "learning_rate": 1.482615379495354e-06, + "loss": 0.7276, + "step": 8607 + }, + { + "epoch": 0.76, + "grad_norm": 11.055823122640868, + "learning_rate": 1.4816046621848784e-06, + "loss": 0.6627, + "step": 8608 + }, + { + "epoch": 0.76, + "grad_norm": 7.0920168861877935, + "learning_rate": 1.4805942295771013e-06, + "loss": 0.7991, + "step": 8609 + }, + { + "epoch": 0.76, + "grad_norm": 10.702272333582668, + "learning_rate": 1.479584081753785e-06, + "loss": 0.8009, + "step": 8610 + }, + { + "epoch": 0.76, + "grad_norm": 10.875675122685227, + "learning_rate": 1.4785742187966667e-06, + "loss": 0.7326, + "step": 8611 + }, + { + "epoch": 0.76, + "grad_norm": 7.299474266758159, + "learning_rate": 1.4775646407874672e-06, + "loss": 0.7056, + "step": 8612 + }, + { + "epoch": 0.76, + "grad_norm": 6.77970915403319, + "learning_rate": 1.4765553478078777e-06, + "loss": 0.6586, + "step": 8613 + }, + { + "epoch": 0.76, + "grad_norm": 50.907772554188064, + "learning_rate": 1.475546339939568e-06, + "loss": 0.865, + "step": 8614 + }, + { + "epoch": 0.76, + "grad_norm": 5.210011326825359, + "learning_rate": 1.4745376172641862e-06, + "loss": 0.7386, + "step": 8615 + }, + { + "epoch": 0.76, + "grad_norm": 14.202068782864504, + "learning_rate": 1.4735291798633561e-06, + "loss": 0.6448, + "step": 8616 + }, + { + "epoch": 0.76, + "grad_norm": 16.58633954863907, + "learning_rate": 1.4725210278186769e-06, + "loss": 0.8219, + "step": 8617 + }, + { + "epoch": 0.76, + "grad_norm": 6.683813587639699, + "learning_rate": 1.4715131612117305e-06, + "loss": 0.7425, + "step": 8618 + }, + { + "epoch": 0.76, + "grad_norm": 12.856415127747056, + "learning_rate": 1.4705055801240692e-06, + "loss": 0.6943, + "step": 8619 + }, + { + "epoch": 0.76, + "grad_norm": 2.853787291282398, + "learning_rate": 1.4694982846372258e-06, + "loss": 0.5224, + "step": 8620 + }, + { + "epoch": 0.76, + "grad_norm": 9.671782951675317, + "learning_rate": 1.4684912748327085e-06, + "loss": 0.7289, + "step": 8621 + }, + { + "epoch": 0.76, + "grad_norm": 12.15851716988949, + "learning_rate": 1.4674845507920027e-06, + "loss": 0.8522, + "step": 8622 + }, + { + "epoch": 0.76, + "grad_norm": 5.968242503130659, + "learning_rate": 1.4664781125965694e-06, + "loss": 0.7154, + "step": 8623 + }, + { + "epoch": 0.76, + "grad_norm": 5.650334658114343, + "learning_rate": 1.4654719603278516e-06, + "loss": 0.5904, + "step": 8624 + }, + { + "epoch": 0.76, + "grad_norm": 7.187417658507275, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.6623, + "step": 8625 + }, + { + "epoch": 0.76, + "grad_norm": 9.21928807742645, + "learning_rate": 1.4634605138961965e-06, + "loss": 0.8839, + "step": 8626 + }, + { + "epoch": 0.76, + "grad_norm": 21.863451825481434, + "learning_rate": 1.4624552198960235e-06, + "loss": 0.8139, + "step": 8627 + }, + { + "epoch": 0.76, + "grad_norm": 9.826284501606244, + "learning_rate": 1.4614502121480873e-06, + "loss": 0.6361, + "step": 8628 + }, + { + "epoch": 0.76, + "grad_norm": 3.618622967407926, + "learning_rate": 1.4604454907337157e-06, + "loss": 0.5529, + "step": 8629 + }, + { + "epoch": 0.76, + "grad_norm": 6.696074724093164, + "learning_rate": 1.4594410557342076e-06, + "loss": 0.5685, + "step": 8630 + }, + { + "epoch": 0.76, + "grad_norm": 13.413783018672186, + "learning_rate": 1.4584369072308396e-06, + "loss": 0.7677, + "step": 8631 + }, + { + "epoch": 0.76, + "grad_norm": 7.9266477878877195, + "learning_rate": 1.4574330453048657e-06, + "loss": 0.7337, + "step": 8632 + }, + { + "epoch": 0.76, + "grad_norm": 7.269577912517483, + "learning_rate": 1.456429470037517e-06, + "loss": 0.8271, + "step": 8633 + }, + { + "epoch": 0.76, + "grad_norm": 8.157559658364619, + "learning_rate": 1.45542618151e-06, + "loss": 0.7523, + "step": 8634 + }, + { + "epoch": 0.76, + "grad_norm": 7.319763001344733, + "learning_rate": 1.454423179803502e-06, + "loss": 0.7247, + "step": 8635 + }, + { + "epoch": 0.76, + "grad_norm": 53.13099662581687, + "learning_rate": 1.4534204649991817e-06, + "loss": 0.6461, + "step": 8636 + }, + { + "epoch": 0.76, + "grad_norm": 7.161358992164923, + "learning_rate": 1.4524180371781788e-06, + "loss": 0.7696, + "step": 8637 + }, + { + "epoch": 0.76, + "grad_norm": 5.809352688698878, + "learning_rate": 1.4514158964216069e-06, + "loss": 0.6689, + "step": 8638 + }, + { + "epoch": 0.76, + "grad_norm": 2.4265176321837507, + "learning_rate": 1.4504140428105578e-06, + "loss": 0.4461, + "step": 8639 + }, + { + "epoch": 0.76, + "grad_norm": 10.847915500040301, + "learning_rate": 1.4494124764260986e-06, + "loss": 0.6864, + "step": 8640 + }, + { + "epoch": 0.76, + "grad_norm": 7.892460175040077, + "learning_rate": 1.448411197349277e-06, + "loss": 0.657, + "step": 8641 + }, + { + "epoch": 0.76, + "grad_norm": 5.717873759494398, + "learning_rate": 1.4474102056611134e-06, + "loss": 0.734, + "step": 8642 + }, + { + "epoch": 0.76, + "grad_norm": 6.694649780237207, + "learning_rate": 1.4464095014426072e-06, + "loss": 0.7707, + "step": 8643 + }, + { + "epoch": 0.76, + "grad_norm": 7.781175248553418, + "learning_rate": 1.4454090847747326e-06, + "loss": 0.6396, + "step": 8644 + }, + { + "epoch": 0.76, + "grad_norm": 2.6582746058652624, + "learning_rate": 1.4444089557384404e-06, + "loss": 0.5121, + "step": 8645 + }, + { + "epoch": 0.76, + "grad_norm": 5.968932846541165, + "learning_rate": 1.443409114414663e-06, + "loss": 0.8975, + "step": 8646 + }, + { + "epoch": 0.76, + "grad_norm": 5.139171197189469, + "learning_rate": 1.4424095608843036e-06, + "loss": 0.6045, + "step": 8647 + }, + { + "epoch": 0.76, + "grad_norm": 8.255833057675488, + "learning_rate": 1.441410295228245e-06, + "loss": 0.6101, + "step": 8648 + }, + { + "epoch": 0.76, + "grad_norm": 8.6359606946305, + "learning_rate": 1.440411317527346e-06, + "loss": 0.7633, + "step": 8649 + }, + { + "epoch": 0.76, + "grad_norm": 7.676150857092457, + "learning_rate": 1.4394126278624414e-06, + "loss": 0.6144, + "step": 8650 + }, + { + "epoch": 0.76, + "grad_norm": 6.172553207188935, + "learning_rate": 1.4384142263143437e-06, + "loss": 0.6379, + "step": 8651 + }, + { + "epoch": 0.76, + "grad_norm": 8.98979771527368, + "learning_rate": 1.4374161129638437e-06, + "loss": 0.7283, + "step": 8652 + }, + { + "epoch": 0.76, + "grad_norm": 7.954030659100848, + "learning_rate": 1.4364182878917055e-06, + "loss": 0.7374, + "step": 8653 + }, + { + "epoch": 0.76, + "grad_norm": 7.368387587239321, + "learning_rate": 1.4354207511786717e-06, + "loss": 0.7247, + "step": 8654 + }, + { + "epoch": 0.76, + "grad_norm": 5.748386382608836, + "learning_rate": 1.4344235029054616e-06, + "loss": 0.7909, + "step": 8655 + }, + { + "epoch": 0.76, + "grad_norm": 10.970352676337706, + "learning_rate": 1.4334265431527706e-06, + "loss": 0.6708, + "step": 8656 + }, + { + "epoch": 0.76, + "grad_norm": 11.084757603801766, + "learning_rate": 1.432429872001269e-06, + "loss": 0.6787, + "step": 8657 + }, + { + "epoch": 0.76, + "grad_norm": 15.700518259820024, + "learning_rate": 1.4314334895316095e-06, + "loss": 0.7163, + "step": 8658 + }, + { + "epoch": 0.76, + "grad_norm": 6.221512806766231, + "learning_rate": 1.4304373958244167e-06, + "loss": 0.6752, + "step": 8659 + }, + { + "epoch": 0.76, + "grad_norm": 6.069871288123253, + "learning_rate": 1.4294415909602915e-06, + "loss": 0.7046, + "step": 8660 + }, + { + "epoch": 0.76, + "grad_norm": 7.234255150111992, + "learning_rate": 1.4284460750198137e-06, + "loss": 0.6313, + "step": 8661 + }, + { + "epoch": 0.76, + "grad_norm": 8.331510696351522, + "learning_rate": 1.4274508480835387e-06, + "loss": 0.6737, + "step": 8662 + }, + { + "epoch": 0.76, + "grad_norm": 10.858353563436932, + "learning_rate": 1.4264559102319964e-06, + "loss": 0.7781, + "step": 8663 + }, + { + "epoch": 0.76, + "grad_norm": 8.438448782622634, + "learning_rate": 1.4254612615456998e-06, + "loss": 0.713, + "step": 8664 + }, + { + "epoch": 0.76, + "grad_norm": 12.52702422549637, + "learning_rate": 1.424466902105131e-06, + "loss": 0.8141, + "step": 8665 + }, + { + "epoch": 0.76, + "grad_norm": 9.78914949752711, + "learning_rate": 1.4234728319907537e-06, + "loss": 0.7962, + "step": 8666 + }, + { + "epoch": 0.76, + "grad_norm": 12.335745174892548, + "learning_rate": 1.4224790512830056e-06, + "loss": 0.7543, + "step": 8667 + }, + { + "epoch": 0.76, + "grad_norm": 9.439931977376983, + "learning_rate": 1.4214855600622995e-06, + "loss": 0.8156, + "step": 8668 + }, + { + "epoch": 0.76, + "grad_norm": 2.2082669950494114, + "learning_rate": 1.4204923584090314e-06, + "loss": 0.4673, + "step": 8669 + }, + { + "epoch": 0.76, + "grad_norm": 8.572354160200108, + "learning_rate": 1.4194994464035678e-06, + "loss": 0.7335, + "step": 8670 + }, + { + "epoch": 0.76, + "grad_norm": 8.940618688142402, + "learning_rate": 1.4185068241262522e-06, + "loss": 0.8885, + "step": 8671 + }, + { + "epoch": 0.76, + "grad_norm": 8.187054858775998, + "learning_rate": 1.417514491657408e-06, + "loss": 0.5938, + "step": 8672 + }, + { + "epoch": 0.76, + "grad_norm": 7.741388011560046, + "learning_rate": 1.416522449077331e-06, + "loss": 0.6732, + "step": 8673 + }, + { + "epoch": 0.76, + "grad_norm": 8.205572284872352, + "learning_rate": 1.4155306964662957e-06, + "loss": 0.7754, + "step": 8674 + }, + { + "epoch": 0.76, + "grad_norm": 10.142888837061443, + "learning_rate": 1.414539233904555e-06, + "loss": 0.6512, + "step": 8675 + }, + { + "epoch": 0.76, + "grad_norm": 7.978050653115836, + "learning_rate": 1.4135480614723357e-06, + "loss": 0.5958, + "step": 8676 + }, + { + "epoch": 0.76, + "grad_norm": 5.3388836895661305, + "learning_rate": 1.4125571792498415e-06, + "loss": 0.6346, + "step": 8677 + }, + { + "epoch": 0.76, + "grad_norm": 9.083196272209833, + "learning_rate": 1.411566587317253e-06, + "loss": 0.5989, + "step": 8678 + }, + { + "epoch": 0.76, + "grad_norm": 5.666472227598377, + "learning_rate": 1.4105762857547273e-06, + "loss": 0.667, + "step": 8679 + }, + { + "epoch": 0.76, + "grad_norm": 5.0406150427107255, + "learning_rate": 1.4095862746423961e-06, + "loss": 0.684, + "step": 8680 + }, + { + "epoch": 0.76, + "grad_norm": 8.225988750611352, + "learning_rate": 1.4085965540603724e-06, + "loss": 0.7782, + "step": 8681 + }, + { + "epoch": 0.76, + "grad_norm": 3.232867460878122, + "learning_rate": 1.4076071240887423e-06, + "loss": 0.4889, + "step": 8682 + }, + { + "epoch": 0.76, + "grad_norm": 8.733145947158091, + "learning_rate": 1.4066179848075678e-06, + "loss": 0.7147, + "step": 8683 + }, + { + "epoch": 0.76, + "grad_norm": 7.219964506965332, + "learning_rate": 1.4056291362968882e-06, + "loss": 0.6203, + "step": 8684 + }, + { + "epoch": 0.76, + "grad_norm": 7.069421342823745, + "learning_rate": 1.404640578636719e-06, + "loss": 0.758, + "step": 8685 + }, + { + "epoch": 0.76, + "grad_norm": 8.971810726211828, + "learning_rate": 1.4036523119070549e-06, + "loss": 0.7312, + "step": 8686 + }, + { + "epoch": 0.76, + "grad_norm": 6.46611575069243, + "learning_rate": 1.4026643361878638e-06, + "loss": 0.7265, + "step": 8687 + }, + { + "epoch": 0.76, + "grad_norm": 10.69884120471074, + "learning_rate": 1.4016766515590906e-06, + "loss": 0.836, + "step": 8688 + }, + { + "epoch": 0.76, + "grad_norm": 10.94444680887318, + "learning_rate": 1.400689258100657e-06, + "loss": 0.5131, + "step": 8689 + }, + { + "epoch": 0.76, + "grad_norm": 5.563155092929684, + "learning_rate": 1.3997021558924617e-06, + "loss": 0.8229, + "step": 8690 + }, + { + "epoch": 0.76, + "grad_norm": 6.548819563316428, + "learning_rate": 1.3987153450143775e-06, + "loss": 0.7773, + "step": 8691 + }, + { + "epoch": 0.76, + "grad_norm": 9.070919995406312, + "learning_rate": 1.397728825546259e-06, + "loss": 0.7192, + "step": 8692 + }, + { + "epoch": 0.76, + "grad_norm": 1.851750268393453, + "learning_rate": 1.3967425975679317e-06, + "loss": 0.4427, + "step": 8693 + }, + { + "epoch": 0.76, + "grad_norm": 12.169232404921456, + "learning_rate": 1.3957566611591995e-06, + "loss": 0.6924, + "step": 8694 + }, + { + "epoch": 0.76, + "grad_norm": 6.084187337144143, + "learning_rate": 1.3947710163998428e-06, + "loss": 0.6337, + "step": 8695 + }, + { + "epoch": 0.76, + "grad_norm": 5.6324800063329485, + "learning_rate": 1.393785663369619e-06, + "loss": 0.642, + "step": 8696 + }, + { + "epoch": 0.76, + "grad_norm": 10.413253657059709, + "learning_rate": 1.3928006021482588e-06, + "loss": 0.6542, + "step": 8697 + }, + { + "epoch": 0.76, + "grad_norm": 7.465066141922781, + "learning_rate": 1.3918158328154753e-06, + "loss": 0.8429, + "step": 8698 + }, + { + "epoch": 0.76, + "grad_norm": 8.044220797945703, + "learning_rate": 1.3908313554509527e-06, + "loss": 0.7168, + "step": 8699 + }, + { + "epoch": 0.76, + "grad_norm": 15.602181659189382, + "learning_rate": 1.389847170134353e-06, + "loss": 0.5457, + "step": 8700 + }, + { + "epoch": 0.76, + "grad_norm": 8.12739619984938, + "learning_rate": 1.3888632769453154e-06, + "loss": 0.6582, + "step": 8701 + }, + { + "epoch": 0.76, + "grad_norm": 6.298056528537876, + "learning_rate": 1.3878796759634544e-06, + "loss": 0.6646, + "step": 8702 + }, + { + "epoch": 0.76, + "grad_norm": 11.26680402115239, + "learning_rate": 1.3868963672683604e-06, + "loss": 0.5961, + "step": 8703 + }, + { + "epoch": 0.76, + "grad_norm": 5.883542306718527, + "learning_rate": 1.3859133509396034e-06, + "loss": 0.5041, + "step": 8704 + }, + { + "epoch": 0.76, + "grad_norm": 6.260680417649375, + "learning_rate": 1.3849306270567263e-06, + "loss": 0.779, + "step": 8705 + }, + { + "epoch": 0.76, + "grad_norm": 5.539076811787292, + "learning_rate": 1.38394819569925e-06, + "loss": 0.8026, + "step": 8706 + }, + { + "epoch": 0.76, + "grad_norm": 8.518213147373457, + "learning_rate": 1.3829660569466701e-06, + "loss": 0.8169, + "step": 8707 + }, + { + "epoch": 0.76, + "grad_norm": 6.853878577496333, + "learning_rate": 1.3819842108784593e-06, + "loss": 0.7464, + "step": 8708 + }, + { + "epoch": 0.76, + "grad_norm": 7.352029826437412, + "learning_rate": 1.3810026575740692e-06, + "loss": 0.6761, + "step": 8709 + }, + { + "epoch": 0.76, + "grad_norm": 5.814427168586443, + "learning_rate": 1.380021397112924e-06, + "loss": 0.6956, + "step": 8710 + }, + { + "epoch": 0.77, + "grad_norm": 10.102854892321195, + "learning_rate": 1.3790404295744258e-06, + "loss": 0.6751, + "step": 8711 + }, + { + "epoch": 0.77, + "grad_norm": 16.78424235500768, + "learning_rate": 1.3780597550379532e-06, + "loss": 0.732, + "step": 8712 + }, + { + "epoch": 0.77, + "grad_norm": 7.554781805470691, + "learning_rate": 1.3770793735828603e-06, + "loss": 0.6635, + "step": 8713 + }, + { + "epoch": 0.77, + "grad_norm": 6.88162417559707, + "learning_rate": 1.3760992852884764e-06, + "loss": 0.7253, + "step": 8714 + }, + { + "epoch": 0.77, + "grad_norm": 7.683823913924015, + "learning_rate": 1.3751194902341114e-06, + "loss": 0.7108, + "step": 8715 + }, + { + "epoch": 0.77, + "grad_norm": 13.347118445667016, + "learning_rate": 1.374139988499048e-06, + "loss": 0.752, + "step": 8716 + }, + { + "epoch": 0.77, + "grad_norm": 7.423331145037352, + "learning_rate": 1.3731607801625457e-06, + "loss": 0.6476, + "step": 8717 + }, + { + "epoch": 0.77, + "grad_norm": 2.7746872511784897, + "learning_rate": 1.37218186530384e-06, + "loss": 0.5991, + "step": 8718 + }, + { + "epoch": 0.77, + "grad_norm": 5.349443386543477, + "learning_rate": 1.3712032440021428e-06, + "loss": 0.634, + "step": 8719 + }, + { + "epoch": 0.77, + "grad_norm": 2.6651434091544384, + "learning_rate": 1.3702249163366422e-06, + "loss": 0.5034, + "step": 8720 + }, + { + "epoch": 0.77, + "grad_norm": 9.520575587656925, + "learning_rate": 1.369246882386504e-06, + "loss": 0.7875, + "step": 8721 + }, + { + "epoch": 0.77, + "grad_norm": 5.944055698989413, + "learning_rate": 1.3682691422308692e-06, + "loss": 0.8515, + "step": 8722 + }, + { + "epoch": 0.77, + "grad_norm": 14.18333588284536, + "learning_rate": 1.3672916959488546e-06, + "loss": 0.7519, + "step": 8723 + }, + { + "epoch": 0.77, + "grad_norm": 8.923217702083239, + "learning_rate": 1.366314543619553e-06, + "loss": 0.8723, + "step": 8724 + }, + { + "epoch": 0.77, + "grad_norm": 5.977551589731624, + "learning_rate": 1.3653376853220323e-06, + "loss": 0.7122, + "step": 8725 + }, + { + "epoch": 0.77, + "grad_norm": 12.174133519545524, + "learning_rate": 1.3643611211353424e-06, + "loss": 0.8001, + "step": 8726 + }, + { + "epoch": 0.77, + "grad_norm": 6.427539184857809, + "learning_rate": 1.3633848511385023e-06, + "loss": 0.6211, + "step": 8727 + }, + { + "epoch": 0.77, + "grad_norm": 14.324551716358974, + "learning_rate": 1.3624088754105113e-06, + "loss": 0.7305, + "step": 8728 + }, + { + "epoch": 0.77, + "grad_norm": 8.846335783959164, + "learning_rate": 1.3614331940303432e-06, + "loss": 0.8434, + "step": 8729 + }, + { + "epoch": 0.77, + "grad_norm": 6.412195015255476, + "learning_rate": 1.3604578070769486e-06, + "loss": 0.7981, + "step": 8730 + }, + { + "epoch": 0.77, + "grad_norm": 14.61526697629878, + "learning_rate": 1.3594827146292528e-06, + "loss": 0.6766, + "step": 8731 + }, + { + "epoch": 0.77, + "grad_norm": 6.227683798722631, + "learning_rate": 1.3585079167661613e-06, + "loss": 0.7539, + "step": 8732 + }, + { + "epoch": 0.77, + "grad_norm": 8.723051375087946, + "learning_rate": 1.3575334135665519e-06, + "loss": 0.8016, + "step": 8733 + }, + { + "epoch": 0.77, + "grad_norm": 7.902960411410596, + "learning_rate": 1.3565592051092795e-06, + "loss": 0.7282, + "step": 8734 + }, + { + "epoch": 0.77, + "grad_norm": 6.482803773160826, + "learning_rate": 1.355585291473176e-06, + "loss": 0.7114, + "step": 8735 + }, + { + "epoch": 0.77, + "grad_norm": 8.667881692178176, + "learning_rate": 1.3546116727370484e-06, + "loss": 0.713, + "step": 8736 + }, + { + "epoch": 0.77, + "grad_norm": 8.354027025901473, + "learning_rate": 1.353638348979679e-06, + "loss": 0.638, + "step": 8737 + }, + { + "epoch": 0.77, + "grad_norm": 11.131549439947584, + "learning_rate": 1.3526653202798306e-06, + "loss": 0.8163, + "step": 8738 + }, + { + "epoch": 0.77, + "grad_norm": 5.871978000606811, + "learning_rate": 1.3516925867162372e-06, + "loss": 0.6614, + "step": 8739 + }, + { + "epoch": 0.77, + "grad_norm": 7.35578514694306, + "learning_rate": 1.3507201483676113e-06, + "loss": 0.5809, + "step": 8740 + }, + { + "epoch": 0.77, + "grad_norm": 9.18289677944259, + "learning_rate": 1.349748005312641e-06, + "loss": 0.7099, + "step": 8741 + }, + { + "epoch": 0.77, + "grad_norm": 7.1157310462287136, + "learning_rate": 1.3487761576299896e-06, + "loss": 0.6251, + "step": 8742 + }, + { + "epoch": 0.77, + "grad_norm": 7.1867207358640215, + "learning_rate": 1.3478046053982968e-06, + "loss": 0.7154, + "step": 8743 + }, + { + "epoch": 0.77, + "grad_norm": 8.832369275422275, + "learning_rate": 1.3468333486961815e-06, + "loss": 0.5398, + "step": 8744 + }, + { + "epoch": 0.77, + "grad_norm": 11.38489262543658, + "learning_rate": 1.3458623876022353e-06, + "loss": 0.7685, + "step": 8745 + }, + { + "epoch": 0.77, + "grad_norm": 5.337314442985692, + "learning_rate": 1.3448917221950264e-06, + "loss": 0.6518, + "step": 8746 + }, + { + "epoch": 0.77, + "grad_norm": 7.0694552409690115, + "learning_rate": 1.3439213525530993e-06, + "loss": 0.6447, + "step": 8747 + }, + { + "epoch": 0.77, + "grad_norm": 7.902068266986551, + "learning_rate": 1.3429512787549725e-06, + "loss": 0.6236, + "step": 8748 + }, + { + "epoch": 0.77, + "grad_norm": 14.947922079658946, + "learning_rate": 1.3419815008791476e-06, + "loss": 0.714, + "step": 8749 + }, + { + "epoch": 0.77, + "grad_norm": 5.441620795696565, + "learning_rate": 1.3410120190040944e-06, + "loss": 0.5572, + "step": 8750 + }, + { + "epoch": 0.77, + "grad_norm": 3.7770413665445246, + "learning_rate": 1.340042833208262e-06, + "loss": 0.5015, + "step": 8751 + }, + { + "epoch": 0.77, + "grad_norm": 8.902448394418544, + "learning_rate": 1.3390739435700757e-06, + "loss": 0.7518, + "step": 8752 + }, + { + "epoch": 0.77, + "grad_norm": 11.102933224163156, + "learning_rate": 1.3381053501679364e-06, + "loss": 0.6895, + "step": 8753 + }, + { + "epoch": 0.77, + "grad_norm": 14.26604598474782, + "learning_rate": 1.3371370530802192e-06, + "loss": 0.7286, + "step": 8754 + }, + { + "epoch": 0.77, + "grad_norm": 2.776177603486869, + "learning_rate": 1.3361690523852805e-06, + "loss": 0.5927, + "step": 8755 + }, + { + "epoch": 0.77, + "grad_norm": 10.576588742753177, + "learning_rate": 1.3352013481614474e-06, + "loss": 0.5924, + "step": 8756 + }, + { + "epoch": 0.77, + "grad_norm": 7.49891904203642, + "learning_rate": 1.3342339404870253e-06, + "loss": 0.9322, + "step": 8757 + }, + { + "epoch": 0.77, + "grad_norm": 7.8442552334593, + "learning_rate": 1.3332668294402946e-06, + "loss": 0.6265, + "step": 8758 + }, + { + "epoch": 0.77, + "grad_norm": 6.731657778287067, + "learning_rate": 1.3323000150995135e-06, + "loss": 0.724, + "step": 8759 + }, + { + "epoch": 0.77, + "grad_norm": 10.085486250746031, + "learning_rate": 1.331333497542912e-06, + "loss": 0.8792, + "step": 8760 + }, + { + "epoch": 0.77, + "grad_norm": 6.93201009903357, + "learning_rate": 1.3303672768487036e-06, + "loss": 0.8441, + "step": 8761 + }, + { + "epoch": 0.77, + "grad_norm": 6.371595699589806, + "learning_rate": 1.3294013530950706e-06, + "loss": 0.7347, + "step": 8762 + }, + { + "epoch": 0.77, + "grad_norm": 14.270783180599054, + "learning_rate": 1.3284357263601744e-06, + "loss": 0.8248, + "step": 8763 + }, + { + "epoch": 0.77, + "grad_norm": 23.0701867825164, + "learning_rate": 1.3274703967221519e-06, + "loss": 0.7816, + "step": 8764 + }, + { + "epoch": 0.77, + "grad_norm": 9.535126326160935, + "learning_rate": 1.3265053642591147e-06, + "loss": 0.7379, + "step": 8765 + }, + { + "epoch": 0.77, + "grad_norm": 10.286050864187809, + "learning_rate": 1.3255406290491541e-06, + "loss": 0.7734, + "step": 8766 + }, + { + "epoch": 0.77, + "grad_norm": 7.512645554724417, + "learning_rate": 1.3245761911703336e-06, + "loss": 0.733, + "step": 8767 + }, + { + "epoch": 0.77, + "grad_norm": 7.72173052760993, + "learning_rate": 1.3236120507006945e-06, + "loss": 0.8537, + "step": 8768 + }, + { + "epoch": 0.77, + "grad_norm": 10.233358690923374, + "learning_rate": 1.3226482077182523e-06, + "loss": 0.929, + "step": 8769 + }, + { + "epoch": 0.77, + "grad_norm": 8.550549776216473, + "learning_rate": 1.3216846623010004e-06, + "loss": 0.6701, + "step": 8770 + }, + { + "epoch": 0.77, + "grad_norm": 3.4510142552719474, + "learning_rate": 1.3207214145269054e-06, + "loss": 0.5546, + "step": 8771 + }, + { + "epoch": 0.77, + "grad_norm": 6.020525600560378, + "learning_rate": 1.3197584644739153e-06, + "loss": 0.7389, + "step": 8772 + }, + { + "epoch": 0.77, + "grad_norm": 12.260868094638964, + "learning_rate": 1.3187958122199478e-06, + "loss": 0.7592, + "step": 8773 + }, + { + "epoch": 0.77, + "grad_norm": 10.057910364285778, + "learning_rate": 1.3178334578429003e-06, + "loss": 0.8225, + "step": 8774 + }, + { + "epoch": 0.77, + "grad_norm": 2.946316218076359, + "learning_rate": 1.3168714014206446e-06, + "loss": 0.5518, + "step": 8775 + }, + { + "epoch": 0.77, + "grad_norm": 8.66947847635663, + "learning_rate": 1.315909643031028e-06, + "loss": 0.6794, + "step": 8776 + }, + { + "epoch": 0.77, + "grad_norm": 8.893215912346303, + "learning_rate": 1.314948182751874e-06, + "loss": 0.803, + "step": 8777 + }, + { + "epoch": 0.77, + "grad_norm": 11.335062815749591, + "learning_rate": 1.313987020660984e-06, + "loss": 0.6603, + "step": 8778 + }, + { + "epoch": 0.77, + "grad_norm": 13.613149759016531, + "learning_rate": 1.3130261568361335e-06, + "loss": 0.6204, + "step": 8779 + }, + { + "epoch": 0.77, + "grad_norm": 17.265682660076656, + "learning_rate": 1.3120655913550734e-06, + "loss": 0.7138, + "step": 8780 + }, + { + "epoch": 0.77, + "grad_norm": 5.050708713485543, + "learning_rate": 1.3111053242955307e-06, + "loss": 0.6544, + "step": 8781 + }, + { + "epoch": 0.77, + "grad_norm": 8.320275354151176, + "learning_rate": 1.3101453557352096e-06, + "loss": 0.6964, + "step": 8782 + }, + { + "epoch": 0.77, + "grad_norm": 7.477956627091802, + "learning_rate": 1.3091856857517866e-06, + "loss": 0.8169, + "step": 8783 + }, + { + "epoch": 0.77, + "grad_norm": 11.008261400012582, + "learning_rate": 1.3082263144229201e-06, + "loss": 0.8255, + "step": 8784 + }, + { + "epoch": 0.77, + "grad_norm": 6.752883102161081, + "learning_rate": 1.3072672418262394e-06, + "loss": 0.766, + "step": 8785 + }, + { + "epoch": 0.77, + "grad_norm": 9.823726383953764, + "learning_rate": 1.3063084680393511e-06, + "loss": 0.6491, + "step": 8786 + }, + { + "epoch": 0.77, + "grad_norm": 9.703797719095418, + "learning_rate": 1.3053499931398373e-06, + "loss": 0.6446, + "step": 8787 + }, + { + "epoch": 0.77, + "grad_norm": 6.915994682912011, + "learning_rate": 1.3043918172052545e-06, + "loss": 0.8304, + "step": 8788 + }, + { + "epoch": 0.77, + "grad_norm": 2.3790927656260847, + "learning_rate": 1.3034339403131402e-06, + "loss": 0.5265, + "step": 8789 + }, + { + "epoch": 0.77, + "grad_norm": 14.804791312383543, + "learning_rate": 1.3024763625410025e-06, + "loss": 0.8515, + "step": 8790 + }, + { + "epoch": 0.77, + "grad_norm": 7.2115349397227595, + "learning_rate": 1.3015190839663272e-06, + "loss": 0.7307, + "step": 8791 + }, + { + "epoch": 0.77, + "grad_norm": 7.436209346617742, + "learning_rate": 1.3005621046665756e-06, + "loss": 0.7666, + "step": 8792 + }, + { + "epoch": 0.77, + "grad_norm": 7.0446754822119475, + "learning_rate": 1.2996054247191848e-06, + "loss": 0.7345, + "step": 8793 + }, + { + "epoch": 0.77, + "grad_norm": 7.860697336192859, + "learning_rate": 1.2986490442015659e-06, + "loss": 0.6159, + "step": 8794 + }, + { + "epoch": 0.77, + "grad_norm": 8.563089950325496, + "learning_rate": 1.2976929631911112e-06, + "loss": 0.7376, + "step": 8795 + }, + { + "epoch": 0.77, + "grad_norm": 13.773104641685098, + "learning_rate": 1.2967371817651836e-06, + "loss": 0.7483, + "step": 8796 + }, + { + "epoch": 0.77, + "grad_norm": 42.66805204340833, + "learning_rate": 1.2957817000011224e-06, + "loss": 0.7848, + "step": 8797 + }, + { + "epoch": 0.77, + "grad_norm": 9.720647777247015, + "learning_rate": 1.2948265179762453e-06, + "loss": 0.6808, + "step": 8798 + }, + { + "epoch": 0.77, + "grad_norm": 8.58175639559625, + "learning_rate": 1.2938716357678427e-06, + "loss": 0.8258, + "step": 8799 + }, + { + "epoch": 0.77, + "grad_norm": 8.471174712916202, + "learning_rate": 1.292917053453181e-06, + "loss": 0.5928, + "step": 8800 + }, + { + "epoch": 0.77, + "grad_norm": 7.8546848995613825, + "learning_rate": 1.2919627711095068e-06, + "loss": 0.5341, + "step": 8801 + }, + { + "epoch": 0.77, + "grad_norm": 8.74273907286214, + "learning_rate": 1.2910087888140365e-06, + "loss": 0.8288, + "step": 8802 + }, + { + "epoch": 0.77, + "grad_norm": 2.6834806612948863, + "learning_rate": 1.290055106643966e-06, + "loss": 0.4717, + "step": 8803 + }, + { + "epoch": 0.77, + "grad_norm": 8.437491751269093, + "learning_rate": 1.2891017246764648e-06, + "loss": 0.6009, + "step": 8804 + }, + { + "epoch": 0.77, + "grad_norm": 5.897247005150633, + "learning_rate": 1.288148642988678e-06, + "loss": 0.7165, + "step": 8805 + }, + { + "epoch": 0.77, + "grad_norm": 11.506781872955159, + "learning_rate": 1.28719586165773e-06, + "loss": 0.7025, + "step": 8806 + }, + { + "epoch": 0.77, + "grad_norm": 5.518847018465194, + "learning_rate": 1.2862433807607172e-06, + "loss": 0.8113, + "step": 8807 + }, + { + "epoch": 0.77, + "grad_norm": 9.74551322271975, + "learning_rate": 1.285291200374712e-06, + "loss": 0.769, + "step": 8808 + }, + { + "epoch": 0.77, + "grad_norm": 10.646275154395655, + "learning_rate": 1.284339320576764e-06, + "loss": 0.8651, + "step": 8809 + }, + { + "epoch": 0.77, + "grad_norm": 6.966958472836764, + "learning_rate": 1.2833877414438977e-06, + "loss": 0.681, + "step": 8810 + }, + { + "epoch": 0.77, + "grad_norm": 10.564562616606226, + "learning_rate": 1.2824364630531117e-06, + "loss": 0.7611, + "step": 8811 + }, + { + "epoch": 0.77, + "grad_norm": 8.266729401987941, + "learning_rate": 1.281485485481384e-06, + "loss": 0.6654, + "step": 8812 + }, + { + "epoch": 0.77, + "grad_norm": 10.624595179569754, + "learning_rate": 1.2805348088056663e-06, + "loss": 0.7639, + "step": 8813 + }, + { + "epoch": 0.77, + "grad_norm": 8.850519413097532, + "learning_rate": 1.2795844331028844e-06, + "loss": 0.6401, + "step": 8814 + }, + { + "epoch": 0.77, + "grad_norm": 7.426800121114585, + "learning_rate": 1.2786343584499416e-06, + "loss": 0.6442, + "step": 8815 + }, + { + "epoch": 0.77, + "grad_norm": 9.811951512161551, + "learning_rate": 1.2776845849237164e-06, + "loss": 0.8081, + "step": 8816 + }, + { + "epoch": 0.77, + "grad_norm": 22.263991294525365, + "learning_rate": 1.2767351126010612e-06, + "loss": 0.6839, + "step": 8817 + }, + { + "epoch": 0.77, + "grad_norm": 8.233167302066029, + "learning_rate": 1.2757859415588087e-06, + "loss": 0.7313, + "step": 8818 + }, + { + "epoch": 0.77, + "grad_norm": 32.63879436647405, + "learning_rate": 1.2748370718737636e-06, + "loss": 0.7895, + "step": 8819 + }, + { + "epoch": 0.77, + "grad_norm": 8.2917878464905, + "learning_rate": 1.2738885036227055e-06, + "loss": 0.6698, + "step": 8820 + }, + { + "epoch": 0.77, + "grad_norm": 8.444025286833845, + "learning_rate": 1.2729402368823917e-06, + "loss": 0.8007, + "step": 8821 + }, + { + "epoch": 0.77, + "grad_norm": 18.075259690358553, + "learning_rate": 1.271992271729554e-06, + "loss": 0.6727, + "step": 8822 + }, + { + "epoch": 0.77, + "grad_norm": 5.529209357492956, + "learning_rate": 1.2710446082408996e-06, + "loss": 0.6108, + "step": 8823 + }, + { + "epoch": 0.77, + "grad_norm": 7.458525966493961, + "learning_rate": 1.2700972464931139e-06, + "loss": 0.7715, + "step": 8824 + }, + { + "epoch": 0.78, + "grad_norm": 11.531188421815044, + "learning_rate": 1.2691501865628547e-06, + "loss": 0.5588, + "step": 8825 + }, + { + "epoch": 0.78, + "grad_norm": 36.24522517195984, + "learning_rate": 1.2682034285267564e-06, + "loss": 0.6711, + "step": 8826 + }, + { + "epoch": 0.78, + "grad_norm": 2.356142658327255, + "learning_rate": 1.2672569724614291e-06, + "loss": 0.4568, + "step": 8827 + }, + { + "epoch": 0.78, + "grad_norm": 5.780235090282325, + "learning_rate": 1.2663108184434574e-06, + "loss": 0.6864, + "step": 8828 + }, + { + "epoch": 0.78, + "grad_norm": 10.999669522769254, + "learning_rate": 1.2653649665494044e-06, + "loss": 0.711, + "step": 8829 + }, + { + "epoch": 0.78, + "grad_norm": 6.263805208724202, + "learning_rate": 1.2644194168558066e-06, + "loss": 0.7761, + "step": 8830 + }, + { + "epoch": 0.78, + "grad_norm": 10.712246743462776, + "learning_rate": 1.2634741694391756e-06, + "loss": 0.8649, + "step": 8831 + }, + { + "epoch": 0.78, + "grad_norm": 7.054005434650165, + "learning_rate": 1.2625292243759995e-06, + "loss": 0.7196, + "step": 8832 + }, + { + "epoch": 0.78, + "grad_norm": 7.908291084011567, + "learning_rate": 1.2615845817427414e-06, + "loss": 0.7618, + "step": 8833 + }, + { + "epoch": 0.78, + "grad_norm": 11.805890465624469, + "learning_rate": 1.2606402416158391e-06, + "loss": 0.72, + "step": 8834 + }, + { + "epoch": 0.78, + "grad_norm": 7.907206366019394, + "learning_rate": 1.2596962040717098e-06, + "loss": 0.5523, + "step": 8835 + }, + { + "epoch": 0.78, + "grad_norm": 7.266406529928336, + "learning_rate": 1.258752469186742e-06, + "loss": 0.5613, + "step": 8836 + }, + { + "epoch": 0.78, + "grad_norm": 5.154906579871336, + "learning_rate": 1.257809037037301e-06, + "loss": 0.5724, + "step": 8837 + }, + { + "epoch": 0.78, + "grad_norm": 4.500458703688294, + "learning_rate": 1.256865907699728e-06, + "loss": 0.6615, + "step": 8838 + }, + { + "epoch": 0.78, + "grad_norm": 8.623894028689657, + "learning_rate": 1.2559230812503392e-06, + "loss": 0.6887, + "step": 8839 + }, + { + "epoch": 0.78, + "grad_norm": 5.9596611473625725, + "learning_rate": 1.2549805577654257e-06, + "loss": 0.6701, + "step": 8840 + }, + { + "epoch": 0.78, + "grad_norm": 6.883953117115505, + "learning_rate": 1.2540383373212572e-06, + "loss": 0.7716, + "step": 8841 + }, + { + "epoch": 0.78, + "grad_norm": 1.9497414259789048, + "learning_rate": 1.2530964199940748e-06, + "loss": 0.4889, + "step": 8842 + }, + { + "epoch": 0.78, + "grad_norm": 8.359237983842947, + "learning_rate": 1.252154805860098e-06, + "loss": 0.8861, + "step": 8843 + }, + { + "epoch": 0.78, + "grad_norm": 9.392325542355689, + "learning_rate": 1.2512134949955202e-06, + "loss": 0.7503, + "step": 8844 + }, + { + "epoch": 0.78, + "grad_norm": 4.346159421395749, + "learning_rate": 1.2502724874765087e-06, + "loss": 0.74, + "step": 8845 + }, + { + "epoch": 0.78, + "grad_norm": 5.790696259676842, + "learning_rate": 1.2493317833792122e-06, + "loss": 0.6884, + "step": 8846 + }, + { + "epoch": 0.78, + "grad_norm": 8.815589372787583, + "learning_rate": 1.2483913827797483e-06, + "loss": 0.6577, + "step": 8847 + }, + { + "epoch": 0.78, + "grad_norm": 5.4843782233388465, + "learning_rate": 1.2474512857542137e-06, + "loss": 0.661, + "step": 8848 + }, + { + "epoch": 0.78, + "grad_norm": 8.082242702324754, + "learning_rate": 1.2465114923786786e-06, + "loss": 0.7123, + "step": 8849 + }, + { + "epoch": 0.78, + "grad_norm": 2.004099731224515, + "learning_rate": 1.2455720027291902e-06, + "loss": 0.5197, + "step": 8850 + }, + { + "epoch": 0.78, + "grad_norm": 12.566209820826785, + "learning_rate": 1.2446328168817684e-06, + "loss": 0.776, + "step": 8851 + }, + { + "epoch": 0.78, + "grad_norm": 14.74090305247015, + "learning_rate": 1.2436939349124144e-06, + "loss": 0.6308, + "step": 8852 + }, + { + "epoch": 0.78, + "grad_norm": 7.68992886461999, + "learning_rate": 1.2427553568970986e-06, + "loss": 0.7564, + "step": 8853 + }, + { + "epoch": 0.78, + "grad_norm": 6.419207869401286, + "learning_rate": 1.2418170829117692e-06, + "loss": 0.6939, + "step": 8854 + }, + { + "epoch": 0.78, + "grad_norm": 6.461137825997049, + "learning_rate": 1.24087911303235e-06, + "loss": 0.6791, + "step": 8855 + }, + { + "epoch": 0.78, + "grad_norm": 16.72107039808863, + "learning_rate": 1.2399414473347405e-06, + "loss": 0.7835, + "step": 8856 + }, + { + "epoch": 0.78, + "grad_norm": 8.464506517571404, + "learning_rate": 1.2390040858948126e-06, + "loss": 0.8291, + "step": 8857 + }, + { + "epoch": 0.78, + "grad_norm": 6.245213439743416, + "learning_rate": 1.2380670287884194e-06, + "loss": 0.5986, + "step": 8858 + }, + { + "epoch": 0.78, + "grad_norm": 6.925614040620021, + "learning_rate": 1.237130276091385e-06, + "loss": 0.8226, + "step": 8859 + }, + { + "epoch": 0.78, + "grad_norm": 5.335691975805414, + "learning_rate": 1.2361938278795093e-06, + "loss": 0.5944, + "step": 8860 + }, + { + "epoch": 0.78, + "grad_norm": 10.538024427456389, + "learning_rate": 1.235257684228568e-06, + "loss": 0.8062, + "step": 8861 + }, + { + "epoch": 0.78, + "grad_norm": 13.031378560145821, + "learning_rate": 1.234321845214313e-06, + "loss": 0.6231, + "step": 8862 + }, + { + "epoch": 0.78, + "grad_norm": 6.507309456665436, + "learning_rate": 1.2333863109124684e-06, + "loss": 0.8301, + "step": 8863 + }, + { + "epoch": 0.78, + "grad_norm": 10.779294578827422, + "learning_rate": 1.23245108139874e-06, + "loss": 0.7991, + "step": 8864 + }, + { + "epoch": 0.78, + "grad_norm": 10.069303764752775, + "learning_rate": 1.2315161567488027e-06, + "loss": 0.9342, + "step": 8865 + }, + { + "epoch": 0.78, + "grad_norm": 8.472143094563824, + "learning_rate": 1.23058153703831e-06, + "loss": 0.6666, + "step": 8866 + }, + { + "epoch": 0.78, + "grad_norm": 7.826490363258301, + "learning_rate": 1.229647222342889e-06, + "loss": 0.6683, + "step": 8867 + }, + { + "epoch": 0.78, + "grad_norm": 7.959893804972599, + "learning_rate": 1.2287132127381412e-06, + "loss": 0.8157, + "step": 8868 + }, + { + "epoch": 0.78, + "grad_norm": 9.181976571443847, + "learning_rate": 1.227779508299649e-06, + "loss": 0.6915, + "step": 8869 + }, + { + "epoch": 0.78, + "grad_norm": 6.93403581505879, + "learning_rate": 1.2268461091029638e-06, + "loss": 0.8963, + "step": 8870 + }, + { + "epoch": 0.78, + "grad_norm": 10.434983687603143, + "learning_rate": 1.2259130152236153e-06, + "loss": 0.641, + "step": 8871 + }, + { + "epoch": 0.78, + "grad_norm": 6.59067425077541, + "learning_rate": 1.224980226737108e-06, + "loss": 0.6878, + "step": 8872 + }, + { + "epoch": 0.78, + "grad_norm": 6.353403441171738, + "learning_rate": 1.2240477437189212e-06, + "loss": 0.8094, + "step": 8873 + }, + { + "epoch": 0.78, + "grad_norm": 21.45817439876477, + "learning_rate": 1.2231155662445087e-06, + "loss": 0.7607, + "step": 8874 + }, + { + "epoch": 0.78, + "grad_norm": 8.408315079348904, + "learning_rate": 1.2221836943893034e-06, + "loss": 0.7425, + "step": 8875 + }, + { + "epoch": 0.78, + "grad_norm": 12.130437827598493, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.8125, + "step": 8876 + }, + { + "epoch": 0.78, + "grad_norm": 14.562521308334752, + "learning_rate": 1.2203208678381073e-06, + "loss": 0.6999, + "step": 8877 + }, + { + "epoch": 0.78, + "grad_norm": 8.20500740293367, + "learning_rate": 1.2193899132928539e-06, + "loss": 0.7693, + "step": 8878 + }, + { + "epoch": 0.78, + "grad_norm": 6.188782480582762, + "learning_rate": 1.2184592646682797e-06, + "loss": 0.7479, + "step": 8879 + }, + { + "epoch": 0.78, + "grad_norm": 12.890849315937144, + "learning_rate": 1.2175289220396897e-06, + "loss": 0.8055, + "step": 8880 + }, + { + "epoch": 0.78, + "grad_norm": 9.113921263937561, + "learning_rate": 1.2165988854823696e-06, + "loss": 0.667, + "step": 8881 + }, + { + "epoch": 0.78, + "grad_norm": 9.390988168229857, + "learning_rate": 1.2156691550715737e-06, + "loss": 0.7694, + "step": 8882 + }, + { + "epoch": 0.78, + "grad_norm": 7.720098276523093, + "learning_rate": 1.2147397308825354e-06, + "loss": 0.7901, + "step": 8883 + }, + { + "epoch": 0.78, + "grad_norm": 7.845400058637514, + "learning_rate": 1.2138106129904613e-06, + "loss": 0.7575, + "step": 8884 + }, + { + "epoch": 0.78, + "grad_norm": 3.273742727807788, + "learning_rate": 1.2128818014705334e-06, + "loss": 0.4746, + "step": 8885 + }, + { + "epoch": 0.78, + "grad_norm": 6.082466960181305, + "learning_rate": 1.211953296397912e-06, + "loss": 0.6886, + "step": 8886 + }, + { + "epoch": 0.78, + "grad_norm": 7.447634915556724, + "learning_rate": 1.2110250978477284e-06, + "loss": 0.7011, + "step": 8887 + }, + { + "epoch": 0.78, + "grad_norm": 7.041956182948543, + "learning_rate": 1.210097205895091e-06, + "loss": 0.715, + "step": 8888 + }, + { + "epoch": 0.78, + "grad_norm": 7.9196378386288675, + "learning_rate": 1.2091696206150843e-06, + "loss": 0.5785, + "step": 8889 + }, + { + "epoch": 0.78, + "grad_norm": 9.195468952703317, + "learning_rate": 1.2082423420827655e-06, + "loss": 0.9572, + "step": 8890 + }, + { + "epoch": 0.78, + "grad_norm": 11.0361458160725, + "learning_rate": 1.2073153703731683e-06, + "loss": 0.6811, + "step": 8891 + }, + { + "epoch": 0.78, + "grad_norm": 6.769571264543007, + "learning_rate": 1.2063887055613033e-06, + "loss": 0.6877, + "step": 8892 + }, + { + "epoch": 0.78, + "grad_norm": 3.721544414236292, + "learning_rate": 1.2054623477221545e-06, + "loss": 0.5449, + "step": 8893 + }, + { + "epoch": 0.78, + "grad_norm": 7.884931436801484, + "learning_rate": 1.20453629693068e-06, + "loss": 0.662, + "step": 8894 + }, + { + "epoch": 0.78, + "grad_norm": 10.555819475489395, + "learning_rate": 1.2036105532618154e-06, + "loss": 0.791, + "step": 8895 + }, + { + "epoch": 0.78, + "grad_norm": 7.310820465429855, + "learning_rate": 1.2026851167904697e-06, + "loss": 0.6816, + "step": 8896 + }, + { + "epoch": 0.78, + "grad_norm": 7.445830051516451, + "learning_rate": 1.2017599875915264e-06, + "loss": 0.7647, + "step": 8897 + }, + { + "epoch": 0.78, + "grad_norm": 13.27179718012978, + "learning_rate": 1.2008351657398481e-06, + "loss": 0.9307, + "step": 8898 + }, + { + "epoch": 0.78, + "grad_norm": 10.41260010113509, + "learning_rate": 1.1999106513102687e-06, + "loss": 0.6018, + "step": 8899 + }, + { + "epoch": 0.78, + "grad_norm": 6.572623367003266, + "learning_rate": 1.1989864443775984e-06, + "loss": 0.8579, + "step": 8900 + }, + { + "epoch": 0.78, + "grad_norm": 8.071479197837485, + "learning_rate": 1.1980625450166222e-06, + "loss": 0.7608, + "step": 8901 + }, + { + "epoch": 0.78, + "grad_norm": 13.342484097770717, + "learning_rate": 1.1971389533020995e-06, + "loss": 0.8077, + "step": 8902 + }, + { + "epoch": 0.78, + "grad_norm": 12.485047310610621, + "learning_rate": 1.1962156693087684e-06, + "loss": 0.8467, + "step": 8903 + }, + { + "epoch": 0.78, + "grad_norm": 7.727790715453803, + "learning_rate": 1.195292693111338e-06, + "loss": 0.7329, + "step": 8904 + }, + { + "epoch": 0.78, + "grad_norm": 8.56398931069056, + "learning_rate": 1.1943700247844936e-06, + "loss": 0.7998, + "step": 8905 + }, + { + "epoch": 0.78, + "grad_norm": 6.3083132583624275, + "learning_rate": 1.193447664402897e-06, + "loss": 0.5958, + "step": 8906 + }, + { + "epoch": 0.78, + "grad_norm": 14.418807143697423, + "learning_rate": 1.1925256120411831e-06, + "loss": 0.8472, + "step": 8907 + }, + { + "epoch": 0.78, + "grad_norm": 9.04856357349795, + "learning_rate": 1.1916038677739623e-06, + "loss": 0.7155, + "step": 8908 + }, + { + "epoch": 0.78, + "grad_norm": 15.707981505845416, + "learning_rate": 1.190682431675823e-06, + "loss": 0.7011, + "step": 8909 + }, + { + "epoch": 0.78, + "grad_norm": 9.95681715277562, + "learning_rate": 1.1897613038213246e-06, + "loss": 0.7836, + "step": 8910 + }, + { + "epoch": 0.78, + "grad_norm": 6.124942776493706, + "learning_rate": 1.1888404842850031e-06, + "loss": 0.7936, + "step": 8911 + }, + { + "epoch": 0.78, + "grad_norm": 7.491331429786411, + "learning_rate": 1.1879199731413709e-06, + "loss": 0.6791, + "step": 8912 + }, + { + "epoch": 0.78, + "grad_norm": 5.235815607213947, + "learning_rate": 1.1869997704649127e-06, + "loss": 0.7803, + "step": 8913 + }, + { + "epoch": 0.78, + "grad_norm": 8.753094867266459, + "learning_rate": 1.1860798763300896e-06, + "loss": 0.746, + "step": 8914 + }, + { + "epoch": 0.78, + "grad_norm": 4.8557798427885555, + "learning_rate": 1.1851602908113403e-06, + "loss": 0.6567, + "step": 8915 + }, + { + "epoch": 0.78, + "grad_norm": 7.745230220887685, + "learning_rate": 1.184241013983074e-06, + "loss": 0.8062, + "step": 8916 + }, + { + "epoch": 0.78, + "grad_norm": 10.866806135434258, + "learning_rate": 1.183322045919678e-06, + "loss": 0.6695, + "step": 8917 + }, + { + "epoch": 0.78, + "grad_norm": 17.80104313490844, + "learning_rate": 1.182403386695513e-06, + "loss": 0.8983, + "step": 8918 + }, + { + "epoch": 0.78, + "grad_norm": 18.087607102981103, + "learning_rate": 1.181485036384916e-06, + "loss": 0.688, + "step": 8919 + }, + { + "epoch": 0.78, + "grad_norm": 7.98200138423517, + "learning_rate": 1.1805669950621968e-06, + "loss": 0.8358, + "step": 8920 + }, + { + "epoch": 0.78, + "grad_norm": 7.316066329001479, + "learning_rate": 1.1796492628016443e-06, + "loss": 0.6944, + "step": 8921 + }, + { + "epoch": 0.78, + "grad_norm": 8.005784065017325, + "learning_rate": 1.1787318396775188e-06, + "loss": 0.6395, + "step": 8922 + }, + { + "epoch": 0.78, + "grad_norm": 13.522072972673689, + "learning_rate": 1.177814725764056e-06, + "loss": 0.7062, + "step": 8923 + }, + { + "epoch": 0.78, + "grad_norm": 7.67147573392786, + "learning_rate": 1.176897921135468e-06, + "loss": 0.6902, + "step": 8924 + }, + { + "epoch": 0.78, + "grad_norm": 6.984837632256193, + "learning_rate": 1.1759814258659397e-06, + "loss": 0.742, + "step": 8925 + }, + { + "epoch": 0.78, + "grad_norm": 6.481491120327404, + "learning_rate": 1.1750652400296342e-06, + "loss": 0.6711, + "step": 8926 + }, + { + "epoch": 0.78, + "grad_norm": 6.289403958659639, + "learning_rate": 1.1741493637006878e-06, + "loss": 0.7253, + "step": 8927 + }, + { + "epoch": 0.78, + "grad_norm": 7.387227536921199, + "learning_rate": 1.1732337969532105e-06, + "loss": 0.71, + "step": 8928 + }, + { + "epoch": 0.78, + "grad_norm": 5.599201568082832, + "learning_rate": 1.1723185398612891e-06, + "loss": 0.7817, + "step": 8929 + }, + { + "epoch": 0.78, + "grad_norm": 5.745156938433286, + "learning_rate": 1.1714035924989842e-06, + "loss": 0.6026, + "step": 8930 + }, + { + "epoch": 0.78, + "grad_norm": 8.419540475658666, + "learning_rate": 1.1704889549403303e-06, + "loss": 0.6908, + "step": 8931 + }, + { + "epoch": 0.78, + "grad_norm": 6.703894869568693, + "learning_rate": 1.1695746272593422e-06, + "loss": 0.6461, + "step": 8932 + }, + { + "epoch": 0.78, + "grad_norm": 30.70533740802035, + "learning_rate": 1.1686606095300034e-06, + "loss": 0.7309, + "step": 8933 + }, + { + "epoch": 0.78, + "grad_norm": 11.145965573556648, + "learning_rate": 1.167746901826275e-06, + "loss": 0.8027, + "step": 8934 + }, + { + "epoch": 0.78, + "grad_norm": 5.701364062493659, + "learning_rate": 1.1668335042220924e-06, + "loss": 0.6928, + "step": 8935 + }, + { + "epoch": 0.78, + "grad_norm": 11.591571181175345, + "learning_rate": 1.1659204167913669e-06, + "loss": 0.9074, + "step": 8936 + }, + { + "epoch": 0.78, + "grad_norm": 6.96618084000368, + "learning_rate": 1.1650076396079818e-06, + "loss": 0.7628, + "step": 8937 + }, + { + "epoch": 0.78, + "grad_norm": 8.407322216019653, + "learning_rate": 1.1640951727458011e-06, + "loss": 0.7261, + "step": 8938 + }, + { + "epoch": 0.79, + "grad_norm": 5.503392393727406, + "learning_rate": 1.163183016278658e-06, + "loss": 0.6796, + "step": 8939 + }, + { + "epoch": 0.79, + "grad_norm": 8.02915828148453, + "learning_rate": 1.162271170280363e-06, + "loss": 0.6865, + "step": 8940 + }, + { + "epoch": 0.79, + "grad_norm": 6.663340694555714, + "learning_rate": 1.1613596348247015e-06, + "loss": 0.7252, + "step": 8941 + }, + { + "epoch": 0.79, + "grad_norm": 15.18605336981366, + "learning_rate": 1.1604484099854319e-06, + "loss": 0.7462, + "step": 8942 + }, + { + "epoch": 0.79, + "grad_norm": 9.410614255263635, + "learning_rate": 1.159537495836291e-06, + "loss": 0.5976, + "step": 8943 + }, + { + "epoch": 0.79, + "grad_norm": 3.9032750720246594, + "learning_rate": 1.158626892450988e-06, + "loss": 0.7232, + "step": 8944 + }, + { + "epoch": 0.79, + "grad_norm": 16.861908122810554, + "learning_rate": 1.157716599903207e-06, + "loss": 0.7734, + "step": 8945 + }, + { + "epoch": 0.79, + "grad_norm": 6.449455642620154, + "learning_rate": 1.1568066182666072e-06, + "loss": 0.7559, + "step": 8946 + }, + { + "epoch": 0.79, + "grad_norm": 6.598309401009037, + "learning_rate": 1.155896947614823e-06, + "loss": 0.6304, + "step": 8947 + }, + { + "epoch": 0.79, + "grad_norm": 7.05104860639195, + "learning_rate": 1.1549875880214624e-06, + "loss": 0.8143, + "step": 8948 + }, + { + "epoch": 0.79, + "grad_norm": 11.623922584630233, + "learning_rate": 1.1540785395601118e-06, + "loss": 0.8378, + "step": 8949 + }, + { + "epoch": 0.79, + "grad_norm": 9.232866990255507, + "learning_rate": 1.153169802304328e-06, + "loss": 0.6743, + "step": 8950 + }, + { + "epoch": 0.79, + "grad_norm": 7.4651375938766975, + "learning_rate": 1.1522613763276453e-06, + "loss": 0.6212, + "step": 8951 + }, + { + "epoch": 0.79, + "grad_norm": 5.942557646008107, + "learning_rate": 1.151353261703571e-06, + "loss": 0.6632, + "step": 8952 + }, + { + "epoch": 0.79, + "grad_norm": 6.200714052609229, + "learning_rate": 1.1504454585055897e-06, + "loss": 0.7109, + "step": 8953 + }, + { + "epoch": 0.79, + "grad_norm": 7.29427424583139, + "learning_rate": 1.1495379668071566e-06, + "loss": 0.6296, + "step": 8954 + }, + { + "epoch": 0.79, + "grad_norm": 6.09058122845786, + "learning_rate": 1.1486307866817082e-06, + "loss": 0.5694, + "step": 8955 + }, + { + "epoch": 0.79, + "grad_norm": 8.68953731324268, + "learning_rate": 1.1477239182026494e-06, + "loss": 0.6273, + "step": 8956 + }, + { + "epoch": 0.79, + "grad_norm": 6.235358742031987, + "learning_rate": 1.1468173614433637e-06, + "loss": 0.6904, + "step": 8957 + }, + { + "epoch": 0.79, + "grad_norm": 7.634596870582409, + "learning_rate": 1.1459111164772073e-06, + "loss": 0.868, + "step": 8958 + }, + { + "epoch": 0.79, + "grad_norm": 5.826482941211521, + "learning_rate": 1.1450051833775127e-06, + "loss": 0.6661, + "step": 8959 + }, + { + "epoch": 0.79, + "grad_norm": 8.542269755730917, + "learning_rate": 1.1440995622175849e-06, + "loss": 0.7943, + "step": 8960 + }, + { + "epoch": 0.79, + "grad_norm": 7.246236550545804, + "learning_rate": 1.143194253070708e-06, + "loss": 0.8932, + "step": 8961 + }, + { + "epoch": 0.79, + "grad_norm": 10.054946474860742, + "learning_rate": 1.1422892560101363e-06, + "loss": 0.7709, + "step": 8962 + }, + { + "epoch": 0.79, + "grad_norm": 7.444494547150574, + "learning_rate": 1.1413845711091015e-06, + "loss": 0.7263, + "step": 8963 + }, + { + "epoch": 0.79, + "grad_norm": 17.32185963413271, + "learning_rate": 1.140480198440808e-06, + "loss": 0.6899, + "step": 8964 + }, + { + "epoch": 0.79, + "grad_norm": 7.5595244458856365, + "learning_rate": 1.139576138078436e-06, + "loss": 0.8143, + "step": 8965 + }, + { + "epoch": 0.79, + "grad_norm": 9.071307983714867, + "learning_rate": 1.138672390095143e-06, + "loss": 0.8274, + "step": 8966 + }, + { + "epoch": 0.79, + "grad_norm": 8.971691814145007, + "learning_rate": 1.137768954564057e-06, + "loss": 0.7765, + "step": 8967 + }, + { + "epoch": 0.79, + "grad_norm": 5.214145488915406, + "learning_rate": 1.1368658315582825e-06, + "loss": 0.7573, + "step": 8968 + }, + { + "epoch": 0.79, + "grad_norm": 17.590376357268685, + "learning_rate": 1.1359630211508987e-06, + "loss": 0.8101, + "step": 8969 + }, + { + "epoch": 0.79, + "grad_norm": 9.304580235078092, + "learning_rate": 1.1350605234149604e-06, + "loss": 0.7318, + "step": 8970 + }, + { + "epoch": 0.79, + "grad_norm": 14.627688101517352, + "learning_rate": 1.134158338423494e-06, + "loss": 0.8192, + "step": 8971 + }, + { + "epoch": 0.79, + "grad_norm": 11.910426645911475, + "learning_rate": 1.1332564662495055e-06, + "loss": 0.6375, + "step": 8972 + }, + { + "epoch": 0.79, + "grad_norm": 7.165162995209304, + "learning_rate": 1.1323549069659717e-06, + "loss": 0.6854, + "step": 8973 + }, + { + "epoch": 0.79, + "grad_norm": 9.9296552183297, + "learning_rate": 1.1314536606458454e-06, + "loss": 0.7648, + "step": 8974 + }, + { + "epoch": 0.79, + "grad_norm": 6.681060744135518, + "learning_rate": 1.1305527273620542e-06, + "loss": 0.7499, + "step": 8975 + }, + { + "epoch": 0.79, + "grad_norm": 8.33524372346871, + "learning_rate": 1.1296521071874994e-06, + "loss": 0.7797, + "step": 8976 + }, + { + "epoch": 0.79, + "grad_norm": 9.841520723769099, + "learning_rate": 1.128751800195057e-06, + "loss": 0.7483, + "step": 8977 + }, + { + "epoch": 0.79, + "grad_norm": 8.361982033374975, + "learning_rate": 1.1278518064575805e-06, + "loss": 0.6775, + "step": 8978 + }, + { + "epoch": 0.79, + "grad_norm": 8.836388872670856, + "learning_rate": 1.126952126047895e-06, + "loss": 0.7881, + "step": 8979 + }, + { + "epoch": 0.79, + "grad_norm": 5.60799718090706, + "learning_rate": 1.126052759038801e-06, + "loss": 0.6845, + "step": 8980 + }, + { + "epoch": 0.79, + "grad_norm": 11.236204331711983, + "learning_rate": 1.1251537055030742e-06, + "loss": 0.7313, + "step": 8981 + }, + { + "epoch": 0.79, + "grad_norm": 6.1833567164614776, + "learning_rate": 1.124254965513462e-06, + "loss": 0.7352, + "step": 8982 + }, + { + "epoch": 0.79, + "grad_norm": 13.63444088373223, + "learning_rate": 1.1233565391426926e-06, + "loss": 0.6426, + "step": 8983 + }, + { + "epoch": 0.79, + "grad_norm": 8.558444004076424, + "learning_rate": 1.1224584264634636e-06, + "loss": 0.7663, + "step": 8984 + }, + { + "epoch": 0.79, + "grad_norm": 6.240299320580057, + "learning_rate": 1.1215606275484492e-06, + "loss": 0.6599, + "step": 8985 + }, + { + "epoch": 0.79, + "grad_norm": 3.1055724776156652, + "learning_rate": 1.1206631424702969e-06, + "loss": 0.5945, + "step": 8986 + }, + { + "epoch": 0.79, + "grad_norm": 5.6307920928521815, + "learning_rate": 1.11976597130163e-06, + "loss": 0.575, + "step": 8987 + }, + { + "epoch": 0.79, + "grad_norm": 6.75947200112567, + "learning_rate": 1.1188691141150455e-06, + "loss": 0.6835, + "step": 8988 + }, + { + "epoch": 0.79, + "grad_norm": 8.532983037883604, + "learning_rate": 1.1179725709831168e-06, + "loss": 0.7445, + "step": 8989 + }, + { + "epoch": 0.79, + "grad_norm": 7.7225589285950145, + "learning_rate": 1.117076341978391e-06, + "loss": 0.6268, + "step": 8990 + }, + { + "epoch": 0.79, + "grad_norm": 6.796211980979798, + "learning_rate": 1.1161804271733883e-06, + "loss": 0.6899, + "step": 8991 + }, + { + "epoch": 0.79, + "grad_norm": 6.373662043686975, + "learning_rate": 1.1152848266406052e-06, + "loss": 0.7877, + "step": 8992 + }, + { + "epoch": 0.79, + "grad_norm": 9.338618122796712, + "learning_rate": 1.1143895404525124e-06, + "loss": 0.65, + "step": 8993 + }, + { + "epoch": 0.79, + "grad_norm": 7.834097396588196, + "learning_rate": 1.113494568681553e-06, + "loss": 0.7536, + "step": 8994 + }, + { + "epoch": 0.79, + "grad_norm": 2.2334060845906185, + "learning_rate": 1.1125999114001495e-06, + "loss": 0.5463, + "step": 8995 + }, + { + "epoch": 0.79, + "grad_norm": 7.207403147224385, + "learning_rate": 1.111705568680695e-06, + "loss": 0.776, + "step": 8996 + }, + { + "epoch": 0.79, + "grad_norm": 2.4419009683043895, + "learning_rate": 1.1108115405955577e-06, + "loss": 0.4551, + "step": 8997 + }, + { + "epoch": 0.79, + "grad_norm": 11.748441443961566, + "learning_rate": 1.1099178272170813e-06, + "loss": 0.7392, + "step": 8998 + }, + { + "epoch": 0.79, + "grad_norm": 22.330432988518318, + "learning_rate": 1.1090244286175834e-06, + "loss": 0.7739, + "step": 8999 + }, + { + "epoch": 0.79, + "grad_norm": 10.061364823312184, + "learning_rate": 1.1081313448693555e-06, + "loss": 0.6123, + "step": 9000 + }, + { + "epoch": 0.79, + "grad_norm": 9.818310930157564, + "learning_rate": 1.1072385760446663e-06, + "loss": 0.6925, + "step": 9001 + }, + { + "epoch": 0.79, + "grad_norm": 8.665127007685236, + "learning_rate": 1.1063461222157561e-06, + "loss": 0.7355, + "step": 9002 + }, + { + "epoch": 0.79, + "grad_norm": 6.206767593116703, + "learning_rate": 1.1054539834548411e-06, + "loss": 0.7438, + "step": 9003 + }, + { + "epoch": 0.79, + "grad_norm": 7.593518658468566, + "learning_rate": 1.1045621598341121e-06, + "loss": 0.6365, + "step": 9004 + }, + { + "epoch": 0.79, + "grad_norm": 13.694834493893739, + "learning_rate": 1.1036706514257311e-06, + "loss": 0.7739, + "step": 9005 + }, + { + "epoch": 0.79, + "grad_norm": 14.240727752569521, + "learning_rate": 1.102779458301842e-06, + "loss": 0.9515, + "step": 9006 + }, + { + "epoch": 0.79, + "grad_norm": 2.609239136338328, + "learning_rate": 1.1018885805345558e-06, + "loss": 0.4855, + "step": 9007 + }, + { + "epoch": 0.79, + "grad_norm": 7.408454950068642, + "learning_rate": 1.1009980181959618e-06, + "loss": 0.7846, + "step": 9008 + }, + { + "epoch": 0.79, + "grad_norm": 5.413880578734823, + "learning_rate": 1.1001077713581225e-06, + "loss": 0.6437, + "step": 9009 + }, + { + "epoch": 0.79, + "grad_norm": 2.412565752440684, + "learning_rate": 1.0992178400930753e-06, + "loss": 0.5162, + "step": 9010 + }, + { + "epoch": 0.79, + "grad_norm": 8.27694052684623, + "learning_rate": 1.0983282244728306e-06, + "loss": 0.7384, + "step": 9011 + }, + { + "epoch": 0.79, + "grad_norm": 8.930848748074082, + "learning_rate": 1.0974389245693766e-06, + "loss": 0.6598, + "step": 9012 + }, + { + "epoch": 0.79, + "grad_norm": 8.530129453294355, + "learning_rate": 1.0965499404546736e-06, + "loss": 0.7026, + "step": 9013 + }, + { + "epoch": 0.79, + "grad_norm": 5.290680514174833, + "learning_rate": 1.0956612722006565e-06, + "loss": 0.6444, + "step": 9014 + }, + { + "epoch": 0.79, + "grad_norm": 2.7271413675969947, + "learning_rate": 1.0947729198792346e-06, + "loss": 0.5465, + "step": 9015 + }, + { + "epoch": 0.79, + "grad_norm": 23.979568323060242, + "learning_rate": 1.0938848835622924e-06, + "loss": 0.6157, + "step": 9016 + }, + { + "epoch": 0.79, + "grad_norm": 6.251996834507902, + "learning_rate": 1.0929971633216863e-06, + "loss": 0.6915, + "step": 9017 + }, + { + "epoch": 0.79, + "grad_norm": 6.575543615042543, + "learning_rate": 1.0921097592292518e-06, + "loss": 0.6601, + "step": 9018 + }, + { + "epoch": 0.79, + "grad_norm": 13.289579632687508, + "learning_rate": 1.0912226713567953e-06, + "loss": 0.6895, + "step": 9019 + }, + { + "epoch": 0.79, + "grad_norm": 10.285123091544268, + "learning_rate": 1.0903358997760982e-06, + "loss": 0.6869, + "step": 9020 + }, + { + "epoch": 0.79, + "grad_norm": 12.472103868543392, + "learning_rate": 1.0894494445589171e-06, + "loss": 0.7463, + "step": 9021 + }, + { + "epoch": 0.79, + "grad_norm": 7.921188997630975, + "learning_rate": 1.0885633057769806e-06, + "loss": 0.6239, + "step": 9022 + }, + { + "epoch": 0.79, + "grad_norm": 3.02764934284269, + "learning_rate": 1.0876774835019965e-06, + "loss": 0.4274, + "step": 9023 + }, + { + "epoch": 0.79, + "grad_norm": 2.514484210581527, + "learning_rate": 1.0867919778056424e-06, + "loss": 0.5567, + "step": 9024 + }, + { + "epoch": 0.79, + "grad_norm": 8.724929699655084, + "learning_rate": 1.085906788759572e-06, + "loss": 0.6224, + "step": 9025 + }, + { + "epoch": 0.79, + "grad_norm": 16.474047954076543, + "learning_rate": 1.0850219164354136e-06, + "loss": 0.8301, + "step": 9026 + }, + { + "epoch": 0.79, + "grad_norm": 11.179235041440561, + "learning_rate": 1.08413736090477e-06, + "loss": 0.7278, + "step": 9027 + }, + { + "epoch": 0.79, + "grad_norm": 7.410494249623599, + "learning_rate": 1.0832531222392157e-06, + "loss": 0.7585, + "step": 9028 + }, + { + "epoch": 0.79, + "grad_norm": 11.117351640756734, + "learning_rate": 1.0823692005103048e-06, + "loss": 0.6636, + "step": 9029 + }, + { + "epoch": 0.79, + "grad_norm": 6.582018127051647, + "learning_rate": 1.0814855957895614e-06, + "loss": 0.6943, + "step": 9030 + }, + { + "epoch": 0.79, + "grad_norm": 7.120698596387841, + "learning_rate": 1.0806023081484861e-06, + "loss": 0.7579, + "step": 9031 + }, + { + "epoch": 0.79, + "grad_norm": 9.037262227266785, + "learning_rate": 1.0797193376585518e-06, + "loss": 0.6858, + "step": 9032 + }, + { + "epoch": 0.79, + "grad_norm": 8.679517212944871, + "learning_rate": 1.0788366843912086e-06, + "loss": 0.7866, + "step": 9033 + }, + { + "epoch": 0.79, + "grad_norm": 9.051051358260983, + "learning_rate": 1.0779543484178762e-06, + "loss": 0.8207, + "step": 9034 + }, + { + "epoch": 0.79, + "grad_norm": 5.624877318079726, + "learning_rate": 1.0770723298099555e-06, + "loss": 0.7467, + "step": 9035 + }, + { + "epoch": 0.79, + "grad_norm": 11.350523058979917, + "learning_rate": 1.0761906286388162e-06, + "loss": 0.6277, + "step": 9036 + }, + { + "epoch": 0.79, + "grad_norm": 7.395557600183917, + "learning_rate": 1.0753092449758046e-06, + "loss": 0.6387, + "step": 9037 + }, + { + "epoch": 0.79, + "grad_norm": 8.81615484043506, + "learning_rate": 1.0744281788922401e-06, + "loss": 0.6439, + "step": 9038 + }, + { + "epoch": 0.79, + "grad_norm": 2.764124860113596, + "learning_rate": 1.0735474304594178e-06, + "loss": 0.5013, + "step": 9039 + }, + { + "epoch": 0.79, + "grad_norm": 12.246339491231687, + "learning_rate": 1.072666999748605e-06, + "loss": 0.706, + "step": 9040 + }, + { + "epoch": 0.79, + "grad_norm": 7.47590478297457, + "learning_rate": 1.0717868868310466e-06, + "loss": 0.721, + "step": 9041 + }, + { + "epoch": 0.79, + "grad_norm": 7.811964980450763, + "learning_rate": 1.0709070917779595e-06, + "loss": 0.8099, + "step": 9042 + }, + { + "epoch": 0.79, + "grad_norm": 11.947991788243373, + "learning_rate": 1.0700276146605349e-06, + "loss": 0.6986, + "step": 9043 + }, + { + "epoch": 0.79, + "grad_norm": 11.398062610580636, + "learning_rate": 1.0691484555499382e-06, + "loss": 0.7277, + "step": 9044 + }, + { + "epoch": 0.79, + "grad_norm": 7.430842590716504, + "learning_rate": 1.0682696145173087e-06, + "loss": 0.7245, + "step": 9045 + }, + { + "epoch": 0.79, + "grad_norm": 11.5686843411137, + "learning_rate": 1.0673910916337633e-06, + "loss": 0.667, + "step": 9046 + }, + { + "epoch": 0.79, + "grad_norm": 7.770746343150389, + "learning_rate": 1.066512886970389e-06, + "loss": 0.7302, + "step": 9047 + }, + { + "epoch": 0.79, + "grad_norm": 12.925113982269815, + "learning_rate": 1.0656350005982491e-06, + "loss": 0.7733, + "step": 9048 + }, + { + "epoch": 0.79, + "grad_norm": 3.071082010773156, + "learning_rate": 1.0647574325883803e-06, + "loss": 0.5135, + "step": 9049 + }, + { + "epoch": 0.79, + "grad_norm": 10.428263724342186, + "learning_rate": 1.063880183011794e-06, + "loss": 0.788, + "step": 9050 + }, + { + "epoch": 0.79, + "grad_norm": 2.9223303885906233, + "learning_rate": 1.0630032519394746e-06, + "loss": 0.4956, + "step": 9051 + }, + { + "epoch": 0.79, + "grad_norm": 7.473279560245256, + "learning_rate": 1.0621266394423845e-06, + "loss": 0.7188, + "step": 9052 + }, + { + "epoch": 0.8, + "grad_norm": 9.858528495437891, + "learning_rate": 1.061250345591457e-06, + "loss": 0.7857, + "step": 9053 + }, + { + "epoch": 0.8, + "grad_norm": 7.674459967864784, + "learning_rate": 1.0603743704575992e-06, + "loss": 0.6143, + "step": 9054 + }, + { + "epoch": 0.8, + "grad_norm": 12.47274872684439, + "learning_rate": 1.0594987141116942e-06, + "loss": 0.6942, + "step": 9055 + }, + { + "epoch": 0.8, + "grad_norm": 9.900467476632773, + "learning_rate": 1.0586233766245985e-06, + "loss": 0.796, + "step": 9056 + }, + { + "epoch": 0.8, + "grad_norm": 2.8659407757628994, + "learning_rate": 1.0577483580671417e-06, + "loss": 0.5468, + "step": 9057 + }, + { + "epoch": 0.8, + "grad_norm": 43.14774023021034, + "learning_rate": 1.0568736585101319e-06, + "loss": 0.8191, + "step": 9058 + }, + { + "epoch": 0.8, + "grad_norm": 7.287430109319763, + "learning_rate": 1.0559992780243461e-06, + "loss": 0.6695, + "step": 9059 + }, + { + "epoch": 0.8, + "grad_norm": 7.735146019476751, + "learning_rate": 1.0551252166805387e-06, + "loss": 0.7603, + "step": 9060 + }, + { + "epoch": 0.8, + "grad_norm": 7.503788492225641, + "learning_rate": 1.0542514745494363e-06, + "loss": 0.5478, + "step": 9061 + }, + { + "epoch": 0.8, + "grad_norm": 13.074529493926983, + "learning_rate": 1.05337805170174e-06, + "loss": 0.7158, + "step": 9062 + }, + { + "epoch": 0.8, + "grad_norm": 7.22009290900595, + "learning_rate": 1.0525049482081285e-06, + "loss": 0.6165, + "step": 9063 + }, + { + "epoch": 0.8, + "grad_norm": 9.681872662917455, + "learning_rate": 1.0516321641392497e-06, + "loss": 0.6769, + "step": 9064 + }, + { + "epoch": 0.8, + "grad_norm": 7.0455495115056825, + "learning_rate": 1.0507596995657288e-06, + "loss": 0.7445, + "step": 9065 + }, + { + "epoch": 0.8, + "grad_norm": 6.176676819743537, + "learning_rate": 1.0498875545581639e-06, + "loss": 0.6582, + "step": 9066 + }, + { + "epoch": 0.8, + "grad_norm": 10.068632088561937, + "learning_rate": 1.0490157291871268e-06, + "loss": 0.796, + "step": 9067 + }, + { + "epoch": 0.8, + "grad_norm": 2.732190970462313, + "learning_rate": 1.0481442235231637e-06, + "loss": 0.5159, + "step": 9068 + }, + { + "epoch": 0.8, + "grad_norm": 23.115212917843962, + "learning_rate": 1.0472730376367983e-06, + "loss": 0.6814, + "step": 9069 + }, + { + "epoch": 0.8, + "grad_norm": 14.616650459166136, + "learning_rate": 1.0464021715985234e-06, + "loss": 0.8036, + "step": 9070 + }, + { + "epoch": 0.8, + "grad_norm": 7.4615543813950325, + "learning_rate": 1.0455316254788079e-06, + "loss": 0.7731, + "step": 9071 + }, + { + "epoch": 0.8, + "grad_norm": 7.236796513031183, + "learning_rate": 1.0446613993480958e-06, + "loss": 0.6485, + "step": 9072 + }, + { + "epoch": 0.8, + "grad_norm": 6.258702399382285, + "learning_rate": 1.043791493276804e-06, + "loss": 0.6108, + "step": 9073 + }, + { + "epoch": 0.8, + "grad_norm": 10.635481862740809, + "learning_rate": 1.0429219073353219e-06, + "loss": 0.5856, + "step": 9074 + }, + { + "epoch": 0.8, + "grad_norm": 7.414030488881851, + "learning_rate": 1.0420526415940185e-06, + "loss": 0.816, + "step": 9075 + }, + { + "epoch": 0.8, + "grad_norm": 3.0428377671034426, + "learning_rate": 1.0411836961232312e-06, + "loss": 0.5096, + "step": 9076 + }, + { + "epoch": 0.8, + "grad_norm": 7.665437750171081, + "learning_rate": 1.0403150709932747e-06, + "loss": 0.7963, + "step": 9077 + }, + { + "epoch": 0.8, + "grad_norm": 10.522424526269505, + "learning_rate": 1.0394467662744357e-06, + "loss": 0.8174, + "step": 9078 + }, + { + "epoch": 0.8, + "grad_norm": 7.6424717315520905, + "learning_rate": 1.0385787820369764e-06, + "loss": 0.7616, + "step": 9079 + }, + { + "epoch": 0.8, + "grad_norm": 8.734335788109684, + "learning_rate": 1.0377111183511313e-06, + "loss": 0.7275, + "step": 9080 + }, + { + "epoch": 0.8, + "grad_norm": 6.89831786267206, + "learning_rate": 1.0368437752871125e-06, + "loss": 0.6601, + "step": 9081 + }, + { + "epoch": 0.8, + "grad_norm": 6.860671413726487, + "learning_rate": 1.0359767529151038e-06, + "loss": 0.7471, + "step": 9082 + }, + { + "epoch": 0.8, + "grad_norm": 10.18499931277872, + "learning_rate": 1.035110051305262e-06, + "loss": 0.778, + "step": 9083 + }, + { + "epoch": 0.8, + "grad_norm": 27.491118087677307, + "learning_rate": 1.0342436705277193e-06, + "loss": 0.5971, + "step": 9084 + }, + { + "epoch": 0.8, + "grad_norm": 9.16568256643771, + "learning_rate": 1.033377610652581e-06, + "loss": 0.7365, + "step": 9085 + }, + { + "epoch": 0.8, + "grad_norm": 3.4027197359150607, + "learning_rate": 1.03251187174993e-06, + "loss": 0.4222, + "step": 9086 + }, + { + "epoch": 0.8, + "grad_norm": 6.354334747469086, + "learning_rate": 1.031646453889818e-06, + "loss": 0.8131, + "step": 9087 + }, + { + "epoch": 0.8, + "grad_norm": 10.14563777259893, + "learning_rate": 1.0307813571422748e-06, + "loss": 0.778, + "step": 9088 + }, + { + "epoch": 0.8, + "grad_norm": 8.925240351031993, + "learning_rate": 1.029916581577301e-06, + "loss": 0.6308, + "step": 9089 + }, + { + "epoch": 0.8, + "grad_norm": 7.528601325574248, + "learning_rate": 1.029052127264874e-06, + "loss": 0.6357, + "step": 9090 + }, + { + "epoch": 0.8, + "grad_norm": 10.476999973342902, + "learning_rate": 1.0281879942749418e-06, + "loss": 0.6855, + "step": 9091 + }, + { + "epoch": 0.8, + "grad_norm": 3.7187471744071607, + "learning_rate": 1.0273241826774321e-06, + "loss": 0.6487, + "step": 9092 + }, + { + "epoch": 0.8, + "grad_norm": 12.427500638108157, + "learning_rate": 1.026460692542241e-06, + "loss": 0.796, + "step": 9093 + }, + { + "epoch": 0.8, + "grad_norm": 3.280164360431218, + "learning_rate": 1.0255975239392414e-06, + "loss": 0.54, + "step": 9094 + }, + { + "epoch": 0.8, + "grad_norm": 5.9011371782635385, + "learning_rate": 1.0247346769382782e-06, + "loss": 0.7128, + "step": 9095 + }, + { + "epoch": 0.8, + "grad_norm": 6.129177366712064, + "learning_rate": 1.0238721516091731e-06, + "loss": 0.8009, + "step": 9096 + }, + { + "epoch": 0.8, + "grad_norm": 6.865824549638402, + "learning_rate": 1.0230099480217182e-06, + "loss": 0.7363, + "step": 9097 + }, + { + "epoch": 0.8, + "grad_norm": 10.132235993618679, + "learning_rate": 1.0221480662456845e-06, + "loss": 0.8014, + "step": 9098 + }, + { + "epoch": 0.8, + "grad_norm": 12.760391236164795, + "learning_rate": 1.021286506350812e-06, + "loss": 0.7188, + "step": 9099 + }, + { + "epoch": 0.8, + "grad_norm": 14.79344970243782, + "learning_rate": 1.0204252684068172e-06, + "loss": 0.7728, + "step": 9100 + }, + { + "epoch": 0.8, + "grad_norm": 6.805965389449348, + "learning_rate": 1.0195643524833903e-06, + "loss": 0.7426, + "step": 9101 + }, + { + "epoch": 0.8, + "grad_norm": 7.44771651078443, + "learning_rate": 1.0187037586501935e-06, + "loss": 0.6538, + "step": 9102 + }, + { + "epoch": 0.8, + "grad_norm": 7.4519634424398875, + "learning_rate": 1.0178434869768672e-06, + "loss": 0.8494, + "step": 9103 + }, + { + "epoch": 0.8, + "grad_norm": 6.341622602270738, + "learning_rate": 1.0169835375330218e-06, + "loss": 0.6771, + "step": 9104 + }, + { + "epoch": 0.8, + "grad_norm": 8.620609663701945, + "learning_rate": 1.016123910388243e-06, + "loss": 0.7743, + "step": 9105 + }, + { + "epoch": 0.8, + "grad_norm": 16.553403632826406, + "learning_rate": 1.0152646056120913e-06, + "loss": 0.7975, + "step": 9106 + }, + { + "epoch": 0.8, + "grad_norm": 7.334622613570645, + "learning_rate": 1.014405623274099e-06, + "loss": 0.7359, + "step": 9107 + }, + { + "epoch": 0.8, + "grad_norm": 18.715529275081593, + "learning_rate": 1.0135469634437723e-06, + "loss": 0.7349, + "step": 9108 + }, + { + "epoch": 0.8, + "grad_norm": 3.1157059592065077, + "learning_rate": 1.012688626190596e-06, + "loss": 0.5167, + "step": 9109 + }, + { + "epoch": 0.8, + "grad_norm": 9.885631775038458, + "learning_rate": 1.0118306115840233e-06, + "loss": 0.6534, + "step": 9110 + }, + { + "epoch": 0.8, + "grad_norm": 3.0937613722181334, + "learning_rate": 1.0109729196934837e-06, + "loss": 0.542, + "step": 9111 + }, + { + "epoch": 0.8, + "grad_norm": 8.11497497306201, + "learning_rate": 1.0101155505883797e-06, + "loss": 0.7268, + "step": 9112 + }, + { + "epoch": 0.8, + "grad_norm": 6.300467877310917, + "learning_rate": 1.0092585043380886e-06, + "loss": 0.7031, + "step": 9113 + }, + { + "epoch": 0.8, + "grad_norm": 7.241526309635458, + "learning_rate": 1.00840178101196e-06, + "loss": 0.7856, + "step": 9114 + }, + { + "epoch": 0.8, + "grad_norm": 9.596869422745545, + "learning_rate": 1.0075453806793206e-06, + "loss": 0.7974, + "step": 9115 + }, + { + "epoch": 0.8, + "grad_norm": 9.427190295661566, + "learning_rate": 1.0066893034094683e-06, + "loss": 0.6435, + "step": 9116 + }, + { + "epoch": 0.8, + "grad_norm": 12.553269032503021, + "learning_rate": 1.0058335492716748e-06, + "loss": 0.7181, + "step": 9117 + }, + { + "epoch": 0.8, + "grad_norm": 5.409195189681279, + "learning_rate": 1.0049781183351863e-06, + "loss": 0.7469, + "step": 9118 + }, + { + "epoch": 0.8, + "grad_norm": 13.129715815841564, + "learning_rate": 1.0041230106692235e-06, + "loss": 0.7296, + "step": 9119 + }, + { + "epoch": 0.8, + "grad_norm": 12.860757332559496, + "learning_rate": 1.0032682263429788e-06, + "loss": 0.7684, + "step": 9120 + }, + { + "epoch": 0.8, + "grad_norm": 10.284899326967398, + "learning_rate": 1.0024137654256215e-06, + "loss": 0.8271, + "step": 9121 + }, + { + "epoch": 0.8, + "grad_norm": 2.0157023364542765, + "learning_rate": 1.0015596279862933e-06, + "loss": 0.4677, + "step": 9122 + }, + { + "epoch": 0.8, + "grad_norm": 10.853624550045152, + "learning_rate": 1.0007058140941089e-06, + "loss": 0.7318, + "step": 9123 + }, + { + "epoch": 0.8, + "grad_norm": 8.39353127612747, + "learning_rate": 9.998523238181573e-07, + "loss": 0.7677, + "step": 9124 + }, + { + "epoch": 0.8, + "grad_norm": 9.075978148204632, + "learning_rate": 9.989991572275004e-07, + "loss": 0.7343, + "step": 9125 + }, + { + "epoch": 0.8, + "grad_norm": 8.973406288625885, + "learning_rate": 9.98146314391178e-07, + "loss": 0.7981, + "step": 9126 + }, + { + "epoch": 0.8, + "grad_norm": 15.403541114749547, + "learning_rate": 9.972937953781985e-07, + "loss": 0.8045, + "step": 9127 + }, + { + "epoch": 0.8, + "grad_norm": 7.173366429959118, + "learning_rate": 9.964416002575472e-07, + "loss": 0.7797, + "step": 9128 + }, + { + "epoch": 0.8, + "grad_norm": 9.90647541186054, + "learning_rate": 9.955897290981825e-07, + "loss": 0.8708, + "step": 9129 + }, + { + "epoch": 0.8, + "grad_norm": 10.490246093902956, + "learning_rate": 9.947381819690354e-07, + "loss": 0.6343, + "step": 9130 + }, + { + "epoch": 0.8, + "grad_norm": 10.16330447415262, + "learning_rate": 9.93886958939011e-07, + "loss": 0.6675, + "step": 9131 + }, + { + "epoch": 0.8, + "grad_norm": 3.9397446123372406, + "learning_rate": 9.93036060076991e-07, + "loss": 0.5261, + "step": 9132 + }, + { + "epoch": 0.8, + "grad_norm": 6.709353680927766, + "learning_rate": 9.921854854518287e-07, + "loss": 0.8551, + "step": 9133 + }, + { + "epoch": 0.8, + "grad_norm": 8.41128646099276, + "learning_rate": 9.913352351323497e-07, + "loss": 0.6614, + "step": 9134 + }, + { + "epoch": 0.8, + "grad_norm": 5.124086852368999, + "learning_rate": 9.904853091873557e-07, + "loss": 0.5631, + "step": 9135 + }, + { + "epoch": 0.8, + "grad_norm": 8.978301255945286, + "learning_rate": 9.896357076856205e-07, + "loss": 0.6061, + "step": 9136 + }, + { + "epoch": 0.8, + "grad_norm": 6.569307184437193, + "learning_rate": 9.887864306958916e-07, + "loss": 0.6784, + "step": 9137 + }, + { + "epoch": 0.8, + "grad_norm": 9.46183546312742, + "learning_rate": 9.879374782868944e-07, + "loss": 0.721, + "step": 9138 + }, + { + "epoch": 0.8, + "grad_norm": 10.022749446844777, + "learning_rate": 9.870888505273223e-07, + "loss": 0.8341, + "step": 9139 + }, + { + "epoch": 0.8, + "grad_norm": 9.31134485003876, + "learning_rate": 9.86240547485845e-07, + "loss": 0.814, + "step": 9140 + }, + { + "epoch": 0.8, + "grad_norm": 10.04283039983028, + "learning_rate": 9.853925692311068e-07, + "loss": 0.7806, + "step": 9141 + }, + { + "epoch": 0.8, + "grad_norm": 7.544808712398108, + "learning_rate": 9.845449158317216e-07, + "loss": 0.7147, + "step": 9142 + }, + { + "epoch": 0.8, + "grad_norm": 7.715635483758614, + "learning_rate": 9.836975873562848e-07, + "loss": 0.6288, + "step": 9143 + }, + { + "epoch": 0.8, + "grad_norm": 8.847700431248766, + "learning_rate": 9.828505838733576e-07, + "loss": 0.8664, + "step": 9144 + }, + { + "epoch": 0.8, + "grad_norm": 10.149258238132514, + "learning_rate": 9.820039054514795e-07, + "loss": 0.725, + "step": 9145 + }, + { + "epoch": 0.8, + "grad_norm": 11.030247157501112, + "learning_rate": 9.811575521591615e-07, + "loss": 0.8077, + "step": 9146 + }, + { + "epoch": 0.8, + "grad_norm": 7.031743563817695, + "learning_rate": 9.803115240648898e-07, + "loss": 0.6571, + "step": 9147 + }, + { + "epoch": 0.8, + "grad_norm": 8.697591073431594, + "learning_rate": 9.794658212371217e-07, + "loss": 0.5926, + "step": 9148 + }, + { + "epoch": 0.8, + "grad_norm": 5.796179043757467, + "learning_rate": 9.786204437442925e-07, + "loss": 0.6541, + "step": 9149 + }, + { + "epoch": 0.8, + "grad_norm": 6.972235239030435, + "learning_rate": 9.777753916548083e-07, + "loss": 0.6869, + "step": 9150 + }, + { + "epoch": 0.8, + "grad_norm": 7.228589233223445, + "learning_rate": 9.76930665037048e-07, + "loss": 0.6635, + "step": 9151 + }, + { + "epoch": 0.8, + "grad_norm": 57.0824244814583, + "learning_rate": 9.760862639593672e-07, + "loss": 0.719, + "step": 9152 + }, + { + "epoch": 0.8, + "grad_norm": 7.010877791479401, + "learning_rate": 9.752421884900915e-07, + "loss": 0.8166, + "step": 9153 + }, + { + "epoch": 0.8, + "grad_norm": 6.408652464403094, + "learning_rate": 9.743984386975225e-07, + "loss": 0.5722, + "step": 9154 + }, + { + "epoch": 0.8, + "grad_norm": 6.439476136167754, + "learning_rate": 9.735550146499368e-07, + "loss": 0.8027, + "step": 9155 + }, + { + "epoch": 0.8, + "grad_norm": 2.133026748909734, + "learning_rate": 9.727119164155813e-07, + "loss": 0.3953, + "step": 9156 + }, + { + "epoch": 0.8, + "grad_norm": 6.126321447892382, + "learning_rate": 9.718691440626788e-07, + "loss": 0.8934, + "step": 9157 + }, + { + "epoch": 0.8, + "grad_norm": 5.212310819060483, + "learning_rate": 9.710266976594245e-07, + "loss": 0.7749, + "step": 9158 + }, + { + "epoch": 0.8, + "grad_norm": 9.806675022559117, + "learning_rate": 9.701845772739882e-07, + "loss": 0.7281, + "step": 9159 + }, + { + "epoch": 0.8, + "grad_norm": 7.3465156394856725, + "learning_rate": 9.693427829745111e-07, + "loss": 0.7653, + "step": 9160 + }, + { + "epoch": 0.8, + "grad_norm": 7.0519519853833375, + "learning_rate": 9.685013148291127e-07, + "loss": 0.8044, + "step": 9161 + }, + { + "epoch": 0.8, + "grad_norm": 9.675934394912074, + "learning_rate": 9.67660172905882e-07, + "loss": 0.6883, + "step": 9162 + }, + { + "epoch": 0.8, + "grad_norm": 9.730255780637618, + "learning_rate": 9.66819357272883e-07, + "loss": 0.6479, + "step": 9163 + }, + { + "epoch": 0.8, + "grad_norm": 8.608917064418815, + "learning_rate": 9.65978867998152e-07, + "loss": 0.6219, + "step": 9164 + }, + { + "epoch": 0.8, + "grad_norm": 7.413971636343903, + "learning_rate": 9.651387051496997e-07, + "loss": 0.7616, + "step": 9165 + }, + { + "epoch": 0.8, + "grad_norm": 11.27053003870301, + "learning_rate": 9.642988687955136e-07, + "loss": 0.614, + "step": 9166 + }, + { + "epoch": 0.81, + "grad_norm": 9.361591378800194, + "learning_rate": 9.63459359003549e-07, + "loss": 0.6145, + "step": 9167 + }, + { + "epoch": 0.81, + "grad_norm": 8.789025828709784, + "learning_rate": 9.626201758417391e-07, + "loss": 0.7702, + "step": 9168 + }, + { + "epoch": 0.81, + "grad_norm": 9.004501981938727, + "learning_rate": 9.61781319377989e-07, + "loss": 0.7368, + "step": 9169 + }, + { + "epoch": 0.81, + "grad_norm": 10.67662533071018, + "learning_rate": 9.60942789680177e-07, + "loss": 0.9406, + "step": 9170 + }, + { + "epoch": 0.81, + "grad_norm": 6.407092090522071, + "learning_rate": 9.601045868161546e-07, + "loss": 0.6789, + "step": 9171 + }, + { + "epoch": 0.81, + "grad_norm": 10.04487438542271, + "learning_rate": 9.592667108537506e-07, + "loss": 0.6953, + "step": 9172 + }, + { + "epoch": 0.81, + "grad_norm": 8.455787212537734, + "learning_rate": 9.584291618607627e-07, + "loss": 0.74, + "step": 9173 + }, + { + "epoch": 0.81, + "grad_norm": 10.21073285683193, + "learning_rate": 9.575919399049639e-07, + "loss": 0.7724, + "step": 9174 + }, + { + "epoch": 0.81, + "grad_norm": 7.14352510285547, + "learning_rate": 9.567550450541012e-07, + "loss": 0.7808, + "step": 9175 + }, + { + "epoch": 0.81, + "grad_norm": 10.757499597821012, + "learning_rate": 9.55918477375895e-07, + "loss": 0.6628, + "step": 9176 + }, + { + "epoch": 0.81, + "grad_norm": 6.755578181586543, + "learning_rate": 9.550822369380374e-07, + "loss": 0.6141, + "step": 9177 + }, + { + "epoch": 0.81, + "grad_norm": 7.536593903791943, + "learning_rate": 9.542463238081979e-07, + "loss": 0.9105, + "step": 9178 + }, + { + "epoch": 0.81, + "grad_norm": 5.051801501709059, + "learning_rate": 9.534107380540159e-07, + "loss": 0.7276, + "step": 9179 + }, + { + "epoch": 0.81, + "grad_norm": 1.9517480698771705, + "learning_rate": 9.525754797431063e-07, + "loss": 0.4368, + "step": 9180 + }, + { + "epoch": 0.81, + "grad_norm": 6.876010854795496, + "learning_rate": 9.517405489430559e-07, + "loss": 0.7568, + "step": 9181 + }, + { + "epoch": 0.81, + "grad_norm": 6.256301213858471, + "learning_rate": 9.509059457214253e-07, + "loss": 0.7303, + "step": 9182 + }, + { + "epoch": 0.81, + "grad_norm": 8.366880533284233, + "learning_rate": 9.500716701457513e-07, + "loss": 0.5829, + "step": 9183 + }, + { + "epoch": 0.81, + "grad_norm": 5.741783692026903, + "learning_rate": 9.492377222835414e-07, + "loss": 0.6762, + "step": 9184 + }, + { + "epoch": 0.81, + "grad_norm": 5.765855178581994, + "learning_rate": 9.484041022022766e-07, + "loss": 0.7005, + "step": 9185 + }, + { + "epoch": 0.81, + "grad_norm": 9.419428166658662, + "learning_rate": 9.475708099694125e-07, + "loss": 0.7074, + "step": 9186 + }, + { + "epoch": 0.81, + "grad_norm": 3.147385170172356, + "learning_rate": 9.467378456523779e-07, + "loss": 0.5256, + "step": 9187 + }, + { + "epoch": 0.81, + "grad_norm": 8.100547551112578, + "learning_rate": 9.459052093185728e-07, + "loss": 0.6602, + "step": 9188 + }, + { + "epoch": 0.81, + "grad_norm": 9.432634098329748, + "learning_rate": 9.450729010353759e-07, + "loss": 0.6663, + "step": 9189 + }, + { + "epoch": 0.81, + "grad_norm": 7.055208575196027, + "learning_rate": 9.442409208701347e-07, + "loss": 0.6478, + "step": 9190 + }, + { + "epoch": 0.81, + "grad_norm": 2.2813166501277498, + "learning_rate": 9.434092688901724e-07, + "loss": 0.4676, + "step": 9191 + }, + { + "epoch": 0.81, + "grad_norm": 9.058835860283908, + "learning_rate": 9.425779451627837e-07, + "loss": 0.6452, + "step": 9192 + }, + { + "epoch": 0.81, + "grad_norm": 3.0765477794061233, + "learning_rate": 9.417469497552384e-07, + "loss": 0.5562, + "step": 9193 + }, + { + "epoch": 0.81, + "grad_norm": 8.831486766703488, + "learning_rate": 9.409162827347784e-07, + "loss": 0.693, + "step": 9194 + }, + { + "epoch": 0.81, + "grad_norm": 3.9227744774577866, + "learning_rate": 9.400859441686222e-07, + "loss": 0.4907, + "step": 9195 + }, + { + "epoch": 0.81, + "grad_norm": 18.450113918066265, + "learning_rate": 9.392559341239582e-07, + "loss": 0.7325, + "step": 9196 + }, + { + "epoch": 0.81, + "grad_norm": 9.136310358366678, + "learning_rate": 9.384262526679488e-07, + "loss": 0.8124, + "step": 9197 + }, + { + "epoch": 0.81, + "grad_norm": 5.982202662400923, + "learning_rate": 9.375968998677309e-07, + "loss": 0.5987, + "step": 9198 + }, + { + "epoch": 0.81, + "grad_norm": 8.200682618439494, + "learning_rate": 9.367678757904136e-07, + "loss": 0.6305, + "step": 9199 + }, + { + "epoch": 0.81, + "grad_norm": 9.067993349223283, + "learning_rate": 9.35939180503082e-07, + "loss": 0.7299, + "step": 9200 + }, + { + "epoch": 0.81, + "grad_norm": 7.376432390776689, + "learning_rate": 9.351108140727921e-07, + "loss": 0.7191, + "step": 9201 + }, + { + "epoch": 0.81, + "grad_norm": 8.630784800653208, + "learning_rate": 9.342827765665729e-07, + "loss": 0.7146, + "step": 9202 + }, + { + "epoch": 0.81, + "grad_norm": 10.843734972431202, + "learning_rate": 9.334550680514287e-07, + "loss": 0.6767, + "step": 9203 + }, + { + "epoch": 0.81, + "grad_norm": 6.412046317337681, + "learning_rate": 9.326276885943364e-07, + "loss": 0.6365, + "step": 9204 + }, + { + "epoch": 0.81, + "grad_norm": 9.645456903555596, + "learning_rate": 9.31800638262244e-07, + "loss": 0.7218, + "step": 9205 + }, + { + "epoch": 0.81, + "grad_norm": 2.631618651928423, + "learning_rate": 9.309739171220783e-07, + "loss": 0.5105, + "step": 9206 + }, + { + "epoch": 0.81, + "grad_norm": 9.627814937709983, + "learning_rate": 9.301475252407349e-07, + "loss": 0.8887, + "step": 9207 + }, + { + "epoch": 0.81, + "grad_norm": 14.064480724976232, + "learning_rate": 9.293214626850838e-07, + "loss": 0.6773, + "step": 9208 + }, + { + "epoch": 0.81, + "grad_norm": 6.792407957780757, + "learning_rate": 9.284957295219688e-07, + "loss": 0.7372, + "step": 9209 + }, + { + "epoch": 0.81, + "grad_norm": 10.903093667653398, + "learning_rate": 9.276703258182068e-07, + "loss": 0.6314, + "step": 9210 + }, + { + "epoch": 0.81, + "grad_norm": 6.112965990092264, + "learning_rate": 9.268452516405862e-07, + "loss": 0.6961, + "step": 9211 + }, + { + "epoch": 0.81, + "grad_norm": 2.9489826757628554, + "learning_rate": 9.260205070558742e-07, + "loss": 0.5135, + "step": 9212 + }, + { + "epoch": 0.81, + "grad_norm": 7.764720504233326, + "learning_rate": 9.251960921308062e-07, + "loss": 0.6939, + "step": 9213 + }, + { + "epoch": 0.81, + "grad_norm": 8.027065013071878, + "learning_rate": 9.243720069320922e-07, + "loss": 0.6433, + "step": 9214 + }, + { + "epoch": 0.81, + "grad_norm": 7.279991582047705, + "learning_rate": 9.23548251526416e-07, + "loss": 0.6311, + "step": 9215 + }, + { + "epoch": 0.81, + "grad_norm": 5.493626857503991, + "learning_rate": 9.227248259804344e-07, + "loss": 0.6111, + "step": 9216 + }, + { + "epoch": 0.81, + "grad_norm": 5.322399074164602, + "learning_rate": 9.219017303607764e-07, + "loss": 0.7604, + "step": 9217 + }, + { + "epoch": 0.81, + "grad_norm": 7.353136470518077, + "learning_rate": 9.21078964734048e-07, + "loss": 0.7788, + "step": 9218 + }, + { + "epoch": 0.81, + "grad_norm": 63.978229126292035, + "learning_rate": 9.202565291668253e-07, + "loss": 0.6317, + "step": 9219 + }, + { + "epoch": 0.81, + "grad_norm": 8.781679019052822, + "learning_rate": 9.194344237256575e-07, + "loss": 0.7203, + "step": 9220 + }, + { + "epoch": 0.81, + "grad_norm": 8.725377048449685, + "learning_rate": 9.186126484770686e-07, + "loss": 0.6878, + "step": 9221 + }, + { + "epoch": 0.81, + "grad_norm": 6.45772502472633, + "learning_rate": 9.177912034875535e-07, + "loss": 0.7986, + "step": 9222 + }, + { + "epoch": 0.81, + "grad_norm": 9.668618334847581, + "learning_rate": 9.169700888235855e-07, + "loss": 0.8263, + "step": 9223 + }, + { + "epoch": 0.81, + "grad_norm": 9.362279936047926, + "learning_rate": 9.161493045516062e-07, + "loss": 0.7061, + "step": 9224 + }, + { + "epoch": 0.81, + "grad_norm": 6.906272394697493, + "learning_rate": 9.15328850738032e-07, + "loss": 0.6323, + "step": 9225 + }, + { + "epoch": 0.81, + "grad_norm": 5.510877616087022, + "learning_rate": 9.145087274492526e-07, + "loss": 0.8144, + "step": 9226 + }, + { + "epoch": 0.81, + "grad_norm": 7.051944654958101, + "learning_rate": 9.136889347516309e-07, + "loss": 0.6651, + "step": 9227 + }, + { + "epoch": 0.81, + "grad_norm": 3.9893391277514407, + "learning_rate": 9.128694727115023e-07, + "loss": 0.6276, + "step": 9228 + }, + { + "epoch": 0.81, + "grad_norm": 9.32559320619942, + "learning_rate": 9.120503413951787e-07, + "loss": 0.7818, + "step": 9229 + }, + { + "epoch": 0.81, + "grad_norm": 7.82332646348571, + "learning_rate": 9.112315408689415e-07, + "loss": 0.709, + "step": 9230 + }, + { + "epoch": 0.81, + "grad_norm": 17.245510424322276, + "learning_rate": 9.104130711990467e-07, + "loss": 0.8027, + "step": 9231 + }, + { + "epoch": 0.81, + "grad_norm": 9.820867565414948, + "learning_rate": 9.095949324517234e-07, + "loss": 0.942, + "step": 9232 + }, + { + "epoch": 0.81, + "grad_norm": 6.608257609148904, + "learning_rate": 9.087771246931743e-07, + "loss": 0.8281, + "step": 9233 + }, + { + "epoch": 0.81, + "grad_norm": 5.77548950463403, + "learning_rate": 9.07959647989573e-07, + "loss": 0.6599, + "step": 9234 + }, + { + "epoch": 0.81, + "grad_norm": 14.518580216663638, + "learning_rate": 9.071425024070724e-07, + "loss": 0.6748, + "step": 9235 + }, + { + "epoch": 0.81, + "grad_norm": 10.041583403653801, + "learning_rate": 9.063256880117916e-07, + "loss": 0.601, + "step": 9236 + }, + { + "epoch": 0.81, + "grad_norm": 8.135412053608107, + "learning_rate": 9.055092048698272e-07, + "loss": 0.5529, + "step": 9237 + }, + { + "epoch": 0.81, + "grad_norm": 6.185807895556572, + "learning_rate": 9.04693053047247e-07, + "loss": 0.7925, + "step": 9238 + }, + { + "epoch": 0.81, + "grad_norm": 9.543492588032095, + "learning_rate": 9.038772326100914e-07, + "loss": 0.7338, + "step": 9239 + }, + { + "epoch": 0.81, + "grad_norm": 5.8084792678597, + "learning_rate": 9.030617436243777e-07, + "loss": 0.7233, + "step": 9240 + }, + { + "epoch": 0.81, + "grad_norm": 42.902664802336375, + "learning_rate": 9.022465861560931e-07, + "loss": 0.6921, + "step": 9241 + }, + { + "epoch": 0.81, + "grad_norm": 13.00463326594889, + "learning_rate": 9.01431760271198e-07, + "loss": 0.685, + "step": 9242 + }, + { + "epoch": 0.81, + "grad_norm": 7.873876118284987, + "learning_rate": 9.006172660356278e-07, + "loss": 0.6035, + "step": 9243 + }, + { + "epoch": 0.81, + "grad_norm": 12.26203605910899, + "learning_rate": 8.998031035152893e-07, + "loss": 0.5977, + "step": 9244 + }, + { + "epoch": 0.81, + "grad_norm": 10.18883486999286, + "learning_rate": 8.989892727760619e-07, + "loss": 0.7659, + "step": 9245 + }, + { + "epoch": 0.81, + "grad_norm": 29.915774612949985, + "learning_rate": 8.98175773883802e-07, + "loss": 0.6162, + "step": 9246 + }, + { + "epoch": 0.81, + "grad_norm": 2.7573820076884155, + "learning_rate": 8.973626069043356e-07, + "loss": 0.4172, + "step": 9247 + }, + { + "epoch": 0.81, + "grad_norm": 12.795756138604867, + "learning_rate": 8.965497719034627e-07, + "loss": 0.8444, + "step": 9248 + }, + { + "epoch": 0.81, + "grad_norm": 12.368950556231617, + "learning_rate": 8.957372689469568e-07, + "loss": 0.7094, + "step": 9249 + }, + { + "epoch": 0.81, + "grad_norm": 6.94989266942595, + "learning_rate": 8.949250981005637e-07, + "loss": 0.4798, + "step": 9250 + }, + { + "epoch": 0.81, + "grad_norm": 11.257404502511541, + "learning_rate": 8.941132594300023e-07, + "loss": 0.8098, + "step": 9251 + }, + { + "epoch": 0.81, + "grad_norm": 2.1191061205805997, + "learning_rate": 8.933017530009669e-07, + "loss": 0.4665, + "step": 9252 + }, + { + "epoch": 0.81, + "grad_norm": 8.61673295800796, + "learning_rate": 8.924905788791227e-07, + "loss": 0.6644, + "step": 9253 + }, + { + "epoch": 0.81, + "grad_norm": 7.648197362537582, + "learning_rate": 8.916797371301083e-07, + "loss": 0.87, + "step": 9254 + }, + { + "epoch": 0.81, + "grad_norm": 7.457146648812867, + "learning_rate": 8.908692278195358e-07, + "loss": 0.8362, + "step": 9255 + }, + { + "epoch": 0.81, + "grad_norm": 5.788296051719646, + "learning_rate": 8.900590510129903e-07, + "loss": 0.6923, + "step": 9256 + }, + { + "epoch": 0.81, + "grad_norm": 7.959399014811093, + "learning_rate": 8.892492067760284e-07, + "loss": 0.8296, + "step": 9257 + }, + { + "epoch": 0.81, + "grad_norm": 2.4169896014942664, + "learning_rate": 8.884396951741841e-07, + "loss": 0.4921, + "step": 9258 + }, + { + "epoch": 0.81, + "grad_norm": 8.088791981142418, + "learning_rate": 8.87630516272961e-07, + "loss": 0.7854, + "step": 9259 + }, + { + "epoch": 0.81, + "grad_norm": 5.9333375420587195, + "learning_rate": 8.868216701378352e-07, + "loss": 0.6465, + "step": 9260 + }, + { + "epoch": 0.81, + "grad_norm": 23.957386665471343, + "learning_rate": 8.860131568342584e-07, + "loss": 0.5995, + "step": 9261 + }, + { + "epoch": 0.81, + "grad_norm": 8.757023622529534, + "learning_rate": 8.852049764276522e-07, + "loss": 0.8161, + "step": 9262 + }, + { + "epoch": 0.81, + "grad_norm": 2.475389745282986, + "learning_rate": 8.843971289834157e-07, + "loss": 0.5019, + "step": 9263 + }, + { + "epoch": 0.81, + "grad_norm": 5.44328627504726, + "learning_rate": 8.835896145669182e-07, + "loss": 0.7078, + "step": 9264 + }, + { + "epoch": 0.81, + "grad_norm": 8.485057508832215, + "learning_rate": 8.827824332435014e-07, + "loss": 0.779, + "step": 9265 + }, + { + "epoch": 0.81, + "grad_norm": 14.098469494197388, + "learning_rate": 8.819755850784811e-07, + "loss": 0.6114, + "step": 9266 + }, + { + "epoch": 0.81, + "grad_norm": 7.976269169319653, + "learning_rate": 8.811690701371473e-07, + "loss": 0.7669, + "step": 9267 + }, + { + "epoch": 0.81, + "grad_norm": 10.557853644055609, + "learning_rate": 8.803628884847593e-07, + "loss": 0.6614, + "step": 9268 + }, + { + "epoch": 0.81, + "grad_norm": 6.416972709394762, + "learning_rate": 8.79557040186555e-07, + "loss": 0.6321, + "step": 9269 + }, + { + "epoch": 0.81, + "grad_norm": 6.789426896418713, + "learning_rate": 8.787515253077411e-07, + "loss": 0.7863, + "step": 9270 + }, + { + "epoch": 0.81, + "grad_norm": 6.196083723481281, + "learning_rate": 8.779463439134983e-07, + "loss": 0.8861, + "step": 9271 + }, + { + "epoch": 0.81, + "grad_norm": 5.491875219100366, + "learning_rate": 8.771414960689806e-07, + "loss": 0.5694, + "step": 9272 + }, + { + "epoch": 0.81, + "grad_norm": 5.890908136322314, + "learning_rate": 8.763369818393153e-07, + "loss": 0.6718, + "step": 9273 + }, + { + "epoch": 0.81, + "grad_norm": 5.875628435466241, + "learning_rate": 8.755328012896002e-07, + "loss": 0.7405, + "step": 9274 + }, + { + "epoch": 0.81, + "grad_norm": 6.216489007084825, + "learning_rate": 8.747289544849119e-07, + "loss": 0.7189, + "step": 9275 + }, + { + "epoch": 0.81, + "grad_norm": 9.163644652465688, + "learning_rate": 8.739254414902942e-07, + "loss": 0.7445, + "step": 9276 + }, + { + "epoch": 0.81, + "grad_norm": 7.575911478170872, + "learning_rate": 8.731222623707669e-07, + "loss": 0.7411, + "step": 9277 + }, + { + "epoch": 0.81, + "grad_norm": 8.906200845147456, + "learning_rate": 8.723194171913213e-07, + "loss": 0.6381, + "step": 9278 + }, + { + "epoch": 0.81, + "grad_norm": 7.13503404603286, + "learning_rate": 8.715169060169209e-07, + "loss": 0.8378, + "step": 9279 + }, + { + "epoch": 0.81, + "grad_norm": 5.7572287365080514, + "learning_rate": 8.707147289125067e-07, + "loss": 0.6576, + "step": 9280 + }, + { + "epoch": 0.82, + "grad_norm": 6.456078818569949, + "learning_rate": 8.699128859429879e-07, + "loss": 0.6858, + "step": 9281 + }, + { + "epoch": 0.82, + "grad_norm": 8.694320890297362, + "learning_rate": 8.691113771732479e-07, + "loss": 0.7446, + "step": 9282 + }, + { + "epoch": 0.82, + "grad_norm": 7.309003413247973, + "learning_rate": 8.683102026681439e-07, + "loss": 0.7446, + "step": 9283 + }, + { + "epoch": 0.82, + "grad_norm": 3.404052450457528, + "learning_rate": 8.675093624925058e-07, + "loss": 0.4764, + "step": 9284 + }, + { + "epoch": 0.82, + "grad_norm": 15.04020312814935, + "learning_rate": 8.667088567111348e-07, + "loss": 0.7328, + "step": 9285 + }, + { + "epoch": 0.82, + "grad_norm": 2.305860099458438, + "learning_rate": 8.659086853888093e-07, + "loss": 0.4251, + "step": 9286 + }, + { + "epoch": 0.82, + "grad_norm": 8.098203651982752, + "learning_rate": 8.651088485902759e-07, + "loss": 0.5829, + "step": 9287 + }, + { + "epoch": 0.82, + "grad_norm": 9.087727093444776, + "learning_rate": 8.643093463802565e-07, + "loss": 0.6797, + "step": 9288 + }, + { + "epoch": 0.82, + "grad_norm": 7.17409230129124, + "learning_rate": 8.635101788234451e-07, + "loss": 0.8137, + "step": 9289 + }, + { + "epoch": 0.82, + "grad_norm": 6.294470877963959, + "learning_rate": 8.627113459845099e-07, + "loss": 0.673, + "step": 9290 + }, + { + "epoch": 0.82, + "grad_norm": 2.2266496090673304, + "learning_rate": 8.619128479280892e-07, + "loss": 0.4991, + "step": 9291 + }, + { + "epoch": 0.82, + "grad_norm": 14.430430021399202, + "learning_rate": 8.611146847187984e-07, + "loss": 0.9373, + "step": 9292 + }, + { + "epoch": 0.82, + "grad_norm": 8.552244898980994, + "learning_rate": 8.603168564212228e-07, + "loss": 0.7105, + "step": 9293 + }, + { + "epoch": 0.82, + "grad_norm": 7.920196999755695, + "learning_rate": 8.59519363099921e-07, + "loss": 0.5868, + "step": 9294 + }, + { + "epoch": 0.82, + "grad_norm": 6.92826904234767, + "learning_rate": 8.587222048194255e-07, + "loss": 0.7131, + "step": 9295 + }, + { + "epoch": 0.82, + "grad_norm": 6.879361090727755, + "learning_rate": 8.579253816442401e-07, + "loss": 0.706, + "step": 9296 + }, + { + "epoch": 0.82, + "grad_norm": 12.014851388327353, + "learning_rate": 8.571288936388417e-07, + "loss": 0.5942, + "step": 9297 + }, + { + "epoch": 0.82, + "grad_norm": 9.852376251180017, + "learning_rate": 8.56332740867683e-07, + "loss": 0.6672, + "step": 9298 + }, + { + "epoch": 0.82, + "grad_norm": 7.2062598937276015, + "learning_rate": 8.555369233951866e-07, + "loss": 0.6423, + "step": 9299 + }, + { + "epoch": 0.82, + "grad_norm": 8.67835055639959, + "learning_rate": 8.54741441285748e-07, + "loss": 0.6154, + "step": 9300 + }, + { + "epoch": 0.82, + "grad_norm": 10.08868683210892, + "learning_rate": 8.539462946037369e-07, + "loss": 0.8163, + "step": 9301 + }, + { + "epoch": 0.82, + "grad_norm": 4.7249488178136465, + "learning_rate": 8.531514834134936e-07, + "loss": 0.6906, + "step": 9302 + }, + { + "epoch": 0.82, + "grad_norm": 8.472699431270849, + "learning_rate": 8.523570077793358e-07, + "loss": 0.6623, + "step": 9303 + }, + { + "epoch": 0.82, + "grad_norm": 7.417983823152164, + "learning_rate": 8.515628677655497e-07, + "loss": 0.7492, + "step": 9304 + }, + { + "epoch": 0.82, + "grad_norm": 11.618891396207351, + "learning_rate": 8.507690634363958e-07, + "loss": 0.7614, + "step": 9305 + }, + { + "epoch": 0.82, + "grad_norm": 7.864586349596326, + "learning_rate": 8.499755948561072e-07, + "loss": 0.5932, + "step": 9306 + }, + { + "epoch": 0.82, + "grad_norm": 13.485928289947749, + "learning_rate": 8.491824620888906e-07, + "loss": 0.7306, + "step": 9307 + }, + { + "epoch": 0.82, + "grad_norm": 7.337487277336428, + "learning_rate": 8.483896651989231e-07, + "loss": 0.7642, + "step": 9308 + }, + { + "epoch": 0.82, + "grad_norm": 5.787281243631815, + "learning_rate": 8.475972042503594e-07, + "loss": 0.6913, + "step": 9309 + }, + { + "epoch": 0.82, + "grad_norm": 9.056419021480833, + "learning_rate": 8.468050793073224e-07, + "loss": 0.7021, + "step": 9310 + }, + { + "epoch": 0.82, + "grad_norm": 11.535950026956248, + "learning_rate": 8.460132904339103e-07, + "loss": 0.7231, + "step": 9311 + }, + { + "epoch": 0.82, + "grad_norm": 7.358526450076581, + "learning_rate": 8.452218376941928e-07, + "loss": 0.6946, + "step": 9312 + }, + { + "epoch": 0.82, + "grad_norm": 7.4752247986094265, + "learning_rate": 8.444307211522135e-07, + "loss": 0.7519, + "step": 9313 + }, + { + "epoch": 0.82, + "grad_norm": 2.991685857405032, + "learning_rate": 8.436399408719859e-07, + "loss": 0.528, + "step": 9314 + }, + { + "epoch": 0.82, + "grad_norm": 6.281594498623618, + "learning_rate": 8.428494969175021e-07, + "loss": 0.8204, + "step": 9315 + }, + { + "epoch": 0.82, + "grad_norm": 12.984412535457198, + "learning_rate": 8.420593893527207e-07, + "loss": 0.8001, + "step": 9316 + }, + { + "epoch": 0.82, + "grad_norm": 9.593281521260066, + "learning_rate": 8.412696182415786e-07, + "loss": 0.8011, + "step": 9317 + }, + { + "epoch": 0.82, + "grad_norm": 2.375184286249789, + "learning_rate": 8.404801836479809e-07, + "loss": 0.4779, + "step": 9318 + }, + { + "epoch": 0.82, + "grad_norm": 7.339479619967032, + "learning_rate": 8.396910856358076e-07, + "loss": 0.7601, + "step": 9319 + }, + { + "epoch": 0.82, + "grad_norm": 7.0304566852571, + "learning_rate": 8.389023242689115e-07, + "loss": 0.8408, + "step": 9320 + }, + { + "epoch": 0.82, + "grad_norm": 9.110779082417247, + "learning_rate": 8.381138996111166e-07, + "loss": 1.0034, + "step": 9321 + }, + { + "epoch": 0.82, + "grad_norm": 10.085666578201373, + "learning_rate": 8.373258117262234e-07, + "loss": 0.6755, + "step": 9322 + }, + { + "epoch": 0.82, + "grad_norm": 10.044369353609175, + "learning_rate": 8.36538060678001e-07, + "loss": 0.7471, + "step": 9323 + }, + { + "epoch": 0.82, + "grad_norm": 6.437354666935691, + "learning_rate": 8.357506465301934e-07, + "loss": 0.6148, + "step": 9324 + }, + { + "epoch": 0.82, + "grad_norm": 7.204717258861933, + "learning_rate": 8.349635693465163e-07, + "loss": 0.6775, + "step": 9325 + }, + { + "epoch": 0.82, + "grad_norm": 7.4474648051260965, + "learning_rate": 8.341768291906577e-07, + "loss": 0.683, + "step": 9326 + }, + { + "epoch": 0.82, + "grad_norm": 12.205858696833879, + "learning_rate": 8.333904261262821e-07, + "loss": 0.8173, + "step": 9327 + }, + { + "epoch": 0.82, + "grad_norm": 13.789317727151639, + "learning_rate": 8.326043602170226e-07, + "loss": 0.7742, + "step": 9328 + }, + { + "epoch": 0.82, + "grad_norm": 6.543511569772944, + "learning_rate": 8.318186315264859e-07, + "loss": 0.6073, + "step": 9329 + }, + { + "epoch": 0.82, + "grad_norm": 10.519317588152179, + "learning_rate": 8.310332401182519e-07, + "loss": 0.7047, + "step": 9330 + }, + { + "epoch": 0.82, + "grad_norm": 7.698356935054897, + "learning_rate": 8.302481860558737e-07, + "loss": 0.8056, + "step": 9331 + }, + { + "epoch": 0.82, + "grad_norm": 8.420586541757169, + "learning_rate": 8.294634694028747e-07, + "loss": 0.6131, + "step": 9332 + }, + { + "epoch": 0.82, + "grad_norm": 7.488137233148498, + "learning_rate": 8.28679090222756e-07, + "loss": 0.6141, + "step": 9333 + }, + { + "epoch": 0.82, + "grad_norm": 9.90621697166471, + "learning_rate": 8.278950485789866e-07, + "loss": 0.7491, + "step": 9334 + }, + { + "epoch": 0.82, + "grad_norm": 2.966331679163147, + "learning_rate": 8.271113445350099e-07, + "loss": 0.4955, + "step": 9335 + }, + { + "epoch": 0.82, + "grad_norm": 7.0641327859471135, + "learning_rate": 8.263279781542421e-07, + "loss": 0.8127, + "step": 9336 + }, + { + "epoch": 0.82, + "grad_norm": 7.492502033079217, + "learning_rate": 8.255449495000712e-07, + "loss": 0.6735, + "step": 9337 + }, + { + "epoch": 0.82, + "grad_norm": 7.729044244674042, + "learning_rate": 8.247622586358583e-07, + "loss": 0.6577, + "step": 9338 + }, + { + "epoch": 0.82, + "grad_norm": 2.7471745652706896, + "learning_rate": 8.239799056249398e-07, + "loss": 0.4971, + "step": 9339 + }, + { + "epoch": 0.82, + "grad_norm": 10.036544799756554, + "learning_rate": 8.231978905306204e-07, + "loss": 0.8312, + "step": 9340 + }, + { + "epoch": 0.82, + "grad_norm": 6.669803480151421, + "learning_rate": 8.2241621341618e-07, + "loss": 0.7514, + "step": 9341 + }, + { + "epoch": 0.82, + "grad_norm": 11.528968467207866, + "learning_rate": 8.21634874344871e-07, + "loss": 0.6198, + "step": 9342 + }, + { + "epoch": 0.82, + "grad_norm": 12.379495020819641, + "learning_rate": 8.20853873379916e-07, + "loss": 0.7784, + "step": 9343 + }, + { + "epoch": 0.82, + "grad_norm": 11.020311282500089, + "learning_rate": 8.200732105845155e-07, + "loss": 0.7537, + "step": 9344 + }, + { + "epoch": 0.82, + "grad_norm": 9.685925868322588, + "learning_rate": 8.192928860218374e-07, + "loss": 0.6428, + "step": 9345 + }, + { + "epoch": 0.82, + "grad_norm": 10.129134107372566, + "learning_rate": 8.185128997550251e-07, + "loss": 0.7032, + "step": 9346 + }, + { + "epoch": 0.82, + "grad_norm": 10.588372068409598, + "learning_rate": 8.177332518471931e-07, + "loss": 0.6138, + "step": 9347 + }, + { + "epoch": 0.82, + "grad_norm": 6.2141143629663995, + "learning_rate": 8.169539423614298e-07, + "loss": 0.6633, + "step": 9348 + }, + { + "epoch": 0.82, + "grad_norm": 7.61529383108552, + "learning_rate": 8.161749713607936e-07, + "loss": 0.7867, + "step": 9349 + }, + { + "epoch": 0.82, + "grad_norm": 9.06504011438458, + "learning_rate": 8.153963389083208e-07, + "loss": 0.8264, + "step": 9350 + }, + { + "epoch": 0.82, + "grad_norm": 3.168726611176923, + "learning_rate": 8.146180450670155e-07, + "loss": 0.5047, + "step": 9351 + }, + { + "epoch": 0.82, + "grad_norm": 6.352415553904741, + "learning_rate": 8.13840089899856e-07, + "loss": 0.7611, + "step": 9352 + }, + { + "epoch": 0.82, + "grad_norm": 6.7956869098404304, + "learning_rate": 8.130624734697934e-07, + "loss": 0.6884, + "step": 9353 + }, + { + "epoch": 0.82, + "grad_norm": 11.582841701558648, + "learning_rate": 8.122851958397504e-07, + "loss": 0.8769, + "step": 9354 + }, + { + "epoch": 0.82, + "grad_norm": 4.624712226129316, + "learning_rate": 8.115082570726224e-07, + "loss": 0.5562, + "step": 9355 + }, + { + "epoch": 0.82, + "grad_norm": 3.22023391726779, + "learning_rate": 8.107316572312801e-07, + "loss": 0.5976, + "step": 9356 + }, + { + "epoch": 0.82, + "grad_norm": 6.121599496185174, + "learning_rate": 8.099553963785639e-07, + "loss": 0.8137, + "step": 9357 + }, + { + "epoch": 0.82, + "grad_norm": 6.08618767276797, + "learning_rate": 8.091794745772874e-07, + "loss": 0.7676, + "step": 9358 + }, + { + "epoch": 0.82, + "grad_norm": 21.82365078291684, + "learning_rate": 8.084038918902365e-07, + "loss": 0.5637, + "step": 9359 + }, + { + "epoch": 0.82, + "grad_norm": 6.283027298222416, + "learning_rate": 8.076286483801698e-07, + "loss": 0.5512, + "step": 9360 + }, + { + "epoch": 0.82, + "grad_norm": 2.207047911226063, + "learning_rate": 8.06853744109819e-07, + "loss": 0.4531, + "step": 9361 + }, + { + "epoch": 0.82, + "grad_norm": 13.887355129695631, + "learning_rate": 8.060791791418887e-07, + "loss": 0.7249, + "step": 9362 + }, + { + "epoch": 0.82, + "grad_norm": 11.762280892914161, + "learning_rate": 8.053049535390551e-07, + "loss": 0.7263, + "step": 9363 + }, + { + "epoch": 0.82, + "grad_norm": 9.563753290181623, + "learning_rate": 8.045310673639672e-07, + "loss": 0.7976, + "step": 9364 + }, + { + "epoch": 0.82, + "grad_norm": 10.303478678700754, + "learning_rate": 8.037575206792469e-07, + "loss": 0.8118, + "step": 9365 + }, + { + "epoch": 0.82, + "grad_norm": 4.488611157184617, + "learning_rate": 8.029843135474863e-07, + "loss": 0.5516, + "step": 9366 + }, + { + "epoch": 0.82, + "grad_norm": 9.611588029288322, + "learning_rate": 8.022114460312552e-07, + "loss": 0.8256, + "step": 9367 + }, + { + "epoch": 0.82, + "grad_norm": 10.30911602213159, + "learning_rate": 8.014389181930909e-07, + "loss": 0.717, + "step": 9368 + }, + { + "epoch": 0.82, + "grad_norm": 22.89657816070596, + "learning_rate": 8.006667300955056e-07, + "loss": 0.847, + "step": 9369 + }, + { + "epoch": 0.82, + "grad_norm": 35.28395601033886, + "learning_rate": 7.998948818009833e-07, + "loss": 0.8294, + "step": 9370 + }, + { + "epoch": 0.82, + "grad_norm": 8.92658680925399, + "learning_rate": 7.991233733719805e-07, + "loss": 0.6611, + "step": 9371 + }, + { + "epoch": 0.82, + "grad_norm": 10.450745190115347, + "learning_rate": 7.983522048709252e-07, + "loss": 0.9019, + "step": 9372 + }, + { + "epoch": 0.82, + "grad_norm": 7.157934538946371, + "learning_rate": 7.975813763602219e-07, + "loss": 0.5845, + "step": 9373 + }, + { + "epoch": 0.82, + "grad_norm": 9.434700910653651, + "learning_rate": 7.968108879022434e-07, + "loss": 0.7573, + "step": 9374 + }, + { + "epoch": 0.82, + "grad_norm": 4.995479332856048, + "learning_rate": 7.960407395593356e-07, + "loss": 0.7108, + "step": 9375 + }, + { + "epoch": 0.82, + "grad_norm": 8.630104069317637, + "learning_rate": 7.952709313938189e-07, + "loss": 0.6875, + "step": 9376 + }, + { + "epoch": 0.82, + "grad_norm": 8.98237105281694, + "learning_rate": 7.945014634679838e-07, + "loss": 0.7531, + "step": 9377 + }, + { + "epoch": 0.82, + "grad_norm": 7.204661126146342, + "learning_rate": 7.937323358440935e-07, + "loss": 0.7246, + "step": 9378 + }, + { + "epoch": 0.82, + "grad_norm": 15.49184059368478, + "learning_rate": 7.929635485843873e-07, + "loss": 0.7579, + "step": 9379 + }, + { + "epoch": 0.82, + "grad_norm": 2.727759279015298, + "learning_rate": 7.921951017510726e-07, + "loss": 0.515, + "step": 9380 + }, + { + "epoch": 0.82, + "grad_norm": 10.407738960089752, + "learning_rate": 7.914269954063309e-07, + "loss": 0.6689, + "step": 9381 + }, + { + "epoch": 0.82, + "grad_norm": 13.583803850879532, + "learning_rate": 7.906592296123161e-07, + "loss": 0.8143, + "step": 9382 + }, + { + "epoch": 0.82, + "grad_norm": 10.037968688989945, + "learning_rate": 7.898918044311537e-07, + "loss": 0.6626, + "step": 9383 + }, + { + "epoch": 0.82, + "grad_norm": 5.037055454863252, + "learning_rate": 7.891247199249441e-07, + "loss": 0.6399, + "step": 9384 + }, + { + "epoch": 0.82, + "grad_norm": 4.804964792164879, + "learning_rate": 7.88357976155758e-07, + "loss": 0.7304, + "step": 9385 + }, + { + "epoch": 0.82, + "grad_norm": 6.474903196293046, + "learning_rate": 7.875915731856387e-07, + "loss": 0.7521, + "step": 9386 + }, + { + "epoch": 0.82, + "grad_norm": 10.000902079306039, + "learning_rate": 7.868255110766021e-07, + "loss": 0.6046, + "step": 9387 + }, + { + "epoch": 0.82, + "grad_norm": 9.2389488969994, + "learning_rate": 7.86059789890637e-07, + "loss": 0.6998, + "step": 9388 + }, + { + "epoch": 0.82, + "grad_norm": 15.206189359634177, + "learning_rate": 7.85294409689703e-07, + "loss": 0.6674, + "step": 9389 + }, + { + "epoch": 0.82, + "grad_norm": 7.030909082602173, + "learning_rate": 7.845293705357359e-07, + "loss": 0.6371, + "step": 9390 + }, + { + "epoch": 0.82, + "grad_norm": 14.165352167693255, + "learning_rate": 7.8376467249064e-07, + "loss": 0.7248, + "step": 9391 + }, + { + "epoch": 0.82, + "grad_norm": 7.591313235047662, + "learning_rate": 7.830003156162935e-07, + "loss": 0.7593, + "step": 9392 + }, + { + "epoch": 0.82, + "grad_norm": 4.082807112799994, + "learning_rate": 7.822362999745469e-07, + "loss": 0.5539, + "step": 9393 + }, + { + "epoch": 0.83, + "grad_norm": 22.068215412569877, + "learning_rate": 7.814726256272226e-07, + "loss": 0.6464, + "step": 9394 + }, + { + "epoch": 0.83, + "grad_norm": 7.046599593211527, + "learning_rate": 7.807092926361154e-07, + "loss": 0.6713, + "step": 9395 + }, + { + "epoch": 0.83, + "grad_norm": 8.782757626941665, + "learning_rate": 7.799463010629949e-07, + "loss": 0.6899, + "step": 9396 + }, + { + "epoch": 0.83, + "grad_norm": 8.339109993601559, + "learning_rate": 7.791836509696e-07, + "loss": 0.7675, + "step": 9397 + }, + { + "epoch": 0.83, + "grad_norm": 8.903401780719449, + "learning_rate": 7.784213424176434e-07, + "loss": 0.5628, + "step": 9398 + }, + { + "epoch": 0.83, + "grad_norm": 8.102547655830792, + "learning_rate": 7.776593754688094e-07, + "loss": 0.7819, + "step": 9399 + }, + { + "epoch": 0.83, + "grad_norm": 7.3829381983270075, + "learning_rate": 7.768977501847553e-07, + "loss": 0.7035, + "step": 9400 + }, + { + "epoch": 0.83, + "grad_norm": 9.404852234155648, + "learning_rate": 7.761364666271087e-07, + "loss": 0.826, + "step": 9401 + }, + { + "epoch": 0.83, + "grad_norm": 6.066346441922094, + "learning_rate": 7.753755248574751e-07, + "loss": 0.6761, + "step": 9402 + }, + { + "epoch": 0.83, + "grad_norm": 13.023804072722648, + "learning_rate": 7.746149249374263e-07, + "loss": 0.7568, + "step": 9403 + }, + { + "epoch": 0.83, + "grad_norm": 6.321210801927498, + "learning_rate": 7.73854666928509e-07, + "loss": 0.6038, + "step": 9404 + }, + { + "epoch": 0.83, + "grad_norm": 8.310353742031296, + "learning_rate": 7.730947508922427e-07, + "loss": 0.6923, + "step": 9405 + }, + { + "epoch": 0.83, + "grad_norm": 5.876831422747912, + "learning_rate": 7.723351768901172e-07, + "loss": 0.6586, + "step": 9406 + }, + { + "epoch": 0.83, + "grad_norm": 5.9939074938586065, + "learning_rate": 7.715759449835975e-07, + "loss": 0.6615, + "step": 9407 + }, + { + "epoch": 0.83, + "grad_norm": 7.755204515035598, + "learning_rate": 7.708170552341188e-07, + "loss": 0.6143, + "step": 9408 + }, + { + "epoch": 0.83, + "grad_norm": 10.107242375266651, + "learning_rate": 7.700585077030887e-07, + "loss": 0.6711, + "step": 9409 + }, + { + "epoch": 0.83, + "grad_norm": 13.122208736837033, + "learning_rate": 7.693003024518886e-07, + "loss": 0.6978, + "step": 9410 + }, + { + "epoch": 0.83, + "grad_norm": 4.715681683099378, + "learning_rate": 7.6854243954187e-07, + "loss": 0.6683, + "step": 9411 + }, + { + "epoch": 0.83, + "grad_norm": 7.362850500902117, + "learning_rate": 7.677849190343578e-07, + "loss": 0.7119, + "step": 9412 + }, + { + "epoch": 0.83, + "grad_norm": 10.874104980373005, + "learning_rate": 7.670277409906513e-07, + "loss": 0.8244, + "step": 9413 + }, + { + "epoch": 0.83, + "grad_norm": 8.161893131868421, + "learning_rate": 7.662709054720185e-07, + "loss": 0.6789, + "step": 9414 + }, + { + "epoch": 0.83, + "grad_norm": 6.647908800607688, + "learning_rate": 7.655144125397019e-07, + "loss": 0.6548, + "step": 9415 + }, + { + "epoch": 0.83, + "grad_norm": 5.7844405166886546, + "learning_rate": 7.647582622549149e-07, + "loss": 0.6191, + "step": 9416 + }, + { + "epoch": 0.83, + "grad_norm": 11.201632743950533, + "learning_rate": 7.640024546788449e-07, + "loss": 0.8435, + "step": 9417 + }, + { + "epoch": 0.83, + "grad_norm": 5.987021934625724, + "learning_rate": 7.632469898726485e-07, + "loss": 0.8967, + "step": 9418 + }, + { + "epoch": 0.83, + "grad_norm": 9.943324618144977, + "learning_rate": 7.624918678974596e-07, + "loss": 0.7279, + "step": 9419 + }, + { + "epoch": 0.83, + "grad_norm": 6.375816978137355, + "learning_rate": 7.6173708881438e-07, + "loss": 0.5937, + "step": 9420 + }, + { + "epoch": 0.83, + "grad_norm": 2.302921937862531, + "learning_rate": 7.609826526844849e-07, + "loss": 0.5136, + "step": 9421 + }, + { + "epoch": 0.83, + "grad_norm": 2.7081148256325496, + "learning_rate": 7.602285595688224e-07, + "loss": 0.5304, + "step": 9422 + }, + { + "epoch": 0.83, + "grad_norm": 10.096620458273174, + "learning_rate": 7.59474809528411e-07, + "loss": 0.7842, + "step": 9423 + }, + { + "epoch": 0.83, + "grad_norm": 9.100007614819203, + "learning_rate": 7.587214026242457e-07, + "loss": 0.805, + "step": 9424 + }, + { + "epoch": 0.83, + "grad_norm": 10.839722893298307, + "learning_rate": 7.579683389172898e-07, + "loss": 0.885, + "step": 9425 + }, + { + "epoch": 0.83, + "grad_norm": 20.126969751272707, + "learning_rate": 7.572156184684792e-07, + "loss": 0.772, + "step": 9426 + }, + { + "epoch": 0.83, + "grad_norm": 11.713374928481315, + "learning_rate": 7.564632413387235e-07, + "loss": 0.7005, + "step": 9427 + }, + { + "epoch": 0.83, + "grad_norm": 7.325216175364524, + "learning_rate": 7.557112075889034e-07, + "loss": 0.7741, + "step": 9428 + }, + { + "epoch": 0.83, + "grad_norm": 9.710538854478758, + "learning_rate": 7.549595172798713e-07, + "loss": 0.6143, + "step": 9429 + }, + { + "epoch": 0.83, + "grad_norm": 9.763462350448227, + "learning_rate": 7.542081704724547e-07, + "loss": 0.7417, + "step": 9430 + }, + { + "epoch": 0.83, + "grad_norm": 13.787860017399883, + "learning_rate": 7.53457167227451e-07, + "loss": 0.8801, + "step": 9431 + }, + { + "epoch": 0.83, + "grad_norm": 6.741138491082276, + "learning_rate": 7.527065076056289e-07, + "loss": 0.7415, + "step": 9432 + }, + { + "epoch": 0.83, + "grad_norm": 10.271504313218799, + "learning_rate": 7.519561916677315e-07, + "loss": 0.7365, + "step": 9433 + }, + { + "epoch": 0.83, + "grad_norm": 9.279027251334517, + "learning_rate": 7.512062194744729e-07, + "loss": 0.7018, + "step": 9434 + }, + { + "epoch": 0.83, + "grad_norm": 8.543180957675201, + "learning_rate": 7.50456591086538e-07, + "loss": 0.7295, + "step": 9435 + }, + { + "epoch": 0.83, + "grad_norm": 8.814412684216544, + "learning_rate": 7.497073065645888e-07, + "loss": 0.6613, + "step": 9436 + }, + { + "epoch": 0.83, + "grad_norm": 5.798131931968293, + "learning_rate": 7.48958365969254e-07, + "loss": 0.7557, + "step": 9437 + }, + { + "epoch": 0.83, + "grad_norm": 25.326722053313024, + "learning_rate": 7.482097693611368e-07, + "loss": 0.7211, + "step": 9438 + }, + { + "epoch": 0.83, + "grad_norm": 6.236452337488048, + "learning_rate": 7.474615168008126e-07, + "loss": 0.5614, + "step": 9439 + }, + { + "epoch": 0.83, + "grad_norm": 6.733434060864437, + "learning_rate": 7.467136083488291e-07, + "loss": 0.7392, + "step": 9440 + }, + { + "epoch": 0.83, + "grad_norm": 8.610832761640292, + "learning_rate": 7.459660440657035e-07, + "loss": 0.7213, + "step": 9441 + }, + { + "epoch": 0.83, + "grad_norm": 11.102757668071238, + "learning_rate": 7.452188240119313e-07, + "loss": 0.6721, + "step": 9442 + }, + { + "epoch": 0.83, + "grad_norm": 11.630113869881615, + "learning_rate": 7.44471948247974e-07, + "loss": 0.7842, + "step": 9443 + }, + { + "epoch": 0.83, + "grad_norm": 8.190757630370104, + "learning_rate": 7.437254168342683e-07, + "loss": 0.7858, + "step": 9444 + }, + { + "epoch": 0.83, + "grad_norm": 11.405714829806723, + "learning_rate": 7.429792298312217e-07, + "loss": 0.7808, + "step": 9445 + }, + { + "epoch": 0.83, + "grad_norm": 10.628852907560475, + "learning_rate": 7.422333872992138e-07, + "loss": 0.6561, + "step": 9446 + }, + { + "epoch": 0.83, + "grad_norm": 8.612108903309892, + "learning_rate": 7.414878892985983e-07, + "loss": 0.7638, + "step": 9447 + }, + { + "epoch": 0.83, + "grad_norm": 8.259408758622694, + "learning_rate": 7.407427358896996e-07, + "loss": 0.7228, + "step": 9448 + }, + { + "epoch": 0.83, + "grad_norm": 4.948395051821875, + "learning_rate": 7.399979271328134e-07, + "loss": 0.6405, + "step": 9449 + }, + { + "epoch": 0.83, + "grad_norm": 9.87094451003289, + "learning_rate": 7.392534630882092e-07, + "loss": 0.781, + "step": 9450 + }, + { + "epoch": 0.83, + "grad_norm": 8.565798643391666, + "learning_rate": 7.385093438161273e-07, + "loss": 0.7775, + "step": 9451 + }, + { + "epoch": 0.83, + "grad_norm": 2.5066185984080227, + "learning_rate": 7.377655693767793e-07, + "loss": 0.5126, + "step": 9452 + }, + { + "epoch": 0.83, + "grad_norm": 8.894468493145071, + "learning_rate": 7.370221398303529e-07, + "loss": 0.7305, + "step": 9453 + }, + { + "epoch": 0.83, + "grad_norm": 16.469891738822056, + "learning_rate": 7.362790552370036e-07, + "loss": 0.7354, + "step": 9454 + }, + { + "epoch": 0.83, + "grad_norm": 8.20474091133914, + "learning_rate": 7.355363156568612e-07, + "loss": 0.7318, + "step": 9455 + }, + { + "epoch": 0.83, + "grad_norm": 7.284761776566771, + "learning_rate": 7.347939211500266e-07, + "loss": 0.7241, + "step": 9456 + }, + { + "epoch": 0.83, + "grad_norm": 10.610545965208276, + "learning_rate": 7.340518717765726e-07, + "loss": 0.6666, + "step": 9457 + }, + { + "epoch": 0.83, + "grad_norm": 6.105201408721838, + "learning_rate": 7.333101675965443e-07, + "loss": 0.7361, + "step": 9458 + }, + { + "epoch": 0.83, + "grad_norm": 6.459998541132067, + "learning_rate": 7.325688086699612e-07, + "loss": 0.7616, + "step": 9459 + }, + { + "epoch": 0.83, + "grad_norm": 12.0527287454475, + "learning_rate": 7.318277950568114e-07, + "loss": 0.7496, + "step": 9460 + }, + { + "epoch": 0.83, + "grad_norm": 8.498455549426357, + "learning_rate": 7.310871268170566e-07, + "loss": 0.8934, + "step": 9461 + }, + { + "epoch": 0.83, + "grad_norm": 7.242868484116865, + "learning_rate": 7.303468040106304e-07, + "loss": 0.8033, + "step": 9462 + }, + { + "epoch": 0.83, + "grad_norm": 18.60241717006607, + "learning_rate": 7.296068266974377e-07, + "loss": 0.7915, + "step": 9463 + }, + { + "epoch": 0.83, + "grad_norm": 14.377833586702502, + "learning_rate": 7.288671949373582e-07, + "loss": 1.0445, + "step": 9464 + }, + { + "epoch": 0.83, + "grad_norm": 8.140842928590262, + "learning_rate": 7.28127908790241e-07, + "loss": 0.6618, + "step": 9465 + }, + { + "epoch": 0.83, + "grad_norm": 14.035732445597572, + "learning_rate": 7.273889683159074e-07, + "loss": 0.6493, + "step": 9466 + }, + { + "epoch": 0.83, + "grad_norm": 7.594034002889208, + "learning_rate": 7.26650373574151e-07, + "loss": 0.765, + "step": 9467 + }, + { + "epoch": 0.83, + "grad_norm": 11.340109360614228, + "learning_rate": 7.25912124624738e-07, + "loss": 0.6429, + "step": 9468 + }, + { + "epoch": 0.83, + "grad_norm": 11.920606937364703, + "learning_rate": 7.251742215274054e-07, + "loss": 0.6895, + "step": 9469 + }, + { + "epoch": 0.83, + "grad_norm": 13.183158002639288, + "learning_rate": 7.244366643418655e-07, + "loss": 0.6271, + "step": 9470 + }, + { + "epoch": 0.83, + "grad_norm": 6.8157094706201615, + "learning_rate": 7.23699453127798e-07, + "loss": 0.752, + "step": 9471 + }, + { + "epoch": 0.83, + "grad_norm": 6.284589133281334, + "learning_rate": 7.229625879448577e-07, + "loss": 0.6677, + "step": 9472 + }, + { + "epoch": 0.83, + "grad_norm": 9.378861188788939, + "learning_rate": 7.222260688526705e-07, + "loss": 0.8705, + "step": 9473 + }, + { + "epoch": 0.83, + "grad_norm": 3.5644556118927686, + "learning_rate": 7.214898959108341e-07, + "loss": 0.4609, + "step": 9474 + }, + { + "epoch": 0.83, + "grad_norm": 8.461857099485401, + "learning_rate": 7.207540691789172e-07, + "loss": 0.7776, + "step": 9475 + }, + { + "epoch": 0.83, + "grad_norm": 7.140576394075574, + "learning_rate": 7.200185887164641e-07, + "loss": 0.6221, + "step": 9476 + }, + { + "epoch": 0.83, + "grad_norm": 2.5300624249070136, + "learning_rate": 7.192834545829869e-07, + "loss": 0.4472, + "step": 9477 + }, + { + "epoch": 0.83, + "grad_norm": 7.6032241908534175, + "learning_rate": 7.18548666837972e-07, + "loss": 0.7248, + "step": 9478 + }, + { + "epoch": 0.83, + "grad_norm": 9.693406976342965, + "learning_rate": 7.178142255408776e-07, + "loss": 0.67, + "step": 9479 + }, + { + "epoch": 0.83, + "grad_norm": 9.376287983345318, + "learning_rate": 7.170801307511327e-07, + "loss": 0.7494, + "step": 9480 + }, + { + "epoch": 0.83, + "grad_norm": 3.022932750232056, + "learning_rate": 7.163463825281386e-07, + "loss": 0.5173, + "step": 9481 + }, + { + "epoch": 0.83, + "grad_norm": 9.03979289283667, + "learning_rate": 7.156129809312706e-07, + "loss": 0.702, + "step": 9482 + }, + { + "epoch": 0.83, + "grad_norm": 12.136126234886744, + "learning_rate": 7.148799260198736e-07, + "loss": 0.7224, + "step": 9483 + }, + { + "epoch": 0.83, + "grad_norm": 16.46274389856982, + "learning_rate": 7.141472178532649e-07, + "loss": 0.6604, + "step": 9484 + }, + { + "epoch": 0.83, + "grad_norm": 9.053313344655809, + "learning_rate": 7.134148564907345e-07, + "loss": 0.7421, + "step": 9485 + }, + { + "epoch": 0.83, + "grad_norm": 10.841610923196303, + "learning_rate": 7.126828419915422e-07, + "loss": 0.6824, + "step": 9486 + }, + { + "epoch": 0.83, + "grad_norm": 7.712184319439009, + "learning_rate": 7.119511744149238e-07, + "loss": 0.6863, + "step": 9487 + }, + { + "epoch": 0.83, + "grad_norm": 8.950251558978415, + "learning_rate": 7.112198538200832e-07, + "loss": 0.7668, + "step": 9488 + }, + { + "epoch": 0.83, + "grad_norm": 3.504148458240387, + "learning_rate": 7.104888802661985e-07, + "loss": 0.5751, + "step": 9489 + }, + { + "epoch": 0.83, + "grad_norm": 9.655631224628618, + "learning_rate": 7.097582538124181e-07, + "loss": 0.7208, + "step": 9490 + }, + { + "epoch": 0.83, + "grad_norm": 10.540418098514964, + "learning_rate": 7.090279745178635e-07, + "loss": 0.7468, + "step": 9491 + }, + { + "epoch": 0.83, + "grad_norm": 3.233197400623047, + "learning_rate": 7.082980424416264e-07, + "loss": 0.499, + "step": 9492 + }, + { + "epoch": 0.83, + "grad_norm": 5.839037661841943, + "learning_rate": 7.075684576427744e-07, + "loss": 0.541, + "step": 9493 + }, + { + "epoch": 0.83, + "grad_norm": 2.148657517971902, + "learning_rate": 7.06839220180342e-07, + "loss": 0.438, + "step": 9494 + }, + { + "epoch": 0.83, + "grad_norm": 7.815478406539521, + "learning_rate": 7.061103301133387e-07, + "loss": 0.6498, + "step": 9495 + }, + { + "epoch": 0.83, + "grad_norm": 9.383730201846236, + "learning_rate": 7.053817875007457e-07, + "loss": 0.6212, + "step": 9496 + }, + { + "epoch": 0.83, + "grad_norm": 12.166331792938806, + "learning_rate": 7.046535924015141e-07, + "loss": 0.8651, + "step": 9497 + }, + { + "epoch": 0.83, + "grad_norm": 81.72212203875863, + "learning_rate": 7.039257448745679e-07, + "loss": 0.7352, + "step": 9498 + }, + { + "epoch": 0.83, + "grad_norm": 18.603597056422633, + "learning_rate": 7.03198244978805e-07, + "loss": 0.6204, + "step": 9499 + }, + { + "epoch": 0.83, + "grad_norm": 2.2633740145935155, + "learning_rate": 7.024710927730932e-07, + "loss": 0.5258, + "step": 9500 + }, + { + "epoch": 0.83, + "grad_norm": 10.093264185779388, + "learning_rate": 7.017442883162718e-07, + "loss": 0.6412, + "step": 9501 + }, + { + "epoch": 0.83, + "grad_norm": 2.613467903016073, + "learning_rate": 7.010178316671528e-07, + "loss": 0.4701, + "step": 9502 + }, + { + "epoch": 0.83, + "grad_norm": 11.715805018152546, + "learning_rate": 7.002917228845191e-07, + "loss": 0.9386, + "step": 9503 + }, + { + "epoch": 0.83, + "grad_norm": 9.52055479561859, + "learning_rate": 6.995659620271277e-07, + "loss": 0.6478, + "step": 9504 + }, + { + "epoch": 0.83, + "grad_norm": 6.647944628569995, + "learning_rate": 6.988405491537054e-07, + "loss": 0.6048, + "step": 9505 + }, + { + "epoch": 0.83, + "grad_norm": 9.00480622529574, + "learning_rate": 6.98115484322951e-07, + "loss": 0.8276, + "step": 9506 + }, + { + "epoch": 0.83, + "grad_norm": 2.5287509687856717, + "learning_rate": 6.97390767593536e-07, + "loss": 0.4623, + "step": 9507 + }, + { + "epoch": 0.84, + "grad_norm": 23.800103076471256, + "learning_rate": 6.966663990241035e-07, + "loss": 0.6341, + "step": 9508 + }, + { + "epoch": 0.84, + "grad_norm": 7.950333153478232, + "learning_rate": 6.959423786732661e-07, + "loss": 0.6286, + "step": 9509 + }, + { + "epoch": 0.84, + "grad_norm": 8.254264345989403, + "learning_rate": 6.95218706599613e-07, + "loss": 0.5569, + "step": 9510 + }, + { + "epoch": 0.84, + "grad_norm": 11.740919941908855, + "learning_rate": 6.944953828617018e-07, + "loss": 0.5653, + "step": 9511 + }, + { + "epoch": 0.84, + "grad_norm": 19.934361336314492, + "learning_rate": 6.937724075180624e-07, + "loss": 0.6671, + "step": 9512 + }, + { + "epoch": 0.84, + "grad_norm": 14.545396758275343, + "learning_rate": 6.930497806271969e-07, + "loss": 0.6455, + "step": 9513 + }, + { + "epoch": 0.84, + "grad_norm": 16.19114228148146, + "learning_rate": 6.923275022475784e-07, + "loss": 0.7481, + "step": 9514 + }, + { + "epoch": 0.84, + "grad_norm": 2.5555843872810176, + "learning_rate": 6.916055724376519e-07, + "loss": 0.5359, + "step": 9515 + }, + { + "epoch": 0.84, + "grad_norm": 7.244952413292187, + "learning_rate": 6.908839912558374e-07, + "loss": 0.8904, + "step": 9516 + }, + { + "epoch": 0.84, + "grad_norm": 7.506024505489445, + "learning_rate": 6.901627587605225e-07, + "loss": 0.5456, + "step": 9517 + }, + { + "epoch": 0.84, + "grad_norm": 10.28176675009726, + "learning_rate": 6.894418750100679e-07, + "loss": 0.7937, + "step": 9518 + }, + { + "epoch": 0.84, + "grad_norm": 9.4325500975101, + "learning_rate": 6.887213400628062e-07, + "loss": 0.766, + "step": 9519 + }, + { + "epoch": 0.84, + "grad_norm": 13.598938845497955, + "learning_rate": 6.880011539770426e-07, + "loss": 0.847, + "step": 9520 + }, + { + "epoch": 0.84, + "grad_norm": 13.415029677323174, + "learning_rate": 6.87281316811052e-07, + "loss": 0.8088, + "step": 9521 + }, + { + "epoch": 0.84, + "grad_norm": 6.040759851823647, + "learning_rate": 6.86561828623084e-07, + "loss": 0.589, + "step": 9522 + }, + { + "epoch": 0.84, + "grad_norm": 14.407942859204157, + "learning_rate": 6.858426894713582e-07, + "loss": 0.6768, + "step": 9523 + }, + { + "epoch": 0.84, + "grad_norm": 6.280375358525902, + "learning_rate": 6.851238994140658e-07, + "loss": 0.8051, + "step": 9524 + }, + { + "epoch": 0.84, + "grad_norm": 8.897779259330854, + "learning_rate": 6.844054585093696e-07, + "loss": 0.8437, + "step": 9525 + }, + { + "epoch": 0.84, + "grad_norm": 6.550806640424179, + "learning_rate": 6.836873668154042e-07, + "loss": 0.7245, + "step": 9526 + }, + { + "epoch": 0.84, + "grad_norm": 6.92025847636815, + "learning_rate": 6.829696243902784e-07, + "loss": 0.7342, + "step": 9527 + }, + { + "epoch": 0.84, + "grad_norm": 2.9155285430833957, + "learning_rate": 6.822522312920698e-07, + "loss": 0.5649, + "step": 9528 + }, + { + "epoch": 0.84, + "grad_norm": 10.345953817500328, + "learning_rate": 6.815351875788284e-07, + "loss": 0.6751, + "step": 9529 + }, + { + "epoch": 0.84, + "grad_norm": 8.722939718660795, + "learning_rate": 6.80818493308576e-07, + "loss": 0.5891, + "step": 9530 + }, + { + "epoch": 0.84, + "grad_norm": 8.005984600738593, + "learning_rate": 6.801021485393066e-07, + "loss": 0.6573, + "step": 9531 + }, + { + "epoch": 0.84, + "grad_norm": 10.662792810172473, + "learning_rate": 6.793861533289842e-07, + "loss": 0.7666, + "step": 9532 + }, + { + "epoch": 0.84, + "grad_norm": 12.334588861436547, + "learning_rate": 6.786705077355488e-07, + "loss": 0.7252, + "step": 9533 + }, + { + "epoch": 0.84, + "grad_norm": 10.839235741841874, + "learning_rate": 6.779552118169074e-07, + "loss": 0.7048, + "step": 9534 + }, + { + "epoch": 0.84, + "grad_norm": 7.300818463457132, + "learning_rate": 6.772402656309413e-07, + "loss": 0.7029, + "step": 9535 + }, + { + "epoch": 0.84, + "grad_norm": 6.958569033892228, + "learning_rate": 6.765256692355021e-07, + "loss": 0.6621, + "step": 9536 + }, + { + "epoch": 0.84, + "grad_norm": 9.128899230696415, + "learning_rate": 6.758114226884138e-07, + "loss": 0.581, + "step": 9537 + }, + { + "epoch": 0.84, + "grad_norm": 11.853064918615695, + "learning_rate": 6.750975260474718e-07, + "loss": 0.7677, + "step": 9538 + }, + { + "epoch": 0.84, + "grad_norm": 13.418743363980537, + "learning_rate": 6.743839793704443e-07, + "loss": 0.7283, + "step": 9539 + }, + { + "epoch": 0.84, + "grad_norm": 7.9389030363260975, + "learning_rate": 6.736707827150701e-07, + "loss": 0.702, + "step": 9540 + }, + { + "epoch": 0.84, + "grad_norm": 13.94705560963349, + "learning_rate": 6.729579361390598e-07, + "loss": 0.6515, + "step": 9541 + }, + { + "epoch": 0.84, + "grad_norm": 8.522406631535652, + "learning_rate": 6.72245439700096e-07, + "loss": 0.6742, + "step": 9542 + }, + { + "epoch": 0.84, + "grad_norm": 9.270049836233534, + "learning_rate": 6.715332934558311e-07, + "loss": 0.7381, + "step": 9543 + }, + { + "epoch": 0.84, + "grad_norm": 9.388660510333814, + "learning_rate": 6.708214974638927e-07, + "loss": 0.6634, + "step": 9544 + }, + { + "epoch": 0.84, + "grad_norm": 11.946589594673755, + "learning_rate": 6.701100517818776e-07, + "loss": 0.8038, + "step": 9545 + }, + { + "epoch": 0.84, + "grad_norm": 17.369012743817827, + "learning_rate": 6.693989564673548e-07, + "loss": 0.7047, + "step": 9546 + }, + { + "epoch": 0.84, + "grad_norm": 8.147307104926439, + "learning_rate": 6.686882115778653e-07, + "loss": 0.728, + "step": 9547 + }, + { + "epoch": 0.84, + "grad_norm": 2.4264504919417056, + "learning_rate": 6.679778171709206e-07, + "loss": 0.4638, + "step": 9548 + }, + { + "epoch": 0.84, + "grad_norm": 12.041547159866631, + "learning_rate": 6.67267773304004e-07, + "loss": 0.7087, + "step": 9549 + }, + { + "epoch": 0.84, + "grad_norm": 2.3657158594264316, + "learning_rate": 6.665580800345728e-07, + "loss": 0.4756, + "step": 9550 + }, + { + "epoch": 0.84, + "grad_norm": 9.42118149333652, + "learning_rate": 6.658487374200534e-07, + "loss": 0.6332, + "step": 9551 + }, + { + "epoch": 0.84, + "grad_norm": 7.494149408035652, + "learning_rate": 6.651397455178449e-07, + "loss": 0.7691, + "step": 9552 + }, + { + "epoch": 0.84, + "grad_norm": 8.80188054813868, + "learning_rate": 6.644311043853179e-07, + "loss": 0.9665, + "step": 9553 + }, + { + "epoch": 0.84, + "grad_norm": 10.258891505494601, + "learning_rate": 6.637228140798135e-07, + "loss": 0.659, + "step": 9554 + }, + { + "epoch": 0.84, + "grad_norm": 7.552383488827012, + "learning_rate": 6.630148746586451e-07, + "loss": 0.7615, + "step": 9555 + }, + { + "epoch": 0.84, + "grad_norm": 15.457587953171089, + "learning_rate": 6.623072861791002e-07, + "loss": 0.6445, + "step": 9556 + }, + { + "epoch": 0.84, + "grad_norm": 20.121391650737152, + "learning_rate": 6.616000486984342e-07, + "loss": 0.573, + "step": 9557 + }, + { + "epoch": 0.84, + "grad_norm": 6.197180231231266, + "learning_rate": 6.608931622738757e-07, + "loss": 0.7649, + "step": 9558 + }, + { + "epoch": 0.84, + "grad_norm": 12.326670195481835, + "learning_rate": 6.601866269626245e-07, + "loss": 0.7736, + "step": 9559 + }, + { + "epoch": 0.84, + "grad_norm": 11.957752055676073, + "learning_rate": 6.594804428218527e-07, + "loss": 0.7469, + "step": 9560 + }, + { + "epoch": 0.84, + "grad_norm": 22.788850253956877, + "learning_rate": 6.587746099087022e-07, + "loss": 0.7008, + "step": 9561 + }, + { + "epoch": 0.84, + "grad_norm": 2.390588336526356, + "learning_rate": 6.580691282802903e-07, + "loss": 0.4471, + "step": 9562 + }, + { + "epoch": 0.84, + "grad_norm": 6.5344411873595964, + "learning_rate": 6.573639979937019e-07, + "loss": 0.7988, + "step": 9563 + }, + { + "epoch": 0.84, + "grad_norm": 6.484771503410733, + "learning_rate": 6.56659219105995e-07, + "loss": 0.746, + "step": 9564 + }, + { + "epoch": 0.84, + "grad_norm": 10.373052723549524, + "learning_rate": 6.559547916741999e-07, + "loss": 0.672, + "step": 9565 + }, + { + "epoch": 0.84, + "grad_norm": 7.107510223384763, + "learning_rate": 6.552507157553156e-07, + "loss": 0.7849, + "step": 9566 + }, + { + "epoch": 0.84, + "grad_norm": 10.55868611253363, + "learning_rate": 6.545469914063174e-07, + "loss": 0.8138, + "step": 9567 + }, + { + "epoch": 0.84, + "grad_norm": 5.780575916329172, + "learning_rate": 6.538436186841484e-07, + "loss": 0.6512, + "step": 9568 + }, + { + "epoch": 0.84, + "grad_norm": 8.471798024453975, + "learning_rate": 6.531405976457245e-07, + "loss": 0.6291, + "step": 9569 + }, + { + "epoch": 0.84, + "grad_norm": 8.067223315767896, + "learning_rate": 6.524379283479326e-07, + "loss": 0.6087, + "step": 9570 + }, + { + "epoch": 0.84, + "grad_norm": 9.46045017980365, + "learning_rate": 6.517356108476314e-07, + "loss": 0.669, + "step": 9571 + }, + { + "epoch": 0.84, + "grad_norm": 12.407284403929326, + "learning_rate": 6.510336452016514e-07, + "loss": 0.6719, + "step": 9572 + }, + { + "epoch": 0.84, + "grad_norm": 8.280041335372257, + "learning_rate": 6.503320314667949e-07, + "loss": 0.8179, + "step": 9573 + }, + { + "epoch": 0.84, + "grad_norm": 7.170816352074637, + "learning_rate": 6.496307696998355e-07, + "loss": 0.6552, + "step": 9574 + }, + { + "epoch": 0.84, + "grad_norm": 6.238007940527223, + "learning_rate": 6.489298599575177e-07, + "loss": 0.6777, + "step": 9575 + }, + { + "epoch": 0.84, + "grad_norm": 10.566994174913665, + "learning_rate": 6.48229302296558e-07, + "loss": 0.8302, + "step": 9576 + }, + { + "epoch": 0.84, + "grad_norm": 9.485424580916915, + "learning_rate": 6.475290967736448e-07, + "loss": 0.6315, + "step": 9577 + }, + { + "epoch": 0.84, + "grad_norm": 13.55378269464555, + "learning_rate": 6.468292434454354e-07, + "loss": 0.6968, + "step": 9578 + }, + { + "epoch": 0.84, + "grad_norm": 6.7636317930535865, + "learning_rate": 6.461297423685637e-07, + "loss": 0.664, + "step": 9579 + }, + { + "epoch": 0.84, + "grad_norm": 9.75361134885829, + "learning_rate": 6.454305935996314e-07, + "loss": 0.7216, + "step": 9580 + }, + { + "epoch": 0.84, + "grad_norm": 16.946637072724652, + "learning_rate": 6.447317971952116e-07, + "loss": 0.684, + "step": 9581 + }, + { + "epoch": 0.84, + "grad_norm": 7.893540107865847, + "learning_rate": 6.440333532118503e-07, + "loss": 0.7564, + "step": 9582 + }, + { + "epoch": 0.84, + "grad_norm": 7.026050949140311, + "learning_rate": 6.433352617060629e-07, + "loss": 0.6831, + "step": 9583 + }, + { + "epoch": 0.84, + "grad_norm": 8.857611687132179, + "learning_rate": 6.426375227343407e-07, + "loss": 0.6281, + "step": 9584 + }, + { + "epoch": 0.84, + "grad_norm": 5.601117649873591, + "learning_rate": 6.419401363531419e-07, + "loss": 0.55, + "step": 9585 + }, + { + "epoch": 0.84, + "grad_norm": 10.475372604389735, + "learning_rate": 6.412431026188975e-07, + "loss": 0.7453, + "step": 9586 + }, + { + "epoch": 0.84, + "grad_norm": 3.6833219493153204, + "learning_rate": 6.405464215880114e-07, + "loss": 0.5208, + "step": 9587 + }, + { + "epoch": 0.84, + "grad_norm": 7.933130487539849, + "learning_rate": 6.398500933168573e-07, + "loss": 0.6785, + "step": 9588 + }, + { + "epoch": 0.84, + "grad_norm": 9.362122557240987, + "learning_rate": 6.391541178617794e-07, + "loss": 0.677, + "step": 9589 + }, + { + "epoch": 0.84, + "grad_norm": 12.943564899868798, + "learning_rate": 6.384584952790979e-07, + "loss": 0.7159, + "step": 9590 + }, + { + "epoch": 0.84, + "grad_norm": 8.58461110783394, + "learning_rate": 6.377632256250999e-07, + "loss": 0.6809, + "step": 9591 + }, + { + "epoch": 0.84, + "grad_norm": 7.711161548834147, + "learning_rate": 6.370683089560453e-07, + "loss": 0.7547, + "step": 9592 + }, + { + "epoch": 0.84, + "grad_norm": 10.123991356933997, + "learning_rate": 6.36373745328166e-07, + "loss": 0.7227, + "step": 9593 + }, + { + "epoch": 0.84, + "grad_norm": 18.324842519553684, + "learning_rate": 6.35679534797664e-07, + "loss": 0.7734, + "step": 9594 + }, + { + "epoch": 0.84, + "grad_norm": 19.973401887304764, + "learning_rate": 6.349856774207142e-07, + "loss": 0.7072, + "step": 9595 + }, + { + "epoch": 0.84, + "grad_norm": 16.434821019934557, + "learning_rate": 6.34292173253463e-07, + "loss": 0.6589, + "step": 9596 + }, + { + "epoch": 0.84, + "grad_norm": 5.81031693412713, + "learning_rate": 6.335990223520277e-07, + "loss": 0.6781, + "step": 9597 + }, + { + "epoch": 0.84, + "grad_norm": 42.98963116753113, + "learning_rate": 6.32906224772496e-07, + "loss": 0.6296, + "step": 9598 + }, + { + "epoch": 0.84, + "grad_norm": 9.480352809585703, + "learning_rate": 6.322137805709283e-07, + "loss": 0.6369, + "step": 9599 + }, + { + "epoch": 0.84, + "grad_norm": 8.490012895153063, + "learning_rate": 6.31521689803355e-07, + "loss": 0.6158, + "step": 9600 + }, + { + "epoch": 0.84, + "grad_norm": 9.688014067702102, + "learning_rate": 6.308299525257811e-07, + "loss": 0.8244, + "step": 9601 + }, + { + "epoch": 0.84, + "grad_norm": 11.26816332515487, + "learning_rate": 6.301385687941796e-07, + "loss": 0.702, + "step": 9602 + }, + { + "epoch": 0.84, + "grad_norm": 8.330580057518818, + "learning_rate": 6.294475386644966e-07, + "loss": 0.7753, + "step": 9603 + }, + { + "epoch": 0.84, + "grad_norm": 6.61987509980103, + "learning_rate": 6.287568621926482e-07, + "loss": 0.6875, + "step": 9604 + }, + { + "epoch": 0.84, + "grad_norm": 11.094679889058785, + "learning_rate": 6.280665394345237e-07, + "loss": 0.6857, + "step": 9605 + }, + { + "epoch": 0.84, + "grad_norm": 3.004574205664053, + "learning_rate": 6.273765704459811e-07, + "loss": 0.5505, + "step": 9606 + }, + { + "epoch": 0.84, + "grad_norm": 8.242726269366347, + "learning_rate": 6.266869552828541e-07, + "loss": 0.7099, + "step": 9607 + }, + { + "epoch": 0.84, + "grad_norm": 10.13443362254515, + "learning_rate": 6.259976940009444e-07, + "loss": 0.5647, + "step": 9608 + }, + { + "epoch": 0.84, + "grad_norm": 12.037319129647528, + "learning_rate": 6.253087866560254e-07, + "loss": 0.7794, + "step": 9609 + }, + { + "epoch": 0.84, + "grad_norm": 2.2087746265177404, + "learning_rate": 6.24620233303842e-07, + "loss": 0.5055, + "step": 9610 + }, + { + "epoch": 0.84, + "grad_norm": 9.067580153887382, + "learning_rate": 6.239320340001115e-07, + "loss": 0.7812, + "step": 9611 + }, + { + "epoch": 0.84, + "grad_norm": 8.250796324074107, + "learning_rate": 6.232441888005208e-07, + "loss": 0.5882, + "step": 9612 + }, + { + "epoch": 0.84, + "grad_norm": 5.313322597754515, + "learning_rate": 6.225566977607306e-07, + "loss": 0.6462, + "step": 9613 + }, + { + "epoch": 0.84, + "grad_norm": 8.554097143089603, + "learning_rate": 6.218695609363711e-07, + "loss": 0.8271, + "step": 9614 + }, + { + "epoch": 0.84, + "grad_norm": 2.9236746964734386, + "learning_rate": 6.211827783830443e-07, + "loss": 0.5202, + "step": 9615 + }, + { + "epoch": 0.84, + "grad_norm": 9.72350630302397, + "learning_rate": 6.204963501563232e-07, + "loss": 0.7628, + "step": 9616 + }, + { + "epoch": 0.84, + "grad_norm": 11.485026185878116, + "learning_rate": 6.198102763117525e-07, + "loss": 0.6207, + "step": 9617 + }, + { + "epoch": 0.84, + "grad_norm": 2.5454201867879602, + "learning_rate": 6.19124556904847e-07, + "loss": 0.524, + "step": 9618 + }, + { + "epoch": 0.84, + "grad_norm": 6.703984661249591, + "learning_rate": 6.184391919910965e-07, + "loss": 0.7684, + "step": 9619 + }, + { + "epoch": 0.84, + "grad_norm": 8.36113495200022, + "learning_rate": 6.177541816259574e-07, + "loss": 0.7479, + "step": 9620 + }, + { + "epoch": 0.84, + "grad_norm": 2.8135827407254417, + "learning_rate": 6.170695258648613e-07, + "loss": 0.4915, + "step": 9621 + }, + { + "epoch": 0.85, + "grad_norm": 13.197686876669929, + "learning_rate": 6.163852247632085e-07, + "loss": 0.7405, + "step": 9622 + }, + { + "epoch": 0.85, + "grad_norm": 7.585080659507356, + "learning_rate": 6.1570127837637e-07, + "loss": 0.6629, + "step": 9623 + }, + { + "epoch": 0.85, + "grad_norm": 10.280692860340267, + "learning_rate": 6.150176867596924e-07, + "loss": 0.7153, + "step": 9624 + }, + { + "epoch": 0.85, + "grad_norm": 3.1347964342748194, + "learning_rate": 6.143344499684894e-07, + "loss": 0.5239, + "step": 9625 + }, + { + "epoch": 0.85, + "grad_norm": 5.468073361343271, + "learning_rate": 6.136515680580479e-07, + "loss": 0.7076, + "step": 9626 + }, + { + "epoch": 0.85, + "grad_norm": 8.544809952585062, + "learning_rate": 6.129690410836247e-07, + "loss": 0.6421, + "step": 9627 + }, + { + "epoch": 0.85, + "grad_norm": 6.559778410871057, + "learning_rate": 6.122868691004497e-07, + "loss": 0.7037, + "step": 9628 + }, + { + "epoch": 0.85, + "grad_norm": 5.860290632511405, + "learning_rate": 6.116050521637218e-07, + "loss": 0.5552, + "step": 9629 + }, + { + "epoch": 0.85, + "grad_norm": 9.34574844827051, + "learning_rate": 6.109235903286137e-07, + "loss": 0.7071, + "step": 9630 + }, + { + "epoch": 0.85, + "grad_norm": 6.083121200833571, + "learning_rate": 6.102424836502685e-07, + "loss": 0.6027, + "step": 9631 + }, + { + "epoch": 0.85, + "grad_norm": 8.424545179091039, + "learning_rate": 6.095617321837988e-07, + "loss": 0.7754, + "step": 9632 + }, + { + "epoch": 0.85, + "grad_norm": 5.09981690475628, + "learning_rate": 6.08881335984291e-07, + "loss": 0.5982, + "step": 9633 + }, + { + "epoch": 0.85, + "grad_norm": 12.970066093609942, + "learning_rate": 6.082012951068017e-07, + "loss": 0.6887, + "step": 9634 + }, + { + "epoch": 0.85, + "grad_norm": 29.71760074622384, + "learning_rate": 6.075216096063563e-07, + "loss": 0.6875, + "step": 9635 + }, + { + "epoch": 0.85, + "grad_norm": 8.455059921603182, + "learning_rate": 6.068422795379575e-07, + "loss": 0.8223, + "step": 9636 + }, + { + "epoch": 0.85, + "grad_norm": 10.262216845850427, + "learning_rate": 6.061633049565735e-07, + "loss": 0.7901, + "step": 9637 + }, + { + "epoch": 0.85, + "grad_norm": 8.115468551362397, + "learning_rate": 6.054846859171459e-07, + "loss": 0.8765, + "step": 9638 + }, + { + "epoch": 0.85, + "grad_norm": 12.436610241007251, + "learning_rate": 6.048064224745881e-07, + "loss": 0.7282, + "step": 9639 + }, + { + "epoch": 0.85, + "grad_norm": 8.349814091722449, + "learning_rate": 6.041285146837822e-07, + "loss": 0.6862, + "step": 9640 + }, + { + "epoch": 0.85, + "grad_norm": 8.055901488185071, + "learning_rate": 6.03450962599586e-07, + "loss": 0.5868, + "step": 9641 + }, + { + "epoch": 0.85, + "grad_norm": 8.190738214513534, + "learning_rate": 6.02773766276824e-07, + "loss": 0.7667, + "step": 9642 + }, + { + "epoch": 0.85, + "grad_norm": 8.1445873408532, + "learning_rate": 6.020969257702952e-07, + "loss": 0.7307, + "step": 9643 + }, + { + "epoch": 0.85, + "grad_norm": 6.597781561653013, + "learning_rate": 6.014204411347674e-07, + "loss": 0.7288, + "step": 9644 + }, + { + "epoch": 0.85, + "grad_norm": 7.778855207928751, + "learning_rate": 6.007443124249806e-07, + "loss": 0.7452, + "step": 9645 + }, + { + "epoch": 0.85, + "grad_norm": 14.153095237198993, + "learning_rate": 6.000685396956452e-07, + "loss": 0.7134, + "step": 9646 + }, + { + "epoch": 0.85, + "grad_norm": 8.089766340317512, + "learning_rate": 5.993931230014461e-07, + "loss": 0.7959, + "step": 9647 + }, + { + "epoch": 0.85, + "grad_norm": 7.106105513219367, + "learning_rate": 5.987180623970351e-07, + "loss": 0.6519, + "step": 9648 + }, + { + "epoch": 0.85, + "grad_norm": 8.65460264013132, + "learning_rate": 5.980433579370376e-07, + "loss": 0.8577, + "step": 9649 + }, + { + "epoch": 0.85, + "grad_norm": 9.391841594087918, + "learning_rate": 5.973690096760487e-07, + "loss": 0.8566, + "step": 9650 + }, + { + "epoch": 0.85, + "grad_norm": 9.467609087953246, + "learning_rate": 5.96695017668637e-07, + "loss": 0.6781, + "step": 9651 + }, + { + "epoch": 0.85, + "grad_norm": 9.554484769081709, + "learning_rate": 5.960213819693383e-07, + "loss": 0.883, + "step": 9652 + }, + { + "epoch": 0.85, + "grad_norm": 9.364734764339678, + "learning_rate": 5.953481026326646e-07, + "loss": 0.6703, + "step": 9653 + }, + { + "epoch": 0.85, + "grad_norm": 2.208870542457413, + "learning_rate": 5.946751797130961e-07, + "loss": 0.4805, + "step": 9654 + }, + { + "epoch": 0.85, + "grad_norm": 7.722566753305962, + "learning_rate": 5.940026132650845e-07, + "loss": 0.8048, + "step": 9655 + }, + { + "epoch": 0.85, + "grad_norm": 8.350254893064855, + "learning_rate": 5.93330403343052e-07, + "loss": 0.6908, + "step": 9656 + }, + { + "epoch": 0.85, + "grad_norm": 4.444029217697824, + "learning_rate": 5.926585500013932e-07, + "loss": 0.6403, + "step": 9657 + }, + { + "epoch": 0.85, + "grad_norm": 7.115268623847213, + "learning_rate": 5.919870532944721e-07, + "loss": 0.7289, + "step": 9658 + }, + { + "epoch": 0.85, + "grad_norm": 3.3175619328117287, + "learning_rate": 5.913159132766272e-07, + "loss": 0.479, + "step": 9659 + }, + { + "epoch": 0.85, + "grad_norm": 7.044737916175612, + "learning_rate": 5.906451300021654e-07, + "loss": 0.7656, + "step": 9660 + }, + { + "epoch": 0.85, + "grad_norm": 5.86192646845315, + "learning_rate": 5.89974703525365e-07, + "loss": 0.7313, + "step": 9661 + }, + { + "epoch": 0.85, + "grad_norm": 8.754863675979529, + "learning_rate": 5.89304633900476e-07, + "loss": 0.7267, + "step": 9662 + }, + { + "epoch": 0.85, + "grad_norm": 9.731457775364749, + "learning_rate": 5.886349211817183e-07, + "loss": 0.887, + "step": 9663 + }, + { + "epoch": 0.85, + "grad_norm": 9.387432134512258, + "learning_rate": 5.879655654232857e-07, + "loss": 0.7194, + "step": 9664 + }, + { + "epoch": 0.85, + "grad_norm": 7.359108402478368, + "learning_rate": 5.872965666793407e-07, + "loss": 0.8026, + "step": 9665 + }, + { + "epoch": 0.85, + "grad_norm": 6.334977192122494, + "learning_rate": 5.866279250040168e-07, + "loss": 0.7171, + "step": 9666 + }, + { + "epoch": 0.85, + "grad_norm": 2.714206244863842, + "learning_rate": 5.859596404514206e-07, + "loss": 0.5281, + "step": 9667 + }, + { + "epoch": 0.85, + "grad_norm": 7.057090019556384, + "learning_rate": 5.852917130756275e-07, + "loss": 0.6865, + "step": 9668 + }, + { + "epoch": 0.85, + "grad_norm": 5.206576736192515, + "learning_rate": 5.846241429306849e-07, + "loss": 0.8268, + "step": 9669 + }, + { + "epoch": 0.85, + "grad_norm": 7.609950373857141, + "learning_rate": 5.839569300706127e-07, + "loss": 0.748, + "step": 9670 + }, + { + "epoch": 0.85, + "grad_norm": 8.234384792556735, + "learning_rate": 5.832900745494003e-07, + "loss": 0.8375, + "step": 9671 + }, + { + "epoch": 0.85, + "grad_norm": 21.578957575378393, + "learning_rate": 5.826235764210081e-07, + "loss": 0.7516, + "step": 9672 + }, + { + "epoch": 0.85, + "grad_norm": 2.98048376354057, + "learning_rate": 5.819574357393676e-07, + "loss": 0.5334, + "step": 9673 + }, + { + "epoch": 0.85, + "grad_norm": 10.922101882657264, + "learning_rate": 5.812916525583833e-07, + "loss": 0.6078, + "step": 9674 + }, + { + "epoch": 0.85, + "grad_norm": 3.4500122147631176, + "learning_rate": 5.806262269319269e-07, + "loss": 0.5766, + "step": 9675 + }, + { + "epoch": 0.85, + "grad_norm": 7.391627749710389, + "learning_rate": 5.799611589138459e-07, + "loss": 0.705, + "step": 9676 + }, + { + "epoch": 0.85, + "grad_norm": 7.814876591821448, + "learning_rate": 5.792964485579556e-07, + "loss": 0.767, + "step": 9677 + }, + { + "epoch": 0.85, + "grad_norm": 8.873868025865027, + "learning_rate": 5.78632095918043e-07, + "loss": 0.761, + "step": 9678 + }, + { + "epoch": 0.85, + "grad_norm": 8.907830325340184, + "learning_rate": 5.779681010478671e-07, + "loss": 0.6468, + "step": 9679 + }, + { + "epoch": 0.85, + "grad_norm": 6.840469940658134, + "learning_rate": 5.773044640011555e-07, + "loss": 0.6819, + "step": 9680 + }, + { + "epoch": 0.85, + "grad_norm": 9.850879684836517, + "learning_rate": 5.766411848316111e-07, + "loss": 0.6841, + "step": 9681 + }, + { + "epoch": 0.85, + "grad_norm": 10.492458426961717, + "learning_rate": 5.75978263592904e-07, + "loss": 0.7676, + "step": 9682 + }, + { + "epoch": 0.85, + "grad_norm": 10.102705768989694, + "learning_rate": 5.753157003386767e-07, + "loss": 0.8268, + "step": 9683 + }, + { + "epoch": 0.85, + "grad_norm": 5.679462058790516, + "learning_rate": 5.74653495122543e-07, + "loss": 0.6839, + "step": 9684 + }, + { + "epoch": 0.85, + "grad_norm": 7.793881119247856, + "learning_rate": 5.739916479980873e-07, + "loss": 0.7483, + "step": 9685 + }, + { + "epoch": 0.85, + "grad_norm": 8.39220573848827, + "learning_rate": 5.73330159018864e-07, + "loss": 0.6795, + "step": 9686 + }, + { + "epoch": 0.85, + "grad_norm": 7.849682184739373, + "learning_rate": 5.726690282384023e-07, + "loss": 0.6823, + "step": 9687 + }, + { + "epoch": 0.85, + "grad_norm": 9.6868139436996, + "learning_rate": 5.72008255710198e-07, + "loss": 0.79, + "step": 9688 + }, + { + "epoch": 0.85, + "grad_norm": 9.295863018492144, + "learning_rate": 5.713478414877199e-07, + "loss": 0.6882, + "step": 9689 + }, + { + "epoch": 0.85, + "grad_norm": 5.913542890504663, + "learning_rate": 5.706877856244081e-07, + "loss": 0.7065, + "step": 9690 + }, + { + "epoch": 0.85, + "grad_norm": 7.036126410066496, + "learning_rate": 5.700280881736725e-07, + "loss": 0.832, + "step": 9691 + }, + { + "epoch": 0.85, + "grad_norm": 8.033844754318766, + "learning_rate": 5.693687491888944e-07, + "loss": 0.8147, + "step": 9692 + }, + { + "epoch": 0.85, + "grad_norm": 6.657780445929791, + "learning_rate": 5.687097687234283e-07, + "loss": 0.6131, + "step": 9693 + }, + { + "epoch": 0.85, + "grad_norm": 31.2290615066493, + "learning_rate": 5.680511468305966e-07, + "loss": 0.811, + "step": 9694 + }, + { + "epoch": 0.85, + "grad_norm": 5.68899710055598, + "learning_rate": 5.673928835636938e-07, + "loss": 0.6594, + "step": 9695 + }, + { + "epoch": 0.85, + "grad_norm": 9.656988444027489, + "learning_rate": 5.667349789759852e-07, + "loss": 0.8966, + "step": 9696 + }, + { + "epoch": 0.85, + "grad_norm": 10.33271723446793, + "learning_rate": 5.660774331207086e-07, + "loss": 0.8662, + "step": 9697 + }, + { + "epoch": 0.85, + "grad_norm": 14.029069645118675, + "learning_rate": 5.654202460510689e-07, + "loss": 0.8737, + "step": 9698 + }, + { + "epoch": 0.85, + "grad_norm": 10.701000053648542, + "learning_rate": 5.647634178202477e-07, + "loss": 0.6657, + "step": 9699 + }, + { + "epoch": 0.85, + "grad_norm": 5.754838205083736, + "learning_rate": 5.641069484813932e-07, + "loss": 0.7753, + "step": 9700 + }, + { + "epoch": 0.85, + "grad_norm": 9.62502538691945, + "learning_rate": 5.634508380876252e-07, + "loss": 0.6666, + "step": 9701 + }, + { + "epoch": 0.85, + "grad_norm": 78.91903711238572, + "learning_rate": 5.627950866920362e-07, + "loss": 0.7855, + "step": 9702 + }, + { + "epoch": 0.85, + "grad_norm": 13.841107932591106, + "learning_rate": 5.621396943476865e-07, + "loss": 0.8022, + "step": 9703 + }, + { + "epoch": 0.85, + "grad_norm": 16.54994086480411, + "learning_rate": 5.614846611076119e-07, + "loss": 0.7861, + "step": 9704 + }, + { + "epoch": 0.85, + "grad_norm": 7.1336839336540105, + "learning_rate": 5.608299870248151e-07, + "loss": 0.7748, + "step": 9705 + }, + { + "epoch": 0.85, + "grad_norm": 8.27808670247799, + "learning_rate": 5.601756721522716e-07, + "loss": 0.8952, + "step": 9706 + }, + { + "epoch": 0.85, + "grad_norm": 11.372785345907824, + "learning_rate": 5.595217165429279e-07, + "loss": 0.7842, + "step": 9707 + }, + { + "epoch": 0.85, + "grad_norm": 8.876072612750109, + "learning_rate": 5.588681202497004e-07, + "loss": 0.7191, + "step": 9708 + }, + { + "epoch": 0.85, + "grad_norm": 9.757158944226994, + "learning_rate": 5.582148833254758e-07, + "loss": 0.8253, + "step": 9709 + }, + { + "epoch": 0.85, + "grad_norm": 10.54336491077582, + "learning_rate": 5.575620058231157e-07, + "loss": 0.8277, + "step": 9710 + }, + { + "epoch": 0.85, + "grad_norm": 6.097361570770077, + "learning_rate": 5.569094877954484e-07, + "loss": 0.6851, + "step": 9711 + }, + { + "epoch": 0.85, + "grad_norm": 10.585202606637756, + "learning_rate": 5.562573292952744e-07, + "loss": 0.7457, + "step": 9712 + }, + { + "epoch": 0.85, + "grad_norm": 5.6429951248318195, + "learning_rate": 5.55605530375366e-07, + "loss": 0.7093, + "step": 9713 + }, + { + "epoch": 0.85, + "grad_norm": 12.674382769912192, + "learning_rate": 5.549540910884649e-07, + "loss": 0.6997, + "step": 9714 + }, + { + "epoch": 0.85, + "grad_norm": 6.695244889617863, + "learning_rate": 5.543030114872838e-07, + "loss": 0.7849, + "step": 9715 + }, + { + "epoch": 0.85, + "grad_norm": 7.0890814547323595, + "learning_rate": 5.536522916245091e-07, + "loss": 0.711, + "step": 9716 + }, + { + "epoch": 0.85, + "grad_norm": 6.3787789443694365, + "learning_rate": 5.530019315527946e-07, + "loss": 0.7534, + "step": 9717 + }, + { + "epoch": 0.85, + "grad_norm": 8.79938953816969, + "learning_rate": 5.523519313247666e-07, + "loss": 0.5959, + "step": 9718 + }, + { + "epoch": 0.85, + "grad_norm": 7.7128482749443235, + "learning_rate": 5.517022909930219e-07, + "loss": 0.602, + "step": 9719 + }, + { + "epoch": 0.85, + "grad_norm": 9.357820540515629, + "learning_rate": 5.51053010610128e-07, + "loss": 0.6977, + "step": 9720 + }, + { + "epoch": 0.85, + "grad_norm": 6.4101570927590625, + "learning_rate": 5.504040902286245e-07, + "loss": 0.6161, + "step": 9721 + }, + { + "epoch": 0.85, + "grad_norm": 6.083335317870826, + "learning_rate": 5.49755529901021e-07, + "loss": 0.6637, + "step": 9722 + }, + { + "epoch": 0.85, + "grad_norm": 2.836791268542405, + "learning_rate": 5.491073296797972e-07, + "loss": 0.4883, + "step": 9723 + }, + { + "epoch": 0.85, + "grad_norm": 8.153886354571409, + "learning_rate": 5.484594896174045e-07, + "loss": 0.881, + "step": 9724 + }, + { + "epoch": 0.85, + "grad_norm": 6.887338158461261, + "learning_rate": 5.478120097662654e-07, + "loss": 0.6352, + "step": 9725 + }, + { + "epoch": 0.85, + "grad_norm": 9.751541249783397, + "learning_rate": 5.471648901787713e-07, + "loss": 0.6344, + "step": 9726 + }, + { + "epoch": 0.85, + "grad_norm": 12.954250535749726, + "learning_rate": 5.465181309072886e-07, + "loss": 0.7525, + "step": 9727 + }, + { + "epoch": 0.85, + "grad_norm": 7.482539487146004, + "learning_rate": 5.458717320041507e-07, + "loss": 0.7546, + "step": 9728 + }, + { + "epoch": 0.85, + "grad_norm": 10.574947650022132, + "learning_rate": 5.452256935216633e-07, + "loss": 0.7624, + "step": 9729 + }, + { + "epoch": 0.85, + "grad_norm": 10.200550146745396, + "learning_rate": 5.445800155121022e-07, + "loss": 0.7471, + "step": 9730 + }, + { + "epoch": 0.85, + "grad_norm": 8.638537986432201, + "learning_rate": 5.439346980277155e-07, + "loss": 0.747, + "step": 9731 + }, + { + "epoch": 0.85, + "grad_norm": 2.4362897600749434, + "learning_rate": 5.432897411207195e-07, + "loss": 0.4686, + "step": 9732 + }, + { + "epoch": 0.85, + "grad_norm": 9.094393995980624, + "learning_rate": 5.426451448433051e-07, + "loss": 0.7358, + "step": 9733 + }, + { + "epoch": 0.85, + "grad_norm": 9.11199240975103, + "learning_rate": 5.420009092476314e-07, + "loss": 0.7316, + "step": 9734 + }, + { + "epoch": 0.85, + "grad_norm": 9.741033736520302, + "learning_rate": 5.413570343858288e-07, + "loss": 0.8332, + "step": 9735 + }, + { + "epoch": 0.86, + "grad_norm": 9.251953758458466, + "learning_rate": 5.407135203099984e-07, + "loss": 0.897, + "step": 9736 + }, + { + "epoch": 0.86, + "grad_norm": 8.106887510654186, + "learning_rate": 5.400703670722118e-07, + "loss": 0.7237, + "step": 9737 + }, + { + "epoch": 0.86, + "grad_norm": 3.3304967335784856, + "learning_rate": 5.39427574724512e-07, + "loss": 0.485, + "step": 9738 + }, + { + "epoch": 0.86, + "grad_norm": 8.892146311996779, + "learning_rate": 5.38785143318914e-07, + "loss": 0.6565, + "step": 9739 + }, + { + "epoch": 0.86, + "grad_norm": 7.3748924624032055, + "learning_rate": 5.38143072907401e-07, + "loss": 0.6859, + "step": 9740 + }, + { + "epoch": 0.86, + "grad_norm": 9.041394330044215, + "learning_rate": 5.375013635419285e-07, + "loss": 0.6458, + "step": 9741 + }, + { + "epoch": 0.86, + "grad_norm": 10.19472049591071, + "learning_rate": 5.36860015274423e-07, + "loss": 0.7162, + "step": 9742 + }, + { + "epoch": 0.86, + "grad_norm": 8.571299742700726, + "learning_rate": 5.362190281567803e-07, + "loss": 0.7304, + "step": 9743 + }, + { + "epoch": 0.86, + "grad_norm": 9.49344996118587, + "learning_rate": 5.355784022408694e-07, + "loss": 0.6707, + "step": 9744 + }, + { + "epoch": 0.86, + "grad_norm": 8.864029126260435, + "learning_rate": 5.349381375785279e-07, + "loss": 0.6492, + "step": 9745 + }, + { + "epoch": 0.86, + "grad_norm": 9.805092685529248, + "learning_rate": 5.342982342215652e-07, + "loss": 0.7169, + "step": 9746 + }, + { + "epoch": 0.86, + "grad_norm": 7.169842426796706, + "learning_rate": 5.336586922217607e-07, + "loss": 0.6668, + "step": 9747 + }, + { + "epoch": 0.86, + "grad_norm": 9.215133241394195, + "learning_rate": 5.330195116308662e-07, + "loss": 0.6619, + "step": 9748 + }, + { + "epoch": 0.86, + "grad_norm": 8.524295136102406, + "learning_rate": 5.323806925006008e-07, + "loss": 0.7449, + "step": 9749 + }, + { + "epoch": 0.86, + "grad_norm": 19.39129387733728, + "learning_rate": 5.317422348826595e-07, + "loss": 0.673, + "step": 9750 + }, + { + "epoch": 0.86, + "grad_norm": 7.852788939215467, + "learning_rate": 5.311041388287036e-07, + "loss": 0.7551, + "step": 9751 + }, + { + "epoch": 0.86, + "grad_norm": 6.818254658262373, + "learning_rate": 5.304664043903674e-07, + "loss": 0.7446, + "step": 9752 + }, + { + "epoch": 0.86, + "grad_norm": 10.444003908649107, + "learning_rate": 5.298290316192556e-07, + "loss": 0.5199, + "step": 9753 + }, + { + "epoch": 0.86, + "grad_norm": 15.405810398808605, + "learning_rate": 5.29192020566942e-07, + "loss": 0.6986, + "step": 9754 + }, + { + "epoch": 0.86, + "grad_norm": 13.19117868468781, + "learning_rate": 5.285553712849728e-07, + "loss": 0.7264, + "step": 9755 + }, + { + "epoch": 0.86, + "grad_norm": 4.703805828318245, + "learning_rate": 5.27919083824866e-07, + "loss": 0.6653, + "step": 9756 + }, + { + "epoch": 0.86, + "grad_norm": 11.37371264867726, + "learning_rate": 5.272831582381078e-07, + "loss": 0.6689, + "step": 9757 + }, + { + "epoch": 0.86, + "grad_norm": 5.323856728312404, + "learning_rate": 5.266475945761562e-07, + "loss": 0.647, + "step": 9758 + }, + { + "epoch": 0.86, + "grad_norm": 7.923218178458161, + "learning_rate": 5.260123928904409e-07, + "loss": 0.6951, + "step": 9759 + }, + { + "epoch": 0.86, + "grad_norm": 8.311329430571854, + "learning_rate": 5.253775532323596e-07, + "loss": 0.7655, + "step": 9760 + }, + { + "epoch": 0.86, + "grad_norm": 11.831186001908502, + "learning_rate": 5.24743075653284e-07, + "loss": 0.7408, + "step": 9761 + }, + { + "epoch": 0.86, + "grad_norm": 6.96221092928022, + "learning_rate": 5.241089602045546e-07, + "loss": 0.7102, + "step": 9762 + }, + { + "epoch": 0.86, + "grad_norm": 5.914002687838653, + "learning_rate": 5.234752069374833e-07, + "loss": 0.7985, + "step": 9763 + }, + { + "epoch": 0.86, + "grad_norm": 6.669058971408071, + "learning_rate": 5.228418159033516e-07, + "loss": 0.7395, + "step": 9764 + }, + { + "epoch": 0.86, + "grad_norm": 2.4988902378949343, + "learning_rate": 5.222087871534132e-07, + "loss": 0.4875, + "step": 9765 + }, + { + "epoch": 0.86, + "grad_norm": 19.8526291678453, + "learning_rate": 5.215761207388897e-07, + "loss": 0.6082, + "step": 9766 + }, + { + "epoch": 0.86, + "grad_norm": 26.968151928298525, + "learning_rate": 5.209438167109787e-07, + "loss": 0.7225, + "step": 9767 + }, + { + "epoch": 0.86, + "grad_norm": 5.6963004308958824, + "learning_rate": 5.20311875120843e-07, + "loss": 0.6673, + "step": 9768 + }, + { + "epoch": 0.86, + "grad_norm": 8.426896062913544, + "learning_rate": 5.19680296019619e-07, + "loss": 0.7365, + "step": 9769 + }, + { + "epoch": 0.86, + "grad_norm": 9.860326486359604, + "learning_rate": 5.190490794584129e-07, + "loss": 0.5515, + "step": 9770 + }, + { + "epoch": 0.86, + "grad_norm": 10.237493580310028, + "learning_rate": 5.184182254883014e-07, + "loss": 0.7182, + "step": 9771 + }, + { + "epoch": 0.86, + "grad_norm": 7.787787606767485, + "learning_rate": 5.177877341603316e-07, + "loss": 0.728, + "step": 9772 + }, + { + "epoch": 0.86, + "grad_norm": 2.9548831129271775, + "learning_rate": 5.171576055255234e-07, + "loss": 0.4742, + "step": 9773 + }, + { + "epoch": 0.86, + "grad_norm": 5.982698902422188, + "learning_rate": 5.16527839634865e-07, + "loss": 0.7641, + "step": 9774 + }, + { + "epoch": 0.86, + "grad_norm": 2.3029774624223416, + "learning_rate": 5.15898436539316e-07, + "loss": 0.4756, + "step": 9775 + }, + { + "epoch": 0.86, + "grad_norm": 6.356453844827146, + "learning_rate": 5.152693962898064e-07, + "loss": 0.6697, + "step": 9776 + }, + { + "epoch": 0.86, + "grad_norm": 8.21805171922595, + "learning_rate": 5.146407189372371e-07, + "loss": 0.6698, + "step": 9777 + }, + { + "epoch": 0.86, + "grad_norm": 2.530029612570942, + "learning_rate": 5.140124045324791e-07, + "loss": 0.4723, + "step": 9778 + }, + { + "epoch": 0.86, + "grad_norm": 7.9700257525750695, + "learning_rate": 5.133844531263765e-07, + "loss": 0.6872, + "step": 9779 + }, + { + "epoch": 0.86, + "grad_norm": 7.729111105912597, + "learning_rate": 5.127568647697407e-07, + "loss": 0.8384, + "step": 9780 + }, + { + "epoch": 0.86, + "grad_norm": 5.822712000444037, + "learning_rate": 5.121296395133551e-07, + "loss": 0.7221, + "step": 9781 + }, + { + "epoch": 0.86, + "grad_norm": 3.1650066232558607, + "learning_rate": 5.11502777407974e-07, + "loss": 0.5037, + "step": 9782 + }, + { + "epoch": 0.86, + "grad_norm": 10.44698604426762, + "learning_rate": 5.108762785043209e-07, + "loss": 0.5755, + "step": 9783 + }, + { + "epoch": 0.86, + "grad_norm": 8.599074798495877, + "learning_rate": 5.102501428530931e-07, + "loss": 0.8413, + "step": 9784 + }, + { + "epoch": 0.86, + "grad_norm": 12.669665214214357, + "learning_rate": 5.096243705049553e-07, + "loss": 0.6627, + "step": 9785 + }, + { + "epoch": 0.86, + "grad_norm": 7.72043534981417, + "learning_rate": 5.08998961510544e-07, + "loss": 0.7511, + "step": 9786 + }, + { + "epoch": 0.86, + "grad_norm": 7.0920314494831915, + "learning_rate": 5.083739159204665e-07, + "loss": 0.6711, + "step": 9787 + }, + { + "epoch": 0.86, + "grad_norm": 7.61188982057318, + "learning_rate": 5.077492337853001e-07, + "loss": 0.7603, + "step": 9788 + }, + { + "epoch": 0.86, + "grad_norm": 11.982482717811608, + "learning_rate": 5.071249151555929e-07, + "loss": 0.7056, + "step": 9789 + }, + { + "epoch": 0.86, + "grad_norm": 7.979978284730719, + "learning_rate": 5.065009600818643e-07, + "loss": 0.656, + "step": 9790 + }, + { + "epoch": 0.86, + "grad_norm": 10.924440912872432, + "learning_rate": 5.05877368614604e-07, + "loss": 0.6695, + "step": 9791 + }, + { + "epoch": 0.86, + "grad_norm": 2.1770051980651592, + "learning_rate": 5.052541408042711e-07, + "loss": 0.4278, + "step": 9792 + }, + { + "epoch": 0.86, + "grad_norm": 8.039121553500186, + "learning_rate": 5.046312767012962e-07, + "loss": 0.723, + "step": 9793 + }, + { + "epoch": 0.86, + "grad_norm": 9.130532005744127, + "learning_rate": 5.040087763560814e-07, + "loss": 0.823, + "step": 9794 + }, + { + "epoch": 0.86, + "grad_norm": 5.9487600072846964, + "learning_rate": 5.033866398189963e-07, + "loss": 0.6469, + "step": 9795 + }, + { + "epoch": 0.86, + "grad_norm": 9.471258106446935, + "learning_rate": 5.027648671403857e-07, + "loss": 0.7255, + "step": 9796 + }, + { + "epoch": 0.86, + "grad_norm": 7.54076244177735, + "learning_rate": 5.021434583705615e-07, + "loss": 0.7906, + "step": 9797 + }, + { + "epoch": 0.86, + "grad_norm": 8.013135608946751, + "learning_rate": 5.015224135598063e-07, + "loss": 0.7775, + "step": 9798 + }, + { + "epoch": 0.86, + "grad_norm": 7.35597201909561, + "learning_rate": 5.009017327583748e-07, + "loss": 0.6872, + "step": 9799 + }, + { + "epoch": 0.86, + "grad_norm": 6.6053305073816295, + "learning_rate": 5.002814160164904e-07, + "loss": 0.6962, + "step": 9800 + }, + { + "epoch": 0.86, + "grad_norm": 7.092561297725261, + "learning_rate": 4.996614633843499e-07, + "loss": 0.6141, + "step": 9801 + }, + { + "epoch": 0.86, + "grad_norm": 2.5285252928215605, + "learning_rate": 4.990418749121179e-07, + "loss": 0.5124, + "step": 9802 + }, + { + "epoch": 0.86, + "grad_norm": 14.928184213051738, + "learning_rate": 4.984226506499301e-07, + "loss": 0.7282, + "step": 9803 + }, + { + "epoch": 0.86, + "grad_norm": 7.953356825554693, + "learning_rate": 4.978037906478933e-07, + "loss": 0.7518, + "step": 9804 + }, + { + "epoch": 0.86, + "grad_norm": 7.388643129169732, + "learning_rate": 4.971852949560852e-07, + "loss": 0.6024, + "step": 9805 + }, + { + "epoch": 0.86, + "grad_norm": 8.321466909850221, + "learning_rate": 4.965671636245517e-07, + "loss": 0.7041, + "step": 9806 + }, + { + "epoch": 0.86, + "grad_norm": 15.970924095939756, + "learning_rate": 4.959493967033135e-07, + "loss": 0.7248, + "step": 9807 + }, + { + "epoch": 0.86, + "grad_norm": 5.856223167542266, + "learning_rate": 4.953319942423579e-07, + "loss": 0.5832, + "step": 9808 + }, + { + "epoch": 0.86, + "grad_norm": 8.170324435988464, + "learning_rate": 4.947149562916442e-07, + "loss": 0.7457, + "step": 9809 + }, + { + "epoch": 0.86, + "grad_norm": 5.980427722032299, + "learning_rate": 4.940982829011021e-07, + "loss": 0.7267, + "step": 9810 + }, + { + "epoch": 0.86, + "grad_norm": 7.736012882962191, + "learning_rate": 4.934819741206315e-07, + "loss": 0.6086, + "step": 9811 + }, + { + "epoch": 0.86, + "grad_norm": 13.912215188035516, + "learning_rate": 4.928660300001026e-07, + "loss": 0.6432, + "step": 9812 + }, + { + "epoch": 0.86, + "grad_norm": 2.8148702515833373, + "learning_rate": 4.922504505893583e-07, + "loss": 0.4771, + "step": 9813 + }, + { + "epoch": 0.86, + "grad_norm": 12.453472133699373, + "learning_rate": 4.916352359382093e-07, + "loss": 0.6483, + "step": 9814 + }, + { + "epoch": 0.86, + "grad_norm": 6.110754322626885, + "learning_rate": 4.91020386096438e-07, + "loss": 0.6361, + "step": 9815 + }, + { + "epoch": 0.86, + "grad_norm": 3.256218491165295, + "learning_rate": 4.904059011137962e-07, + "loss": 0.62, + "step": 9816 + }, + { + "epoch": 0.86, + "grad_norm": 11.950928830043175, + "learning_rate": 4.897917810400077e-07, + "loss": 0.7627, + "step": 9817 + }, + { + "epoch": 0.86, + "grad_norm": 6.589658877756415, + "learning_rate": 4.891780259247653e-07, + "loss": 0.8999, + "step": 9818 + }, + { + "epoch": 0.86, + "grad_norm": 9.205493750774874, + "learning_rate": 4.885646358177343e-07, + "loss": 0.7111, + "step": 9819 + }, + { + "epoch": 0.86, + "grad_norm": 6.780215431222568, + "learning_rate": 4.879516107685489e-07, + "loss": 0.6627, + "step": 9820 + }, + { + "epoch": 0.86, + "grad_norm": 8.85205461484518, + "learning_rate": 4.873389508268133e-07, + "loss": 0.6789, + "step": 9821 + }, + { + "epoch": 0.86, + "grad_norm": 11.035545348196193, + "learning_rate": 4.867266560421035e-07, + "loss": 0.8074, + "step": 9822 + }, + { + "epoch": 0.86, + "grad_norm": 10.092505239134113, + "learning_rate": 4.861147264639649e-07, + "loss": 0.6075, + "step": 9823 + }, + { + "epoch": 0.86, + "grad_norm": 9.552331274141942, + "learning_rate": 4.855031621419143e-07, + "loss": 0.6325, + "step": 9824 + }, + { + "epoch": 0.86, + "grad_norm": 11.420785853632593, + "learning_rate": 4.84891963125439e-07, + "loss": 0.6953, + "step": 9825 + }, + { + "epoch": 0.86, + "grad_norm": 6.122012463762052, + "learning_rate": 4.842811294639948e-07, + "loss": 0.7298, + "step": 9826 + }, + { + "epoch": 0.86, + "grad_norm": 16.913208123369905, + "learning_rate": 4.836706612070108e-07, + "loss": 0.6355, + "step": 9827 + }, + { + "epoch": 0.86, + "grad_norm": 8.980583779165812, + "learning_rate": 4.830605584038839e-07, + "loss": 0.7988, + "step": 9828 + }, + { + "epoch": 0.86, + "grad_norm": 8.299767882244236, + "learning_rate": 4.824508211039824e-07, + "loss": 0.6822, + "step": 9829 + }, + { + "epoch": 0.86, + "grad_norm": 6.972263357304276, + "learning_rate": 4.81841449356647e-07, + "loss": 0.7231, + "step": 9830 + }, + { + "epoch": 0.86, + "grad_norm": 9.86344283987726, + "learning_rate": 4.812324432111853e-07, + "loss": 0.7852, + "step": 9831 + }, + { + "epoch": 0.86, + "grad_norm": 5.550228366361908, + "learning_rate": 4.80623802716878e-07, + "loss": 0.6911, + "step": 9832 + }, + { + "epoch": 0.86, + "grad_norm": 9.558215176324856, + "learning_rate": 4.800155279229751e-07, + "loss": 0.7503, + "step": 9833 + }, + { + "epoch": 0.86, + "grad_norm": 8.794061010508472, + "learning_rate": 4.794076188786973e-07, + "loss": 0.6412, + "step": 9834 + }, + { + "epoch": 0.86, + "grad_norm": 6.172939245706113, + "learning_rate": 4.788000756332339e-07, + "loss": 0.7254, + "step": 9835 + }, + { + "epoch": 0.86, + "grad_norm": 3.0152938360649504, + "learning_rate": 4.781928982357492e-07, + "loss": 0.5217, + "step": 9836 + }, + { + "epoch": 0.86, + "grad_norm": 11.965849331987505, + "learning_rate": 4.775860867353732e-07, + "loss": 0.759, + "step": 9837 + }, + { + "epoch": 0.86, + "grad_norm": 8.445489730674181, + "learning_rate": 4.769796411812088e-07, + "loss": 0.6666, + "step": 9838 + }, + { + "epoch": 0.86, + "grad_norm": 11.705233092881569, + "learning_rate": 4.763735616223275e-07, + "loss": 0.7586, + "step": 9839 + }, + { + "epoch": 0.86, + "grad_norm": 9.896988384476186, + "learning_rate": 4.7576784810777267e-07, + "loss": 0.6801, + "step": 9840 + }, + { + "epoch": 0.86, + "grad_norm": 11.098431480054634, + "learning_rate": 4.7516250068655847e-07, + "loss": 0.583, + "step": 9841 + }, + { + "epoch": 0.86, + "grad_norm": 12.59842026599163, + "learning_rate": 4.7455751940766794e-07, + "loss": 0.6803, + "step": 9842 + }, + { + "epoch": 0.86, + "grad_norm": 26.06806896066989, + "learning_rate": 4.739529043200558e-07, + "loss": 0.6604, + "step": 9843 + }, + { + "epoch": 0.86, + "grad_norm": 22.98028749329932, + "learning_rate": 4.733486554726452e-07, + "loss": 0.6308, + "step": 9844 + }, + { + "epoch": 0.86, + "grad_norm": 6.670767373951702, + "learning_rate": 4.727447729143325e-07, + "loss": 0.6621, + "step": 9845 + }, + { + "epoch": 0.86, + "grad_norm": 9.09230710829091, + "learning_rate": 4.721412566939804e-07, + "loss": 0.7592, + "step": 9846 + }, + { + "epoch": 0.86, + "grad_norm": 10.272758146589243, + "learning_rate": 4.7153810686042746e-07, + "loss": 0.728, + "step": 9847 + }, + { + "epoch": 0.86, + "grad_norm": 13.93897414501462, + "learning_rate": 4.7093532346247807e-07, + "loss": 0.6991, + "step": 9848 + }, + { + "epoch": 0.86, + "grad_norm": 12.101441983788696, + "learning_rate": 4.7033290654890875e-07, + "loss": 0.6627, + "step": 9849 + }, + { + "epoch": 0.87, + "grad_norm": 2.1748383280833905, + "learning_rate": 4.697308561684655e-07, + "loss": 0.4653, + "step": 9850 + }, + { + "epoch": 0.87, + "grad_norm": 2.3171944843366767, + "learning_rate": 4.6912917236986597e-07, + "loss": 0.4857, + "step": 9851 + }, + { + "epoch": 0.87, + "grad_norm": 6.285804452632416, + "learning_rate": 4.685278552017963e-07, + "loss": 0.7975, + "step": 9852 + }, + { + "epoch": 0.87, + "grad_norm": 8.024095285009784, + "learning_rate": 4.6792690471291537e-07, + "loss": 0.6836, + "step": 9853 + }, + { + "epoch": 0.87, + "grad_norm": 12.61468739858051, + "learning_rate": 4.673263209518508e-07, + "loss": 0.6593, + "step": 9854 + }, + { + "epoch": 0.87, + "grad_norm": 9.062098408773396, + "learning_rate": 4.667261039672011e-07, + "loss": 0.7148, + "step": 9855 + }, + { + "epoch": 0.87, + "grad_norm": 11.507174152974633, + "learning_rate": 4.6612625380753397e-07, + "loss": 0.6963, + "step": 9856 + }, + { + "epoch": 0.87, + "grad_norm": 6.181619749867034, + "learning_rate": 4.655267705213884e-07, + "loss": 0.6176, + "step": 9857 + }, + { + "epoch": 0.87, + "grad_norm": 18.22633453689926, + "learning_rate": 4.6492765415727335e-07, + "loss": 0.6987, + "step": 9858 + }, + { + "epoch": 0.87, + "grad_norm": 8.659653041351268, + "learning_rate": 4.643289047636695e-07, + "loss": 0.8596, + "step": 9859 + }, + { + "epoch": 0.87, + "grad_norm": 13.478331588049793, + "learning_rate": 4.6373052238902595e-07, + "loss": 0.6665, + "step": 9860 + }, + { + "epoch": 0.87, + "grad_norm": 6.143246601183669, + "learning_rate": 4.631325070817627e-07, + "loss": 0.6486, + "step": 9861 + }, + { + "epoch": 0.87, + "grad_norm": 8.504005973251713, + "learning_rate": 4.625348588902706e-07, + "loss": 0.8794, + "step": 9862 + }, + { + "epoch": 0.87, + "grad_norm": 14.700546919561079, + "learning_rate": 4.619375778629087e-07, + "loss": 0.6453, + "step": 9863 + }, + { + "epoch": 0.87, + "grad_norm": 8.971417420239481, + "learning_rate": 4.6134066404801057e-07, + "loss": 0.687, + "step": 9864 + }, + { + "epoch": 0.87, + "grad_norm": 5.425567227290899, + "learning_rate": 4.6074411749387647e-07, + "loss": 0.5625, + "step": 9865 + }, + { + "epoch": 0.87, + "grad_norm": 7.482738554165864, + "learning_rate": 4.601479382487772e-07, + "loss": 0.7401, + "step": 9866 + }, + { + "epoch": 0.87, + "grad_norm": 5.839030686304922, + "learning_rate": 4.5955212636095535e-07, + "loss": 0.6297, + "step": 9867 + }, + { + "epoch": 0.87, + "grad_norm": 6.2753135897951315, + "learning_rate": 4.5895668187862283e-07, + "loss": 0.6722, + "step": 9868 + }, + { + "epoch": 0.87, + "grad_norm": 10.013833629542072, + "learning_rate": 4.5836160484996117e-07, + "loss": 0.6615, + "step": 9869 + }, + { + "epoch": 0.87, + "grad_norm": 10.6862926090822, + "learning_rate": 4.5776689532312456e-07, + "loss": 0.7648, + "step": 9870 + }, + { + "epoch": 0.87, + "grad_norm": 25.88357544966205, + "learning_rate": 4.571725533462351e-07, + "loss": 0.4448, + "step": 9871 + }, + { + "epoch": 0.87, + "grad_norm": 9.749203742816428, + "learning_rate": 4.56578578967386e-07, + "loss": 0.6669, + "step": 9872 + }, + { + "epoch": 0.87, + "grad_norm": 2.748661282413972, + "learning_rate": 4.55984972234641e-07, + "loss": 0.4957, + "step": 9873 + }, + { + "epoch": 0.87, + "grad_norm": 5.49511836061474, + "learning_rate": 4.553917331960334e-07, + "loss": 0.6944, + "step": 9874 + }, + { + "epoch": 0.87, + "grad_norm": 2.8562548491229656, + "learning_rate": 4.547988618995658e-07, + "loss": 0.4695, + "step": 9875 + }, + { + "epoch": 0.87, + "grad_norm": 10.08520090450851, + "learning_rate": 4.5420635839321494e-07, + "loss": 0.7401, + "step": 9876 + }, + { + "epoch": 0.87, + "grad_norm": 2.231806464216396, + "learning_rate": 4.5361422272492405e-07, + "loss": 0.4882, + "step": 9877 + }, + { + "epoch": 0.87, + "grad_norm": 11.636414748777026, + "learning_rate": 4.530224549426082e-07, + "loss": 0.7058, + "step": 9878 + }, + { + "epoch": 0.87, + "grad_norm": 2.8267801281724156, + "learning_rate": 4.524310550941513e-07, + "loss": 0.4964, + "step": 9879 + }, + { + "epoch": 0.87, + "grad_norm": 7.461939557502427, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.8689, + "step": 9880 + }, + { + "epoch": 0.87, + "grad_norm": 10.883928759390301, + "learning_rate": 4.5124935939020566e-07, + "loss": 0.7623, + "step": 9881 + }, + { + "epoch": 0.87, + "grad_norm": 14.973548181104082, + "learning_rate": 4.506590636303382e-07, + "loss": 0.6439, + "step": 9882 + }, + { + "epoch": 0.87, + "grad_norm": 18.389987239427732, + "learning_rate": 4.5006913599557224e-07, + "loss": 0.6688, + "step": 9883 + }, + { + "epoch": 0.87, + "grad_norm": 2.5556518660459973, + "learning_rate": 4.494795765336435e-07, + "loss": 0.4927, + "step": 9884 + }, + { + "epoch": 0.87, + "grad_norm": 9.45486754699668, + "learning_rate": 4.488903852922577e-07, + "loss": 0.7256, + "step": 9885 + }, + { + "epoch": 0.87, + "grad_norm": 11.13221535568157, + "learning_rate": 4.4830156231909114e-07, + "loss": 0.8349, + "step": 9886 + }, + { + "epoch": 0.87, + "grad_norm": 8.729921426612469, + "learning_rate": 4.477131076617913e-07, + "loss": 0.6529, + "step": 9887 + }, + { + "epoch": 0.87, + "grad_norm": 5.145021280016165, + "learning_rate": 4.471250213679745e-07, + "loss": 0.6922, + "step": 9888 + }, + { + "epoch": 0.87, + "grad_norm": 7.843706337922591, + "learning_rate": 4.4653730348522773e-07, + "loss": 0.7006, + "step": 9889 + }, + { + "epoch": 0.87, + "grad_norm": 10.426304820522677, + "learning_rate": 4.4594995406110785e-07, + "loss": 0.6012, + "step": 9890 + }, + { + "epoch": 0.87, + "grad_norm": 15.151480559926304, + "learning_rate": 4.453629731431425e-07, + "loss": 0.7573, + "step": 9891 + }, + { + "epoch": 0.87, + "grad_norm": 2.527990013865563, + "learning_rate": 4.44776360778828e-07, + "loss": 0.504, + "step": 9892 + }, + { + "epoch": 0.87, + "grad_norm": 6.999353797836969, + "learning_rate": 4.4419011701563486e-07, + "loss": 0.9659, + "step": 9893 + }, + { + "epoch": 0.87, + "grad_norm": 16.990467625987982, + "learning_rate": 4.436042419009984e-07, + "loss": 0.8817, + "step": 9894 + }, + { + "epoch": 0.87, + "grad_norm": 2.4990949018704827, + "learning_rate": 4.430187354823279e-07, + "loss": 0.4904, + "step": 9895 + }, + { + "epoch": 0.87, + "grad_norm": 2.5152499971630684, + "learning_rate": 4.42433597807001e-07, + "loss": 0.4193, + "step": 9896 + }, + { + "epoch": 0.87, + "grad_norm": 6.020448172583634, + "learning_rate": 4.4184882892236545e-07, + "loss": 0.9255, + "step": 9897 + }, + { + "epoch": 0.87, + "grad_norm": 7.578579851259254, + "learning_rate": 4.4126442887574174e-07, + "loss": 0.8114, + "step": 9898 + }, + { + "epoch": 0.87, + "grad_norm": 8.441913649839428, + "learning_rate": 4.406803977144169e-07, + "loss": 0.7424, + "step": 9899 + }, + { + "epoch": 0.87, + "grad_norm": 7.005770546889629, + "learning_rate": 4.4009673548565057e-07, + "loss": 0.7791, + "step": 9900 + }, + { + "epoch": 0.87, + "grad_norm": 13.001581328735162, + "learning_rate": 4.395134422366715e-07, + "loss": 0.6823, + "step": 9901 + }, + { + "epoch": 0.87, + "grad_norm": 2.5485753577128856, + "learning_rate": 4.38930518014678e-07, + "loss": 0.4949, + "step": 9902 + }, + { + "epoch": 0.87, + "grad_norm": 10.208445723608673, + "learning_rate": 4.3834796286683967e-07, + "loss": 0.7897, + "step": 9903 + }, + { + "epoch": 0.87, + "grad_norm": 2.8931436197123164, + "learning_rate": 4.377657768402971e-07, + "loss": 0.5359, + "step": 9904 + }, + { + "epoch": 0.87, + "grad_norm": 16.495929623796115, + "learning_rate": 4.3718395998215923e-07, + "loss": 0.6761, + "step": 9905 + }, + { + "epoch": 0.87, + "grad_norm": 10.03926246718842, + "learning_rate": 4.3660251233950514e-07, + "loss": 0.6513, + "step": 9906 + }, + { + "epoch": 0.87, + "grad_norm": 10.474892571059112, + "learning_rate": 4.360214339593849e-07, + "loss": 0.7109, + "step": 9907 + }, + { + "epoch": 0.87, + "grad_norm": 9.09267683705839, + "learning_rate": 4.3544072488881874e-07, + "loss": 0.8505, + "step": 9908 + }, + { + "epoch": 0.87, + "grad_norm": 9.388642211073185, + "learning_rate": 4.3486038517479513e-07, + "loss": 0.8039, + "step": 9909 + }, + { + "epoch": 0.87, + "grad_norm": 2.611070828853342, + "learning_rate": 4.3428041486427654e-07, + "loss": 0.5035, + "step": 9910 + }, + { + "epoch": 0.87, + "grad_norm": 15.862403862973459, + "learning_rate": 4.337008140041921e-07, + "loss": 0.5909, + "step": 9911 + }, + { + "epoch": 0.87, + "grad_norm": 8.580677445000985, + "learning_rate": 4.33121582641442e-07, + "loss": 0.7085, + "step": 9912 + }, + { + "epoch": 0.87, + "grad_norm": 14.528348930341236, + "learning_rate": 4.325427208228966e-07, + "loss": 0.7747, + "step": 9913 + }, + { + "epoch": 0.87, + "grad_norm": 6.386892665245769, + "learning_rate": 4.3196422859539735e-07, + "loss": 0.651, + "step": 9914 + }, + { + "epoch": 0.87, + "grad_norm": 7.337813004033316, + "learning_rate": 4.313861060057528e-07, + "loss": 0.7133, + "step": 9915 + }, + { + "epoch": 0.87, + "grad_norm": 6.163669329944335, + "learning_rate": 4.308083531007462e-07, + "loss": 0.6959, + "step": 9916 + }, + { + "epoch": 0.87, + "grad_norm": 10.16151585514107, + "learning_rate": 4.3023096992712677e-07, + "loss": 0.6181, + "step": 9917 + }, + { + "epoch": 0.87, + "grad_norm": 9.490884672116964, + "learning_rate": 4.29653956531616e-07, + "loss": 0.587, + "step": 9918 + }, + { + "epoch": 0.87, + "grad_norm": 7.994175432617073, + "learning_rate": 4.2907731296090495e-07, + "loss": 0.6198, + "step": 9919 + }, + { + "epoch": 0.87, + "grad_norm": 17.51215498766391, + "learning_rate": 4.285010392616534e-07, + "loss": 0.6697, + "step": 9920 + }, + { + "epoch": 0.87, + "grad_norm": 8.163889217189492, + "learning_rate": 4.279251354804942e-07, + "loss": 0.6174, + "step": 9921 + }, + { + "epoch": 0.87, + "grad_norm": 6.149563404382077, + "learning_rate": 4.2734960166402773e-07, + "loss": 0.65, + "step": 9922 + }, + { + "epoch": 0.87, + "grad_norm": 8.922534807636119, + "learning_rate": 4.2677443785882566e-07, + "loss": 0.7507, + "step": 9923 + }, + { + "epoch": 0.87, + "grad_norm": 5.9976639478722715, + "learning_rate": 4.2619964411142853e-07, + "loss": 0.5519, + "step": 9924 + }, + { + "epoch": 0.87, + "grad_norm": 7.8089728817864605, + "learning_rate": 4.256252204683481e-07, + "loss": 0.7691, + "step": 9925 + }, + { + "epoch": 0.87, + "grad_norm": 12.38219135642577, + "learning_rate": 4.2505116697606487e-07, + "loss": 0.7284, + "step": 9926 + }, + { + "epoch": 0.87, + "grad_norm": 8.371017455358041, + "learning_rate": 4.2447748368103225e-07, + "loss": 0.7102, + "step": 9927 + }, + { + "epoch": 0.87, + "grad_norm": 2.7279503060781773, + "learning_rate": 4.239041706296709e-07, + "loss": 0.517, + "step": 9928 + }, + { + "epoch": 0.87, + "grad_norm": 5.8787377791253475, + "learning_rate": 4.2333122786837153e-07, + "loss": 0.678, + "step": 9929 + }, + { + "epoch": 0.87, + "grad_norm": 7.601053143335021, + "learning_rate": 4.227586554434965e-07, + "loss": 0.8448, + "step": 9930 + }, + { + "epoch": 0.87, + "grad_norm": 9.834709692821946, + "learning_rate": 4.221864534013775e-07, + "loss": 0.57, + "step": 9931 + }, + { + "epoch": 0.87, + "grad_norm": 8.471495424509895, + "learning_rate": 4.2161462178831436e-07, + "loss": 0.7053, + "step": 9932 + }, + { + "epoch": 0.87, + "grad_norm": 6.952671013872077, + "learning_rate": 4.210431606505816e-07, + "loss": 0.6912, + "step": 9933 + }, + { + "epoch": 0.87, + "grad_norm": 7.311675903893994, + "learning_rate": 4.2047207003442003e-07, + "loss": 0.7691, + "step": 9934 + }, + { + "epoch": 0.87, + "grad_norm": 7.638896407006472, + "learning_rate": 4.199013499860399e-07, + "loss": 0.6913, + "step": 9935 + }, + { + "epoch": 0.87, + "grad_norm": 7.357067867771796, + "learning_rate": 4.193310005516249e-07, + "loss": 0.7003, + "step": 9936 + }, + { + "epoch": 0.87, + "grad_norm": 5.7072488883320345, + "learning_rate": 4.187610217773241e-07, + "loss": 0.6939, + "step": 9937 + }, + { + "epoch": 0.87, + "grad_norm": 6.950154048242517, + "learning_rate": 4.181914137092624e-07, + "loss": 0.7128, + "step": 9938 + }, + { + "epoch": 0.87, + "grad_norm": 7.40362037003597, + "learning_rate": 4.1762217639352956e-07, + "loss": 0.9024, + "step": 9939 + }, + { + "epoch": 0.87, + "grad_norm": 7.263791770961607, + "learning_rate": 4.1705330987618806e-07, + "loss": 0.777, + "step": 9940 + }, + { + "epoch": 0.87, + "grad_norm": 6.215078400451917, + "learning_rate": 4.1648481420326956e-07, + "loss": 0.6956, + "step": 9941 + }, + { + "epoch": 0.87, + "grad_norm": 8.374820941177655, + "learning_rate": 4.1591668942077557e-07, + "loss": 0.731, + "step": 9942 + }, + { + "epoch": 0.87, + "grad_norm": 16.007799076942796, + "learning_rate": 4.1534893557467646e-07, + "loss": 0.6037, + "step": 9943 + }, + { + "epoch": 0.87, + "grad_norm": 7.639591391080686, + "learning_rate": 4.147815527109167e-07, + "loss": 0.6303, + "step": 9944 + }, + { + "epoch": 0.87, + "grad_norm": 7.822911750209867, + "learning_rate": 4.142145408754061e-07, + "loss": 0.8013, + "step": 9945 + }, + { + "epoch": 0.87, + "grad_norm": 8.688737402361655, + "learning_rate": 4.136479001140276e-07, + "loss": 0.7059, + "step": 9946 + }, + { + "epoch": 0.87, + "grad_norm": 9.579788743191354, + "learning_rate": 4.130816304726315e-07, + "loss": 0.7947, + "step": 9947 + }, + { + "epoch": 0.87, + "grad_norm": 6.320944251965223, + "learning_rate": 4.1251573199703964e-07, + "loss": 0.6853, + "step": 9948 + }, + { + "epoch": 0.87, + "grad_norm": 2.9457806567025635, + "learning_rate": 4.1195020473304315e-07, + "loss": 0.5415, + "step": 9949 + }, + { + "epoch": 0.87, + "grad_norm": 6.559340926040853, + "learning_rate": 4.113850487264054e-07, + "loss": 0.6731, + "step": 9950 + }, + { + "epoch": 0.87, + "grad_norm": 10.489330588555953, + "learning_rate": 4.1082026402285647e-07, + "loss": 0.6289, + "step": 9951 + }, + { + "epoch": 0.87, + "grad_norm": 11.652170301749607, + "learning_rate": 4.102558506680987e-07, + "loss": 0.7689, + "step": 9952 + }, + { + "epoch": 0.87, + "grad_norm": 8.721230866050302, + "learning_rate": 4.0969180870780223e-07, + "loss": 0.6446, + "step": 9953 + }, + { + "epoch": 0.87, + "grad_norm": 15.50752617563307, + "learning_rate": 4.091281381876089e-07, + "loss": 0.6615, + "step": 9954 + }, + { + "epoch": 0.87, + "grad_norm": 6.780249800749842, + "learning_rate": 4.085648391531294e-07, + "loss": 0.7501, + "step": 9955 + }, + { + "epoch": 0.87, + "grad_norm": 3.4245147317872813, + "learning_rate": 4.0800191164994675e-07, + "loss": 0.5773, + "step": 9956 + }, + { + "epoch": 0.87, + "grad_norm": 13.08738275974049, + "learning_rate": 4.074393557236106e-07, + "loss": 0.7004, + "step": 9957 + }, + { + "epoch": 0.87, + "grad_norm": 6.107312404054119, + "learning_rate": 4.068771714196429e-07, + "loss": 0.7631, + "step": 9958 + }, + { + "epoch": 0.87, + "grad_norm": 10.60689433944994, + "learning_rate": 4.063153587835339e-07, + "loss": 0.7694, + "step": 9959 + }, + { + "epoch": 0.87, + "grad_norm": 11.839694511675447, + "learning_rate": 4.0575391786074446e-07, + "loss": 0.745, + "step": 9960 + }, + { + "epoch": 0.87, + "grad_norm": 7.364657784202583, + "learning_rate": 4.0519284869670604e-07, + "loss": 0.6496, + "step": 9961 + }, + { + "epoch": 0.87, + "grad_norm": 11.593645427324379, + "learning_rate": 4.0463215133681954e-07, + "loss": 0.6058, + "step": 9962 + }, + { + "epoch": 0.87, + "grad_norm": 2.555988316665421, + "learning_rate": 4.0407182582645533e-07, + "loss": 0.4797, + "step": 9963 + }, + { + "epoch": 0.88, + "grad_norm": 4.890717492835877, + "learning_rate": 4.0351187221095436e-07, + "loss": 0.7737, + "step": 9964 + }, + { + "epoch": 0.88, + "grad_norm": 6.685277585175096, + "learning_rate": 4.0295229053562646e-07, + "loss": 0.6698, + "step": 9965 + }, + { + "epoch": 0.88, + "grad_norm": 7.2320904922136116, + "learning_rate": 4.023930808457516e-07, + "loss": 0.6404, + "step": 9966 + }, + { + "epoch": 0.88, + "grad_norm": 7.300443715612432, + "learning_rate": 4.018342431865818e-07, + "loss": 0.8221, + "step": 9967 + }, + { + "epoch": 0.88, + "grad_norm": 9.040117303373572, + "learning_rate": 4.012757776033366e-07, + "loss": 0.6339, + "step": 9968 + }, + { + "epoch": 0.88, + "grad_norm": 9.485448850590133, + "learning_rate": 4.007176841412058e-07, + "loss": 0.8958, + "step": 9969 + }, + { + "epoch": 0.88, + "grad_norm": 5.692179321749371, + "learning_rate": 4.0015996284534897e-07, + "loss": 0.6275, + "step": 9970 + }, + { + "epoch": 0.88, + "grad_norm": 7.56784664393148, + "learning_rate": 3.9960261376089717e-07, + "loss": 0.7011, + "step": 9971 + }, + { + "epoch": 0.88, + "grad_norm": 5.491157865854683, + "learning_rate": 3.990456369329482e-07, + "loss": 0.6824, + "step": 9972 + }, + { + "epoch": 0.88, + "grad_norm": 6.169277481345289, + "learning_rate": 3.9848903240657433e-07, + "loss": 0.818, + "step": 9973 + }, + { + "epoch": 0.88, + "grad_norm": 6.171541247549411, + "learning_rate": 3.9793280022681346e-07, + "loss": 0.6276, + "step": 9974 + }, + { + "epoch": 0.88, + "grad_norm": 8.566049999860846, + "learning_rate": 3.9737694043867513e-07, + "loss": 0.7416, + "step": 9975 + }, + { + "epoch": 0.88, + "grad_norm": 11.188698694088046, + "learning_rate": 3.9682145308713894e-07, + "loss": 0.8484, + "step": 9976 + }, + { + "epoch": 0.88, + "grad_norm": 10.218201390966994, + "learning_rate": 3.9626633821715277e-07, + "loss": 0.6122, + "step": 9977 + }, + { + "epoch": 0.88, + "grad_norm": 9.424120093910805, + "learning_rate": 3.957115958736374e-07, + "loss": 0.6107, + "step": 9978 + }, + { + "epoch": 0.88, + "grad_norm": 5.722200621106396, + "learning_rate": 3.951572261014808e-07, + "loss": 0.6052, + "step": 9979 + }, + { + "epoch": 0.88, + "grad_norm": 5.804490759833662, + "learning_rate": 3.9460322894554204e-07, + "loss": 0.7268, + "step": 9980 + }, + { + "epoch": 0.88, + "grad_norm": 7.746210812074317, + "learning_rate": 3.9404960445064864e-07, + "loss": 0.5999, + "step": 9981 + }, + { + "epoch": 0.88, + "grad_norm": 11.701315939440764, + "learning_rate": 3.9349635266160036e-07, + "loss": 0.6538, + "step": 9982 + }, + { + "epoch": 0.88, + "grad_norm": 8.248750986630421, + "learning_rate": 3.9294347362316354e-07, + "loss": 0.6797, + "step": 9983 + }, + { + "epoch": 0.88, + "grad_norm": 8.41959748977922, + "learning_rate": 3.9239096738007856e-07, + "loss": 0.7064, + "step": 9984 + }, + { + "epoch": 0.88, + "grad_norm": 7.771859283544009, + "learning_rate": 3.918388339770518e-07, + "loss": 0.7088, + "step": 9985 + }, + { + "epoch": 0.88, + "grad_norm": 8.904282344259082, + "learning_rate": 3.9128707345876096e-07, + "loss": 0.7131, + "step": 9986 + }, + { + "epoch": 0.88, + "grad_norm": 5.863113521460494, + "learning_rate": 3.9073568586985464e-07, + "loss": 0.6184, + "step": 9987 + }, + { + "epoch": 0.88, + "grad_norm": 14.153961732900012, + "learning_rate": 3.9018467125494894e-07, + "loss": 0.7432, + "step": 9988 + }, + { + "epoch": 0.88, + "grad_norm": 5.336707381258269, + "learning_rate": 3.8963402965863094e-07, + "loss": 0.6004, + "step": 9989 + }, + { + "epoch": 0.88, + "grad_norm": 8.640645949080106, + "learning_rate": 3.890837611254594e-07, + "loss": 0.6306, + "step": 9990 + }, + { + "epoch": 0.88, + "grad_norm": 26.54335427099032, + "learning_rate": 3.8853386569995985e-07, + "loss": 0.8371, + "step": 9991 + }, + { + "epoch": 0.88, + "grad_norm": 5.905755250349547, + "learning_rate": 3.8798434342662894e-07, + "loss": 0.6523, + "step": 9992 + }, + { + "epoch": 0.88, + "grad_norm": 13.363107375197051, + "learning_rate": 3.8743519434993384e-07, + "loss": 0.6726, + "step": 9993 + }, + { + "epoch": 0.88, + "grad_norm": 10.190656523433717, + "learning_rate": 3.8688641851430953e-07, + "loss": 0.7444, + "step": 9994 + }, + { + "epoch": 0.88, + "grad_norm": 7.769033565979149, + "learning_rate": 3.863380159641622e-07, + "loss": 0.656, + "step": 9995 + }, + { + "epoch": 0.88, + "grad_norm": 6.343497944491213, + "learning_rate": 3.857899867438691e-07, + "loss": 0.7392, + "step": 9996 + }, + { + "epoch": 0.88, + "grad_norm": 7.1970306656338785, + "learning_rate": 3.8524233089777484e-07, + "loss": 0.8815, + "step": 9997 + }, + { + "epoch": 0.88, + "grad_norm": 8.032852222991858, + "learning_rate": 3.84695048470195e-07, + "loss": 0.7504, + "step": 9998 + }, + { + "epoch": 0.88, + "grad_norm": 8.324498611456105, + "learning_rate": 3.8414813950541416e-07, + "loss": 0.8512, + "step": 9999 + }, + { + "epoch": 0.88, + "grad_norm": 7.327056253175744, + "learning_rate": 3.8360160404768755e-07, + "loss": 0.789, + "step": 10000 + }, + { + "epoch": 0.88, + "grad_norm": 10.864250240361931, + "learning_rate": 3.830554421412408e-07, + "loss": 0.7642, + "step": 10001 + }, + { + "epoch": 0.88, + "grad_norm": 5.164055741425719, + "learning_rate": 3.8250965383026806e-07, + "loss": 0.7705, + "step": 10002 + }, + { + "epoch": 0.88, + "grad_norm": 10.213320676748065, + "learning_rate": 3.819642391589329e-07, + "loss": 0.7406, + "step": 10003 + }, + { + "epoch": 0.88, + "grad_norm": 7.323286632988952, + "learning_rate": 3.8141919817136994e-07, + "loss": 0.7604, + "step": 10004 + }, + { + "epoch": 0.88, + "grad_norm": 5.822830829490459, + "learning_rate": 3.808745309116829e-07, + "loss": 0.6683, + "step": 10005 + }, + { + "epoch": 0.88, + "grad_norm": 5.482280742823281, + "learning_rate": 3.8033023742394426e-07, + "loss": 0.707, + "step": 10006 + }, + { + "epoch": 0.88, + "grad_norm": 17.2736905809593, + "learning_rate": 3.797863177521993e-07, + "loss": 0.7046, + "step": 10007 + }, + { + "epoch": 0.88, + "grad_norm": 8.139000340765678, + "learning_rate": 3.792427719404601e-07, + "loss": 0.731, + "step": 10008 + }, + { + "epoch": 0.88, + "grad_norm": 15.554105787709156, + "learning_rate": 3.7869960003270977e-07, + "loss": 0.7531, + "step": 10009 + }, + { + "epoch": 0.88, + "grad_norm": 9.012413643750742, + "learning_rate": 3.7815680207290094e-07, + "loss": 0.6213, + "step": 10010 + }, + { + "epoch": 0.88, + "grad_norm": 9.760990749842998, + "learning_rate": 3.7761437810495517e-07, + "loss": 0.8232, + "step": 10011 + }, + { + "epoch": 0.88, + "grad_norm": 14.089538931277279, + "learning_rate": 3.770723281727645e-07, + "loss": 0.7827, + "step": 10012 + }, + { + "epoch": 0.88, + "grad_norm": 10.12955199345157, + "learning_rate": 3.765306523201917e-07, + "loss": 0.6272, + "step": 10013 + }, + { + "epoch": 0.88, + "grad_norm": 18.622786072551715, + "learning_rate": 3.759893505910678e-07, + "loss": 0.4881, + "step": 10014 + }, + { + "epoch": 0.88, + "grad_norm": 6.790045329834725, + "learning_rate": 3.754484230291938e-07, + "loss": 0.6719, + "step": 10015 + }, + { + "epoch": 0.88, + "grad_norm": 4.955648461838887, + "learning_rate": 3.749078696783415e-07, + "loss": 0.6334, + "step": 10016 + }, + { + "epoch": 0.88, + "grad_norm": 8.477656915017649, + "learning_rate": 3.743676905822496e-07, + "loss": 0.681, + "step": 10017 + }, + { + "epoch": 0.88, + "grad_norm": 6.872086012070903, + "learning_rate": 3.7382788578463056e-07, + "loss": 0.7298, + "step": 10018 + }, + { + "epoch": 0.88, + "grad_norm": 6.3132528217163735, + "learning_rate": 3.7328845532916435e-07, + "loss": 0.7755, + "step": 10019 + }, + { + "epoch": 0.88, + "grad_norm": 3.1681567538458353, + "learning_rate": 3.727493992594999e-07, + "loss": 0.4879, + "step": 10020 + }, + { + "epoch": 0.88, + "grad_norm": 10.012574817855558, + "learning_rate": 3.7221071761925687e-07, + "loss": 0.5755, + "step": 10021 + }, + { + "epoch": 0.88, + "grad_norm": 11.342696507564359, + "learning_rate": 3.7167241045202474e-07, + "loss": 0.8202, + "step": 10022 + }, + { + "epoch": 0.88, + "grad_norm": 6.100138502186559, + "learning_rate": 3.7113447780136146e-07, + "loss": 0.5843, + "step": 10023 + }, + { + "epoch": 0.88, + "grad_norm": 8.933759853191336, + "learning_rate": 3.7059691971079727e-07, + "loss": 0.7042, + "step": 10024 + }, + { + "epoch": 0.88, + "grad_norm": 25.897487738607627, + "learning_rate": 3.7005973622383006e-07, + "loss": 0.6669, + "step": 10025 + }, + { + "epoch": 0.88, + "grad_norm": 6.730589115997797, + "learning_rate": 3.695229273839274e-07, + "loss": 0.6225, + "step": 10026 + }, + { + "epoch": 0.88, + "grad_norm": 7.578764117789533, + "learning_rate": 3.689864932345266e-07, + "loss": 0.7032, + "step": 10027 + }, + { + "epoch": 0.88, + "grad_norm": 14.440782999005886, + "learning_rate": 3.684504338190359e-07, + "loss": 0.6706, + "step": 10028 + }, + { + "epoch": 0.88, + "grad_norm": 6.617442545752646, + "learning_rate": 3.6791474918083104e-07, + "loss": 0.6013, + "step": 10029 + }, + { + "epoch": 0.88, + "grad_norm": 7.782219929117034, + "learning_rate": 3.6737943936326017e-07, + "loss": 0.7345, + "step": 10030 + }, + { + "epoch": 0.88, + "grad_norm": 6.933328825220593, + "learning_rate": 3.668445044096397e-07, + "loss": 0.7549, + "step": 10031 + }, + { + "epoch": 0.88, + "grad_norm": 7.332967674930606, + "learning_rate": 3.663099443632545e-07, + "loss": 0.6507, + "step": 10032 + }, + { + "epoch": 0.88, + "grad_norm": 7.463289485303232, + "learning_rate": 3.657757592673611e-07, + "loss": 0.6461, + "step": 10033 + }, + { + "epoch": 0.88, + "grad_norm": 9.15832918853818, + "learning_rate": 3.6524194916518427e-07, + "loss": 0.7353, + "step": 10034 + }, + { + "epoch": 0.88, + "grad_norm": 8.590193724437896, + "learning_rate": 3.64708514099919e-07, + "loss": 0.769, + "step": 10035 + }, + { + "epoch": 0.88, + "grad_norm": 8.20332943717909, + "learning_rate": 3.6417545411473077e-07, + "loss": 0.6315, + "step": 10036 + }, + { + "epoch": 0.88, + "grad_norm": 6.875966733924914, + "learning_rate": 3.6364276925275387e-07, + "loss": 0.6779, + "step": 10037 + }, + { + "epoch": 0.88, + "grad_norm": 2.8640586328553392, + "learning_rate": 3.6311045955709166e-07, + "loss": 0.5626, + "step": 10038 + }, + { + "epoch": 0.88, + "grad_norm": 8.479709208140296, + "learning_rate": 3.6257852507081805e-07, + "loss": 0.7128, + "step": 10039 + }, + { + "epoch": 0.88, + "grad_norm": 2.6867913454749868, + "learning_rate": 3.6204696583697575e-07, + "loss": 0.4353, + "step": 10040 + }, + { + "epoch": 0.88, + "grad_norm": 9.977123045328847, + "learning_rate": 3.6151578189857873e-07, + "loss": 0.6968, + "step": 10041 + }, + { + "epoch": 0.88, + "grad_norm": 2.6730049181109377, + "learning_rate": 3.609849732986087e-07, + "loss": 0.5911, + "step": 10042 + }, + { + "epoch": 0.88, + "grad_norm": 21.430919326679046, + "learning_rate": 3.60454540080018e-07, + "loss": 0.788, + "step": 10043 + }, + { + "epoch": 0.88, + "grad_norm": 5.894014779682331, + "learning_rate": 3.5992448228572895e-07, + "loss": 0.6307, + "step": 10044 + }, + { + "epoch": 0.88, + "grad_norm": 6.652435993482048, + "learning_rate": 3.593947999586317e-07, + "loss": 0.5215, + "step": 10045 + }, + { + "epoch": 0.88, + "grad_norm": 11.111918231468566, + "learning_rate": 3.588654931415875e-07, + "loss": 0.6989, + "step": 10046 + }, + { + "epoch": 0.88, + "grad_norm": 22.85104729574058, + "learning_rate": 3.583365618774287e-07, + "loss": 0.8117, + "step": 10047 + }, + { + "epoch": 0.88, + "grad_norm": 6.958789049462743, + "learning_rate": 3.5780800620895394e-07, + "loss": 0.5226, + "step": 10048 + }, + { + "epoch": 0.88, + "grad_norm": 6.444762212755099, + "learning_rate": 3.572798261789334e-07, + "loss": 0.7231, + "step": 10049 + }, + { + "epoch": 0.88, + "grad_norm": 9.198565550849978, + "learning_rate": 3.5675202183010625e-07, + "loss": 0.7292, + "step": 10050 + }, + { + "epoch": 0.88, + "grad_norm": 10.804628780921494, + "learning_rate": 3.562245932051822e-07, + "loss": 0.7419, + "step": 10051 + }, + { + "epoch": 0.88, + "grad_norm": 20.99870559213333, + "learning_rate": 3.556975403468388e-07, + "loss": 0.7407, + "step": 10052 + }, + { + "epoch": 0.88, + "grad_norm": 6.463919719012958, + "learning_rate": 3.5517086329772576e-07, + "loss": 0.6746, + "step": 10053 + }, + { + "epoch": 0.88, + "grad_norm": 7.884058825897231, + "learning_rate": 3.5464456210046016e-07, + "loss": 0.7711, + "step": 10054 + }, + { + "epoch": 0.88, + "grad_norm": 7.065183063641823, + "learning_rate": 3.5411863679762956e-07, + "loss": 0.6191, + "step": 10055 + }, + { + "epoch": 0.88, + "grad_norm": 8.807606157052696, + "learning_rate": 3.5359308743179056e-07, + "loss": 0.7166, + "step": 10056 + }, + { + "epoch": 0.88, + "grad_norm": 15.699081158985233, + "learning_rate": 3.530679140454696e-07, + "loss": 0.8223, + "step": 10057 + }, + { + "epoch": 0.88, + "grad_norm": 6.427465236223481, + "learning_rate": 3.525431166811638e-07, + "loss": 0.7474, + "step": 10058 + }, + { + "epoch": 0.88, + "grad_norm": 9.853025540840656, + "learning_rate": 3.520186953813381e-07, + "loss": 0.7981, + "step": 10059 + }, + { + "epoch": 0.88, + "grad_norm": 9.675448416145604, + "learning_rate": 3.5149465018842863e-07, + "loss": 0.5622, + "step": 10060 + }, + { + "epoch": 0.88, + "grad_norm": 8.689331797673216, + "learning_rate": 3.509709811448392e-07, + "loss": 0.587, + "step": 10061 + }, + { + "epoch": 0.88, + "grad_norm": 7.276236590350457, + "learning_rate": 3.504476882929453e-07, + "loss": 0.7089, + "step": 10062 + }, + { + "epoch": 0.88, + "grad_norm": 6.873524158131042, + "learning_rate": 3.4992477167508864e-07, + "loss": 0.7987, + "step": 10063 + }, + { + "epoch": 0.88, + "grad_norm": 11.978062733862817, + "learning_rate": 3.4940223133358596e-07, + "loss": 0.6974, + "step": 10064 + }, + { + "epoch": 0.88, + "grad_norm": 7.878597081619637, + "learning_rate": 3.4888006731071844e-07, + "loss": 0.6805, + "step": 10065 + }, + { + "epoch": 0.88, + "grad_norm": 25.089053061804588, + "learning_rate": 3.483582796487395e-07, + "loss": 0.6015, + "step": 10066 + }, + { + "epoch": 0.88, + "grad_norm": 12.791883645175458, + "learning_rate": 3.4783686838987083e-07, + "loss": 0.6576, + "step": 10067 + }, + { + "epoch": 0.88, + "grad_norm": 12.597263009273517, + "learning_rate": 3.4731583357630437e-07, + "loss": 0.8268, + "step": 10068 + }, + { + "epoch": 0.88, + "grad_norm": 6.252075433159021, + "learning_rate": 3.467951752502002e-07, + "loss": 0.6403, + "step": 10069 + }, + { + "epoch": 0.88, + "grad_norm": 8.47953744638179, + "learning_rate": 3.4627489345369134e-07, + "loss": 0.6561, + "step": 10070 + }, + { + "epoch": 0.88, + "grad_norm": 6.0977105169785295, + "learning_rate": 3.4575498822887685e-07, + "loss": 0.682, + "step": 10071 + }, + { + "epoch": 0.88, + "grad_norm": 3.0556093120348966, + "learning_rate": 3.4523545961782703e-07, + "loss": 0.5003, + "step": 10072 + }, + { + "epoch": 0.88, + "grad_norm": 8.819163464858146, + "learning_rate": 3.447163076625809e-07, + "loss": 0.6281, + "step": 10073 + }, + { + "epoch": 0.88, + "grad_norm": 11.593346039231564, + "learning_rate": 3.441975324051472e-07, + "loss": 0.684, + "step": 10074 + }, + { + "epoch": 0.88, + "grad_norm": 5.996088311060018, + "learning_rate": 3.4367913388750397e-07, + "loss": 0.6747, + "step": 10075 + }, + { + "epoch": 0.88, + "grad_norm": 10.954546072966174, + "learning_rate": 3.4316111215160097e-07, + "loss": 0.8623, + "step": 10076 + }, + { + "epoch": 0.88, + "grad_norm": 8.074049149072199, + "learning_rate": 3.426434672393542e-07, + "loss": 0.6502, + "step": 10077 + }, + { + "epoch": 0.89, + "grad_norm": 8.302285393329816, + "learning_rate": 3.421261991926511e-07, + "loss": 0.6349, + "step": 10078 + }, + { + "epoch": 0.89, + "grad_norm": 9.341377047369624, + "learning_rate": 3.416093080533478e-07, + "loss": 0.8666, + "step": 10079 + }, + { + "epoch": 0.89, + "grad_norm": 5.81618561580951, + "learning_rate": 3.410927938632697e-07, + "loss": 0.6794, + "step": 10080 + }, + { + "epoch": 0.89, + "grad_norm": 2.925137642224505, + "learning_rate": 3.405766566642144e-07, + "loss": 0.4885, + "step": 10081 + }, + { + "epoch": 0.89, + "grad_norm": 5.937236086621267, + "learning_rate": 3.400608964979446e-07, + "loss": 0.7808, + "step": 10082 + }, + { + "epoch": 0.89, + "grad_norm": 6.910907339851369, + "learning_rate": 3.395455134061959e-07, + "loss": 0.6507, + "step": 10083 + }, + { + "epoch": 0.89, + "grad_norm": 4.93552427331875, + "learning_rate": 3.3903050743067213e-07, + "loss": 0.7247, + "step": 10084 + }, + { + "epoch": 0.89, + "grad_norm": 10.096330278478103, + "learning_rate": 3.3851587861304657e-07, + "loss": 0.6934, + "step": 10085 + }, + { + "epoch": 0.89, + "grad_norm": 8.864395437483065, + "learning_rate": 3.3800162699496154e-07, + "loss": 0.6538, + "step": 10086 + }, + { + "epoch": 0.89, + "grad_norm": 7.210952729346783, + "learning_rate": 3.374877526180309e-07, + "loss": 0.7246, + "step": 10087 + }, + { + "epoch": 0.89, + "grad_norm": 5.146562495011624, + "learning_rate": 3.3697425552383536e-07, + "loss": 0.7445, + "step": 10088 + }, + { + "epoch": 0.89, + "grad_norm": 8.863421927479413, + "learning_rate": 3.3646113575392605e-07, + "loss": 0.6438, + "step": 10089 + }, + { + "epoch": 0.89, + "grad_norm": 2.5447188688250995, + "learning_rate": 3.3594839334982475e-07, + "loss": 0.4858, + "step": 10090 + }, + { + "epoch": 0.89, + "grad_norm": 8.347602610530123, + "learning_rate": 3.3543602835302113e-07, + "loss": 0.7559, + "step": 10091 + }, + { + "epoch": 0.89, + "grad_norm": 12.048475576839989, + "learning_rate": 3.349240408049742e-07, + "loss": 0.5544, + "step": 10092 + }, + { + "epoch": 0.89, + "grad_norm": 2.642314002808744, + "learning_rate": 3.3441243074711473e-07, + "loss": 0.5003, + "step": 10093 + }, + { + "epoch": 0.89, + "grad_norm": 5.629131001424464, + "learning_rate": 3.3390119822084065e-07, + "loss": 0.6106, + "step": 10094 + }, + { + "epoch": 0.89, + "grad_norm": 3.510012033123733, + "learning_rate": 3.333903432675201e-07, + "loss": 0.5092, + "step": 10095 + }, + { + "epoch": 0.89, + "grad_norm": 2.5610154460027967, + "learning_rate": 3.3287986592848987e-07, + "loss": 0.5038, + "step": 10096 + }, + { + "epoch": 0.89, + "grad_norm": 16.88259002201013, + "learning_rate": 3.32369766245057e-07, + "loss": 0.7334, + "step": 10097 + }, + { + "epoch": 0.89, + "grad_norm": 10.360686557714091, + "learning_rate": 3.318600442584996e-07, + "loss": 0.6282, + "step": 10098 + }, + { + "epoch": 0.89, + "grad_norm": 2.4565113456048415, + "learning_rate": 3.3135070001006186e-07, + "loss": 0.489, + "step": 10099 + }, + { + "epoch": 0.89, + "grad_norm": 6.548189488869825, + "learning_rate": 3.308417335409597e-07, + "loss": 0.7949, + "step": 10100 + }, + { + "epoch": 0.89, + "grad_norm": 10.68250303440932, + "learning_rate": 3.30333144892378e-07, + "loss": 0.7161, + "step": 10101 + }, + { + "epoch": 0.89, + "grad_norm": 7.866957124690832, + "learning_rate": 3.2982493410547045e-07, + "loss": 0.7029, + "step": 10102 + }, + { + "epoch": 0.89, + "grad_norm": 2.267758046014054, + "learning_rate": 3.2931710122135973e-07, + "loss": 0.5847, + "step": 10103 + }, + { + "epoch": 0.89, + "grad_norm": 9.278621025077745, + "learning_rate": 3.2880964628114133e-07, + "loss": 0.7776, + "step": 10104 + }, + { + "epoch": 0.89, + "grad_norm": 2.606644210165436, + "learning_rate": 3.283025693258757e-07, + "loss": 0.5325, + "step": 10105 + }, + { + "epoch": 0.89, + "grad_norm": 9.245775777384813, + "learning_rate": 3.2779587039659555e-07, + "loss": 0.6307, + "step": 10106 + }, + { + "epoch": 0.89, + "grad_norm": 7.788501141055241, + "learning_rate": 3.2728954953430147e-07, + "loss": 0.739, + "step": 10107 + }, + { + "epoch": 0.89, + "grad_norm": 11.773353045327989, + "learning_rate": 3.267836067799646e-07, + "loss": 0.6941, + "step": 10108 + }, + { + "epoch": 0.89, + "grad_norm": 8.29280186553623, + "learning_rate": 3.2627804217452375e-07, + "loss": 0.7166, + "step": 10109 + }, + { + "epoch": 0.89, + "grad_norm": 7.160862654696645, + "learning_rate": 3.257728557588902e-07, + "loss": 0.7496, + "step": 10110 + }, + { + "epoch": 0.89, + "grad_norm": 13.81969093480385, + "learning_rate": 3.252680475739423e-07, + "loss": 0.6664, + "step": 10111 + }, + { + "epoch": 0.89, + "grad_norm": 7.0062588642066554, + "learning_rate": 3.2476361766052735e-07, + "loss": 0.6588, + "step": 10112 + }, + { + "epoch": 0.89, + "grad_norm": 7.426828341484632, + "learning_rate": 3.2425956605946386e-07, + "loss": 0.7572, + "step": 10113 + }, + { + "epoch": 0.89, + "grad_norm": 30.40731087121036, + "learning_rate": 3.237558928115392e-07, + "loss": 0.574, + "step": 10114 + }, + { + "epoch": 0.89, + "grad_norm": 7.743641867813294, + "learning_rate": 3.232525979575074e-07, + "loss": 0.7432, + "step": 10115 + }, + { + "epoch": 0.89, + "grad_norm": 7.277910786822903, + "learning_rate": 3.227496815380976e-07, + "loss": 0.7985, + "step": 10116 + }, + { + "epoch": 0.89, + "grad_norm": 6.07405678949519, + "learning_rate": 3.222471435940028e-07, + "loss": 0.6759, + "step": 10117 + }, + { + "epoch": 0.89, + "grad_norm": 13.10951750167491, + "learning_rate": 3.2174498416588817e-07, + "loss": 0.6447, + "step": 10118 + }, + { + "epoch": 0.89, + "grad_norm": 9.123669079935834, + "learning_rate": 3.2124320329438795e-07, + "loss": 0.707, + "step": 10119 + }, + { + "epoch": 0.89, + "grad_norm": 7.243071459292973, + "learning_rate": 3.207418010201041e-07, + "loss": 0.6584, + "step": 10120 + }, + { + "epoch": 0.89, + "grad_norm": 2.5319487427440697, + "learning_rate": 3.202407773836108e-07, + "loss": 0.5248, + "step": 10121 + }, + { + "epoch": 0.89, + "grad_norm": 6.617352763438745, + "learning_rate": 3.1974013242544954e-07, + "loss": 0.6903, + "step": 10122 + }, + { + "epoch": 0.89, + "grad_norm": 15.834326936477042, + "learning_rate": 3.1923986618613124e-07, + "loss": 0.7859, + "step": 10123 + }, + { + "epoch": 0.89, + "grad_norm": 15.473038167518144, + "learning_rate": 3.1873997870613736e-07, + "loss": 0.7275, + "step": 10124 + }, + { + "epoch": 0.89, + "grad_norm": 7.599636126885713, + "learning_rate": 3.182404700259173e-07, + "loss": 0.8696, + "step": 10125 + }, + { + "epoch": 0.89, + "grad_norm": 9.013953360604514, + "learning_rate": 3.1774134018588977e-07, + "loss": 0.6478, + "step": 10126 + }, + { + "epoch": 0.89, + "grad_norm": 9.36785075195153, + "learning_rate": 3.1724258922644523e-07, + "loss": 0.6939, + "step": 10127 + }, + { + "epoch": 0.89, + "grad_norm": 10.586222360497574, + "learning_rate": 3.167442171879415e-07, + "loss": 0.5647, + "step": 10128 + }, + { + "epoch": 0.89, + "grad_norm": 5.5179828511606175, + "learning_rate": 3.1624622411070504e-07, + "loss": 0.7924, + "step": 10129 + }, + { + "epoch": 0.89, + "grad_norm": 9.428534939292172, + "learning_rate": 3.157486100350332e-07, + "loss": 0.7744, + "step": 10130 + }, + { + "epoch": 0.89, + "grad_norm": 2.422409338900954, + "learning_rate": 3.1525137500119207e-07, + "loss": 0.524, + "step": 10131 + }, + { + "epoch": 0.89, + "grad_norm": 2.6552885034881126, + "learning_rate": 3.1475451904941613e-07, + "loss": 0.4777, + "step": 10132 + }, + { + "epoch": 0.89, + "grad_norm": 2.9337976063518036, + "learning_rate": 3.142580422199115e-07, + "loss": 0.4923, + "step": 10133 + }, + { + "epoch": 0.89, + "grad_norm": 7.79957053516311, + "learning_rate": 3.1376194455285225e-07, + "loss": 0.614, + "step": 10134 + }, + { + "epoch": 0.89, + "grad_norm": 6.586974994508568, + "learning_rate": 3.132662260883812e-07, + "loss": 0.6233, + "step": 10135 + }, + { + "epoch": 0.89, + "grad_norm": 9.630716723961871, + "learning_rate": 3.1277088686661127e-07, + "loss": 0.6727, + "step": 10136 + }, + { + "epoch": 0.89, + "grad_norm": 10.439601351958895, + "learning_rate": 3.122759269276232e-07, + "loss": 0.7313, + "step": 10137 + }, + { + "epoch": 0.89, + "grad_norm": 8.223543202774206, + "learning_rate": 3.117813463114705e-07, + "loss": 0.6991, + "step": 10138 + }, + { + "epoch": 0.89, + "grad_norm": 10.833895799869001, + "learning_rate": 3.112871450581728e-07, + "loss": 0.6412, + "step": 10139 + }, + { + "epoch": 0.89, + "grad_norm": 8.655076703253572, + "learning_rate": 3.107933232077198e-07, + "loss": 0.6776, + "step": 10140 + }, + { + "epoch": 0.89, + "grad_norm": 7.431495053471824, + "learning_rate": 3.102998808000712e-07, + "loss": 0.6271, + "step": 10141 + }, + { + "epoch": 0.89, + "grad_norm": 7.820944533714346, + "learning_rate": 3.0980681787515565e-07, + "loss": 0.683, + "step": 10142 + }, + { + "epoch": 0.89, + "grad_norm": 2.6518568133363813, + "learning_rate": 3.093141344728695e-07, + "loss": 0.4243, + "step": 10143 + }, + { + "epoch": 0.89, + "grad_norm": 7.220316777430393, + "learning_rate": 3.08821830633082e-07, + "loss": 0.692, + "step": 10144 + }, + { + "epoch": 0.89, + "grad_norm": 12.203404523777259, + "learning_rate": 3.08329906395628e-07, + "loss": 0.5826, + "step": 10145 + }, + { + "epoch": 0.89, + "grad_norm": 6.472949384420213, + "learning_rate": 3.0783836180031444e-07, + "loss": 0.805, + "step": 10146 + }, + { + "epoch": 0.89, + "grad_norm": 8.384770423927955, + "learning_rate": 3.073471968869157e-07, + "loss": 0.6425, + "step": 10147 + }, + { + "epoch": 0.89, + "grad_norm": 8.947478121725345, + "learning_rate": 3.0685641169517554e-07, + "loss": 0.6272, + "step": 10148 + }, + { + "epoch": 0.89, + "grad_norm": 7.11677021864978, + "learning_rate": 3.063660062648072e-07, + "loss": 0.6865, + "step": 10149 + }, + { + "epoch": 0.89, + "grad_norm": 12.601088361668804, + "learning_rate": 3.0587598063549495e-07, + "loss": 0.7435, + "step": 10150 + }, + { + "epoch": 0.89, + "grad_norm": 5.844898109089921, + "learning_rate": 3.053863348468905e-07, + "loss": 0.8001, + "step": 10151 + }, + { + "epoch": 0.89, + "grad_norm": 6.611822385969128, + "learning_rate": 3.0489706893861435e-07, + "loss": 0.6611, + "step": 10152 + }, + { + "epoch": 0.89, + "grad_norm": 5.283844374227724, + "learning_rate": 3.0440818295025766e-07, + "loss": 0.6618, + "step": 10153 + }, + { + "epoch": 0.89, + "grad_norm": 8.404401381598653, + "learning_rate": 3.039196769213787e-07, + "loss": 0.69, + "step": 10154 + }, + { + "epoch": 0.89, + "grad_norm": 10.96653425926643, + "learning_rate": 3.0343155089150924e-07, + "loss": 0.7902, + "step": 10155 + }, + { + "epoch": 0.89, + "grad_norm": 8.541759505183926, + "learning_rate": 3.0294380490014654e-07, + "loss": 0.6637, + "step": 10156 + }, + { + "epoch": 0.89, + "grad_norm": 6.246221554126944, + "learning_rate": 3.0245643898675793e-07, + "loss": 0.6367, + "step": 10157 + }, + { + "epoch": 0.89, + "grad_norm": 12.291004567786203, + "learning_rate": 3.0196945319078017e-07, + "loss": 0.7866, + "step": 10158 + }, + { + "epoch": 0.89, + "grad_norm": 3.496444286463535, + "learning_rate": 3.0148284755161906e-07, + "loss": 0.5037, + "step": 10159 + }, + { + "epoch": 0.89, + "grad_norm": 7.055979639463636, + "learning_rate": 3.009966221086502e-07, + "loss": 0.6947, + "step": 10160 + }, + { + "epoch": 0.89, + "grad_norm": 8.495348759840352, + "learning_rate": 3.005107769012189e-07, + "loss": 0.72, + "step": 10161 + }, + { + "epoch": 0.89, + "grad_norm": 17.37473838915695, + "learning_rate": 3.0002531196863803e-07, + "loss": 0.7275, + "step": 10162 + }, + { + "epoch": 0.89, + "grad_norm": 10.37621546214236, + "learning_rate": 2.995402273501913e-07, + "loss": 0.7643, + "step": 10163 + }, + { + "epoch": 0.89, + "grad_norm": 6.91391728130077, + "learning_rate": 2.990555230851305e-07, + "loss": 0.6581, + "step": 10164 + }, + { + "epoch": 0.89, + "grad_norm": 7.43037207653117, + "learning_rate": 2.985711992126772e-07, + "loss": 0.7867, + "step": 10165 + }, + { + "epoch": 0.89, + "grad_norm": 9.109281042583445, + "learning_rate": 2.98087255772021e-07, + "loss": 0.7086, + "step": 10166 + }, + { + "epoch": 0.89, + "grad_norm": 6.423206109484461, + "learning_rate": 2.976036928023235e-07, + "loss": 0.6853, + "step": 10167 + }, + { + "epoch": 0.89, + "grad_norm": 7.1059619420090225, + "learning_rate": 2.971205103427138e-07, + "loss": 0.8316, + "step": 10168 + }, + { + "epoch": 0.89, + "grad_norm": 2.6526592656286687, + "learning_rate": 2.966377084322891e-07, + "loss": 0.4945, + "step": 10169 + }, + { + "epoch": 0.89, + "grad_norm": 6.3139589279823545, + "learning_rate": 2.9615528711011696e-07, + "loss": 0.7459, + "step": 10170 + }, + { + "epoch": 0.89, + "grad_norm": 8.992803502796162, + "learning_rate": 2.956732464152351e-07, + "loss": 0.6032, + "step": 10171 + }, + { + "epoch": 0.89, + "grad_norm": 7.934423508913631, + "learning_rate": 2.951915863866484e-07, + "loss": 0.8427, + "step": 10172 + }, + { + "epoch": 0.89, + "grad_norm": 10.115412795634692, + "learning_rate": 2.9471030706333284e-07, + "loss": 0.7068, + "step": 10173 + }, + { + "epoch": 0.89, + "grad_norm": 8.900211364067584, + "learning_rate": 2.942294084842323e-07, + "loss": 0.8001, + "step": 10174 + }, + { + "epoch": 0.89, + "grad_norm": 13.598876750136226, + "learning_rate": 2.9374889068826074e-07, + "loss": 0.7117, + "step": 10175 + }, + { + "epoch": 0.89, + "grad_norm": 10.771720700127187, + "learning_rate": 2.932687537143003e-07, + "loss": 0.7512, + "step": 10176 + }, + { + "epoch": 0.89, + "grad_norm": 8.432418639773603, + "learning_rate": 2.9278899760120215e-07, + "loss": 0.7459, + "step": 10177 + }, + { + "epoch": 0.89, + "grad_norm": 14.479999863578765, + "learning_rate": 2.9230962238778916e-07, + "loss": 0.6178, + "step": 10178 + }, + { + "epoch": 0.89, + "grad_norm": 7.5932524218982005, + "learning_rate": 2.9183062811285077e-07, + "loss": 0.6548, + "step": 10179 + }, + { + "epoch": 0.89, + "grad_norm": 8.728106697735637, + "learning_rate": 2.913520148151461e-07, + "loss": 0.6315, + "step": 10180 + }, + { + "epoch": 0.89, + "grad_norm": 16.292236068195077, + "learning_rate": 2.9087378253340403e-07, + "loss": 0.7307, + "step": 10181 + }, + { + "epoch": 0.89, + "grad_norm": 14.547138513688754, + "learning_rate": 2.90395931306322e-07, + "loss": 0.6792, + "step": 10182 + }, + { + "epoch": 0.89, + "grad_norm": 7.083166162388786, + "learning_rate": 2.8991846117256694e-07, + "loss": 0.7581, + "step": 10183 + }, + { + "epoch": 0.89, + "grad_norm": 4.4337489463706214, + "learning_rate": 2.8944137217077564e-07, + "loss": 0.5376, + "step": 10184 + }, + { + "epoch": 0.89, + "grad_norm": 6.59074513694634, + "learning_rate": 2.8896466433955327e-07, + "loss": 0.6235, + "step": 10185 + }, + { + "epoch": 0.89, + "grad_norm": 7.302700332323829, + "learning_rate": 2.8848833771747355e-07, + "loss": 0.6972, + "step": 10186 + }, + { + "epoch": 0.89, + "grad_norm": 25.340638881400018, + "learning_rate": 2.8801239234308e-07, + "loss": 0.817, + "step": 10187 + }, + { + "epoch": 0.89, + "grad_norm": 7.4041211353288645, + "learning_rate": 2.8753682825488627e-07, + "loss": 0.6977, + "step": 10188 + }, + { + "epoch": 0.89, + "grad_norm": 8.369509304881966, + "learning_rate": 2.870616454913727e-07, + "loss": 0.6192, + "step": 10189 + }, + { + "epoch": 0.89, + "grad_norm": 14.221571087156054, + "learning_rate": 2.865868440909919e-07, + "loss": 0.7002, + "step": 10190 + }, + { + "epoch": 0.89, + "grad_norm": 12.082465790828515, + "learning_rate": 2.8611242409216313e-07, + "loss": 0.5656, + "step": 10191 + }, + { + "epoch": 0.9, + "grad_norm": 11.427936039794334, + "learning_rate": 2.8563838553327614e-07, + "loss": 0.8699, + "step": 10192 + }, + { + "epoch": 0.9, + "grad_norm": 2.5740915893930825, + "learning_rate": 2.851647284526887e-07, + "loss": 0.4846, + "step": 10193 + }, + { + "epoch": 0.9, + "grad_norm": 13.079950002678238, + "learning_rate": 2.846914528887285e-07, + "loss": 0.8217, + "step": 10194 + }, + { + "epoch": 0.9, + "grad_norm": 9.057080218133558, + "learning_rate": 2.842185588796925e-07, + "loss": 0.6366, + "step": 10195 + }, + { + "epoch": 0.9, + "grad_norm": 14.981855041360877, + "learning_rate": 2.83746046463847e-07, + "loss": 0.7125, + "step": 10196 + }, + { + "epoch": 0.9, + "grad_norm": 5.423955657188307, + "learning_rate": 2.832739156794262e-07, + "loss": 0.7163, + "step": 10197 + }, + { + "epoch": 0.9, + "grad_norm": 6.260001528368528, + "learning_rate": 2.828021665646341e-07, + "loss": 0.7429, + "step": 10198 + }, + { + "epoch": 0.9, + "grad_norm": 5.2876459118702215, + "learning_rate": 2.8233079915764406e-07, + "loss": 0.7682, + "step": 10199 + }, + { + "epoch": 0.9, + "grad_norm": 10.386150835457045, + "learning_rate": 2.8185981349659776e-07, + "loss": 0.721, + "step": 10200 + }, + { + "epoch": 0.9, + "grad_norm": 11.557717598711912, + "learning_rate": 2.8138920961960746e-07, + "loss": 0.6921, + "step": 10201 + }, + { + "epoch": 0.9, + "grad_norm": 8.460261215014093, + "learning_rate": 2.809189875647539e-07, + "loss": 0.7333, + "step": 10202 + }, + { + "epoch": 0.9, + "grad_norm": 8.39583322667892, + "learning_rate": 2.8044914737008545e-07, + "loss": 0.6459, + "step": 10203 + }, + { + "epoch": 0.9, + "grad_norm": 5.7589653090324395, + "learning_rate": 2.799796890736217e-07, + "loss": 0.7897, + "step": 10204 + }, + { + "epoch": 0.9, + "grad_norm": 10.016133134813538, + "learning_rate": 2.795106127133501e-07, + "loss": 0.6195, + "step": 10205 + }, + { + "epoch": 0.9, + "grad_norm": 17.176438540981003, + "learning_rate": 2.7904191832722694e-07, + "loss": 0.6241, + "step": 10206 + }, + { + "epoch": 0.9, + "grad_norm": 11.156867563003633, + "learning_rate": 2.785736059531796e-07, + "loss": 0.698, + "step": 10207 + }, + { + "epoch": 0.9, + "grad_norm": 11.615287316685025, + "learning_rate": 2.781056756291023e-07, + "loss": 0.836, + "step": 10208 + }, + { + "epoch": 0.9, + "grad_norm": 5.966086570487633, + "learning_rate": 2.776381273928597e-07, + "loss": 0.6644, + "step": 10209 + }, + { + "epoch": 0.9, + "grad_norm": 20.265202328932165, + "learning_rate": 2.771709612822837e-07, + "loss": 0.8114, + "step": 10210 + }, + { + "epoch": 0.9, + "grad_norm": 7.102771082019412, + "learning_rate": 2.767041773351781e-07, + "loss": 0.7676, + "step": 10211 + }, + { + "epoch": 0.9, + "grad_norm": 11.476121592022142, + "learning_rate": 2.762377755893131e-07, + "loss": 0.6771, + "step": 10212 + }, + { + "epoch": 0.9, + "grad_norm": 7.366847857781416, + "learning_rate": 2.7577175608243023e-07, + "loss": 0.8406, + "step": 10213 + }, + { + "epoch": 0.9, + "grad_norm": 9.29528874063108, + "learning_rate": 2.753061188522382e-07, + "loss": 0.8863, + "step": 10214 + }, + { + "epoch": 0.9, + "grad_norm": 24.186561723989882, + "learning_rate": 2.748408639364164e-07, + "loss": 0.7877, + "step": 10215 + }, + { + "epoch": 0.9, + "grad_norm": 8.201926095917521, + "learning_rate": 2.743759913726118e-07, + "loss": 0.7308, + "step": 10216 + }, + { + "epoch": 0.9, + "grad_norm": 2.5437732060240354, + "learning_rate": 2.739115011984406e-07, + "loss": 0.5259, + "step": 10217 + }, + { + "epoch": 0.9, + "grad_norm": 7.4557888435920585, + "learning_rate": 2.7344739345148976e-07, + "loss": 0.6238, + "step": 10218 + }, + { + "epoch": 0.9, + "grad_norm": 7.232483018853214, + "learning_rate": 2.7298366816931375e-07, + "loss": 0.7544, + "step": 10219 + }, + { + "epoch": 0.9, + "grad_norm": 7.917824351782294, + "learning_rate": 2.725203253894365e-07, + "loss": 0.733, + "step": 10220 + }, + { + "epoch": 0.9, + "grad_norm": 13.995932813372116, + "learning_rate": 2.720573651493502e-07, + "loss": 0.6225, + "step": 10221 + }, + { + "epoch": 0.9, + "grad_norm": 3.792802483099117, + "learning_rate": 2.7159478748651827e-07, + "loss": 0.5297, + "step": 10222 + }, + { + "epoch": 0.9, + "grad_norm": 5.761845704865696, + "learning_rate": 2.711325924383695e-07, + "loss": 0.7531, + "step": 10223 + }, + { + "epoch": 0.9, + "grad_norm": 8.327234618383853, + "learning_rate": 2.7067078004230585e-07, + "loss": 0.8809, + "step": 10224 + }, + { + "epoch": 0.9, + "grad_norm": 9.014052011708289, + "learning_rate": 2.7020935033569616e-07, + "loss": 0.6468, + "step": 10225 + }, + { + "epoch": 0.9, + "grad_norm": 8.19175330376363, + "learning_rate": 2.6974830335587843e-07, + "loss": 0.7967, + "step": 10226 + }, + { + "epoch": 0.9, + "grad_norm": 7.56876301441583, + "learning_rate": 2.6928763914015945e-07, + "loss": 0.7726, + "step": 10227 + }, + { + "epoch": 0.9, + "grad_norm": 7.639606844462045, + "learning_rate": 2.688273577258155e-07, + "loss": 0.7237, + "step": 10228 + }, + { + "epoch": 0.9, + "grad_norm": 16.53555854203909, + "learning_rate": 2.6836745915009123e-07, + "loss": 0.7111, + "step": 10229 + }, + { + "epoch": 0.9, + "grad_norm": 12.226349190543681, + "learning_rate": 2.6790794345020243e-07, + "loss": 0.5975, + "step": 10230 + }, + { + "epoch": 0.9, + "grad_norm": 9.495167523101218, + "learning_rate": 2.6744881066333104e-07, + "loss": 0.7118, + "step": 10231 + }, + { + "epoch": 0.9, + "grad_norm": 7.795836483777883, + "learning_rate": 2.669900608266296e-07, + "loss": 0.6904, + "step": 10232 + }, + { + "epoch": 0.9, + "grad_norm": 13.13339113466498, + "learning_rate": 2.6653169397722e-07, + "loss": 0.6145, + "step": 10233 + }, + { + "epoch": 0.9, + "grad_norm": 7.12209434399075, + "learning_rate": 2.6607371015219097e-07, + "loss": 0.7718, + "step": 10234 + }, + { + "epoch": 0.9, + "grad_norm": 11.01352817243986, + "learning_rate": 2.656161093886034e-07, + "loss": 0.5948, + "step": 10235 + }, + { + "epoch": 0.9, + "grad_norm": 5.389068772503965, + "learning_rate": 2.6515889172348544e-07, + "loss": 0.6127, + "step": 10236 + }, + { + "epoch": 0.9, + "grad_norm": 6.392521831929519, + "learning_rate": 2.6470205719383357e-07, + "loss": 0.6464, + "step": 10237 + }, + { + "epoch": 0.9, + "grad_norm": 8.52346856328717, + "learning_rate": 2.642456058366144e-07, + "loss": 0.514, + "step": 10238 + }, + { + "epoch": 0.9, + "grad_norm": 11.629059182549376, + "learning_rate": 2.637895376887634e-07, + "loss": 0.8529, + "step": 10239 + }, + { + "epoch": 0.9, + "grad_norm": 8.188640856556958, + "learning_rate": 2.6333385278718426e-07, + "loss": 0.7421, + "step": 10240 + }, + { + "epoch": 0.9, + "grad_norm": 9.640975741162345, + "learning_rate": 2.628785511687515e-07, + "loss": 0.6196, + "step": 10241 + }, + { + "epoch": 0.9, + "grad_norm": 2.845009589662916, + "learning_rate": 2.6242363287030617e-07, + "loss": 0.4297, + "step": 10242 + }, + { + "epoch": 0.9, + "grad_norm": 6.897255343083837, + "learning_rate": 2.619690979286599e-07, + "loss": 0.6184, + "step": 10243 + }, + { + "epoch": 0.9, + "grad_norm": 8.745216422395044, + "learning_rate": 2.6151494638059325e-07, + "loss": 0.8001, + "step": 10244 + }, + { + "epoch": 0.9, + "grad_norm": 6.945833832153886, + "learning_rate": 2.610611782628547e-07, + "loss": 0.6597, + "step": 10245 + }, + { + "epoch": 0.9, + "grad_norm": 8.531177430713, + "learning_rate": 2.606077936121626e-07, + "loss": 0.6622, + "step": 10246 + }, + { + "epoch": 0.9, + "grad_norm": 2.8795616972439975, + "learning_rate": 2.601547924652048e-07, + "loss": 0.5061, + "step": 10247 + }, + { + "epoch": 0.9, + "grad_norm": 5.16305588313321, + "learning_rate": 2.597021748586365e-07, + "loss": 0.6534, + "step": 10248 + }, + { + "epoch": 0.9, + "grad_norm": 2.5939539430215244, + "learning_rate": 2.592499408290833e-07, + "loss": 0.4932, + "step": 10249 + }, + { + "epoch": 0.9, + "grad_norm": 9.618980701519943, + "learning_rate": 2.587980904131393e-07, + "loss": 0.5395, + "step": 10250 + }, + { + "epoch": 0.9, + "grad_norm": 8.13887816884103, + "learning_rate": 2.5834662364736697e-07, + "loss": 0.7862, + "step": 10251 + }, + { + "epoch": 0.9, + "grad_norm": 6.493531841443051, + "learning_rate": 2.5789554056829814e-07, + "loss": 0.5685, + "step": 10252 + }, + { + "epoch": 0.9, + "grad_norm": 11.669321247915393, + "learning_rate": 2.5744484121243416e-07, + "loss": 0.4981, + "step": 10253 + }, + { + "epoch": 0.9, + "grad_norm": 6.2223635187447774, + "learning_rate": 2.569945256162454e-07, + "loss": 0.5263, + "step": 10254 + }, + { + "epoch": 0.9, + "grad_norm": 10.152529435809933, + "learning_rate": 2.565445938161698e-07, + "loss": 0.7262, + "step": 10255 + }, + { + "epoch": 0.9, + "grad_norm": 7.1534796715703575, + "learning_rate": 2.5609504584861557e-07, + "loss": 0.7237, + "step": 10256 + }, + { + "epoch": 0.9, + "grad_norm": 7.760120870315477, + "learning_rate": 2.5564588174995797e-07, + "loss": 0.6794, + "step": 10257 + }, + { + "epoch": 0.9, + "grad_norm": 6.129808404756435, + "learning_rate": 2.5519710155654467e-07, + "loss": 0.6924, + "step": 10258 + }, + { + "epoch": 0.9, + "grad_norm": 12.268190573575364, + "learning_rate": 2.5474870530468933e-07, + "loss": 0.7784, + "step": 10259 + }, + { + "epoch": 0.9, + "grad_norm": 8.925982654103281, + "learning_rate": 2.5430069303067574e-07, + "loss": 0.6761, + "step": 10260 + }, + { + "epoch": 0.9, + "grad_norm": 5.716719390211441, + "learning_rate": 2.53853064770756e-07, + "loss": 0.5827, + "step": 10261 + }, + { + "epoch": 0.9, + "grad_norm": 6.6493420851343235, + "learning_rate": 2.5340582056115114e-07, + "loss": 0.6973, + "step": 10262 + }, + { + "epoch": 0.9, + "grad_norm": 12.358830855016492, + "learning_rate": 2.5295896043805113e-07, + "loss": 0.663, + "step": 10263 + }, + { + "epoch": 0.9, + "grad_norm": 6.8054114176533265, + "learning_rate": 2.5251248443761644e-07, + "loss": 0.7562, + "step": 10264 + }, + { + "epoch": 0.9, + "grad_norm": 4.760722222640906, + "learning_rate": 2.5206639259597434e-07, + "loss": 0.692, + "step": 10265 + }, + { + "epoch": 0.9, + "grad_norm": 6.888527884531848, + "learning_rate": 2.51620684949222e-07, + "loss": 0.5921, + "step": 10266 + }, + { + "epoch": 0.9, + "grad_norm": 7.9462871987040335, + "learning_rate": 2.5117536153342557e-07, + "loss": 0.7569, + "step": 10267 + }, + { + "epoch": 0.9, + "grad_norm": 2.051520662723173, + "learning_rate": 2.5073042238461963e-07, + "loss": 0.4824, + "step": 10268 + }, + { + "epoch": 0.9, + "grad_norm": 7.656555261416299, + "learning_rate": 2.50285867538807e-07, + "loss": 0.621, + "step": 10269 + }, + { + "epoch": 0.9, + "grad_norm": 9.92591450338821, + "learning_rate": 2.498416970319617e-07, + "loss": 0.742, + "step": 10270 + }, + { + "epoch": 0.9, + "grad_norm": 7.142304230677319, + "learning_rate": 2.4939791090002496e-07, + "loss": 0.591, + "step": 10271 + }, + { + "epoch": 0.9, + "grad_norm": 8.414524657769206, + "learning_rate": 2.489545091789075e-07, + "loss": 0.7937, + "step": 10272 + }, + { + "epoch": 0.9, + "grad_norm": 7.121085080877494, + "learning_rate": 2.485114919044879e-07, + "loss": 0.8495, + "step": 10273 + }, + { + "epoch": 0.9, + "grad_norm": 6.096764168147371, + "learning_rate": 2.4806885911261347e-07, + "loss": 0.654, + "step": 10274 + }, + { + "epoch": 0.9, + "grad_norm": 7.518136890950368, + "learning_rate": 2.47626610839104e-07, + "loss": 0.6703, + "step": 10275 + }, + { + "epoch": 0.9, + "grad_norm": 8.59728519951368, + "learning_rate": 2.471847471197436e-07, + "loss": 0.6734, + "step": 10276 + }, + { + "epoch": 0.9, + "grad_norm": 7.487640591770653, + "learning_rate": 2.467432679902876e-07, + "loss": 0.7996, + "step": 10277 + }, + { + "epoch": 0.9, + "grad_norm": 6.686933889379761, + "learning_rate": 2.4630217348645957e-07, + "loss": 0.6891, + "step": 10278 + }, + { + "epoch": 0.9, + "grad_norm": 7.672959557328759, + "learning_rate": 2.458614636439527e-07, + "loss": 0.6248, + "step": 10279 + }, + { + "epoch": 0.9, + "grad_norm": 9.186310016615645, + "learning_rate": 2.454211384984273e-07, + "loss": 0.7625, + "step": 10280 + }, + { + "epoch": 0.9, + "grad_norm": 12.771322221611127, + "learning_rate": 2.4498119808551493e-07, + "loss": 0.876, + "step": 10281 + }, + { + "epoch": 0.9, + "grad_norm": 8.04205475359824, + "learning_rate": 2.445416424408148e-07, + "loss": 0.6517, + "step": 10282 + }, + { + "epoch": 0.9, + "grad_norm": 8.30015977201916, + "learning_rate": 2.4410247159989466e-07, + "loss": 0.8282, + "step": 10283 + }, + { + "epoch": 0.9, + "grad_norm": 8.39105517998076, + "learning_rate": 2.43663685598291e-07, + "loss": 0.7189, + "step": 10284 + }, + { + "epoch": 0.9, + "grad_norm": 16.096562501001113, + "learning_rate": 2.432252844715105e-07, + "loss": 0.7535, + "step": 10285 + }, + { + "epoch": 0.9, + "grad_norm": 11.991028396588305, + "learning_rate": 2.4278726825502696e-07, + "loss": 0.6621, + "step": 10286 + }, + { + "epoch": 0.9, + "grad_norm": 10.410707097800298, + "learning_rate": 2.4234963698428484e-07, + "loss": 0.833, + "step": 10287 + }, + { + "epoch": 0.9, + "grad_norm": 5.449009199528339, + "learning_rate": 2.419123906946963e-07, + "loss": 0.665, + "step": 10288 + }, + { + "epoch": 0.9, + "grad_norm": 2.2131139897518874, + "learning_rate": 2.41475529421642e-07, + "loss": 0.4309, + "step": 10289 + }, + { + "epoch": 0.9, + "grad_norm": 9.077096779165888, + "learning_rate": 2.4103905320047306e-07, + "loss": 0.7244, + "step": 10290 + }, + { + "epoch": 0.9, + "grad_norm": 48.58447181360926, + "learning_rate": 2.406029620665079e-07, + "loss": 0.6856, + "step": 10291 + }, + { + "epoch": 0.9, + "grad_norm": 10.885094961540567, + "learning_rate": 2.4016725605503275e-07, + "loss": 0.7775, + "step": 10292 + }, + { + "epoch": 0.9, + "grad_norm": 28.16593124461696, + "learning_rate": 2.397319352013072e-07, + "loss": 0.7195, + "step": 10293 + }, + { + "epoch": 0.9, + "grad_norm": 8.586052378591317, + "learning_rate": 2.392969995405547e-07, + "loss": 0.7098, + "step": 10294 + }, + { + "epoch": 0.9, + "grad_norm": 13.733827575787478, + "learning_rate": 2.3886244910796997e-07, + "loss": 0.6673, + "step": 10295 + }, + { + "epoch": 0.9, + "grad_norm": 12.33143162710436, + "learning_rate": 2.3842828393871586e-07, + "loss": 0.7105, + "step": 10296 + }, + { + "epoch": 0.9, + "grad_norm": 7.077062008967772, + "learning_rate": 2.3799450406792435e-07, + "loss": 0.6576, + "step": 10297 + }, + { + "epoch": 0.9, + "grad_norm": 5.073057339579163, + "learning_rate": 2.3756110953069678e-07, + "loss": 0.707, + "step": 10298 + }, + { + "epoch": 0.9, + "grad_norm": 8.536452625323271, + "learning_rate": 2.3712810036210288e-07, + "loss": 0.6644, + "step": 10299 + }, + { + "epoch": 0.9, + "grad_norm": 7.269776813247736, + "learning_rate": 2.366954765971796e-07, + "loss": 0.7217, + "step": 10300 + }, + { + "epoch": 0.9, + "grad_norm": 14.143517215080326, + "learning_rate": 2.3626323827093566e-07, + "loss": 0.6825, + "step": 10301 + }, + { + "epoch": 0.9, + "grad_norm": 6.948941586564208, + "learning_rate": 2.3583138541834582e-07, + "loss": 0.7816, + "step": 10302 + }, + { + "epoch": 0.9, + "grad_norm": 7.176768718089735, + "learning_rate": 2.3539991807435546e-07, + "loss": 0.6059, + "step": 10303 + }, + { + "epoch": 0.9, + "grad_norm": 9.514353985812916, + "learning_rate": 2.3496883627387834e-07, + "loss": 0.7402, + "step": 10304 + }, + { + "epoch": 0.91, + "grad_norm": 6.960500253001259, + "learning_rate": 2.3453814005179708e-07, + "loss": 0.7397, + "step": 10305 + }, + { + "epoch": 0.91, + "grad_norm": 9.22279778511544, + "learning_rate": 2.3410782944296272e-07, + "loss": 0.7209, + "step": 10306 + }, + { + "epoch": 0.91, + "grad_norm": 10.41219873419728, + "learning_rate": 2.336779044821952e-07, + "loss": 0.6498, + "step": 10307 + }, + { + "epoch": 0.91, + "grad_norm": 6.33929051301109, + "learning_rate": 2.3324836520428275e-07, + "loss": 0.6918, + "step": 10308 + }, + { + "epoch": 0.91, + "grad_norm": 10.475171037136715, + "learning_rate": 2.3281921164398312e-07, + "loss": 0.767, + "step": 10309 + }, + { + "epoch": 0.91, + "grad_norm": 8.353729502941814, + "learning_rate": 2.323904438360236e-07, + "loss": 0.7189, + "step": 10310 + }, + { + "epoch": 0.91, + "grad_norm": 11.934163460988161, + "learning_rate": 2.3196206181509918e-07, + "loss": 0.623, + "step": 10311 + }, + { + "epoch": 0.91, + "grad_norm": 3.0824969426962596, + "learning_rate": 2.3153406561587322e-07, + "loss": 0.5739, + "step": 10312 + }, + { + "epoch": 0.91, + "grad_norm": 6.562954460900678, + "learning_rate": 2.3110645527297915e-07, + "loss": 0.7321, + "step": 10313 + }, + { + "epoch": 0.91, + "grad_norm": 9.505938600184825, + "learning_rate": 2.3067923082101706e-07, + "loss": 0.5668, + "step": 10314 + }, + { + "epoch": 0.91, + "grad_norm": 7.029544786528428, + "learning_rate": 2.3025239229455876e-07, + "loss": 0.6781, + "step": 10315 + }, + { + "epoch": 0.91, + "grad_norm": 9.779904489139911, + "learning_rate": 2.2982593972814327e-07, + "loss": 0.7953, + "step": 10316 + }, + { + "epoch": 0.91, + "grad_norm": 7.8744801313295705, + "learning_rate": 2.2939987315627744e-07, + "loss": 0.6942, + "step": 10317 + }, + { + "epoch": 0.91, + "grad_norm": 9.498061966296035, + "learning_rate": 2.2897419261343868e-07, + "loss": 0.6586, + "step": 10318 + }, + { + "epoch": 0.91, + "grad_norm": 5.2964805020591275, + "learning_rate": 2.285488981340722e-07, + "loss": 0.5884, + "step": 10319 + }, + { + "epoch": 0.91, + "grad_norm": 5.0915204923600195, + "learning_rate": 2.281239897525911e-07, + "loss": 0.5579, + "step": 10320 + }, + { + "epoch": 0.91, + "grad_norm": 8.424017410359221, + "learning_rate": 2.2769946750337946e-07, + "loss": 0.7628, + "step": 10321 + }, + { + "epoch": 0.91, + "grad_norm": 11.732538997806088, + "learning_rate": 2.2727533142078928e-07, + "loss": 0.7447, + "step": 10322 + }, + { + "epoch": 0.91, + "grad_norm": 15.29534605994493, + "learning_rate": 2.2685158153913977e-07, + "loss": 0.6037, + "step": 10323 + }, + { + "epoch": 0.91, + "grad_norm": 14.207546820061552, + "learning_rate": 2.2642821789272073e-07, + "loss": 0.723, + "step": 10324 + }, + { + "epoch": 0.91, + "grad_norm": 7.135248479110175, + "learning_rate": 2.2600524051578977e-07, + "loss": 0.8124, + "step": 10325 + }, + { + "epoch": 0.91, + "grad_norm": 10.45623531254303, + "learning_rate": 2.2558264944257335e-07, + "loss": 0.8282, + "step": 10326 + }, + { + "epoch": 0.91, + "grad_norm": 11.456210967130556, + "learning_rate": 2.251604447072675e-07, + "loss": 0.5863, + "step": 10327 + }, + { + "epoch": 0.91, + "grad_norm": 15.234838008507095, + "learning_rate": 2.24738626344036e-07, + "loss": 0.7797, + "step": 10328 + }, + { + "epoch": 0.91, + "grad_norm": 8.785839910555579, + "learning_rate": 2.243171943870115e-07, + "loss": 0.766, + "step": 10329 + }, + { + "epoch": 0.91, + "grad_norm": 12.97586298809517, + "learning_rate": 2.2389614887029564e-07, + "loss": 0.9922, + "step": 10330 + }, + { + "epoch": 0.91, + "grad_norm": 4.683775665356443, + "learning_rate": 2.234754898279584e-07, + "loss": 0.7247, + "step": 10331 + }, + { + "epoch": 0.91, + "grad_norm": 5.9963073870234345, + "learning_rate": 2.2305521729403923e-07, + "loss": 0.617, + "step": 10332 + }, + { + "epoch": 0.91, + "grad_norm": 13.265992554265631, + "learning_rate": 2.2263533130254645e-07, + "loss": 0.881, + "step": 10333 + }, + { + "epoch": 0.91, + "grad_norm": 7.101849095581412, + "learning_rate": 2.2221583188745567e-07, + "loss": 0.7807, + "step": 10334 + }, + { + "epoch": 0.91, + "grad_norm": 5.18758132522351, + "learning_rate": 2.2179671908271194e-07, + "loss": 0.7103, + "step": 10335 + }, + { + "epoch": 0.91, + "grad_norm": 5.922029060992653, + "learning_rate": 2.213779929222304e-07, + "loss": 0.7118, + "step": 10336 + }, + { + "epoch": 0.91, + "grad_norm": 11.788173525284439, + "learning_rate": 2.2095965343989169e-07, + "loss": 0.8412, + "step": 10337 + }, + { + "epoch": 0.91, + "grad_norm": 16.014796567138422, + "learning_rate": 2.2054170066954872e-07, + "loss": 0.6743, + "step": 10338 + }, + { + "epoch": 0.91, + "grad_norm": 9.10708912987797, + "learning_rate": 2.2012413464502168e-07, + "loss": 0.7442, + "step": 10339 + }, + { + "epoch": 0.91, + "grad_norm": 9.831469337575987, + "learning_rate": 2.1970695540009857e-07, + "loss": 0.719, + "step": 10340 + }, + { + "epoch": 0.91, + "grad_norm": 6.816382277683231, + "learning_rate": 2.1929016296853679e-07, + "loss": 0.7866, + "step": 10341 + }, + { + "epoch": 0.91, + "grad_norm": 6.257082092865937, + "learning_rate": 2.1887375738406268e-07, + "loss": 0.7529, + "step": 10342 + }, + { + "epoch": 0.91, + "grad_norm": 9.298516562639657, + "learning_rate": 2.1845773868037102e-07, + "loss": 0.7035, + "step": 10343 + }, + { + "epoch": 0.91, + "grad_norm": 9.468853542052395, + "learning_rate": 2.1804210689112538e-07, + "loss": 0.797, + "step": 10344 + }, + { + "epoch": 0.91, + "grad_norm": 8.237079036555997, + "learning_rate": 2.1762686204995887e-07, + "loss": 0.6524, + "step": 10345 + }, + { + "epoch": 0.91, + "grad_norm": 7.563000635298846, + "learning_rate": 2.1721200419047128e-07, + "loss": 0.6594, + "step": 10346 + }, + { + "epoch": 0.91, + "grad_norm": 5.5003550880892655, + "learning_rate": 2.1679753334623243e-07, + "loss": 0.7458, + "step": 10347 + }, + { + "epoch": 0.91, + "grad_norm": 2.318669726699486, + "learning_rate": 2.163834495507805e-07, + "loss": 0.4711, + "step": 10348 + }, + { + "epoch": 0.91, + "grad_norm": 5.899126838314775, + "learning_rate": 2.1596975283762256e-07, + "loss": 0.7962, + "step": 10349 + }, + { + "epoch": 0.91, + "grad_norm": 7.290685871286693, + "learning_rate": 2.155564432402346e-07, + "loss": 0.6781, + "step": 10350 + }, + { + "epoch": 0.91, + "grad_norm": 7.137131713988452, + "learning_rate": 2.1514352079206103e-07, + "loss": 0.6828, + "step": 10351 + }, + { + "epoch": 0.91, + "grad_norm": 12.896226009775686, + "learning_rate": 2.147309855265145e-07, + "loss": 0.795, + "step": 10352 + }, + { + "epoch": 0.91, + "grad_norm": 9.922625472677005, + "learning_rate": 2.1431883747697669e-07, + "loss": 0.8917, + "step": 10353 + }, + { + "epoch": 0.91, + "grad_norm": 7.077140479264293, + "learning_rate": 2.13907076676797e-07, + "loss": 0.7014, + "step": 10354 + }, + { + "epoch": 0.91, + "grad_norm": 11.37454080484793, + "learning_rate": 2.1349570315929657e-07, + "loss": 0.717, + "step": 10355 + }, + { + "epoch": 0.91, + "grad_norm": 10.789066857007466, + "learning_rate": 2.1308471695776155e-07, + "loss": 0.7413, + "step": 10356 + }, + { + "epoch": 0.91, + "grad_norm": 6.79150945662821, + "learning_rate": 2.1267411810544924e-07, + "loss": 0.6839, + "step": 10357 + }, + { + "epoch": 0.91, + "grad_norm": 8.00755549510495, + "learning_rate": 2.1226390663558362e-07, + "loss": 0.7674, + "step": 10358 + }, + { + "epoch": 0.91, + "grad_norm": 9.177550147216305, + "learning_rate": 2.118540825813581e-07, + "loss": 0.7946, + "step": 10359 + }, + { + "epoch": 0.91, + "grad_norm": 12.90207236816434, + "learning_rate": 2.1144464597593562e-07, + "loss": 0.7489, + "step": 10360 + }, + { + "epoch": 0.91, + "grad_norm": 9.653611913151355, + "learning_rate": 2.1103559685244745e-07, + "loss": 0.6862, + "step": 10361 + }, + { + "epoch": 0.91, + "grad_norm": 2.988475509270275, + "learning_rate": 2.1062693524399268e-07, + "loss": 0.4466, + "step": 10362 + }, + { + "epoch": 0.91, + "grad_norm": 9.232530125452064, + "learning_rate": 2.1021866118363987e-07, + "loss": 0.682, + "step": 10363 + }, + { + "epoch": 0.91, + "grad_norm": 7.935223763883895, + "learning_rate": 2.0981077470442534e-07, + "loss": 0.6748, + "step": 10364 + }, + { + "epoch": 0.91, + "grad_norm": 9.129237905312127, + "learning_rate": 2.0940327583935494e-07, + "loss": 0.6367, + "step": 10365 + }, + { + "epoch": 0.91, + "grad_norm": 2.2709025540789765, + "learning_rate": 2.089961646214017e-07, + "loss": 0.5158, + "step": 10366 + }, + { + "epoch": 0.91, + "grad_norm": 6.930212233203686, + "learning_rate": 2.0858944108351043e-07, + "loss": 0.6812, + "step": 10367 + }, + { + "epoch": 0.91, + "grad_norm": 12.63394504173876, + "learning_rate": 2.0818310525859142e-07, + "loss": 0.6205, + "step": 10368 + }, + { + "epoch": 0.91, + "grad_norm": 5.567598444459076, + "learning_rate": 2.0777715717952507e-07, + "loss": 0.7645, + "step": 10369 + }, + { + "epoch": 0.91, + "grad_norm": 6.660400332815684, + "learning_rate": 2.0737159687915953e-07, + "loss": 0.7806, + "step": 10370 + }, + { + "epoch": 0.91, + "grad_norm": 17.88743868078455, + "learning_rate": 2.069664243903119e-07, + "loss": 0.7334, + "step": 10371 + }, + { + "epoch": 0.91, + "grad_norm": 8.724987660824857, + "learning_rate": 2.0656163974576815e-07, + "loss": 0.7435, + "step": 10372 + }, + { + "epoch": 0.91, + "grad_norm": 8.68994483716371, + "learning_rate": 2.061572429782832e-07, + "loss": 0.5641, + "step": 10373 + }, + { + "epoch": 0.91, + "grad_norm": 9.88428326775165, + "learning_rate": 2.0575323412058036e-07, + "loss": 0.6469, + "step": 10374 + }, + { + "epoch": 0.91, + "grad_norm": 6.6084244154330225, + "learning_rate": 2.053496132053512e-07, + "loss": 0.6708, + "step": 10375 + }, + { + "epoch": 0.91, + "grad_norm": 6.792087379897114, + "learning_rate": 2.0494638026525516e-07, + "loss": 0.5659, + "step": 10376 + }, + { + "epoch": 0.91, + "grad_norm": 6.292583386015013, + "learning_rate": 2.045435353329217e-07, + "loss": 0.6179, + "step": 10377 + }, + { + "epoch": 0.91, + "grad_norm": 7.490839046914845, + "learning_rate": 2.0414107844094923e-07, + "loss": 0.8522, + "step": 10378 + }, + { + "epoch": 0.91, + "grad_norm": 7.631341958754517, + "learning_rate": 2.037390096219033e-07, + "loss": 0.7082, + "step": 10379 + }, + { + "epoch": 0.91, + "grad_norm": 8.766176993891198, + "learning_rate": 2.0333732890831791e-07, + "loss": 0.7826, + "step": 10380 + }, + { + "epoch": 0.91, + "grad_norm": 6.045753053844553, + "learning_rate": 2.0293603633269764e-07, + "loss": 0.7864, + "step": 10381 + }, + { + "epoch": 0.91, + "grad_norm": 11.841539447809549, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.7853, + "step": 10382 + }, + { + "epoch": 0.91, + "grad_norm": 10.440582209236586, + "learning_rate": 2.0213461572520577e-07, + "loss": 0.6792, + "step": 10383 + }, + { + "epoch": 0.91, + "grad_norm": 6.2187966994503805, + "learning_rate": 2.0173448775818506e-07, + "loss": 0.6045, + "step": 10384 + }, + { + "epoch": 0.91, + "grad_norm": 10.650237102637252, + "learning_rate": 2.0133474805882735e-07, + "loss": 0.6474, + "step": 10385 + }, + { + "epoch": 0.91, + "grad_norm": 17.334808065273517, + "learning_rate": 2.0093539665948013e-07, + "loss": 0.6482, + "step": 10386 + }, + { + "epoch": 0.91, + "grad_norm": 6.615245352223833, + "learning_rate": 2.0053643359245812e-07, + "loss": 0.6649, + "step": 10387 + }, + { + "epoch": 0.91, + "grad_norm": 10.785576644768472, + "learning_rate": 2.0013785889004377e-07, + "loss": 0.7545, + "step": 10388 + }, + { + "epoch": 0.91, + "grad_norm": 11.913697907380842, + "learning_rate": 1.9973967258448912e-07, + "loss": 0.8002, + "step": 10389 + }, + { + "epoch": 0.91, + "grad_norm": 5.649404531883438, + "learning_rate": 1.9934187470801613e-07, + "loss": 0.5733, + "step": 10390 + }, + { + "epoch": 0.91, + "grad_norm": 6.887717715089273, + "learning_rate": 1.9894446529281352e-07, + "loss": 0.7994, + "step": 10391 + }, + { + "epoch": 0.91, + "grad_norm": 9.119413437025079, + "learning_rate": 1.9854744437103834e-07, + "loss": 0.7286, + "step": 10392 + }, + { + "epoch": 0.91, + "grad_norm": 3.0375652314491135, + "learning_rate": 1.9815081197481712e-07, + "loss": 0.4822, + "step": 10393 + }, + { + "epoch": 0.91, + "grad_norm": 10.751787937326341, + "learning_rate": 1.9775456813624417e-07, + "loss": 0.8774, + "step": 10394 + }, + { + "epoch": 0.91, + "grad_norm": 15.155514128171017, + "learning_rate": 1.9735871288738385e-07, + "loss": 0.7589, + "step": 10395 + }, + { + "epoch": 0.91, + "grad_norm": 8.51402425229619, + "learning_rate": 1.9696324626026774e-07, + "loss": 0.5628, + "step": 10396 + }, + { + "epoch": 0.91, + "grad_norm": 8.275999205062613, + "learning_rate": 1.965681682868964e-07, + "loss": 0.8399, + "step": 10397 + }, + { + "epoch": 0.91, + "grad_norm": 8.475442757986366, + "learning_rate": 1.9617347899923922e-07, + "loss": 0.5521, + "step": 10398 + }, + { + "epoch": 0.91, + "grad_norm": 6.199502717223992, + "learning_rate": 1.957791784292329e-07, + "loss": 0.5611, + "step": 10399 + }, + { + "epoch": 0.91, + "grad_norm": 28.308402305175782, + "learning_rate": 1.9538526660878355e-07, + "loss": 0.7236, + "step": 10400 + }, + { + "epoch": 0.91, + "grad_norm": 7.5648778177840725, + "learning_rate": 1.949917435697668e-07, + "loss": 0.663, + "step": 10401 + }, + { + "epoch": 0.91, + "grad_norm": 6.701808895161465, + "learning_rate": 1.9459860934402552e-07, + "loss": 0.7699, + "step": 10402 + }, + { + "epoch": 0.91, + "grad_norm": 1.7904266986041502, + "learning_rate": 1.9420586396337148e-07, + "loss": 0.474, + "step": 10403 + }, + { + "epoch": 0.91, + "grad_norm": 14.59136687598552, + "learning_rate": 1.9381350745958483e-07, + "loss": 0.74, + "step": 10404 + }, + { + "epoch": 0.91, + "grad_norm": 7.7715623481171265, + "learning_rate": 1.9342153986441403e-07, + "loss": 0.6442, + "step": 10405 + }, + { + "epoch": 0.91, + "grad_norm": 9.934015005999008, + "learning_rate": 1.9302996120957707e-07, + "loss": 0.9114, + "step": 10406 + }, + { + "epoch": 0.91, + "grad_norm": 7.636386861495865, + "learning_rate": 1.926387715267597e-07, + "loss": 0.9258, + "step": 10407 + }, + { + "epoch": 0.91, + "grad_norm": 18.575017436902495, + "learning_rate": 1.922479708476166e-07, + "loss": 0.7447, + "step": 10408 + }, + { + "epoch": 0.91, + "grad_norm": 7.45787816283097, + "learning_rate": 1.9185755920377025e-07, + "loss": 0.7871, + "step": 10409 + }, + { + "epoch": 0.91, + "grad_norm": 11.400028100320162, + "learning_rate": 1.914675366268126e-07, + "loss": 0.8481, + "step": 10410 + }, + { + "epoch": 0.91, + "grad_norm": 7.891575746009892, + "learning_rate": 1.9107790314830287e-07, + "loss": 0.6861, + "step": 10411 + }, + { + "epoch": 0.91, + "grad_norm": 5.386819232446326, + "learning_rate": 1.9068865879976972e-07, + "loss": 0.5833, + "step": 10412 + }, + { + "epoch": 0.91, + "grad_norm": 6.076118138514235, + "learning_rate": 1.902998036127107e-07, + "loss": 0.7598, + "step": 10413 + }, + { + "epoch": 0.91, + "grad_norm": 9.617011988167453, + "learning_rate": 1.8991133761859128e-07, + "loss": 0.7698, + "step": 10414 + }, + { + "epoch": 0.91, + "grad_norm": 7.3097134463559215, + "learning_rate": 1.8952326084884564e-07, + "loss": 0.6513, + "step": 10415 + }, + { + "epoch": 0.91, + "grad_norm": 10.891842261249685, + "learning_rate": 1.8913557333487542e-07, + "loss": 0.7713, + "step": 10416 + }, + { + "epoch": 0.91, + "grad_norm": 6.162947425326042, + "learning_rate": 1.887482751080516e-07, + "loss": 0.7135, + "step": 10417 + }, + { + "epoch": 0.91, + "grad_norm": 2.6840863667726365, + "learning_rate": 1.8836136619971468e-07, + "loss": 0.546, + "step": 10418 + }, + { + "epoch": 0.92, + "grad_norm": 8.541140013567418, + "learning_rate": 1.8797484664117237e-07, + "loss": 0.7583, + "step": 10419 + }, + { + "epoch": 0.92, + "grad_norm": 6.663262563165605, + "learning_rate": 1.8758871646370136e-07, + "loss": 0.5783, + "step": 10420 + }, + { + "epoch": 0.92, + "grad_norm": 6.200610099625044, + "learning_rate": 1.8720297569854606e-07, + "loss": 0.6429, + "step": 10421 + }, + { + "epoch": 0.92, + "grad_norm": 9.760032277725035, + "learning_rate": 1.868176243769204e-07, + "loss": 0.7463, + "step": 10422 + }, + { + "epoch": 0.92, + "grad_norm": 5.365332088151933, + "learning_rate": 1.864326625300056e-07, + "loss": 0.7248, + "step": 10423 + }, + { + "epoch": 0.92, + "grad_norm": 3.327560280400499, + "learning_rate": 1.860480901889533e-07, + "loss": 0.4936, + "step": 10424 + }, + { + "epoch": 0.92, + "grad_norm": 6.272987505437265, + "learning_rate": 1.8566390738488205e-07, + "loss": 0.7532, + "step": 10425 + }, + { + "epoch": 0.92, + "grad_norm": 5.939647702392859, + "learning_rate": 1.8528011414887914e-07, + "loss": 0.5442, + "step": 10426 + }, + { + "epoch": 0.92, + "grad_norm": 11.150455692889441, + "learning_rate": 1.8489671051200087e-07, + "loss": 0.655, + "step": 10427 + }, + { + "epoch": 0.92, + "grad_norm": 9.562609301705715, + "learning_rate": 1.8451369650527073e-07, + "loss": 0.5889, + "step": 10428 + }, + { + "epoch": 0.92, + "grad_norm": 1.9713429083286784, + "learning_rate": 1.8413107215968174e-07, + "loss": 0.4932, + "step": 10429 + }, + { + "epoch": 0.92, + "grad_norm": 5.269768330814039, + "learning_rate": 1.8374883750619632e-07, + "loss": 0.5288, + "step": 10430 + }, + { + "epoch": 0.92, + "grad_norm": 16.576935128516133, + "learning_rate": 1.8336699257574365e-07, + "loss": 0.7661, + "step": 10431 + }, + { + "epoch": 0.92, + "grad_norm": 11.143611858887716, + "learning_rate": 1.8298553739922175e-07, + "loss": 0.7014, + "step": 10432 + }, + { + "epoch": 0.92, + "grad_norm": 12.958497838960097, + "learning_rate": 1.826044720074971e-07, + "loss": 0.6696, + "step": 10433 + }, + { + "epoch": 0.92, + "grad_norm": 3.797096101963879, + "learning_rate": 1.8222379643140498e-07, + "loss": 0.5411, + "step": 10434 + }, + { + "epoch": 0.92, + "grad_norm": 4.836695744948936, + "learning_rate": 1.8184351070175022e-07, + "loss": 0.7477, + "step": 10435 + }, + { + "epoch": 0.92, + "grad_norm": 3.7903596919596194, + "learning_rate": 1.8146361484930376e-07, + "loss": 0.5674, + "step": 10436 + }, + { + "epoch": 0.92, + "grad_norm": 7.313133437234341, + "learning_rate": 1.81084108904806e-07, + "loss": 0.7673, + "step": 10437 + }, + { + "epoch": 0.92, + "grad_norm": 9.125378003731786, + "learning_rate": 1.8070499289896626e-07, + "loss": 0.68, + "step": 10438 + }, + { + "epoch": 0.92, + "grad_norm": 9.944876374892925, + "learning_rate": 1.8032626686246224e-07, + "loss": 0.6166, + "step": 10439 + }, + { + "epoch": 0.92, + "grad_norm": 12.84452900451727, + "learning_rate": 1.7994793082593942e-07, + "loss": 0.9578, + "step": 10440 + }, + { + "epoch": 0.92, + "grad_norm": 16.883280418722016, + "learning_rate": 1.795699848200122e-07, + "loss": 0.7203, + "step": 10441 + }, + { + "epoch": 0.92, + "grad_norm": 2.542310043806184, + "learning_rate": 1.791924288752639e-07, + "loss": 0.4647, + "step": 10442 + }, + { + "epoch": 0.92, + "grad_norm": 21.579376893900243, + "learning_rate": 1.7881526302224505e-07, + "loss": 0.8317, + "step": 10443 + }, + { + "epoch": 0.92, + "grad_norm": 7.031664107276702, + "learning_rate": 1.7843848729147573e-07, + "loss": 0.752, + "step": 10444 + }, + { + "epoch": 0.92, + "grad_norm": 8.908363230677654, + "learning_rate": 1.780621017134432e-07, + "loss": 0.7552, + "step": 10445 + }, + { + "epoch": 0.92, + "grad_norm": 8.451539946346037, + "learning_rate": 1.7768610631860473e-07, + "loss": 0.5885, + "step": 10446 + }, + { + "epoch": 0.92, + "grad_norm": 15.10410175772494, + "learning_rate": 1.7731050113738491e-07, + "loss": 0.7238, + "step": 10447 + }, + { + "epoch": 0.92, + "grad_norm": 8.846641409963613, + "learning_rate": 1.7693528620017776e-07, + "loss": 0.7309, + "step": 10448 + }, + { + "epoch": 0.92, + "grad_norm": 7.913954972159215, + "learning_rate": 1.7656046153734396e-07, + "loss": 0.621, + "step": 10449 + }, + { + "epoch": 0.92, + "grad_norm": 10.349716338966253, + "learning_rate": 1.7618602717921428e-07, + "loss": 0.6232, + "step": 10450 + }, + { + "epoch": 0.92, + "grad_norm": 8.889236277856957, + "learning_rate": 1.7581198315608727e-07, + "loss": 0.783, + "step": 10451 + }, + { + "epoch": 0.92, + "grad_norm": 11.338744379024144, + "learning_rate": 1.754383294982298e-07, + "loss": 0.6492, + "step": 10452 + }, + { + "epoch": 0.92, + "grad_norm": 14.789466480786745, + "learning_rate": 1.750650662358777e-07, + "loss": 0.7037, + "step": 10453 + }, + { + "epoch": 0.92, + "grad_norm": 5.90655132520562, + "learning_rate": 1.7469219339923516e-07, + "loss": 0.6996, + "step": 10454 + }, + { + "epoch": 0.92, + "grad_norm": 12.354214882321145, + "learning_rate": 1.74319711018473e-07, + "loss": 0.758, + "step": 10455 + }, + { + "epoch": 0.92, + "grad_norm": 13.831669369508985, + "learning_rate": 1.7394761912373326e-07, + "loss": 0.6952, + "step": 10456 + }, + { + "epoch": 0.92, + "grad_norm": 6.915348773012282, + "learning_rate": 1.735759177451235e-07, + "loss": 0.7281, + "step": 10457 + }, + { + "epoch": 0.92, + "grad_norm": 7.324220134847796, + "learning_rate": 1.73204606912723e-07, + "loss": 0.7374, + "step": 10458 + }, + { + "epoch": 0.92, + "grad_norm": 9.1632856859284, + "learning_rate": 1.7283368665657663e-07, + "loss": 0.7305, + "step": 10459 + }, + { + "epoch": 0.92, + "grad_norm": 8.003041830601978, + "learning_rate": 1.724631570066987e-07, + "loss": 0.6915, + "step": 10460 + }, + { + "epoch": 0.92, + "grad_norm": 6.957033291971113, + "learning_rate": 1.7209301799307243e-07, + "loss": 0.7774, + "step": 10461 + }, + { + "epoch": 0.92, + "grad_norm": 7.020195562868563, + "learning_rate": 1.7172326964564777e-07, + "loss": 0.5989, + "step": 10462 + }, + { + "epoch": 0.92, + "grad_norm": 7.518241156582654, + "learning_rate": 1.7135391199434413e-07, + "loss": 0.7383, + "step": 10463 + }, + { + "epoch": 0.92, + "grad_norm": 7.41690768956506, + "learning_rate": 1.7098494506905095e-07, + "loss": 0.6299, + "step": 10464 + }, + { + "epoch": 0.92, + "grad_norm": 6.73749538164051, + "learning_rate": 1.7061636889962264e-07, + "loss": 0.8023, + "step": 10465 + }, + { + "epoch": 0.92, + "grad_norm": 7.513174824356611, + "learning_rate": 1.702481835158848e-07, + "loss": 0.6316, + "step": 10466 + }, + { + "epoch": 0.92, + "grad_norm": 9.652812952839534, + "learning_rate": 1.6988038894763025e-07, + "loss": 0.848, + "step": 10467 + }, + { + "epoch": 0.92, + "grad_norm": 6.1108886003325225, + "learning_rate": 1.6951298522462024e-07, + "loss": 0.6392, + "step": 10468 + }, + { + "epoch": 0.92, + "grad_norm": 2.9720065654777525, + "learning_rate": 1.6914597237658315e-07, + "loss": 0.4696, + "step": 10469 + }, + { + "epoch": 0.92, + "grad_norm": 9.025515218424903, + "learning_rate": 1.687793504332197e-07, + "loss": 0.6056, + "step": 10470 + }, + { + "epoch": 0.92, + "grad_norm": 7.022649743358438, + "learning_rate": 1.684131194241939e-07, + "loss": 0.8138, + "step": 10471 + }, + { + "epoch": 0.92, + "grad_norm": 5.944147835224656, + "learning_rate": 1.6804727937914211e-07, + "loss": 0.6921, + "step": 10472 + }, + { + "epoch": 0.92, + "grad_norm": 2.7958054408670594, + "learning_rate": 1.6768183032766728e-07, + "loss": 0.5087, + "step": 10473 + }, + { + "epoch": 0.92, + "grad_norm": 8.377345250569519, + "learning_rate": 1.6731677229933963e-07, + "loss": 0.6598, + "step": 10474 + }, + { + "epoch": 0.92, + "grad_norm": 10.097721051308591, + "learning_rate": 1.6695210532370053e-07, + "loss": 0.8059, + "step": 10475 + }, + { + "epoch": 0.92, + "grad_norm": 9.278267767969671, + "learning_rate": 1.6658782943025753e-07, + "loss": 0.77, + "step": 10476 + }, + { + "epoch": 0.92, + "grad_norm": 7.31464249452488, + "learning_rate": 1.6622394464848758e-07, + "loss": 0.6931, + "step": 10477 + }, + { + "epoch": 0.92, + "grad_norm": 8.957145135443199, + "learning_rate": 1.6586045100783598e-07, + "loss": 0.8274, + "step": 10478 + }, + { + "epoch": 0.92, + "grad_norm": 8.32931118080048, + "learning_rate": 1.6549734853771481e-07, + "loss": 0.6645, + "step": 10479 + }, + { + "epoch": 0.92, + "grad_norm": 14.38346347420433, + "learning_rate": 1.6513463726750612e-07, + "loss": 0.8388, + "step": 10480 + }, + { + "epoch": 0.92, + "grad_norm": 12.502393839947713, + "learning_rate": 1.6477231722656085e-07, + "loss": 0.6477, + "step": 10481 + }, + { + "epoch": 0.92, + "grad_norm": 6.362200844206364, + "learning_rate": 1.644103884441961e-07, + "loss": 0.8228, + "step": 10482 + }, + { + "epoch": 0.92, + "grad_norm": 14.34586504153744, + "learning_rate": 1.640488509496996e-07, + "loss": 0.645, + "step": 10483 + }, + { + "epoch": 0.92, + "grad_norm": 6.50463981868821, + "learning_rate": 1.6368770477232622e-07, + "loss": 0.835, + "step": 10484 + }, + { + "epoch": 0.92, + "grad_norm": 5.586190416038723, + "learning_rate": 1.633269499412987e-07, + "loss": 0.6714, + "step": 10485 + }, + { + "epoch": 0.92, + "grad_norm": 8.334313412570081, + "learning_rate": 1.6296658648580822e-07, + "loss": 0.8571, + "step": 10486 + }, + { + "epoch": 0.92, + "grad_norm": 6.956504900464916, + "learning_rate": 1.6260661443501635e-07, + "loss": 0.719, + "step": 10487 + }, + { + "epoch": 0.92, + "grad_norm": 3.464542385822016, + "learning_rate": 1.6224703381805042e-07, + "loss": 0.4615, + "step": 10488 + }, + { + "epoch": 0.92, + "grad_norm": 6.410378688744677, + "learning_rate": 1.6188784466400765e-07, + "loss": 0.7241, + "step": 10489 + }, + { + "epoch": 0.92, + "grad_norm": 8.275094151430999, + "learning_rate": 1.6152904700195204e-07, + "loss": 0.6693, + "step": 10490 + }, + { + "epoch": 0.92, + "grad_norm": 11.563122364295825, + "learning_rate": 1.6117064086091762e-07, + "loss": 0.667, + "step": 10491 + }, + { + "epoch": 0.92, + "grad_norm": 7.523557130731832, + "learning_rate": 1.6081262626990667e-07, + "loss": 0.6313, + "step": 10492 + }, + { + "epoch": 0.92, + "grad_norm": 7.969150889580358, + "learning_rate": 1.6045500325788776e-07, + "loss": 0.7461, + "step": 10493 + }, + { + "epoch": 0.92, + "grad_norm": 10.749873881905675, + "learning_rate": 1.600977718538005e-07, + "loss": 0.8344, + "step": 10494 + }, + { + "epoch": 0.92, + "grad_norm": 7.177674227930173, + "learning_rate": 1.597409320865506e-07, + "loss": 0.6776, + "step": 10495 + }, + { + "epoch": 0.92, + "grad_norm": 8.935682576491654, + "learning_rate": 1.5938448398501284e-07, + "loss": 0.77, + "step": 10496 + }, + { + "epoch": 0.92, + "grad_norm": 12.158846513849879, + "learning_rate": 1.5902842757803016e-07, + "loss": 0.647, + "step": 10497 + }, + { + "epoch": 0.92, + "grad_norm": 7.702066688239027, + "learning_rate": 1.5867276289441514e-07, + "loss": 0.7682, + "step": 10498 + }, + { + "epoch": 0.92, + "grad_norm": 10.16424775605604, + "learning_rate": 1.583174899629475e-07, + "loss": 0.7731, + "step": 10499 + }, + { + "epoch": 0.92, + "grad_norm": 7.516852303575782, + "learning_rate": 1.5796260881237424e-07, + "loss": 0.6587, + "step": 10500 + }, + { + "epoch": 0.92, + "grad_norm": 13.670204208558012, + "learning_rate": 1.576081194714124e-07, + "loss": 0.8051, + "step": 10501 + }, + { + "epoch": 0.92, + "grad_norm": 8.080362163396698, + "learning_rate": 1.5725402196874683e-07, + "loss": 0.7481, + "step": 10502 + }, + { + "epoch": 0.92, + "grad_norm": 4.6134058867429495, + "learning_rate": 1.5690031633303016e-07, + "loss": 0.6235, + "step": 10503 + }, + { + "epoch": 0.92, + "grad_norm": 7.551265420478676, + "learning_rate": 1.565470025928839e-07, + "loss": 0.7673, + "step": 10504 + }, + { + "epoch": 0.92, + "grad_norm": 8.922385626403056, + "learning_rate": 1.5619408077689746e-07, + "loss": 0.7797, + "step": 10505 + }, + { + "epoch": 0.92, + "grad_norm": 2.8716261866786765, + "learning_rate": 1.5584155091362907e-07, + "loss": 0.5082, + "step": 10506 + }, + { + "epoch": 0.92, + "grad_norm": 6.91828375744929, + "learning_rate": 1.5548941303160424e-07, + "loss": 0.7309, + "step": 10507 + }, + { + "epoch": 0.92, + "grad_norm": 2.331146546975688, + "learning_rate": 1.5513766715931743e-07, + "loss": 0.3876, + "step": 10508 + }, + { + "epoch": 0.92, + "grad_norm": 8.003239831175756, + "learning_rate": 1.5478631332523142e-07, + "loss": 0.6838, + "step": 10509 + }, + { + "epoch": 0.92, + "grad_norm": 38.681116104278125, + "learning_rate": 1.544353515577779e-07, + "loss": 0.6084, + "step": 10510 + }, + { + "epoch": 0.92, + "grad_norm": 6.182498564799644, + "learning_rate": 1.5408478188535525e-07, + "loss": 0.7028, + "step": 10511 + }, + { + "epoch": 0.92, + "grad_norm": 9.702308421437085, + "learning_rate": 1.5373460433633191e-07, + "loss": 0.7622, + "step": 10512 + }, + { + "epoch": 0.92, + "grad_norm": 3.8715041008218316, + "learning_rate": 1.5338481893904245e-07, + "loss": 0.502, + "step": 10513 + }, + { + "epoch": 0.92, + "grad_norm": 7.557476834875102, + "learning_rate": 1.5303542572179087e-07, + "loss": 0.7538, + "step": 10514 + }, + { + "epoch": 0.92, + "grad_norm": 10.02151841840321, + "learning_rate": 1.5268642471285123e-07, + "loss": 0.7974, + "step": 10515 + }, + { + "epoch": 0.92, + "grad_norm": 12.454719503775522, + "learning_rate": 1.523378159404626e-07, + "loss": 0.6946, + "step": 10516 + }, + { + "epoch": 0.92, + "grad_norm": 6.602642310296248, + "learning_rate": 1.5198959943283466e-07, + "loss": 0.7007, + "step": 10517 + }, + { + "epoch": 0.92, + "grad_norm": 7.801997952630932, + "learning_rate": 1.516417752181437e-07, + "loss": 0.7014, + "step": 10518 + }, + { + "epoch": 0.92, + "grad_norm": 2.4175177895090156, + "learning_rate": 1.5129434332453562e-07, + "loss": 0.5071, + "step": 10519 + }, + { + "epoch": 0.92, + "grad_norm": 5.550471672914078, + "learning_rate": 1.509473037801229e-07, + "loss": 0.6792, + "step": 10520 + }, + { + "epoch": 0.92, + "grad_norm": 12.537674461433237, + "learning_rate": 1.5060065661298918e-07, + "loss": 0.874, + "step": 10521 + }, + { + "epoch": 0.92, + "grad_norm": 6.196223728219703, + "learning_rate": 1.5025440185118422e-07, + "loss": 0.7437, + "step": 10522 + }, + { + "epoch": 0.92, + "grad_norm": 11.381416454095278, + "learning_rate": 1.499085395227251e-07, + "loss": 0.7465, + "step": 10523 + }, + { + "epoch": 0.92, + "grad_norm": 12.619604259115274, + "learning_rate": 1.4956306965559997e-07, + "loss": 0.7562, + "step": 10524 + }, + { + "epoch": 0.92, + "grad_norm": 6.80609578341301, + "learning_rate": 1.4921799227776257e-07, + "loss": 0.7131, + "step": 10525 + }, + { + "epoch": 0.92, + "grad_norm": 11.212257264266023, + "learning_rate": 1.4887330741713613e-07, + "loss": 0.6883, + "step": 10526 + }, + { + "epoch": 0.92, + "grad_norm": 10.160911201923053, + "learning_rate": 1.485290151016122e-07, + "loss": 0.7328, + "step": 10527 + }, + { + "epoch": 0.92, + "grad_norm": 11.258386661095004, + "learning_rate": 1.4818511535905077e-07, + "loss": 0.7806, + "step": 10528 + }, + { + "epoch": 0.92, + "grad_norm": 7.367946950585082, + "learning_rate": 1.4784160821727956e-07, + "loss": 0.7773, + "step": 10529 + }, + { + "epoch": 0.92, + "grad_norm": 7.887881871419911, + "learning_rate": 1.4749849370409352e-07, + "loss": 0.6103, + "step": 10530 + }, + { + "epoch": 0.92, + "grad_norm": 7.338692756517284, + "learning_rate": 1.4715577184725772e-07, + "loss": 0.7067, + "step": 10531 + }, + { + "epoch": 0.92, + "grad_norm": 7.580899255322712, + "learning_rate": 1.4681344267450493e-07, + "loss": 0.8791, + "step": 10532 + }, + { + "epoch": 0.93, + "grad_norm": 2.880712731871448, + "learning_rate": 1.4647150621353523e-07, + "loss": 0.4922, + "step": 10533 + }, + { + "epoch": 0.93, + "grad_norm": 16.64316301674742, + "learning_rate": 1.4612996249201817e-07, + "loss": 0.8714, + "step": 10534 + }, + { + "epoch": 0.93, + "grad_norm": 12.184707166473336, + "learning_rate": 1.4578881153759052e-07, + "loss": 0.792, + "step": 10535 + }, + { + "epoch": 0.93, + "grad_norm": 5.367201453024053, + "learning_rate": 1.454480533778574e-07, + "loss": 0.7131, + "step": 10536 + }, + { + "epoch": 0.93, + "grad_norm": 14.967438449935038, + "learning_rate": 1.451076880403923e-07, + "loss": 0.7467, + "step": 10537 + }, + { + "epoch": 0.93, + "grad_norm": 6.663573399561994, + "learning_rate": 1.4476771555273772e-07, + "loss": 0.597, + "step": 10538 + }, + { + "epoch": 0.93, + "grad_norm": 7.014622943612181, + "learning_rate": 1.444281359424038e-07, + "loss": 0.6577, + "step": 10539 + }, + { + "epoch": 0.93, + "grad_norm": 4.99683132022661, + "learning_rate": 1.4408894923686801e-07, + "loss": 0.6118, + "step": 10540 + }, + { + "epoch": 0.93, + "grad_norm": 2.348309436620904, + "learning_rate": 1.4375015546357674e-07, + "loss": 0.4283, + "step": 10541 + }, + { + "epoch": 0.93, + "grad_norm": 11.041591418787045, + "learning_rate": 1.434117546499453e-07, + "loss": 1.0186, + "step": 10542 + }, + { + "epoch": 0.93, + "grad_norm": 2.7385392071604513, + "learning_rate": 1.4307374682335563e-07, + "loss": 0.5504, + "step": 10543 + }, + { + "epoch": 0.93, + "grad_norm": 9.635418555732876, + "learning_rate": 1.4273613201115976e-07, + "loss": 0.6661, + "step": 10544 + }, + { + "epoch": 0.93, + "grad_norm": 5.528087093021954, + "learning_rate": 1.423989102406764e-07, + "loss": 0.6704, + "step": 10545 + }, + { + "epoch": 0.93, + "grad_norm": 12.7739479632217, + "learning_rate": 1.4206208153919322e-07, + "loss": 1.0005, + "step": 10546 + }, + { + "epoch": 0.93, + "grad_norm": 7.760666848122658, + "learning_rate": 1.4172564593396554e-07, + "loss": 0.6193, + "step": 10547 + }, + { + "epoch": 0.93, + "grad_norm": 7.451994145196896, + "learning_rate": 1.413896034522172e-07, + "loss": 0.7586, + "step": 10548 + }, + { + "epoch": 0.93, + "grad_norm": 6.415105992534846, + "learning_rate": 1.4105395412113975e-07, + "loss": 0.7929, + "step": 10549 + }, + { + "epoch": 0.93, + "grad_norm": 5.724106537403392, + "learning_rate": 1.4071869796789427e-07, + "loss": 0.5631, + "step": 10550 + }, + { + "epoch": 0.93, + "grad_norm": 8.069935868548852, + "learning_rate": 1.4038383501960906e-07, + "loss": 0.7073, + "step": 10551 + }, + { + "epoch": 0.93, + "grad_norm": 10.031015767448089, + "learning_rate": 1.4004936530338022e-07, + "loss": 0.8321, + "step": 10552 + }, + { + "epoch": 0.93, + "grad_norm": 5.597637519462284, + "learning_rate": 1.3971528884627217e-07, + "loss": 0.6155, + "step": 10553 + }, + { + "epoch": 0.93, + "grad_norm": 7.781652519790096, + "learning_rate": 1.3938160567531835e-07, + "loss": 0.6422, + "step": 10554 + }, + { + "epoch": 0.93, + "grad_norm": 6.690162512477687, + "learning_rate": 1.3904831581751988e-07, + "loss": 0.8026, + "step": 10555 + }, + { + "epoch": 0.93, + "grad_norm": 6.940662625973935, + "learning_rate": 1.3871541929984632e-07, + "loss": 0.667, + "step": 10556 + }, + { + "epoch": 0.93, + "grad_norm": 7.270631823035516, + "learning_rate": 1.383829161492345e-07, + "loss": 0.7223, + "step": 10557 + }, + { + "epoch": 0.93, + "grad_norm": 6.451039559098174, + "learning_rate": 1.3805080639259006e-07, + "loss": 0.6695, + "step": 10558 + }, + { + "epoch": 0.93, + "grad_norm": 9.796131644990988, + "learning_rate": 1.3771909005678763e-07, + "loss": 0.6486, + "step": 10559 + }, + { + "epoch": 0.93, + "grad_norm": 8.97644681844403, + "learning_rate": 1.373877671686674e-07, + "loss": 0.776, + "step": 10560 + }, + { + "epoch": 0.93, + "grad_norm": 10.28962120235502, + "learning_rate": 1.3705683775504075e-07, + "loss": 0.7822, + "step": 10561 + }, + { + "epoch": 0.93, + "grad_norm": 10.224923435602648, + "learning_rate": 1.367263018426862e-07, + "loss": 0.7505, + "step": 10562 + }, + { + "epoch": 0.93, + "grad_norm": 7.485305379055425, + "learning_rate": 1.3639615945834906e-07, + "loss": 0.6956, + "step": 10563 + }, + { + "epoch": 0.93, + "grad_norm": 10.781061601016024, + "learning_rate": 1.3606641062874515e-07, + "loss": 0.7368, + "step": 10564 + }, + { + "epoch": 0.93, + "grad_norm": 6.49862128410943, + "learning_rate": 1.3573705538055648e-07, + "loss": 0.6423, + "step": 10565 + }, + { + "epoch": 0.93, + "grad_norm": 15.190274063458096, + "learning_rate": 1.354080937404334e-07, + "loss": 0.6906, + "step": 10566 + }, + { + "epoch": 0.93, + "grad_norm": 5.930741553724949, + "learning_rate": 1.3507952573499573e-07, + "loss": 0.6203, + "step": 10567 + }, + { + "epoch": 0.93, + "grad_norm": 12.771220474897811, + "learning_rate": 1.3475135139083052e-07, + "loss": 0.9461, + "step": 10568 + }, + { + "epoch": 0.93, + "grad_norm": 10.995852913747683, + "learning_rate": 1.344235707344932e-07, + "loss": 0.8576, + "step": 10569 + }, + { + "epoch": 0.93, + "grad_norm": 6.328006342739519, + "learning_rate": 1.34096183792507e-07, + "loss": 0.7754, + "step": 10570 + }, + { + "epoch": 0.93, + "grad_norm": 6.773918835302609, + "learning_rate": 1.337691905913635e-07, + "loss": 0.6597, + "step": 10571 + }, + { + "epoch": 0.93, + "grad_norm": 12.10022593028731, + "learning_rate": 1.3344259115752268e-07, + "loss": 0.7968, + "step": 10572 + }, + { + "epoch": 0.93, + "grad_norm": 12.708555677477076, + "learning_rate": 1.3311638551741223e-07, + "loss": 0.7421, + "step": 10573 + }, + { + "epoch": 0.93, + "grad_norm": 6.596762997738009, + "learning_rate": 1.3279057369742832e-07, + "loss": 0.7507, + "step": 10574 + }, + { + "epoch": 0.93, + "grad_norm": 8.768432691368263, + "learning_rate": 1.324651557239348e-07, + "loss": 0.6345, + "step": 10575 + }, + { + "epoch": 0.93, + "grad_norm": 9.740132353227692, + "learning_rate": 1.3214013162326457e-07, + "loss": 0.6899, + "step": 10576 + }, + { + "epoch": 0.93, + "grad_norm": 2.695723298048868, + "learning_rate": 1.318155014217165e-07, + "loss": 0.4653, + "step": 10577 + }, + { + "epoch": 0.93, + "grad_norm": 6.271545525451573, + "learning_rate": 1.3149126514556132e-07, + "loss": 0.6224, + "step": 10578 + }, + { + "epoch": 0.93, + "grad_norm": 16.42086782575894, + "learning_rate": 1.311674228210341e-07, + "loss": 0.5835, + "step": 10579 + }, + { + "epoch": 0.93, + "grad_norm": 10.259352266345457, + "learning_rate": 1.308439744743406e-07, + "loss": 0.7888, + "step": 10580 + }, + { + "epoch": 0.93, + "grad_norm": 10.489888199749128, + "learning_rate": 1.3052092013165318e-07, + "loss": 0.8206, + "step": 10581 + }, + { + "epoch": 0.93, + "grad_norm": 12.112027562891864, + "learning_rate": 1.3019825981911315e-07, + "loss": 0.823, + "step": 10582 + }, + { + "epoch": 0.93, + "grad_norm": 5.964750203411296, + "learning_rate": 1.2987599356282853e-07, + "loss": 0.6169, + "step": 10583 + }, + { + "epoch": 0.93, + "grad_norm": 5.236206382655901, + "learning_rate": 1.295541213888779e-07, + "loss": 0.5865, + "step": 10584 + }, + { + "epoch": 0.93, + "grad_norm": 9.884050375587615, + "learning_rate": 1.2923264332330655e-07, + "loss": 0.6996, + "step": 10585 + }, + { + "epoch": 0.93, + "grad_norm": 23.257091338819933, + "learning_rate": 1.2891155939212752e-07, + "loss": 0.6614, + "step": 10586 + }, + { + "epoch": 0.93, + "grad_norm": 17.542174240897054, + "learning_rate": 1.2859086962132227e-07, + "loss": 0.6567, + "step": 10587 + }, + { + "epoch": 0.93, + "grad_norm": 6.651981696376349, + "learning_rate": 1.2827057403684063e-07, + "loss": 0.5375, + "step": 10588 + }, + { + "epoch": 0.93, + "grad_norm": 6.520939766295001, + "learning_rate": 1.2795067266460016e-07, + "loss": 0.6303, + "step": 10589 + }, + { + "epoch": 0.93, + "grad_norm": 8.791065023667565, + "learning_rate": 1.2763116553048737e-07, + "loss": 0.7477, + "step": 10590 + }, + { + "epoch": 0.93, + "grad_norm": 8.062010917877078, + "learning_rate": 1.2731205266035607e-07, + "loss": 0.8037, + "step": 10591 + }, + { + "epoch": 0.93, + "grad_norm": 5.989730453256678, + "learning_rate": 1.2699333408002778e-07, + "loss": 0.7291, + "step": 10592 + }, + { + "epoch": 0.93, + "grad_norm": 6.165436989036653, + "learning_rate": 1.2667500981529356e-07, + "loss": 0.6624, + "step": 10593 + }, + { + "epoch": 0.93, + "grad_norm": 9.336250961908098, + "learning_rate": 1.263570798919106e-07, + "loss": 0.5933, + "step": 10594 + }, + { + "epoch": 0.93, + "grad_norm": 7.1189865737052, + "learning_rate": 1.2603954433560604e-07, + "loss": 0.6859, + "step": 10595 + }, + { + "epoch": 0.93, + "grad_norm": 8.836159781311807, + "learning_rate": 1.257224031720744e-07, + "loss": 0.7227, + "step": 10596 + }, + { + "epoch": 0.93, + "grad_norm": 8.087387476032772, + "learning_rate": 1.2540565642697787e-07, + "loss": 0.8358, + "step": 10597 + }, + { + "epoch": 0.93, + "grad_norm": 9.384859291536877, + "learning_rate": 1.250893041259471e-07, + "loss": 0.8447, + "step": 10598 + }, + { + "epoch": 0.93, + "grad_norm": 8.040407241262724, + "learning_rate": 1.2477334629458105e-07, + "loss": 0.5643, + "step": 10599 + }, + { + "epoch": 0.93, + "grad_norm": 8.434549493905978, + "learning_rate": 1.2445778295844535e-07, + "loss": 0.723, + "step": 10600 + }, + { + "epoch": 0.93, + "grad_norm": 8.493422551236879, + "learning_rate": 1.241426141430768e-07, + "loss": 0.6443, + "step": 10601 + }, + { + "epoch": 0.93, + "grad_norm": 13.41923353665427, + "learning_rate": 1.238278398739773e-07, + "loss": 0.8537, + "step": 10602 + }, + { + "epoch": 0.93, + "grad_norm": 6.571080304923897, + "learning_rate": 1.2351346017661746e-07, + "loss": 0.7024, + "step": 10603 + }, + { + "epoch": 0.93, + "grad_norm": 12.350227574151253, + "learning_rate": 1.2319947507643704e-07, + "loss": 0.6801, + "step": 10604 + }, + { + "epoch": 0.93, + "grad_norm": 2.421033930349017, + "learning_rate": 1.2288588459884344e-07, + "loss": 0.5034, + "step": 10605 + }, + { + "epoch": 0.93, + "grad_norm": 6.359841540363276, + "learning_rate": 1.2257268876921024e-07, + "loss": 0.6695, + "step": 10606 + }, + { + "epoch": 0.93, + "grad_norm": 7.914046684246423, + "learning_rate": 1.2225988761288276e-07, + "loss": 0.6372, + "step": 10607 + }, + { + "epoch": 0.93, + "grad_norm": 6.485480837597803, + "learning_rate": 1.219474811551713e-07, + "loss": 0.7329, + "step": 10608 + }, + { + "epoch": 0.93, + "grad_norm": 4.902865818661345, + "learning_rate": 1.2163546942135505e-07, + "loss": 0.6676, + "step": 10609 + }, + { + "epoch": 0.93, + "grad_norm": 10.263103576473986, + "learning_rate": 1.213238524366822e-07, + "loss": 0.7793, + "step": 10610 + }, + { + "epoch": 0.93, + "grad_norm": 5.658375573393596, + "learning_rate": 1.2101263022636755e-07, + "loss": 0.616, + "step": 10611 + }, + { + "epoch": 0.93, + "grad_norm": 12.750488775616404, + "learning_rate": 1.2070180281559597e-07, + "loss": 0.6005, + "step": 10612 + }, + { + "epoch": 0.93, + "grad_norm": 5.954745746348128, + "learning_rate": 1.203913702295173e-07, + "loss": 0.7324, + "step": 10613 + }, + { + "epoch": 0.93, + "grad_norm": 7.645483468888335, + "learning_rate": 1.200813324932526e-07, + "loss": 0.6658, + "step": 10614 + }, + { + "epoch": 0.93, + "grad_norm": 45.00928029894176, + "learning_rate": 1.1977168963188958e-07, + "loss": 0.7095, + "step": 10615 + }, + { + "epoch": 0.93, + "grad_norm": 7.1182366440690545, + "learning_rate": 1.1946244167048314e-07, + "loss": 0.6741, + "step": 10616 + }, + { + "epoch": 0.93, + "grad_norm": 7.041789228845999, + "learning_rate": 1.1915358863405724e-07, + "loss": 0.6848, + "step": 10617 + }, + { + "epoch": 0.93, + "grad_norm": 5.933696543189474, + "learning_rate": 1.1884513054760405e-07, + "loss": 0.729, + "step": 10618 + }, + { + "epoch": 0.93, + "grad_norm": 5.4754934762387855, + "learning_rate": 1.1853706743608417e-07, + "loss": 0.7216, + "step": 10619 + }, + { + "epoch": 0.93, + "grad_norm": 6.712010734020538, + "learning_rate": 1.1822939932442434e-07, + "loss": 0.7121, + "step": 10620 + }, + { + "epoch": 0.93, + "grad_norm": 6.988344071233752, + "learning_rate": 1.1792212623752131e-07, + "loss": 0.6971, + "step": 10621 + }, + { + "epoch": 0.93, + "grad_norm": 9.355224498755097, + "learning_rate": 1.1761524820023906e-07, + "loss": 0.8959, + "step": 10622 + }, + { + "epoch": 0.93, + "grad_norm": 11.548715768063175, + "learning_rate": 1.1730876523740886e-07, + "loss": 0.7803, + "step": 10623 + }, + { + "epoch": 0.93, + "grad_norm": 5.818429534402912, + "learning_rate": 1.1700267737383197e-07, + "loss": 0.7478, + "step": 10624 + }, + { + "epoch": 0.93, + "grad_norm": 11.371207798504907, + "learning_rate": 1.1669698463427636e-07, + "loss": 0.596, + "step": 10625 + }, + { + "epoch": 0.93, + "grad_norm": 8.776012493351532, + "learning_rate": 1.1639168704347725e-07, + "loss": 0.6835, + "step": 10626 + }, + { + "epoch": 0.93, + "grad_norm": 6.076288326367578, + "learning_rate": 1.1608678462613987e-07, + "loss": 0.5589, + "step": 10627 + }, + { + "epoch": 0.93, + "grad_norm": 2.500338183678909, + "learning_rate": 1.1578227740693559e-07, + "loss": 0.4613, + "step": 10628 + }, + { + "epoch": 0.93, + "grad_norm": 7.889325409762863, + "learning_rate": 1.1547816541050417e-07, + "loss": 0.8978, + "step": 10629 + }, + { + "epoch": 0.93, + "grad_norm": 6.320579461652145, + "learning_rate": 1.1517444866145533e-07, + "loss": 0.6446, + "step": 10630 + }, + { + "epoch": 0.93, + "grad_norm": 5.270366133776691, + "learning_rate": 1.14871127184365e-07, + "loss": 0.772, + "step": 10631 + }, + { + "epoch": 0.93, + "grad_norm": 8.58187876603399, + "learning_rate": 1.145682010037763e-07, + "loss": 0.793, + "step": 10632 + }, + { + "epoch": 0.93, + "grad_norm": 2.4640927063904856, + "learning_rate": 1.1426567014420297e-07, + "loss": 0.4445, + "step": 10633 + }, + { + "epoch": 0.93, + "grad_norm": 5.127542150160804, + "learning_rate": 1.1396353463012378e-07, + "loss": 0.6347, + "step": 10634 + }, + { + "epoch": 0.93, + "grad_norm": 5.81384679141104, + "learning_rate": 1.1366179448598858e-07, + "loss": 0.7673, + "step": 10635 + }, + { + "epoch": 0.93, + "grad_norm": 2.563263581790874, + "learning_rate": 1.1336044973621285e-07, + "loss": 0.5294, + "step": 10636 + }, + { + "epoch": 0.93, + "grad_norm": 8.352947872877223, + "learning_rate": 1.1305950040518098e-07, + "loss": 0.7159, + "step": 10637 + }, + { + "epoch": 0.93, + "grad_norm": 12.401876978042585, + "learning_rate": 1.1275894651724517e-07, + "loss": 0.5664, + "step": 10638 + }, + { + "epoch": 0.93, + "grad_norm": 8.42411546748784, + "learning_rate": 1.1245878809672595e-07, + "loss": 0.8043, + "step": 10639 + }, + { + "epoch": 0.93, + "grad_norm": 8.091957739979964, + "learning_rate": 1.121590251679111e-07, + "loss": 0.7282, + "step": 10640 + }, + { + "epoch": 0.93, + "grad_norm": 3.091295217399527, + "learning_rate": 1.1185965775505847e-07, + "loss": 0.535, + "step": 10641 + }, + { + "epoch": 0.93, + "grad_norm": 2.5338019050209515, + "learning_rate": 1.1156068588239088e-07, + "loss": 0.4342, + "step": 10642 + }, + { + "epoch": 0.93, + "grad_norm": 7.508349620005279, + "learning_rate": 1.1126210957410066e-07, + "loss": 0.6816, + "step": 10643 + }, + { + "epoch": 0.93, + "grad_norm": 4.380407703807599, + "learning_rate": 1.1096392885434904e-07, + "loss": 0.7244, + "step": 10644 + }, + { + "epoch": 0.93, + "grad_norm": 6.09650335935954, + "learning_rate": 1.1066614374726392e-07, + "loss": 0.5819, + "step": 10645 + }, + { + "epoch": 0.93, + "grad_norm": 7.460449436014874, + "learning_rate": 1.1036875427694049e-07, + "loss": 0.6126, + "step": 10646 + }, + { + "epoch": 0.94, + "grad_norm": 6.229346813781876, + "learning_rate": 1.1007176046744506e-07, + "loss": 0.6343, + "step": 10647 + }, + { + "epoch": 0.94, + "grad_norm": 6.918297983314238, + "learning_rate": 1.0977516234280839e-07, + "loss": 0.6529, + "step": 10648 + }, + { + "epoch": 0.94, + "grad_norm": 6.562559866317543, + "learning_rate": 1.0947895992703129e-07, + "loss": 0.6035, + "step": 10649 + }, + { + "epoch": 0.94, + "grad_norm": 4.920246699611343, + "learning_rate": 1.091831532440818e-07, + "loss": 0.7219, + "step": 10650 + }, + { + "epoch": 0.94, + "grad_norm": 13.055338848121632, + "learning_rate": 1.0888774231789523e-07, + "loss": 0.7448, + "step": 10651 + }, + { + "epoch": 0.94, + "grad_norm": 8.122016622709214, + "learning_rate": 1.0859272717237745e-07, + "loss": 0.7335, + "step": 10652 + }, + { + "epoch": 0.94, + "grad_norm": 6.971214702258431, + "learning_rate": 1.0829810783139993e-07, + "loss": 0.5439, + "step": 10653 + }, + { + "epoch": 0.94, + "grad_norm": 7.859037723466648, + "learning_rate": 1.0800388431880193e-07, + "loss": 0.6602, + "step": 10654 + }, + { + "epoch": 0.94, + "grad_norm": 17.16237985300054, + "learning_rate": 1.0771005665839274e-07, + "loss": 0.7166, + "step": 10655 + }, + { + "epoch": 0.94, + "grad_norm": 13.098646508359945, + "learning_rate": 1.0741662487394778e-07, + "loss": 0.6947, + "step": 10656 + }, + { + "epoch": 0.94, + "grad_norm": 8.910616964883118, + "learning_rate": 1.0712358898921083e-07, + "loss": 0.7436, + "step": 10657 + }, + { + "epoch": 0.94, + "grad_norm": 11.132650525514949, + "learning_rate": 1.0683094902789404e-07, + "loss": 0.7382, + "step": 10658 + }, + { + "epoch": 0.94, + "grad_norm": 13.323000756328055, + "learning_rate": 1.0653870501367792e-07, + "loss": 0.7088, + "step": 10659 + }, + { + "epoch": 0.94, + "grad_norm": 7.155037969501402, + "learning_rate": 1.062468569702102e-07, + "loss": 0.7199, + "step": 10660 + }, + { + "epoch": 0.94, + "grad_norm": 6.374171430132762, + "learning_rate": 1.059554049211059e-07, + "loss": 0.6959, + "step": 10661 + }, + { + "epoch": 0.94, + "grad_norm": 7.3121567087449115, + "learning_rate": 1.0566434888995003e-07, + "loss": 0.6839, + "step": 10662 + }, + { + "epoch": 0.94, + "grad_norm": 5.432709975828398, + "learning_rate": 1.0537368890029265e-07, + "loss": 0.7065, + "step": 10663 + }, + { + "epoch": 0.94, + "grad_norm": 8.15700800038405, + "learning_rate": 1.0508342497565549e-07, + "loss": 0.6187, + "step": 10664 + }, + { + "epoch": 0.94, + "grad_norm": 3.22982652736421, + "learning_rate": 1.0479355713952478e-07, + "loss": 0.483, + "step": 10665 + }, + { + "epoch": 0.94, + "grad_norm": 14.483920198353625, + "learning_rate": 1.0450408541535728e-07, + "loss": 0.6818, + "step": 10666 + }, + { + "epoch": 0.94, + "grad_norm": 8.858152314465645, + "learning_rate": 1.042150098265754e-07, + "loss": 0.7646, + "step": 10667 + }, + { + "epoch": 0.94, + "grad_norm": 19.906897146756798, + "learning_rate": 1.0392633039657151e-07, + "loss": 0.6533, + "step": 10668 + }, + { + "epoch": 0.94, + "grad_norm": 8.061887994109853, + "learning_rate": 1.0363804714870418e-07, + "loss": 0.6855, + "step": 10669 + }, + { + "epoch": 0.94, + "grad_norm": 6.96141806201965, + "learning_rate": 1.0335016010630194e-07, + "loss": 0.6039, + "step": 10670 + }, + { + "epoch": 0.94, + "grad_norm": 6.552379841058063, + "learning_rate": 1.0306266929265951e-07, + "loss": 0.6903, + "step": 10671 + }, + { + "epoch": 0.94, + "grad_norm": 14.903332039831673, + "learning_rate": 1.0277557473104049e-07, + "loss": 0.7592, + "step": 10672 + }, + { + "epoch": 0.94, + "grad_norm": 7.79882511651113, + "learning_rate": 1.0248887644467575e-07, + "loss": 0.657, + "step": 10673 + }, + { + "epoch": 0.94, + "grad_norm": 7.99517407153259, + "learning_rate": 1.0220257445676395e-07, + "loss": 0.8667, + "step": 10674 + }, + { + "epoch": 0.94, + "grad_norm": 6.454449820936493, + "learning_rate": 1.0191666879047323e-07, + "loss": 0.7521, + "step": 10675 + }, + { + "epoch": 0.94, + "grad_norm": 5.92846502473007, + "learning_rate": 1.0163115946893842e-07, + "loss": 0.6527, + "step": 10676 + }, + { + "epoch": 0.94, + "grad_norm": 6.306521650619101, + "learning_rate": 1.0134604651526158e-07, + "loss": 0.7613, + "step": 10677 + }, + { + "epoch": 0.94, + "grad_norm": 5.380954081890428, + "learning_rate": 1.0106132995251483e-07, + "loss": 0.6521, + "step": 10678 + }, + { + "epoch": 0.94, + "grad_norm": 2.363946059842102, + "learning_rate": 1.0077700980373584e-07, + "loss": 0.4383, + "step": 10679 + }, + { + "epoch": 0.94, + "grad_norm": 10.54925585345759, + "learning_rate": 1.004930860919312e-07, + "loss": 0.8693, + "step": 10680 + }, + { + "epoch": 0.94, + "grad_norm": 7.925124467314804, + "learning_rate": 1.0020955884007699e-07, + "loss": 0.8401, + "step": 10681 + }, + { + "epoch": 0.94, + "grad_norm": 8.32393292259091, + "learning_rate": 9.992642807111486e-08, + "loss": 0.5357, + "step": 10682 + }, + { + "epoch": 0.94, + "grad_norm": 7.999883381331501, + "learning_rate": 9.964369380795536e-08, + "loss": 0.728, + "step": 10683 + }, + { + "epoch": 0.94, + "grad_norm": 10.99720597334235, + "learning_rate": 9.93613560734763e-08, + "loss": 0.9091, + "step": 10684 + }, + { + "epoch": 0.94, + "grad_norm": 9.672357605714533, + "learning_rate": 9.907941489052497e-08, + "loss": 0.6523, + "step": 10685 + }, + { + "epoch": 0.94, + "grad_norm": 6.548210166856567, + "learning_rate": 9.879787028191479e-08, + "loss": 0.7684, + "step": 10686 + }, + { + "epoch": 0.94, + "grad_norm": 6.450872969429546, + "learning_rate": 9.851672227042807e-08, + "loss": 0.8329, + "step": 10687 + }, + { + "epoch": 0.94, + "grad_norm": 11.367332473413818, + "learning_rate": 9.823597087881553e-08, + "loss": 0.7736, + "step": 10688 + }, + { + "epoch": 0.94, + "grad_norm": 13.109573213405215, + "learning_rate": 9.795561612979454e-08, + "loss": 0.8146, + "step": 10689 + }, + { + "epoch": 0.94, + "grad_norm": 10.77422653862773, + "learning_rate": 9.767565804605084e-08, + "loss": 0.6083, + "step": 10690 + }, + { + "epoch": 0.94, + "grad_norm": 9.588822289945984, + "learning_rate": 9.739609665023741e-08, + "loss": 0.6977, + "step": 10691 + }, + { + "epoch": 0.94, + "grad_norm": 3.2042582714880345, + "learning_rate": 9.711693196497785e-08, + "loss": 0.6271, + "step": 10692 + }, + { + "epoch": 0.94, + "grad_norm": 9.708085296954264, + "learning_rate": 9.683816401286017e-08, + "loss": 0.7581, + "step": 10693 + }, + { + "epoch": 0.94, + "grad_norm": 8.066769036491923, + "learning_rate": 9.655979281644246e-08, + "loss": 0.7787, + "step": 10694 + }, + { + "epoch": 0.94, + "grad_norm": 8.809448462554846, + "learning_rate": 9.628181839825001e-08, + "loss": 0.5913, + "step": 10695 + }, + { + "epoch": 0.94, + "grad_norm": 2.5078945914707074, + "learning_rate": 9.600424078077541e-08, + "loss": 0.4483, + "step": 10696 + }, + { + "epoch": 0.94, + "grad_norm": 10.388887790846464, + "learning_rate": 9.572705998648069e-08, + "loss": 0.7787, + "step": 10697 + }, + { + "epoch": 0.94, + "grad_norm": 9.759634516434245, + "learning_rate": 9.5450276037794e-08, + "loss": 0.785, + "step": 10698 + }, + { + "epoch": 0.94, + "grad_norm": 8.537989030522537, + "learning_rate": 9.5173888957113e-08, + "loss": 0.7688, + "step": 10699 + }, + { + "epoch": 0.94, + "grad_norm": 9.839877277372185, + "learning_rate": 9.4897898766802e-08, + "loss": 0.6323, + "step": 10700 + }, + { + "epoch": 0.94, + "grad_norm": 3.169033328862792, + "learning_rate": 9.462230548919426e-08, + "loss": 0.4793, + "step": 10701 + }, + { + "epoch": 0.94, + "grad_norm": 8.311255636363054, + "learning_rate": 9.434710914658973e-08, + "loss": 0.6383, + "step": 10702 + }, + { + "epoch": 0.94, + "grad_norm": 6.650024538016993, + "learning_rate": 9.407230976125669e-08, + "loss": 0.6629, + "step": 10703 + }, + { + "epoch": 0.94, + "grad_norm": 14.604850137170404, + "learning_rate": 9.379790735543182e-08, + "loss": 0.7287, + "step": 10704 + }, + { + "epoch": 0.94, + "grad_norm": 8.341900773017537, + "learning_rate": 9.352390195131899e-08, + "loss": 0.8953, + "step": 10705 + }, + { + "epoch": 0.94, + "grad_norm": 7.530622920169007, + "learning_rate": 9.325029357109105e-08, + "loss": 0.7759, + "step": 10706 + }, + { + "epoch": 0.94, + "grad_norm": 11.214760794680528, + "learning_rate": 9.29770822368875e-08, + "loss": 0.7932, + "step": 10707 + }, + { + "epoch": 0.94, + "grad_norm": 6.302795740253628, + "learning_rate": 9.270426797081567e-08, + "loss": 0.8245, + "step": 10708 + }, + { + "epoch": 0.94, + "grad_norm": 36.30337384923203, + "learning_rate": 9.243185079495177e-08, + "loss": 0.7229, + "step": 10709 + }, + { + "epoch": 0.94, + "grad_norm": 12.014992167421743, + "learning_rate": 9.215983073133872e-08, + "loss": 0.8121, + "step": 10710 + }, + { + "epoch": 0.94, + "grad_norm": 10.948099041544125, + "learning_rate": 9.188820780198893e-08, + "loss": 0.7177, + "step": 10711 + }, + { + "epoch": 0.94, + "grad_norm": 26.18276522149082, + "learning_rate": 9.161698202888092e-08, + "loss": 0.6937, + "step": 10712 + }, + { + "epoch": 0.94, + "grad_norm": 7.898856706688678, + "learning_rate": 9.134615343396214e-08, + "loss": 0.7329, + "step": 10713 + }, + { + "epoch": 0.94, + "grad_norm": 8.2084862528557, + "learning_rate": 9.107572203914728e-08, + "loss": 0.6836, + "step": 10714 + }, + { + "epoch": 0.94, + "grad_norm": 5.843054486206257, + "learning_rate": 9.080568786631939e-08, + "loss": 0.6542, + "step": 10715 + }, + { + "epoch": 0.94, + "grad_norm": 13.997913213468147, + "learning_rate": 9.053605093732931e-08, + "loss": 0.7207, + "step": 10716 + }, + { + "epoch": 0.94, + "grad_norm": 7.513183960247778, + "learning_rate": 9.026681127399573e-08, + "loss": 0.6978, + "step": 10717 + }, + { + "epoch": 0.94, + "grad_norm": 3.2711325761849386, + "learning_rate": 8.999796889810508e-08, + "loss": 0.5141, + "step": 10718 + }, + { + "epoch": 0.94, + "grad_norm": 7.216106852535387, + "learning_rate": 8.972952383141109e-08, + "loss": 0.702, + "step": 10719 + }, + { + "epoch": 0.94, + "grad_norm": 14.23496210195915, + "learning_rate": 8.946147609563637e-08, + "loss": 0.8217, + "step": 10720 + }, + { + "epoch": 0.94, + "grad_norm": 6.837620739334205, + "learning_rate": 8.91938257124697e-08, + "loss": 0.6815, + "step": 10721 + }, + { + "epoch": 0.94, + "grad_norm": 27.78514243810042, + "learning_rate": 8.892657270357097e-08, + "loss": 0.7745, + "step": 10722 + }, + { + "epoch": 0.94, + "grad_norm": 2.9039943779538993, + "learning_rate": 8.865971709056509e-08, + "loss": 0.4839, + "step": 10723 + }, + { + "epoch": 0.94, + "grad_norm": 10.193534113259346, + "learning_rate": 8.83932588950448e-08, + "loss": 0.9197, + "step": 10724 + }, + { + "epoch": 0.94, + "grad_norm": 2.903664477165216, + "learning_rate": 8.812719813857228e-08, + "loss": 0.5479, + "step": 10725 + }, + { + "epoch": 0.94, + "grad_norm": 7.880637656691129, + "learning_rate": 8.78615348426759e-08, + "loss": 0.7294, + "step": 10726 + }, + { + "epoch": 0.94, + "grad_norm": 7.032680245590444, + "learning_rate": 8.759626902885343e-08, + "loss": 0.6757, + "step": 10727 + }, + { + "epoch": 0.94, + "grad_norm": 5.770544806275132, + "learning_rate": 8.733140071856939e-08, + "loss": 0.5115, + "step": 10728 + }, + { + "epoch": 0.94, + "grad_norm": 6.47049847295932, + "learning_rate": 8.70669299332566e-08, + "loss": 0.7806, + "step": 10729 + }, + { + "epoch": 0.94, + "grad_norm": 8.650649064427762, + "learning_rate": 8.680285669431632e-08, + "loss": 0.5898, + "step": 10730 + }, + { + "epoch": 0.94, + "grad_norm": 13.948700439629855, + "learning_rate": 8.653918102311532e-08, + "loss": 0.8773, + "step": 10731 + }, + { + "epoch": 0.94, + "grad_norm": 2.4672759275763942, + "learning_rate": 8.627590294099098e-08, + "loss": 0.492, + "step": 10732 + }, + { + "epoch": 0.94, + "grad_norm": 8.423517011170853, + "learning_rate": 8.601302246924681e-08, + "loss": 0.6742, + "step": 10733 + }, + { + "epoch": 0.94, + "grad_norm": 7.809183181280782, + "learning_rate": 8.575053962915525e-08, + "loss": 0.7771, + "step": 10734 + }, + { + "epoch": 0.94, + "grad_norm": 8.619808858293757, + "learning_rate": 8.548845444195541e-08, + "loss": 0.6882, + "step": 10735 + }, + { + "epoch": 0.94, + "grad_norm": 6.949371811451625, + "learning_rate": 8.522676692885479e-08, + "loss": 0.7291, + "step": 10736 + }, + { + "epoch": 0.94, + "grad_norm": 10.49350118667982, + "learning_rate": 8.49654771110292e-08, + "loss": 0.5874, + "step": 10737 + }, + { + "epoch": 0.94, + "grad_norm": 7.21838604956297, + "learning_rate": 8.47045850096212e-08, + "loss": 0.8055, + "step": 10738 + }, + { + "epoch": 0.94, + "grad_norm": 7.672793427890066, + "learning_rate": 8.444409064574222e-08, + "loss": 0.7287, + "step": 10739 + }, + { + "epoch": 0.94, + "grad_norm": 9.438905712517036, + "learning_rate": 8.41839940404704e-08, + "loss": 0.7349, + "step": 10740 + }, + { + "epoch": 0.94, + "grad_norm": 7.953689972415157, + "learning_rate": 8.392429521485335e-08, + "loss": 0.6412, + "step": 10741 + }, + { + "epoch": 0.94, + "grad_norm": 15.059291308724172, + "learning_rate": 8.366499418990482e-08, + "loss": 0.7166, + "step": 10742 + }, + { + "epoch": 0.94, + "grad_norm": 12.405227370341567, + "learning_rate": 8.34060909866069e-08, + "loss": 0.6998, + "step": 10743 + }, + { + "epoch": 0.94, + "grad_norm": 11.315002289533592, + "learning_rate": 8.314758562590896e-08, + "loss": 0.5811, + "step": 10744 + }, + { + "epoch": 0.94, + "grad_norm": 4.993645487856086, + "learning_rate": 8.288947812873038e-08, + "loss": 0.6495, + "step": 10745 + }, + { + "epoch": 0.94, + "grad_norm": 9.931598196313708, + "learning_rate": 8.263176851595612e-08, + "loss": 0.7433, + "step": 10746 + }, + { + "epoch": 0.94, + "grad_norm": 11.75823216994801, + "learning_rate": 8.237445680843947e-08, + "loss": 0.7531, + "step": 10747 + }, + { + "epoch": 0.94, + "grad_norm": 5.937748173129991, + "learning_rate": 8.211754302700159e-08, + "loss": 0.6438, + "step": 10748 + }, + { + "epoch": 0.94, + "grad_norm": 5.989694891559891, + "learning_rate": 8.186102719243194e-08, + "loss": 0.7425, + "step": 10749 + }, + { + "epoch": 0.94, + "grad_norm": 16.185367806854995, + "learning_rate": 8.16049093254867e-08, + "loss": 0.8014, + "step": 10750 + }, + { + "epoch": 0.94, + "grad_norm": 2.6409431897888176, + "learning_rate": 8.134918944689096e-08, + "loss": 0.5161, + "step": 10751 + }, + { + "epoch": 0.94, + "grad_norm": 5.980809276968603, + "learning_rate": 8.109386757733705e-08, + "loss": 0.5313, + "step": 10752 + }, + { + "epoch": 0.94, + "grad_norm": 6.589433756569764, + "learning_rate": 8.083894373748514e-08, + "loss": 0.7467, + "step": 10753 + }, + { + "epoch": 0.94, + "grad_norm": 6.272055254110136, + "learning_rate": 8.058441794796368e-08, + "loss": 0.7779, + "step": 10754 + }, + { + "epoch": 0.94, + "grad_norm": 7.23225032857959, + "learning_rate": 8.033029022936734e-08, + "loss": 0.5822, + "step": 10755 + }, + { + "epoch": 0.94, + "grad_norm": 6.6766476144386395, + "learning_rate": 8.007656060226133e-08, + "loss": 0.6994, + "step": 10756 + }, + { + "epoch": 0.94, + "grad_norm": 8.985128629076312, + "learning_rate": 7.982322908717533e-08, + "loss": 0.7078, + "step": 10757 + }, + { + "epoch": 0.94, + "grad_norm": 6.997482138913426, + "learning_rate": 7.957029570461017e-08, + "loss": 0.7788, + "step": 10758 + }, + { + "epoch": 0.94, + "grad_norm": 5.830970047960535, + "learning_rate": 7.93177604750317e-08, + "loss": 0.6354, + "step": 10759 + }, + { + "epoch": 0.94, + "grad_norm": 13.638438053166878, + "learning_rate": 7.906562341887469e-08, + "loss": 0.7891, + "step": 10760 + }, + { + "epoch": 0.95, + "grad_norm": 8.3643946956025, + "learning_rate": 7.881388455654171e-08, + "loss": 0.7893, + "step": 10761 + }, + { + "epoch": 0.95, + "grad_norm": 7.192385343034529, + "learning_rate": 7.856254390840367e-08, + "loss": 0.5713, + "step": 10762 + }, + { + "epoch": 0.95, + "grad_norm": 6.963072451489396, + "learning_rate": 7.831160149479766e-08, + "loss": 0.6627, + "step": 10763 + }, + { + "epoch": 0.95, + "grad_norm": 8.048792330025423, + "learning_rate": 7.806105733603075e-08, + "loss": 0.7466, + "step": 10764 + }, + { + "epoch": 0.95, + "grad_norm": 11.053304352453948, + "learning_rate": 7.781091145237563e-08, + "loss": 0.8034, + "step": 10765 + }, + { + "epoch": 0.95, + "grad_norm": 6.8265694778883725, + "learning_rate": 7.756116386407386e-08, + "loss": 0.6903, + "step": 10766 + }, + { + "epoch": 0.95, + "grad_norm": 8.921060521760534, + "learning_rate": 7.731181459133375e-08, + "loss": 0.6509, + "step": 10767 + }, + { + "epoch": 0.95, + "grad_norm": 7.372363258222512, + "learning_rate": 7.706286365433413e-08, + "loss": 0.6582, + "step": 10768 + }, + { + "epoch": 0.95, + "grad_norm": 7.627733565309569, + "learning_rate": 7.681431107321891e-08, + "loss": 0.7323, + "step": 10769 + }, + { + "epoch": 0.95, + "grad_norm": 5.32017671758721, + "learning_rate": 7.656615686809976e-08, + "loss": 0.6145, + "step": 10770 + }, + { + "epoch": 0.95, + "grad_norm": 20.15001131166634, + "learning_rate": 7.631840105905786e-08, + "loss": 0.6526, + "step": 10771 + }, + { + "epoch": 0.95, + "grad_norm": 6.410213424446668, + "learning_rate": 7.607104366614049e-08, + "loss": 0.6265, + "step": 10772 + }, + { + "epoch": 0.95, + "grad_norm": 8.685410338108074, + "learning_rate": 7.582408470936387e-08, + "loss": 0.6115, + "step": 10773 + }, + { + "epoch": 0.95, + "grad_norm": 11.096673140588598, + "learning_rate": 7.557752420871146e-08, + "loss": 0.7254, + "step": 10774 + }, + { + "epoch": 0.95, + "grad_norm": 11.637951617894123, + "learning_rate": 7.533136218413396e-08, + "loss": 0.6547, + "step": 10775 + }, + { + "epoch": 0.95, + "grad_norm": 7.84809517574351, + "learning_rate": 7.508559865555154e-08, + "loss": 0.6883, + "step": 10776 + }, + { + "epoch": 0.95, + "grad_norm": 7.235910676328692, + "learning_rate": 7.484023364284998e-08, + "loss": 0.7171, + "step": 10777 + }, + { + "epoch": 0.95, + "grad_norm": 9.734572044770768, + "learning_rate": 7.459526716588394e-08, + "loss": 0.6397, + "step": 10778 + }, + { + "epoch": 0.95, + "grad_norm": 8.664620083661912, + "learning_rate": 7.435069924447591e-08, + "loss": 0.6685, + "step": 10779 + }, + { + "epoch": 0.95, + "grad_norm": 9.044007738540207, + "learning_rate": 7.410652989841615e-08, + "loss": 0.7721, + "step": 10780 + }, + { + "epoch": 0.95, + "grad_norm": 12.500563372410845, + "learning_rate": 7.386275914746222e-08, + "loss": 0.5882, + "step": 10781 + }, + { + "epoch": 0.95, + "grad_norm": 3.8428257430348736, + "learning_rate": 7.361938701133996e-08, + "loss": 0.5334, + "step": 10782 + }, + { + "epoch": 0.95, + "grad_norm": 15.462808960845512, + "learning_rate": 7.3376413509742e-08, + "loss": 0.8127, + "step": 10783 + }, + { + "epoch": 0.95, + "grad_norm": 5.356825741548106, + "learning_rate": 7.313383866232925e-08, + "loss": 0.5976, + "step": 10784 + }, + { + "epoch": 0.95, + "grad_norm": 7.776613017920412, + "learning_rate": 7.289166248873103e-08, + "loss": 0.6396, + "step": 10785 + }, + { + "epoch": 0.95, + "grad_norm": 11.972076308303222, + "learning_rate": 7.264988500854386e-08, + "loss": 0.6231, + "step": 10786 + }, + { + "epoch": 0.95, + "grad_norm": 10.032356466174452, + "learning_rate": 7.240850624133155e-08, + "loss": 0.8443, + "step": 10787 + }, + { + "epoch": 0.95, + "grad_norm": 8.914947676534966, + "learning_rate": 7.216752620662627e-08, + "loss": 0.5957, + "step": 10788 + }, + { + "epoch": 0.95, + "grad_norm": 7.187852780743257, + "learning_rate": 7.192694492392794e-08, + "loss": 0.7456, + "step": 10789 + }, + { + "epoch": 0.95, + "grad_norm": 6.564226916777111, + "learning_rate": 7.168676241270322e-08, + "loss": 0.6132, + "step": 10790 + }, + { + "epoch": 0.95, + "grad_norm": 4.801507366562395, + "learning_rate": 7.144697869238881e-08, + "loss": 0.7308, + "step": 10791 + }, + { + "epoch": 0.95, + "grad_norm": 12.836059850012242, + "learning_rate": 7.120759378238585e-08, + "loss": 0.7487, + "step": 10792 + }, + { + "epoch": 0.95, + "grad_norm": 10.726684986770605, + "learning_rate": 7.096860770206659e-08, + "loss": 0.5897, + "step": 10793 + }, + { + "epoch": 0.95, + "grad_norm": 7.672382497905008, + "learning_rate": 7.073002047076783e-08, + "loss": 0.8094, + "step": 10794 + }, + { + "epoch": 0.95, + "grad_norm": 7.6662296000696655, + "learning_rate": 7.04918321077963e-08, + "loss": 0.7692, + "step": 10795 + }, + { + "epoch": 0.95, + "grad_norm": 7.258858186001823, + "learning_rate": 7.025404263242664e-08, + "loss": 0.6447, + "step": 10796 + }, + { + "epoch": 0.95, + "grad_norm": 13.607379279518467, + "learning_rate": 7.001665206389895e-08, + "loss": 0.7032, + "step": 10797 + }, + { + "epoch": 0.95, + "grad_norm": 5.18069942971439, + "learning_rate": 6.977966042142403e-08, + "loss": 0.6044, + "step": 10798 + }, + { + "epoch": 0.95, + "grad_norm": 6.754263429012323, + "learning_rate": 6.954306772417707e-08, + "loss": 0.7831, + "step": 10799 + }, + { + "epoch": 0.95, + "grad_norm": 2.588325651084336, + "learning_rate": 6.930687399130442e-08, + "loss": 0.509, + "step": 10800 + }, + { + "epoch": 0.95, + "grad_norm": 8.09606932783011, + "learning_rate": 6.907107924191692e-08, + "loss": 0.7059, + "step": 10801 + }, + { + "epoch": 0.95, + "grad_norm": 5.92677780347473, + "learning_rate": 6.883568349509596e-08, + "loss": 0.7015, + "step": 10802 + }, + { + "epoch": 0.95, + "grad_norm": 13.926974419475545, + "learning_rate": 6.860068676988907e-08, + "loss": 0.6989, + "step": 10803 + }, + { + "epoch": 0.95, + "grad_norm": 5.669379863089404, + "learning_rate": 6.836608908531162e-08, + "loss": 0.6799, + "step": 10804 + }, + { + "epoch": 0.95, + "grad_norm": 5.927731254098361, + "learning_rate": 6.813189046034674e-08, + "loss": 0.6109, + "step": 10805 + }, + { + "epoch": 0.95, + "grad_norm": 13.031136919523384, + "learning_rate": 6.789809091394539e-08, + "loss": 0.7425, + "step": 10806 + }, + { + "epoch": 0.95, + "grad_norm": 6.436357234176299, + "learning_rate": 6.766469046502633e-08, + "loss": 0.8381, + "step": 10807 + }, + { + "epoch": 0.95, + "grad_norm": 2.6235878300435473, + "learning_rate": 6.743168913247611e-08, + "loss": 0.5709, + "step": 10808 + }, + { + "epoch": 0.95, + "grad_norm": 8.174882477392355, + "learning_rate": 6.71990869351491e-08, + "loss": 0.8232, + "step": 10809 + }, + { + "epoch": 0.95, + "grad_norm": 10.832775614929078, + "learning_rate": 6.696688389186634e-08, + "loss": 0.6669, + "step": 10810 + }, + { + "epoch": 0.95, + "grad_norm": 6.2631057270309825, + "learning_rate": 6.673508002141782e-08, + "loss": 0.7451, + "step": 10811 + }, + { + "epoch": 0.95, + "grad_norm": 7.7935571998476085, + "learning_rate": 6.650367534256075e-08, + "loss": 0.7325, + "step": 10812 + }, + { + "epoch": 0.95, + "grad_norm": 8.980546125259767, + "learning_rate": 6.627266987401904e-08, + "loss": 0.6765, + "step": 10813 + }, + { + "epoch": 0.95, + "grad_norm": 3.656344085996391, + "learning_rate": 6.604206363448662e-08, + "loss": 0.5648, + "step": 10814 + }, + { + "epoch": 0.95, + "grad_norm": 2.6386210480224364, + "learning_rate": 6.581185664262301e-08, + "loss": 0.5019, + "step": 10815 + }, + { + "epoch": 0.95, + "grad_norm": 5.954249036499854, + "learning_rate": 6.558204891705666e-08, + "loss": 0.6097, + "step": 10816 + }, + { + "epoch": 0.95, + "grad_norm": 16.16288931020793, + "learning_rate": 6.535264047638268e-08, + "loss": 0.6924, + "step": 10817 + }, + { + "epoch": 0.95, + "grad_norm": 8.515200114842527, + "learning_rate": 6.512363133916454e-08, + "loss": 0.7861, + "step": 10818 + }, + { + "epoch": 0.95, + "grad_norm": 11.24583150495423, + "learning_rate": 6.489502152393356e-08, + "loss": 0.7054, + "step": 10819 + }, + { + "epoch": 0.95, + "grad_norm": 8.344975439732481, + "learning_rate": 6.466681104918826e-08, + "loss": 0.702, + "step": 10820 + }, + { + "epoch": 0.95, + "grad_norm": 7.588492704880469, + "learning_rate": 6.443899993339554e-08, + "loss": 0.7313, + "step": 10821 + }, + { + "epoch": 0.95, + "grad_norm": 6.713798909715843, + "learning_rate": 6.421158819498841e-08, + "loss": 0.6362, + "step": 10822 + }, + { + "epoch": 0.95, + "grad_norm": 10.373709934830716, + "learning_rate": 6.398457585236995e-08, + "loss": 0.711, + "step": 10823 + }, + { + "epoch": 0.95, + "grad_norm": 9.745048248417755, + "learning_rate": 6.375796292390823e-08, + "loss": 0.6945, + "step": 10824 + }, + { + "epoch": 0.95, + "grad_norm": 6.286383698475693, + "learning_rate": 6.353174942794138e-08, + "loss": 0.7995, + "step": 10825 + }, + { + "epoch": 0.95, + "grad_norm": 11.065057886489106, + "learning_rate": 6.330593538277419e-08, + "loss": 0.712, + "step": 10826 + }, + { + "epoch": 0.95, + "grad_norm": 2.896676711531836, + "learning_rate": 6.30805208066787e-08, + "loss": 0.506, + "step": 10827 + }, + { + "epoch": 0.95, + "grad_norm": 3.4847223490383423, + "learning_rate": 6.285550571789589e-08, + "loss": 0.5332, + "step": 10828 + }, + { + "epoch": 0.95, + "grad_norm": 11.67972971536469, + "learning_rate": 6.26308901346323e-08, + "loss": 0.8096, + "step": 10829 + }, + { + "epoch": 0.95, + "grad_norm": 7.908599301558263, + "learning_rate": 6.240667407506452e-08, + "loss": 0.6154, + "step": 10830 + }, + { + "epoch": 0.95, + "grad_norm": 8.324312716096829, + "learning_rate": 6.218285755733522e-08, + "loss": 0.6946, + "step": 10831 + }, + { + "epoch": 0.95, + "grad_norm": 3.6642981565345916, + "learning_rate": 6.195944059955606e-08, + "loss": 0.479, + "step": 10832 + }, + { + "epoch": 0.95, + "grad_norm": 9.505628087966075, + "learning_rate": 6.173642321980422e-08, + "loss": 0.5611, + "step": 10833 + }, + { + "epoch": 0.95, + "grad_norm": 13.973616281747441, + "learning_rate": 6.151380543612695e-08, + "loss": 0.767, + "step": 10834 + }, + { + "epoch": 0.95, + "grad_norm": 9.011191964708074, + "learning_rate": 6.12915872665376e-08, + "loss": 0.6568, + "step": 10835 + }, + { + "epoch": 0.95, + "grad_norm": 9.833941902300957, + "learning_rate": 6.106976872901793e-08, + "loss": 0.7117, + "step": 10836 + }, + { + "epoch": 0.95, + "grad_norm": 6.579985829479544, + "learning_rate": 6.084834984151744e-08, + "loss": 0.6347, + "step": 10837 + }, + { + "epoch": 0.95, + "grad_norm": 6.778226769210727, + "learning_rate": 6.062733062195236e-08, + "loss": 0.7219, + "step": 10838 + }, + { + "epoch": 0.95, + "grad_norm": 12.783853728171986, + "learning_rate": 6.040671108820783e-08, + "loss": 0.6655, + "step": 10839 + }, + { + "epoch": 0.95, + "grad_norm": 5.683524542401596, + "learning_rate": 6.018649125813513e-08, + "loss": 0.6621, + "step": 10840 + }, + { + "epoch": 0.95, + "grad_norm": 5.288034789100126, + "learning_rate": 5.996667114955446e-08, + "loss": 0.6498, + "step": 10841 + }, + { + "epoch": 0.95, + "grad_norm": 4.5933444832219505, + "learning_rate": 5.974725078025379e-08, + "loss": 0.5292, + "step": 10842 + }, + { + "epoch": 0.95, + "grad_norm": 11.800647223405214, + "learning_rate": 5.952823016798837e-08, + "loss": 0.6974, + "step": 10843 + }, + { + "epoch": 0.95, + "grad_norm": 9.632773951899253, + "learning_rate": 5.9309609330479577e-08, + "loss": 0.7387, + "step": 10844 + }, + { + "epoch": 0.95, + "grad_norm": 53.98603925995194, + "learning_rate": 5.909138828541938e-08, + "loss": 0.6778, + "step": 10845 + }, + { + "epoch": 0.95, + "grad_norm": 9.105747696893527, + "learning_rate": 5.8873567050465316e-08, + "loss": 0.6823, + "step": 10846 + }, + { + "epoch": 0.95, + "grad_norm": 3.2321074042744686, + "learning_rate": 5.865614564324273e-08, + "loss": 0.5084, + "step": 10847 + }, + { + "epoch": 0.95, + "grad_norm": 10.820733172712062, + "learning_rate": 5.8439124081345336e-08, + "loss": 0.6448, + "step": 10848 + }, + { + "epoch": 0.95, + "grad_norm": 5.975564403434506, + "learning_rate": 5.822250238233407e-08, + "loss": 0.6776, + "step": 10849 + }, + { + "epoch": 0.95, + "grad_norm": 9.09019274764254, + "learning_rate": 5.800628056373825e-08, + "loss": 0.5846, + "step": 10850 + }, + { + "epoch": 0.95, + "grad_norm": 8.72589812696497, + "learning_rate": 5.779045864305333e-08, + "loss": 0.606, + "step": 10851 + }, + { + "epoch": 0.95, + "grad_norm": 6.456770499676617, + "learning_rate": 5.7575036637743106e-08, + "loss": 0.5113, + "step": 10852 + }, + { + "epoch": 0.95, + "grad_norm": 18.049959547294797, + "learning_rate": 5.7360014565240316e-08, + "loss": 0.6562, + "step": 10853 + }, + { + "epoch": 0.95, + "grad_norm": 7.210765399629086, + "learning_rate": 5.7145392442942704e-08, + "loss": 0.6363, + "step": 10854 + }, + { + "epoch": 0.95, + "grad_norm": 8.869758624113839, + "learning_rate": 5.6931170288218594e-08, + "loss": 0.7285, + "step": 10855 + }, + { + "epoch": 0.95, + "grad_norm": 2.297065195036521, + "learning_rate": 5.671734811840191e-08, + "loss": 0.4492, + "step": 10856 + }, + { + "epoch": 0.95, + "grad_norm": 10.348998427598573, + "learning_rate": 5.6503925950794344e-08, + "loss": 0.5851, + "step": 10857 + }, + { + "epoch": 0.95, + "grad_norm": 8.457778592820532, + "learning_rate": 5.6290903802665444e-08, + "loss": 0.8005, + "step": 10858 + }, + { + "epoch": 0.95, + "grad_norm": 10.943726265013002, + "learning_rate": 5.607828169125418e-08, + "loss": 0.6598, + "step": 10859 + }, + { + "epoch": 0.95, + "grad_norm": 6.565179091812432, + "learning_rate": 5.5866059633764015e-08, + "loss": 0.8769, + "step": 10860 + }, + { + "epoch": 0.95, + "grad_norm": 5.514205165322344, + "learning_rate": 5.565423764736844e-08, + "loss": 0.639, + "step": 10861 + }, + { + "epoch": 0.95, + "grad_norm": 3.7579502859113805, + "learning_rate": 5.544281574920707e-08, + "loss": 0.5142, + "step": 10862 + }, + { + "epoch": 0.95, + "grad_norm": 8.196176909205931, + "learning_rate": 5.5231793956389e-08, + "loss": 0.7687, + "step": 10863 + }, + { + "epoch": 0.95, + "grad_norm": 9.761624549544061, + "learning_rate": 5.50211722859878e-08, + "loss": 0.7008, + "step": 10864 + }, + { + "epoch": 0.95, + "grad_norm": 6.240392441808166, + "learning_rate": 5.481095075504872e-08, + "loss": 0.7924, + "step": 10865 + }, + { + "epoch": 0.95, + "grad_norm": 7.370319106087157, + "learning_rate": 5.460112938058093e-08, + "loss": 0.7981, + "step": 10866 + }, + { + "epoch": 0.95, + "grad_norm": 13.10732555778027, + "learning_rate": 5.439170817956418e-08, + "loss": 0.6986, + "step": 10867 + }, + { + "epoch": 0.95, + "grad_norm": 2.6863970370319294, + "learning_rate": 5.418268716894326e-08, + "loss": 0.5058, + "step": 10868 + }, + { + "epoch": 0.95, + "grad_norm": 6.230834537283179, + "learning_rate": 5.397406636563296e-08, + "loss": 0.7071, + "step": 10869 + }, + { + "epoch": 0.95, + "grad_norm": 2.5988585499110592, + "learning_rate": 5.376584578651312e-08, + "loss": 0.4611, + "step": 10870 + }, + { + "epoch": 0.95, + "grad_norm": 7.639501403519943, + "learning_rate": 5.355802544843358e-08, + "loss": 0.761, + "step": 10871 + }, + { + "epoch": 0.95, + "grad_norm": 18.50853353617992, + "learning_rate": 5.335060536821091e-08, + "loss": 0.6083, + "step": 10872 + }, + { + "epoch": 0.95, + "grad_norm": 12.642852709652837, + "learning_rate": 5.314358556262944e-08, + "loss": 0.731, + "step": 10873 + }, + { + "epoch": 0.95, + "grad_norm": 8.893642051689985, + "learning_rate": 5.293696604843967e-08, + "loss": 0.7099, + "step": 10874 + }, + { + "epoch": 0.96, + "grad_norm": 7.320782344058031, + "learning_rate": 5.273074684236157e-08, + "loss": 0.675, + "step": 10875 + }, + { + "epoch": 0.96, + "grad_norm": 6.651867061311952, + "learning_rate": 5.252492796108233e-08, + "loss": 0.7272, + "step": 10876 + }, + { + "epoch": 0.96, + "grad_norm": 5.761019295584899, + "learning_rate": 5.2319509421255855e-08, + "loss": 0.706, + "step": 10877 + }, + { + "epoch": 0.96, + "grad_norm": 7.358072769965866, + "learning_rate": 5.211449123950496e-08, + "loss": 0.6693, + "step": 10878 + }, + { + "epoch": 0.96, + "grad_norm": 3.216935434121855, + "learning_rate": 5.1909873432419156e-08, + "loss": 0.5191, + "step": 10879 + }, + { + "epoch": 0.96, + "grad_norm": 7.431679710151156, + "learning_rate": 5.1705656016555196e-08, + "loss": 0.6684, + "step": 10880 + }, + { + "epoch": 0.96, + "grad_norm": 17.73439606745751, + "learning_rate": 5.150183900843875e-08, + "loss": 0.8107, + "step": 10881 + }, + { + "epoch": 0.96, + "grad_norm": 6.499538800573447, + "learning_rate": 5.129842242456218e-08, + "loss": 0.8168, + "step": 10882 + }, + { + "epoch": 0.96, + "grad_norm": 10.725530901731856, + "learning_rate": 5.1095406281385654e-08, + "loss": 0.7629, + "step": 10883 + }, + { + "epoch": 0.96, + "grad_norm": 8.831281208517984, + "learning_rate": 5.089279059533658e-08, + "loss": 0.6297, + "step": 10884 + }, + { + "epoch": 0.96, + "grad_norm": 11.133365711956614, + "learning_rate": 5.0690575382810195e-08, + "loss": 0.7209, + "step": 10885 + }, + { + "epoch": 0.96, + "grad_norm": 6.180757803739445, + "learning_rate": 5.0488760660170054e-08, + "loss": 0.6124, + "step": 10886 + }, + { + "epoch": 0.96, + "grad_norm": 7.333853084502503, + "learning_rate": 5.028734644374589e-08, + "loss": 0.783, + "step": 10887 + }, + { + "epoch": 0.96, + "grad_norm": 13.313090857508577, + "learning_rate": 5.008633274983687e-08, + "loss": 0.6133, + "step": 10888 + }, + { + "epoch": 0.96, + "grad_norm": 5.547007119736265, + "learning_rate": 4.988571959470778e-08, + "loss": 0.7015, + "step": 10889 + }, + { + "epoch": 0.96, + "grad_norm": 6.8683268233037555, + "learning_rate": 4.9685506994592294e-08, + "loss": 0.6646, + "step": 10890 + }, + { + "epoch": 0.96, + "grad_norm": 5.534442217618795, + "learning_rate": 4.948569496569078e-08, + "loss": 0.5922, + "step": 10891 + }, + { + "epoch": 0.96, + "grad_norm": 10.677483207538721, + "learning_rate": 4.9286283524171976e-08, + "loss": 0.6559, + "step": 10892 + }, + { + "epoch": 0.96, + "grad_norm": 7.571249388993312, + "learning_rate": 4.908727268617241e-08, + "loss": 0.7001, + "step": 10893 + }, + { + "epoch": 0.96, + "grad_norm": 6.181106325248573, + "learning_rate": 4.888866246779533e-08, + "loss": 0.6272, + "step": 10894 + }, + { + "epoch": 0.96, + "grad_norm": 5.238101627082039, + "learning_rate": 4.869045288511176e-08, + "loss": 0.7741, + "step": 10895 + }, + { + "epoch": 0.96, + "grad_norm": 2.307854603829609, + "learning_rate": 4.849264395416054e-08, + "loss": 0.3916, + "step": 10896 + }, + { + "epoch": 0.96, + "grad_norm": 2.2324546521223456, + "learning_rate": 4.829523569094885e-08, + "loss": 0.5257, + "step": 10897 + }, + { + "epoch": 0.96, + "grad_norm": 6.543261573795957, + "learning_rate": 4.809822811144893e-08, + "loss": 0.6503, + "step": 10898 + }, + { + "epoch": 0.96, + "grad_norm": 5.503831665910411, + "learning_rate": 4.7901621231604135e-08, + "loss": 0.6927, + "step": 10899 + }, + { + "epoch": 0.96, + "grad_norm": 7.96786218073751, + "learning_rate": 4.770541506732229e-08, + "loss": 0.7794, + "step": 10900 + }, + { + "epoch": 0.96, + "grad_norm": 8.86312926220104, + "learning_rate": 4.75096096344807e-08, + "loss": 0.695, + "step": 10901 + }, + { + "epoch": 0.96, + "grad_norm": 8.707406403801432, + "learning_rate": 4.7314204948923356e-08, + "loss": 0.7124, + "step": 10902 + }, + { + "epoch": 0.96, + "grad_norm": 7.0042873213002395, + "learning_rate": 4.7119201026462614e-08, + "loss": 0.7744, + "step": 10903 + }, + { + "epoch": 0.96, + "grad_norm": 11.420787312475658, + "learning_rate": 4.692459788287695e-08, + "loss": 0.8353, + "step": 10904 + }, + { + "epoch": 0.96, + "grad_norm": 8.613471712712125, + "learning_rate": 4.6730395533913784e-08, + "loss": 0.6773, + "step": 10905 + }, + { + "epoch": 0.96, + "grad_norm": 2.949900171496503, + "learning_rate": 4.653659399528776e-08, + "loss": 0.492, + "step": 10906 + }, + { + "epoch": 0.96, + "grad_norm": 5.513829133416891, + "learning_rate": 4.634319328268133e-08, + "loss": 0.6275, + "step": 10907 + }, + { + "epoch": 0.96, + "grad_norm": 2.329704596746755, + "learning_rate": 4.615019341174309e-08, + "loss": 0.4326, + "step": 10908 + }, + { + "epoch": 0.96, + "grad_norm": 6.463230811078824, + "learning_rate": 4.5957594398091665e-08, + "loss": 0.7134, + "step": 10909 + }, + { + "epoch": 0.96, + "grad_norm": 15.23901805635469, + "learning_rate": 4.5765396257310134e-08, + "loss": 0.835, + "step": 10910 + }, + { + "epoch": 0.96, + "grad_norm": 6.452435732693069, + "learning_rate": 4.5573599004952174e-08, + "loss": 0.7149, + "step": 10911 + }, + { + "epoch": 0.96, + "grad_norm": 9.341335212237041, + "learning_rate": 4.538220265653759e-08, + "loss": 0.7877, + "step": 10912 + }, + { + "epoch": 0.96, + "grad_norm": 6.709145654105608, + "learning_rate": 4.5191207227553437e-08, + "loss": 0.8296, + "step": 10913 + }, + { + "epoch": 0.96, + "grad_norm": 8.564820050318058, + "learning_rate": 4.500061273345457e-08, + "loss": 0.7998, + "step": 10914 + }, + { + "epoch": 0.96, + "grad_norm": 11.498009308575705, + "learning_rate": 4.481041918966422e-08, + "loss": 0.7127, + "step": 10915 + }, + { + "epoch": 0.96, + "grad_norm": 10.326456138212375, + "learning_rate": 4.462062661157174e-08, + "loss": 0.6694, + "step": 10916 + }, + { + "epoch": 0.96, + "grad_norm": 7.097693586140491, + "learning_rate": 4.443123501453595e-08, + "loss": 0.6866, + "step": 10917 + }, + { + "epoch": 0.96, + "grad_norm": 6.0725596224623, + "learning_rate": 4.4242244413881254e-08, + "loss": 0.6712, + "step": 10918 + }, + { + "epoch": 0.96, + "grad_norm": 7.041943202323781, + "learning_rate": 4.405365482490043e-08, + "loss": 0.7105, + "step": 10919 + }, + { + "epoch": 0.96, + "grad_norm": 6.982688804504921, + "learning_rate": 4.386546626285459e-08, + "loss": 0.7554, + "step": 10920 + }, + { + "epoch": 0.96, + "grad_norm": 3.732498308658768, + "learning_rate": 4.367767874297046e-08, + "loss": 0.5899, + "step": 10921 + }, + { + "epoch": 0.96, + "grad_norm": 8.963775545800868, + "learning_rate": 4.349029228044477e-08, + "loss": 0.7477, + "step": 10922 + }, + { + "epoch": 0.96, + "grad_norm": 8.5700702106329, + "learning_rate": 4.330330689043927e-08, + "loss": 0.6269, + "step": 10923 + }, + { + "epoch": 0.96, + "grad_norm": 2.5272425011869277, + "learning_rate": 4.311672258808575e-08, + "loss": 0.4336, + "step": 10924 + }, + { + "epoch": 0.96, + "grad_norm": 7.563011286470218, + "learning_rate": 4.2930539388481574e-08, + "loss": 0.6392, + "step": 10925 + }, + { + "epoch": 0.96, + "grad_norm": 7.027422193707875, + "learning_rate": 4.274475730669192e-08, + "loss": 0.7077, + "step": 10926 + }, + { + "epoch": 0.96, + "grad_norm": 10.909168814935574, + "learning_rate": 4.255937635775087e-08, + "loss": 0.7562, + "step": 10927 + }, + { + "epoch": 0.96, + "grad_norm": 22.616806251355754, + "learning_rate": 4.237439655665865e-08, + "loss": 0.5684, + "step": 10928 + }, + { + "epoch": 0.96, + "grad_norm": 9.433047220176222, + "learning_rate": 4.218981791838439e-08, + "loss": 0.8037, + "step": 10929 + }, + { + "epoch": 0.96, + "grad_norm": 17.653960676752448, + "learning_rate": 4.200564045786226e-08, + "loss": 0.7928, + "step": 10930 + }, + { + "epoch": 0.96, + "grad_norm": 8.65518681150698, + "learning_rate": 4.1821864189997006e-08, + "loss": 0.6746, + "step": 10931 + }, + { + "epoch": 0.96, + "grad_norm": 2.950462721650302, + "learning_rate": 4.163848912965895e-08, + "loss": 0.5433, + "step": 10932 + }, + { + "epoch": 0.96, + "grad_norm": 7.550816599654516, + "learning_rate": 4.145551529168623e-08, + "loss": 0.685, + "step": 10933 + }, + { + "epoch": 0.96, + "grad_norm": 9.954148840290086, + "learning_rate": 4.127294269088533e-08, + "loss": 0.6745, + "step": 10934 + }, + { + "epoch": 0.96, + "grad_norm": 11.779649968480367, + "learning_rate": 4.109077134202999e-08, + "loss": 0.7305, + "step": 10935 + }, + { + "epoch": 0.96, + "grad_norm": 11.675852159329212, + "learning_rate": 4.090900125986008e-08, + "loss": 0.7777, + "step": 10936 + }, + { + "epoch": 0.96, + "grad_norm": 10.191949762039545, + "learning_rate": 4.0727632459084956e-08, + "loss": 0.7203, + "step": 10937 + }, + { + "epoch": 0.96, + "grad_norm": 9.724803770240534, + "learning_rate": 4.054666495438009e-08, + "loss": 0.7812, + "step": 10938 + }, + { + "epoch": 0.96, + "grad_norm": 19.722194937418973, + "learning_rate": 4.0366098760389885e-08, + "loss": 0.6732, + "step": 10939 + }, + { + "epoch": 0.96, + "grad_norm": 13.224423075084614, + "learning_rate": 4.018593389172487e-08, + "loss": 0.6925, + "step": 10940 + }, + { + "epoch": 0.96, + "grad_norm": 7.755969294231305, + "learning_rate": 4.000617036296395e-08, + "loss": 0.7504, + "step": 10941 + }, + { + "epoch": 0.96, + "grad_norm": 5.693953296402202, + "learning_rate": 3.982680818865326e-08, + "loss": 0.626, + "step": 10942 + }, + { + "epoch": 0.96, + "grad_norm": 5.9902255531769235, + "learning_rate": 3.9647847383306734e-08, + "loss": 0.5657, + "step": 10943 + }, + { + "epoch": 0.96, + "grad_norm": 8.385741673777458, + "learning_rate": 3.946928796140448e-08, + "loss": 0.6794, + "step": 10944 + }, + { + "epoch": 0.96, + "grad_norm": 6.740058276405326, + "learning_rate": 3.929112993739659e-08, + "loss": 0.6781, + "step": 10945 + }, + { + "epoch": 0.96, + "grad_norm": 10.717259731493675, + "learning_rate": 3.911337332569876e-08, + "loss": 0.6973, + "step": 10946 + }, + { + "epoch": 0.96, + "grad_norm": 7.54324796939971, + "learning_rate": 3.893601814069447e-08, + "loss": 0.6086, + "step": 10947 + }, + { + "epoch": 0.96, + "grad_norm": 8.34669017599431, + "learning_rate": 3.8759064396735605e-08, + "loss": 0.6908, + "step": 10948 + }, + { + "epoch": 0.96, + "grad_norm": 6.833762894260416, + "learning_rate": 3.858251210814068e-08, + "loss": 0.7161, + "step": 10949 + }, + { + "epoch": 0.96, + "grad_norm": 4.384060594719937, + "learning_rate": 3.8406361289195503e-08, + "loss": 0.7321, + "step": 10950 + }, + { + "epoch": 0.96, + "grad_norm": 10.278954699155266, + "learning_rate": 3.823061195415423e-08, + "loss": 0.6498, + "step": 10951 + }, + { + "epoch": 0.96, + "grad_norm": 7.306722927257607, + "learning_rate": 3.8055264117238813e-08, + "loss": 0.6905, + "step": 10952 + }, + { + "epoch": 0.96, + "grad_norm": 2.598282329152803, + "learning_rate": 3.78803177926379e-08, + "loss": 0.4602, + "step": 10953 + }, + { + "epoch": 0.96, + "grad_norm": 19.141965767383926, + "learning_rate": 3.770577299450739e-08, + "loss": 0.5938, + "step": 10954 + }, + { + "epoch": 0.96, + "grad_norm": 6.527629751809623, + "learning_rate": 3.7531629736971e-08, + "loss": 0.7207, + "step": 10955 + }, + { + "epoch": 0.96, + "grad_norm": 11.879183108658403, + "learning_rate": 3.735788803412077e-08, + "loss": 0.6035, + "step": 10956 + }, + { + "epoch": 0.96, + "grad_norm": 9.391769574491674, + "learning_rate": 3.718454790001546e-08, + "loss": 0.7101, + "step": 10957 + }, + { + "epoch": 0.96, + "grad_norm": 13.802386408413199, + "learning_rate": 3.7011609348681085e-08, + "loss": 0.6414, + "step": 10958 + }, + { + "epoch": 0.96, + "grad_norm": 10.564862446076011, + "learning_rate": 3.6839072394111996e-08, + "loss": 0.726, + "step": 10959 + }, + { + "epoch": 0.96, + "grad_norm": 8.342272830438318, + "learning_rate": 3.6666937050269245e-08, + "loss": 0.779, + "step": 10960 + }, + { + "epoch": 0.96, + "grad_norm": 11.284910909461143, + "learning_rate": 3.64952033310817e-08, + "loss": 0.8789, + "step": 10961 + }, + { + "epoch": 0.96, + "grad_norm": 9.87235541817267, + "learning_rate": 3.632387125044601e-08, + "loss": 0.607, + "step": 10962 + }, + { + "epoch": 0.96, + "grad_norm": 10.483359369523727, + "learning_rate": 3.615294082222665e-08, + "loss": 0.7033, + "step": 10963 + }, + { + "epoch": 0.96, + "grad_norm": 11.952336810119624, + "learning_rate": 3.598241206025366e-08, + "loss": 0.5121, + "step": 10964 + }, + { + "epoch": 0.96, + "grad_norm": 9.680000266666495, + "learning_rate": 3.581228497832712e-08, + "loss": 0.7327, + "step": 10965 + }, + { + "epoch": 0.96, + "grad_norm": 7.363862019176481, + "learning_rate": 3.5642559590212675e-08, + "loss": 0.6854, + "step": 10966 + }, + { + "epoch": 0.96, + "grad_norm": 9.175171775485502, + "learning_rate": 3.5473235909644354e-08, + "loss": 0.6816, + "step": 10967 + }, + { + "epoch": 0.96, + "grad_norm": 9.16303430122419, + "learning_rate": 3.530431395032396e-08, + "loss": 0.7556, + "step": 10968 + }, + { + "epoch": 0.96, + "grad_norm": 8.061823114127426, + "learning_rate": 3.513579372592002e-08, + "loss": 0.5559, + "step": 10969 + }, + { + "epoch": 0.96, + "grad_norm": 7.085261676138725, + "learning_rate": 3.4967675250068834e-08, + "loss": 0.6691, + "step": 10970 + }, + { + "epoch": 0.96, + "grad_norm": 13.412706742434398, + "learning_rate": 3.479995853637508e-08, + "loss": 0.7688, + "step": 10971 + }, + { + "epoch": 0.96, + "grad_norm": 9.284570264468268, + "learning_rate": 3.463264359840901e-08, + "loss": 0.7902, + "step": 10972 + }, + { + "epoch": 0.96, + "grad_norm": 6.48234654772879, + "learning_rate": 3.44657304497098e-08, + "loss": 0.698, + "step": 10973 + }, + { + "epoch": 0.96, + "grad_norm": 6.683016762369225, + "learning_rate": 3.429921910378442e-08, + "loss": 0.802, + "step": 10974 + }, + { + "epoch": 0.96, + "grad_norm": 7.811477999019375, + "learning_rate": 3.4133109574105985e-08, + "loss": 0.7094, + "step": 10975 + }, + { + "epoch": 0.96, + "grad_norm": 13.28280278539615, + "learning_rate": 3.396740187411596e-08, + "loss": 0.8245, + "step": 10976 + }, + { + "epoch": 0.96, + "grad_norm": 7.111456780140573, + "learning_rate": 3.380209601722362e-08, + "loss": 0.8609, + "step": 10977 + }, + { + "epoch": 0.96, + "grad_norm": 12.069960320846048, + "learning_rate": 3.363719201680438e-08, + "loss": 0.6904, + "step": 10978 + }, + { + "epoch": 0.96, + "grad_norm": 20.76057864724712, + "learning_rate": 3.347268988620256e-08, + "loss": 0.6651, + "step": 10979 + }, + { + "epoch": 0.96, + "grad_norm": 8.287771159683269, + "learning_rate": 3.3308589638729184e-08, + "loss": 0.6069, + "step": 10980 + }, + { + "epoch": 0.96, + "grad_norm": 2.655966584511517, + "learning_rate": 3.3144891287663074e-08, + "loss": 0.4328, + "step": 10981 + }, + { + "epoch": 0.96, + "grad_norm": 5.845867207989849, + "learning_rate": 3.298159484625085e-08, + "loss": 0.8433, + "step": 10982 + }, + { + "epoch": 0.96, + "grad_norm": 9.491015635656288, + "learning_rate": 3.281870032770529e-08, + "loss": 0.7904, + "step": 10983 + }, + { + "epoch": 0.96, + "grad_norm": 9.308255231059281, + "learning_rate": 3.2656207745208054e-08, + "loss": 0.7533, + "step": 10984 + }, + { + "epoch": 0.96, + "grad_norm": 10.15315007604956, + "learning_rate": 3.249411711190753e-08, + "loss": 0.7253, + "step": 10985 + }, + { + "epoch": 0.96, + "grad_norm": 9.493817344744174, + "learning_rate": 3.233242844092044e-08, + "loss": 0.6187, + "step": 10986 + }, + { + "epoch": 0.96, + "grad_norm": 10.10118764181124, + "learning_rate": 3.217114174532965e-08, + "loss": 0.7695, + "step": 10987 + }, + { + "epoch": 0.96, + "grad_norm": 6.971509065799354, + "learning_rate": 3.2010257038186944e-08, + "loss": 0.6016, + "step": 10988 + }, + { + "epoch": 0.97, + "grad_norm": 7.829715328621397, + "learning_rate": 3.184977433251024e-08, + "loss": 0.8791, + "step": 10989 + }, + { + "epoch": 0.97, + "grad_norm": 9.025532990199963, + "learning_rate": 3.168969364128527e-08, + "loss": 0.686, + "step": 10990 + }, + { + "epoch": 0.97, + "grad_norm": 8.701952393407838, + "learning_rate": 3.153001497746666e-08, + "loss": 0.6362, + "step": 10991 + }, + { + "epoch": 0.97, + "grad_norm": 2.596693935635211, + "learning_rate": 3.137073835397408e-08, + "loss": 0.5396, + "step": 10992 + }, + { + "epoch": 0.97, + "grad_norm": 10.956850355845887, + "learning_rate": 3.1211863783696675e-08, + "loss": 0.7843, + "step": 10993 + }, + { + "epoch": 0.97, + "grad_norm": 29.687289177638085, + "learning_rate": 3.105339127949025e-08, + "loss": 0.6818, + "step": 10994 + }, + { + "epoch": 0.97, + "grad_norm": 7.527637661384299, + "learning_rate": 3.089532085417735e-08, + "loss": 0.8286, + "step": 10995 + }, + { + "epoch": 0.97, + "grad_norm": 6.532447780040471, + "learning_rate": 3.073765252054994e-08, + "loss": 0.7614, + "step": 10996 + }, + { + "epoch": 0.97, + "grad_norm": 9.674227181504182, + "learning_rate": 3.058038629136561e-08, + "loss": 0.6863, + "step": 10997 + }, + { + "epoch": 0.97, + "grad_norm": 7.397402663705885, + "learning_rate": 3.0423522179350295e-08, + "loss": 0.746, + "step": 10998 + }, + { + "epoch": 0.97, + "grad_norm": 6.124299212794277, + "learning_rate": 3.026706019719716e-08, + "loss": 0.5415, + "step": 10999 + }, + { + "epoch": 0.97, + "grad_norm": 6.736507135856749, + "learning_rate": 3.0111000357566645e-08, + "loss": 0.7549, + "step": 11000 + }, + { + "epoch": 0.97, + "grad_norm": 7.260518663644551, + "learning_rate": 2.995534267308697e-08, + "loss": 0.6453, + "step": 11001 + }, + { + "epoch": 0.97, + "grad_norm": 9.446887241535247, + "learning_rate": 2.980008715635363e-08, + "loss": 0.8175, + "step": 11002 + }, + { + "epoch": 0.97, + "grad_norm": 5.97642132927486, + "learning_rate": 2.9645233819929898e-08, + "loss": 0.7732, + "step": 11003 + }, + { + "epoch": 0.97, + "grad_norm": 3.445378437271794, + "learning_rate": 2.9490782676346308e-08, + "loss": 0.6289, + "step": 11004 + }, + { + "epoch": 0.97, + "grad_norm": 17.160288951287537, + "learning_rate": 2.9336733738100087e-08, + "loss": 0.8852, + "step": 11005 + }, + { + "epoch": 0.97, + "grad_norm": 11.622406604078806, + "learning_rate": 2.9183087017657373e-08, + "loss": 0.7429, + "step": 11006 + }, + { + "epoch": 0.97, + "grad_norm": 12.098679781831615, + "learning_rate": 2.9029842527450446e-08, + "loss": 0.7659, + "step": 11007 + }, + { + "epoch": 0.97, + "grad_norm": 7.9605072035775875, + "learning_rate": 2.887700027987994e-08, + "loss": 0.6642, + "step": 11008 + }, + { + "epoch": 0.97, + "grad_norm": 11.34263766014208, + "learning_rate": 2.872456028731374e-08, + "loss": 0.6987, + "step": 11009 + }, + { + "epoch": 0.97, + "grad_norm": 6.670444699389442, + "learning_rate": 2.857252256208698e-08, + "loss": 0.5639, + "step": 11010 + }, + { + "epoch": 0.97, + "grad_norm": 2.2300161146582798, + "learning_rate": 2.8420887116502037e-08, + "loss": 0.4854, + "step": 11011 + }, + { + "epoch": 0.97, + "grad_norm": 9.489129946623468, + "learning_rate": 2.8269653962829104e-08, + "loss": 0.7037, + "step": 11012 + }, + { + "epoch": 0.97, + "grad_norm": 6.515445295416336, + "learning_rate": 2.8118823113305604e-08, + "loss": 0.7076, + "step": 11013 + }, + { + "epoch": 0.97, + "grad_norm": 7.039158274690303, + "learning_rate": 2.7968394580136782e-08, + "loss": 0.9023, + "step": 11014 + }, + { + "epoch": 0.97, + "grad_norm": 8.937131838207543, + "learning_rate": 2.7818368375494566e-08, + "loss": 0.6553, + "step": 11015 + }, + { + "epoch": 0.97, + "grad_norm": 9.284024424500892, + "learning_rate": 2.7668744511519798e-08, + "loss": 0.7387, + "step": 11016 + }, + { + "epoch": 0.97, + "grad_norm": 7.166101648559174, + "learning_rate": 2.7519523000318902e-08, + "loss": 0.5379, + "step": 11017 + }, + { + "epoch": 0.97, + "grad_norm": 18.705660752086676, + "learning_rate": 2.737070385396612e-08, + "loss": 0.8559, + "step": 11018 + }, + { + "epoch": 0.97, + "grad_norm": 10.550304206228049, + "learning_rate": 2.7222287084505694e-08, + "loss": 0.7582, + "step": 11019 + }, + { + "epoch": 0.97, + "grad_norm": 13.327599926069444, + "learning_rate": 2.707427270394525e-08, + "loss": 0.6621, + "step": 11020 + }, + { + "epoch": 0.97, + "grad_norm": 8.634100901119439, + "learning_rate": 2.6926660724262977e-08, + "loss": 0.6499, + "step": 11021 + }, + { + "epoch": 0.97, + "grad_norm": 7.41169753520759, + "learning_rate": 2.677945115740266e-08, + "loss": 0.7456, + "step": 11022 + }, + { + "epoch": 0.97, + "grad_norm": 7.688711465030795, + "learning_rate": 2.6632644015276987e-08, + "loss": 0.6547, + "step": 11023 + }, + { + "epoch": 0.97, + "grad_norm": 8.428938974340776, + "learning_rate": 2.6486239309764793e-08, + "loss": 0.8003, + "step": 11024 + }, + { + "epoch": 0.97, + "grad_norm": 7.569849235484368, + "learning_rate": 2.6340237052713268e-08, + "loss": 0.8305, + "step": 11025 + }, + { + "epoch": 0.97, + "grad_norm": 12.299681238381373, + "learning_rate": 2.6194637255936296e-08, + "loss": 0.7188, + "step": 11026 + }, + { + "epoch": 0.97, + "grad_norm": 12.296505009982596, + "learning_rate": 2.604943993121556e-08, + "loss": 0.8067, + "step": 11027 + }, + { + "epoch": 0.97, + "grad_norm": 19.486392800428746, + "learning_rate": 2.5904645090301106e-08, + "loss": 0.6778, + "step": 11028 + }, + { + "epoch": 0.97, + "grad_norm": 6.459102626686446, + "learning_rate": 2.5760252744908565e-08, + "loss": 0.7422, + "step": 11029 + }, + { + "epoch": 0.97, + "grad_norm": 3.3361398696947666, + "learning_rate": 2.5616262906721367e-08, + "loss": 0.5017, + "step": 11030 + }, + { + "epoch": 0.97, + "grad_norm": 9.48478398180405, + "learning_rate": 2.5472675587392416e-08, + "loss": 0.6234, + "step": 11031 + }, + { + "epoch": 0.97, + "grad_norm": 10.750819471672541, + "learning_rate": 2.532949079854019e-08, + "loss": 0.7984, + "step": 11032 + }, + { + "epoch": 0.97, + "grad_norm": 10.324117706819418, + "learning_rate": 2.5186708551749872e-08, + "loss": 0.6902, + "step": 11033 + }, + { + "epoch": 0.97, + "grad_norm": 8.97946312940412, + "learning_rate": 2.5044328858576105e-08, + "loss": 0.7644, + "step": 11034 + }, + { + "epoch": 0.97, + "grad_norm": 13.778550528156675, + "learning_rate": 2.490235173054023e-08, + "loss": 0.7097, + "step": 11035 + }, + { + "epoch": 0.97, + "grad_norm": 5.872596474535029, + "learning_rate": 2.476077717913028e-08, + "loss": 0.6375, + "step": 11036 + }, + { + "epoch": 0.97, + "grad_norm": 8.958085713556057, + "learning_rate": 2.461960521580209e-08, + "loss": 0.7085, + "step": 11037 + }, + { + "epoch": 0.97, + "grad_norm": 7.976021502745036, + "learning_rate": 2.44788358519793e-08, + "loss": 0.5054, + "step": 11038 + }, + { + "epoch": 0.97, + "grad_norm": 9.372961343965269, + "learning_rate": 2.4338469099052797e-08, + "loss": 0.6437, + "step": 11039 + }, + { + "epoch": 0.97, + "grad_norm": 12.413427921452456, + "learning_rate": 2.4198504968381274e-08, + "loss": 0.7853, + "step": 11040 + }, + { + "epoch": 0.97, + "grad_norm": 9.546042966270393, + "learning_rate": 2.405894347128901e-08, + "loss": 0.7064, + "step": 11041 + }, + { + "epoch": 0.97, + "grad_norm": 32.094755777772235, + "learning_rate": 2.391978461907085e-08, + "loss": 0.8574, + "step": 11042 + }, + { + "epoch": 0.97, + "grad_norm": 4.512281495308019, + "learning_rate": 2.378102842298613e-08, + "loss": 0.5974, + "step": 11043 + }, + { + "epoch": 0.97, + "grad_norm": 9.543357437693224, + "learning_rate": 2.3642674894263085e-08, + "loss": 0.6468, + "step": 11044 + }, + { + "epoch": 0.97, + "grad_norm": 6.7727292015374045, + "learning_rate": 2.3504724044097206e-08, + "loss": 0.6501, + "step": 11045 + }, + { + "epoch": 0.97, + "grad_norm": 8.677590412820729, + "learning_rate": 2.336717588365067e-08, + "loss": 0.6925, + "step": 11046 + }, + { + "epoch": 0.97, + "grad_norm": 8.884677489142438, + "learning_rate": 2.323003042405403e-08, + "loss": 0.5827, + "step": 11047 + }, + { + "epoch": 0.97, + "grad_norm": 8.750692133410347, + "learning_rate": 2.309328767640506e-08, + "loss": 0.8781, + "step": 11048 + }, + { + "epoch": 0.97, + "grad_norm": 5.773717871461782, + "learning_rate": 2.2956947651768812e-08, + "loss": 0.6897, + "step": 11049 + }, + { + "epoch": 0.97, + "grad_norm": 1.8558010907254303, + "learning_rate": 2.2821010361177565e-08, + "loss": 0.436, + "step": 11050 + }, + { + "epoch": 0.97, + "grad_norm": 6.034675794711245, + "learning_rate": 2.2685475815630852e-08, + "loss": 0.6499, + "step": 11051 + }, + { + "epoch": 0.97, + "grad_norm": 10.519378362416234, + "learning_rate": 2.255034402609546e-08, + "loss": 0.7822, + "step": 11052 + }, + { + "epoch": 0.97, + "grad_norm": 7.242803349685205, + "learning_rate": 2.241561500350764e-08, + "loss": 0.6176, + "step": 11053 + }, + { + "epoch": 0.97, + "grad_norm": 11.82627882252235, + "learning_rate": 2.2281288758768116e-08, + "loss": 0.7736, + "step": 11054 + }, + { + "epoch": 0.97, + "grad_norm": 7.622301717563792, + "learning_rate": 2.2147365302746526e-08, + "loss": 0.5806, + "step": 11055 + }, + { + "epoch": 0.97, + "grad_norm": 5.872118781012248, + "learning_rate": 2.2013844646280313e-08, + "loss": 0.6854, + "step": 11056 + }, + { + "epoch": 0.97, + "grad_norm": 7.328062690086716, + "learning_rate": 2.1880726800173058e-08, + "loss": 0.7997, + "step": 11057 + }, + { + "epoch": 0.97, + "grad_norm": 11.190585053757195, + "learning_rate": 2.1748011775196143e-08, + "loss": 0.5555, + "step": 11058 + }, + { + "epoch": 0.97, + "grad_norm": 11.637306636902895, + "learning_rate": 2.1615699582089866e-08, + "loss": 0.8126, + "step": 11059 + }, + { + "epoch": 0.97, + "grad_norm": 6.430424354183671, + "learning_rate": 2.148379023156011e-08, + "loss": 0.7957, + "step": 11060 + }, + { + "epoch": 0.97, + "grad_norm": 11.944582912030338, + "learning_rate": 2.1352283734280556e-08, + "loss": 0.7165, + "step": 11061 + }, + { + "epoch": 0.97, + "grad_norm": 7.348344063635032, + "learning_rate": 2.122118010089269e-08, + "loss": 0.711, + "step": 11062 + }, + { + "epoch": 0.97, + "grad_norm": 6.587320612369737, + "learning_rate": 2.1090479342005255e-08, + "loss": 0.5291, + "step": 11063 + }, + { + "epoch": 0.97, + "grad_norm": 8.575554487266823, + "learning_rate": 2.096018146819423e-08, + "loss": 0.7044, + "step": 11064 + }, + { + "epoch": 0.97, + "grad_norm": 8.07185483158484, + "learning_rate": 2.083028649000285e-08, + "loss": 0.5972, + "step": 11065 + }, + { + "epoch": 0.97, + "grad_norm": 7.696163068439728, + "learning_rate": 2.0700794417942704e-08, + "loss": 0.7014, + "step": 11066 + }, + { + "epoch": 0.97, + "grad_norm": 9.33372814166593, + "learning_rate": 2.057170526249097e-08, + "loss": 0.7325, + "step": 11067 + }, + { + "epoch": 0.97, + "grad_norm": 7.620812502915841, + "learning_rate": 2.0443019034094846e-08, + "loss": 0.7994, + "step": 11068 + }, + { + "epoch": 0.97, + "grad_norm": 16.903146232018837, + "learning_rate": 2.0314735743166004e-08, + "loss": 0.72, + "step": 11069 + }, + { + "epoch": 0.97, + "grad_norm": 2.7825763028349204, + "learning_rate": 2.0186855400085025e-08, + "loss": 0.5273, + "step": 11070 + }, + { + "epoch": 0.97, + "grad_norm": 10.343286582356566, + "learning_rate": 2.005937801520086e-08, + "loss": 0.7601, + "step": 11071 + }, + { + "epoch": 0.97, + "grad_norm": 11.581447879911622, + "learning_rate": 1.9932303598828028e-08, + "loss": 0.8164, + "step": 11072 + }, + { + "epoch": 0.97, + "grad_norm": 16.025550320316672, + "learning_rate": 1.980563216124942e-08, + "loss": 0.7965, + "step": 11073 + }, + { + "epoch": 0.97, + "grad_norm": 7.377495927812763, + "learning_rate": 1.9679363712714615e-08, + "loss": 0.8398, + "step": 11074 + }, + { + "epoch": 0.97, + "grad_norm": 8.448200419531931, + "learning_rate": 1.955349826344155e-08, + "loss": 0.5357, + "step": 11075 + }, + { + "epoch": 0.97, + "grad_norm": 19.42939212630744, + "learning_rate": 1.9428035823614854e-08, + "loss": 0.6576, + "step": 11076 + }, + { + "epoch": 0.97, + "grad_norm": 6.930824133755689, + "learning_rate": 1.9302976403386408e-08, + "loss": 0.7058, + "step": 11077 + }, + { + "epoch": 0.97, + "grad_norm": 10.874013390324363, + "learning_rate": 1.917832001287645e-08, + "loss": 0.8606, + "step": 11078 + }, + { + "epoch": 0.97, + "grad_norm": 8.883447015750377, + "learning_rate": 1.9054066662171912e-08, + "loss": 0.6503, + "step": 11079 + }, + { + "epoch": 0.97, + "grad_norm": 10.596913910911699, + "learning_rate": 1.8930216361326415e-08, + "loss": 0.7996, + "step": 11080 + }, + { + "epoch": 0.97, + "grad_norm": 2.6606360741447714, + "learning_rate": 1.8806769120362502e-08, + "loss": 0.5204, + "step": 11081 + }, + { + "epoch": 0.97, + "grad_norm": 18.56791361500216, + "learning_rate": 1.8683724949268843e-08, + "loss": 0.7265, + "step": 11082 + }, + { + "epoch": 0.97, + "grad_norm": 8.735784516149165, + "learning_rate": 1.8561083858002484e-08, + "loss": 0.7525, + "step": 11083 + }, + { + "epoch": 0.97, + "grad_norm": 6.339935239206998, + "learning_rate": 1.8438845856487143e-08, + "loss": 0.9685, + "step": 11084 + }, + { + "epoch": 0.97, + "grad_norm": 6.217794680760645, + "learning_rate": 1.8317010954613246e-08, + "loss": 0.7646, + "step": 11085 + }, + { + "epoch": 0.97, + "grad_norm": 8.319346630363041, + "learning_rate": 1.8195579162241238e-08, + "loss": 0.7099, + "step": 11086 + }, + { + "epoch": 0.97, + "grad_norm": 11.213776665487211, + "learning_rate": 1.8074550489194932e-08, + "loss": 0.701, + "step": 11087 + }, + { + "epoch": 0.97, + "grad_norm": 10.971153153520369, + "learning_rate": 1.795392494526982e-08, + "loss": 0.8642, + "step": 11088 + }, + { + "epoch": 0.97, + "grad_norm": 10.874561013717122, + "learning_rate": 1.783370254022587e-08, + "loss": 0.806, + "step": 11089 + }, + { + "epoch": 0.97, + "grad_norm": 6.480108447106537, + "learning_rate": 1.7713883283791422e-08, + "loss": 0.6163, + "step": 11090 + }, + { + "epoch": 0.97, + "grad_norm": 5.823995638745566, + "learning_rate": 1.759446718566149e-08, + "loss": 0.583, + "step": 11091 + }, + { + "epoch": 0.97, + "grad_norm": 11.562426325638901, + "learning_rate": 1.7475454255499457e-08, + "loss": 0.7403, + "step": 11092 + }, + { + "epoch": 0.97, + "grad_norm": 7.41935397090497, + "learning_rate": 1.7356844502935955e-08, + "loss": 0.7388, + "step": 11093 + }, + { + "epoch": 0.97, + "grad_norm": 8.346229548571786, + "learning_rate": 1.7238637937568302e-08, + "loss": 0.902, + "step": 11094 + }, + { + "epoch": 0.97, + "grad_norm": 9.766024602553513, + "learning_rate": 1.712083456896163e-08, + "loss": 0.6971, + "step": 11095 + }, + { + "epoch": 0.97, + "grad_norm": 11.46144094109519, + "learning_rate": 1.700343440664831e-08, + "loss": 0.8027, + "step": 11096 + }, + { + "epoch": 0.97, + "grad_norm": 7.101445045100526, + "learning_rate": 1.6886437460128523e-08, + "loss": 0.6558, + "step": 11097 + }, + { + "epoch": 0.97, + "grad_norm": 13.730760530072414, + "learning_rate": 1.676984373886914e-08, + "loss": 0.8153, + "step": 11098 + }, + { + "epoch": 0.97, + "grad_norm": 7.470415194234403, + "learning_rate": 1.6653653252304837e-08, + "loss": 0.5961, + "step": 11099 + }, + { + "epoch": 0.97, + "grad_norm": 8.803921896408216, + "learning_rate": 1.6537866009837533e-08, + "loss": 0.8453, + "step": 11100 + }, + { + "epoch": 0.97, + "grad_norm": 2.6173826338087407, + "learning_rate": 1.6422482020836406e-08, + "loss": 0.4557, + "step": 11101 + }, + { + "epoch": 0.98, + "grad_norm": 6.053847729528716, + "learning_rate": 1.6307501294638428e-08, + "loss": 0.5921, + "step": 11102 + }, + { + "epoch": 0.98, + "grad_norm": 8.518475295451509, + "learning_rate": 1.6192923840547825e-08, + "loss": 0.6498, + "step": 11103 + }, + { + "epoch": 0.98, + "grad_norm": 10.816385832194829, + "learning_rate": 1.607874966783496e-08, + "loss": 0.7719, + "step": 11104 + }, + { + "epoch": 0.98, + "grad_norm": 7.345248024506817, + "learning_rate": 1.5964978785740214e-08, + "loss": 0.8289, + "step": 11105 + }, + { + "epoch": 0.98, + "grad_norm": 9.533663428267156, + "learning_rate": 1.58516112034679e-08, + "loss": 0.8045, + "step": 11106 + }, + { + "epoch": 0.98, + "grad_norm": 2.174797894411663, + "learning_rate": 1.5738646930193447e-08, + "loss": 0.4895, + "step": 11107 + }, + { + "epoch": 0.98, + "grad_norm": 4.988483996808396, + "learning_rate": 1.5626085975056214e-08, + "loss": 0.7641, + "step": 11108 + }, + { + "epoch": 0.98, + "grad_norm": 7.6197890129852555, + "learning_rate": 1.551392834716503e-08, + "loss": 0.5875, + "step": 11109 + }, + { + "epoch": 0.98, + "grad_norm": 11.862943221686455, + "learning_rate": 1.5402174055595408e-08, + "loss": 0.7819, + "step": 11110 + }, + { + "epoch": 0.98, + "grad_norm": 9.430295293801201, + "learning_rate": 1.5290823109390673e-08, + "loss": 0.7878, + "step": 11111 + }, + { + "epoch": 0.98, + "grad_norm": 8.004565372380506, + "learning_rate": 1.517987551756084e-08, + "loss": 0.7109, + "step": 11112 + }, + { + "epoch": 0.98, + "grad_norm": 7.503259270986527, + "learning_rate": 1.5069331289083723e-08, + "loss": 0.7071, + "step": 11113 + }, + { + "epoch": 0.98, + "grad_norm": 11.604854122759154, + "learning_rate": 1.495919043290439e-08, + "loss": 0.7549, + "step": 11114 + }, + { + "epoch": 0.98, + "grad_norm": 8.83907595376207, + "learning_rate": 1.4849452957935163e-08, + "loss": 0.7451, + "step": 11115 + }, + { + "epoch": 0.98, + "grad_norm": 5.780001881810305, + "learning_rate": 1.4740118873055597e-08, + "loss": 0.7308, + "step": 11116 + }, + { + "epoch": 0.98, + "grad_norm": 8.344193534164356, + "learning_rate": 1.4631188187113066e-08, + "loss": 0.6336, + "step": 11117 + }, + { + "epoch": 0.98, + "grad_norm": 2.6459274133615036, + "learning_rate": 1.4522660908922182e-08, + "loss": 0.5361, + "step": 11118 + }, + { + "epoch": 0.98, + "grad_norm": 9.30426555949188, + "learning_rate": 1.4414537047264809e-08, + "loss": 0.6709, + "step": 11119 + }, + { + "epoch": 0.98, + "grad_norm": 6.41425201664416, + "learning_rate": 1.4306816610889507e-08, + "loss": 0.7778, + "step": 11120 + }, + { + "epoch": 0.98, + "grad_norm": 7.308584998212174, + "learning_rate": 1.4199499608513746e-08, + "loss": 0.6832, + "step": 11121 + }, + { + "epoch": 0.98, + "grad_norm": 7.21994963828291, + "learning_rate": 1.4092586048820578e-08, + "loss": 0.5995, + "step": 11122 + }, + { + "epoch": 0.98, + "grad_norm": 11.156991957016237, + "learning_rate": 1.3986075940462529e-08, + "loss": 0.6314, + "step": 11123 + }, + { + "epoch": 0.98, + "grad_norm": 7.242449845053131, + "learning_rate": 1.3879969292056594e-08, + "loss": 0.7511, + "step": 11124 + }, + { + "epoch": 0.98, + "grad_norm": 2.9333486891826257, + "learning_rate": 1.3774266112189794e-08, + "loss": 0.5135, + "step": 11125 + }, + { + "epoch": 0.98, + "grad_norm": 5.1640968091662005, + "learning_rate": 1.3668966409415285e-08, + "loss": 0.6311, + "step": 11126 + }, + { + "epoch": 0.98, + "grad_norm": 16.532350920919768, + "learning_rate": 1.356407019225403e-08, + "loss": 0.7518, + "step": 11127 + }, + { + "epoch": 0.98, + "grad_norm": 11.249124514677193, + "learning_rate": 1.3459577469193131e-08, + "loss": 0.7275, + "step": 11128 + }, + { + "epoch": 0.98, + "grad_norm": 5.419458683435206, + "learning_rate": 1.3355488248689152e-08, + "loss": 0.7924, + "step": 11129 + }, + { + "epoch": 0.98, + "grad_norm": 8.207529607582142, + "learning_rate": 1.3251802539163693e-08, + "loss": 0.6922, + "step": 11130 + }, + { + "epoch": 0.98, + "grad_norm": 2.902164341114565, + "learning_rate": 1.3148520349007265e-08, + "loss": 0.4483, + "step": 11131 + }, + { + "epoch": 0.98, + "grad_norm": 10.024584448901564, + "learning_rate": 1.3045641686577626e-08, + "loss": 0.7685, + "step": 11132 + }, + { + "epoch": 0.98, + "grad_norm": 6.7534377168999455, + "learning_rate": 1.2943166560199228e-08, + "loss": 0.574, + "step": 11133 + }, + { + "epoch": 0.98, + "grad_norm": 8.4612740460002, + "learning_rate": 1.2841094978164326e-08, + "loss": 0.6958, + "step": 11134 + }, + { + "epoch": 0.98, + "grad_norm": 6.411149268934704, + "learning_rate": 1.2739426948732426e-08, + "loss": 0.6293, + "step": 11135 + }, + { + "epoch": 0.98, + "grad_norm": 8.958399155863948, + "learning_rate": 1.2638162480129723e-08, + "loss": 0.7049, + "step": 11136 + }, + { + "epoch": 0.98, + "grad_norm": 7.321285676452062, + "learning_rate": 1.2537301580551331e-08, + "loss": 0.6923, + "step": 11137 + }, + { + "epoch": 0.98, + "grad_norm": 9.630178918265727, + "learning_rate": 1.2436844258157943e-08, + "loss": 0.6549, + "step": 11138 + }, + { + "epoch": 0.98, + "grad_norm": 8.367149095505527, + "learning_rate": 1.2336790521079167e-08, + "loss": 0.6893, + "step": 11139 + }, + { + "epoch": 0.98, + "grad_norm": 9.207472693701009, + "learning_rate": 1.223714037741075e-08, + "loss": 0.7, + "step": 11140 + }, + { + "epoch": 0.98, + "grad_norm": 8.235169812526216, + "learning_rate": 1.2137893835215686e-08, + "loss": 0.7195, + "step": 11141 + }, + { + "epoch": 0.98, + "grad_norm": 7.012069477497209, + "learning_rate": 1.2039050902525884e-08, + "loss": 0.6924, + "step": 11142 + }, + { + "epoch": 0.98, + "grad_norm": 8.176060938934233, + "learning_rate": 1.1940611587338836e-08, + "loss": 0.6802, + "step": 11143 + }, + { + "epoch": 0.98, + "grad_norm": 9.51768563591055, + "learning_rate": 1.1842575897619835e-08, + "loss": 0.714, + "step": 11144 + }, + { + "epoch": 0.98, + "grad_norm": 8.608451340628344, + "learning_rate": 1.1744943841302536e-08, + "loss": 0.6548, + "step": 11145 + }, + { + "epoch": 0.98, + "grad_norm": 9.166039272652437, + "learning_rate": 1.1647715426287287e-08, + "loss": 0.7144, + "step": 11146 + }, + { + "epoch": 0.98, + "grad_norm": 7.768199549518655, + "learning_rate": 1.1550890660440572e-08, + "loss": 0.7628, + "step": 11147 + }, + { + "epoch": 0.98, + "grad_norm": 7.296525253330547, + "learning_rate": 1.1454469551598346e-08, + "loss": 0.7167, + "step": 11148 + }, + { + "epoch": 0.98, + "grad_norm": 13.864838838720223, + "learning_rate": 1.1358452107562146e-08, + "loss": 0.6815, + "step": 11149 + }, + { + "epoch": 0.98, + "grad_norm": 3.8994119307998822, + "learning_rate": 1.1262838336101867e-08, + "loss": 0.5293, + "step": 11150 + }, + { + "epoch": 0.98, + "grad_norm": 5.665533606593245, + "learning_rate": 1.1167628244954654e-08, + "loss": 0.7263, + "step": 11151 + }, + { + "epoch": 0.98, + "grad_norm": 7.441291286707685, + "learning_rate": 1.1072821841824343e-08, + "loss": 0.6372, + "step": 11152 + }, + { + "epoch": 0.98, + "grad_norm": 8.967125470345907, + "learning_rate": 1.0978419134382578e-08, + "loss": 0.7834, + "step": 11153 + }, + { + "epoch": 0.98, + "grad_norm": 7.898394049141486, + "learning_rate": 1.0884420130268248e-08, + "loss": 0.6926, + "step": 11154 + }, + { + "epoch": 0.98, + "grad_norm": 12.869805856918383, + "learning_rate": 1.0790824837088043e-08, + "loss": 0.8806, + "step": 11155 + }, + { + "epoch": 0.98, + "grad_norm": 6.67263090583456, + "learning_rate": 1.0697633262414797e-08, + "loss": 0.4709, + "step": 11156 + }, + { + "epoch": 0.98, + "grad_norm": 7.777276245535932, + "learning_rate": 1.06048454137897e-08, + "loss": 0.6205, + "step": 11157 + }, + { + "epoch": 0.98, + "grad_norm": 7.970205277542463, + "learning_rate": 1.0512461298721189e-08, + "loss": 0.7392, + "step": 11158 + }, + { + "epoch": 0.98, + "grad_norm": 9.208491812831724, + "learning_rate": 1.042048092468495e-08, + "loss": 0.6603, + "step": 11159 + }, + { + "epoch": 0.98, + "grad_norm": 7.838235711743001, + "learning_rate": 1.0328904299123366e-08, + "loss": 0.7198, + "step": 11160 + }, + { + "epoch": 0.98, + "grad_norm": 7.101224921628164, + "learning_rate": 1.0237731429447172e-08, + "loss": 0.7352, + "step": 11161 + }, + { + "epoch": 0.98, + "grad_norm": 8.299058201614299, + "learning_rate": 1.0146962323033805e-08, + "loss": 0.5867, + "step": 11162 + }, + { + "epoch": 0.98, + "grad_norm": 7.34124528192001, + "learning_rate": 1.0056596987227384e-08, + "loss": 0.673, + "step": 11163 + }, + { + "epoch": 0.98, + "grad_norm": 10.945910242101698, + "learning_rate": 9.96663542934151e-09, + "loss": 0.5738, + "step": 11164 + }, + { + "epoch": 0.98, + "grad_norm": 7.494521232060231, + "learning_rate": 9.8770776566548e-09, + "loss": 0.792, + "step": 11165 + }, + { + "epoch": 0.98, + "grad_norm": 5.739379639962634, + "learning_rate": 9.787923676414235e-09, + "loss": 0.6478, + "step": 11166 + }, + { + "epoch": 0.98, + "grad_norm": 9.512610031476644, + "learning_rate": 9.699173495834046e-09, + "loss": 0.7233, + "step": 11167 + }, + { + "epoch": 0.98, + "grad_norm": 7.291738427858549, + "learning_rate": 9.610827122095712e-09, + "loss": 0.7575, + "step": 11168 + }, + { + "epoch": 0.98, + "grad_norm": 6.758557487957195, + "learning_rate": 9.522884562348512e-09, + "loss": 0.6585, + "step": 11169 + }, + { + "epoch": 0.98, + "grad_norm": 12.427149727953582, + "learning_rate": 9.43534582370842e-09, + "loss": 0.6575, + "step": 11170 + }, + { + "epoch": 0.98, + "grad_norm": 7.664339183568163, + "learning_rate": 9.348210913258105e-09, + "loss": 0.8331, + "step": 11171 + }, + { + "epoch": 0.98, + "grad_norm": 6.1060462916071145, + "learning_rate": 9.261479838049149e-09, + "loss": 0.6436, + "step": 11172 + }, + { + "epoch": 0.98, + "grad_norm": 2.1866571991657295, + "learning_rate": 9.175152605099823e-09, + "loss": 0.4484, + "step": 11173 + }, + { + "epoch": 0.98, + "grad_norm": 10.33367144450366, + "learning_rate": 9.089229221395657e-09, + "loss": 0.6549, + "step": 11174 + }, + { + "epoch": 0.98, + "grad_norm": 23.165962446420096, + "learning_rate": 9.003709693888862e-09, + "loss": 0.6554, + "step": 11175 + }, + { + "epoch": 0.98, + "grad_norm": 7.582757854156313, + "learning_rate": 8.918594029499461e-09, + "loss": 0.7339, + "step": 11176 + }, + { + "epoch": 0.98, + "grad_norm": 14.519383681037775, + "learning_rate": 8.833882235115277e-09, + "loss": 0.8287, + "step": 11177 + }, + { + "epoch": 0.98, + "grad_norm": 10.529152965671944, + "learning_rate": 8.749574317591381e-09, + "loss": 0.675, + "step": 11178 + }, + { + "epoch": 0.98, + "grad_norm": 6.992467306823472, + "learning_rate": 8.665670283748983e-09, + "loss": 0.7661, + "step": 11179 + }, + { + "epoch": 0.98, + "grad_norm": 7.044597393133887, + "learning_rate": 8.58217014037821e-09, + "loss": 0.7159, + "step": 11180 + }, + { + "epoch": 0.98, + "grad_norm": 12.869459943817072, + "learning_rate": 8.499073894234765e-09, + "loss": 0.9904, + "step": 11181 + }, + { + "epoch": 0.98, + "grad_norm": 3.0780143389475505, + "learning_rate": 8.416381552043828e-09, + "loss": 0.4855, + "step": 11182 + }, + { + "epoch": 0.98, + "grad_norm": 13.952021879616975, + "learning_rate": 8.334093120496712e-09, + "loss": 0.6823, + "step": 11183 + }, + { + "epoch": 0.98, + "grad_norm": 8.481117289183464, + "learning_rate": 8.252208606250867e-09, + "loss": 0.7384, + "step": 11184 + }, + { + "epoch": 0.98, + "grad_norm": 8.222215113046808, + "learning_rate": 8.170728015933216e-09, + "loss": 0.6015, + "step": 11185 + }, + { + "epoch": 0.98, + "grad_norm": 10.923991213868671, + "learning_rate": 8.089651356136263e-09, + "loss": 0.7144, + "step": 11186 + }, + { + "epoch": 0.98, + "grad_norm": 11.190917310044647, + "learning_rate": 8.008978633421982e-09, + "loss": 0.7712, + "step": 11187 + }, + { + "epoch": 0.98, + "grad_norm": 6.231833056207752, + "learning_rate": 7.928709854316818e-09, + "loss": 0.619, + "step": 11188 + }, + { + "epoch": 0.98, + "grad_norm": 10.90771748376982, + "learning_rate": 7.848845025316686e-09, + "loss": 0.6982, + "step": 11189 + }, + { + "epoch": 0.98, + "grad_norm": 11.351153883884919, + "learning_rate": 7.769384152884196e-09, + "loss": 0.7901, + "step": 11190 + }, + { + "epoch": 0.98, + "grad_norm": 23.28800748822282, + "learning_rate": 7.690327243448647e-09, + "loss": 0.7625, + "step": 11191 + }, + { + "epoch": 0.98, + "grad_norm": 10.714378973826587, + "learning_rate": 7.6116743034077e-09, + "loss": 0.8, + "step": 11192 + }, + { + "epoch": 0.98, + "grad_norm": 6.425750178590315, + "learning_rate": 7.533425339125711e-09, + "loss": 0.7003, + "step": 11193 + }, + { + "epoch": 0.98, + "grad_norm": 10.577659128126726, + "learning_rate": 7.455580356934278e-09, + "loss": 0.7266, + "step": 11194 + }, + { + "epoch": 0.98, + "grad_norm": 7.91854901078657, + "learning_rate": 7.378139363132808e-09, + "loss": 0.7637, + "step": 11195 + }, + { + "epoch": 0.98, + "grad_norm": 7.872414948201355, + "learning_rate": 7.3011023639874e-09, + "loss": 0.747, + "step": 11196 + }, + { + "epoch": 0.98, + "grad_norm": 11.211742032371635, + "learning_rate": 7.224469365731957e-09, + "loss": 0.6981, + "step": 11197 + }, + { + "epoch": 0.98, + "grad_norm": 8.50391918270389, + "learning_rate": 7.1482403745670725e-09, + "loss": 0.5777, + "step": 11198 + }, + { + "epoch": 0.98, + "grad_norm": 9.629118481416253, + "learning_rate": 7.072415396661703e-09, + "loss": 0.7007, + "step": 11199 + }, + { + "epoch": 0.98, + "grad_norm": 11.217826414426284, + "learning_rate": 6.996994438150939e-09, + "loss": 0.8026, + "step": 11200 + }, + { + "epoch": 0.98, + "grad_norm": 20.04840042113556, + "learning_rate": 6.9219775051382335e-09, + "loss": 0.7517, + "step": 11201 + }, + { + "epoch": 0.98, + "grad_norm": 13.343536106382883, + "learning_rate": 6.847364603693174e-09, + "loss": 0.6731, + "step": 11202 + }, + { + "epoch": 0.98, + "grad_norm": 8.076192903167932, + "learning_rate": 6.77315573985371e-09, + "loss": 0.5759, + "step": 11203 + }, + { + "epoch": 0.98, + "grad_norm": 7.811276056810483, + "learning_rate": 6.6993509196250365e-09, + "loss": 0.588, + "step": 11204 + }, + { + "epoch": 0.98, + "grad_norm": 5.32888289888209, + "learning_rate": 6.625950148979043e-09, + "loss": 0.6813, + "step": 11205 + }, + { + "epoch": 0.98, + "grad_norm": 6.259100649695529, + "learning_rate": 6.55295343385487e-09, + "loss": 0.5569, + "step": 11206 + }, + { + "epoch": 0.98, + "grad_norm": 8.887643256808222, + "learning_rate": 6.480360780159456e-09, + "loss": 0.7746, + "step": 11207 + }, + { + "epoch": 0.98, + "grad_norm": 2.7354534448191603, + "learning_rate": 6.408172193766993e-09, + "loss": 0.5184, + "step": 11208 + }, + { + "epoch": 0.98, + "grad_norm": 9.47940050424657, + "learning_rate": 6.336387680518918e-09, + "loss": 0.6705, + "step": 11209 + }, + { + "epoch": 0.98, + "grad_norm": 5.117053199627094, + "learning_rate": 6.265007246223365e-09, + "loss": 0.7503, + "step": 11210 + }, + { + "epoch": 0.98, + "grad_norm": 6.240398313951424, + "learning_rate": 6.194030896657377e-09, + "loss": 0.6782, + "step": 11211 + }, + { + "epoch": 0.98, + "grad_norm": 8.311204495471667, + "learning_rate": 6.123458637563584e-09, + "loss": 0.7094, + "step": 11212 + }, + { + "epoch": 0.98, + "grad_norm": 12.556188311800502, + "learning_rate": 6.053290474652418e-09, + "loss": 0.5714, + "step": 11213 + }, + { + "epoch": 0.98, + "grad_norm": 5.8873963793079875, + "learning_rate": 5.9835264136021146e-09, + "loss": 0.8994, + "step": 11214 + }, + { + "epoch": 0.98, + "grad_norm": 21.85165461859206, + "learning_rate": 5.914166460057602e-09, + "loss": 0.6576, + "step": 11215 + }, + { + "epoch": 0.99, + "grad_norm": 9.680079164267342, + "learning_rate": 5.845210619631614e-09, + "loss": 0.6813, + "step": 11216 + }, + { + "epoch": 0.99, + "grad_norm": 6.564230461124187, + "learning_rate": 5.776658897903575e-09, + "loss": 0.661, + "step": 11217 + }, + { + "epoch": 0.99, + "grad_norm": 12.484195280111146, + "learning_rate": 5.708511300421271e-09, + "loss": 0.8449, + "step": 11218 + }, + { + "epoch": 0.99, + "grad_norm": 10.846040371304296, + "learning_rate": 5.6407678326980686e-09, + "loss": 0.8218, + "step": 11219 + }, + { + "epoch": 0.99, + "grad_norm": 12.783929231010564, + "learning_rate": 5.573428500216805e-09, + "loss": 0.6961, + "step": 11220 + }, + { + "epoch": 0.99, + "grad_norm": 5.983937428997436, + "learning_rate": 5.506493308425342e-09, + "loss": 0.7768, + "step": 11221 + }, + { + "epoch": 0.99, + "grad_norm": 14.263532506786298, + "learning_rate": 5.4399622627410164e-09, + "loss": 0.8541, + "step": 11222 + }, + { + "epoch": 0.99, + "grad_norm": 9.575005328082609, + "learning_rate": 5.373835368546743e-09, + "loss": 0.6594, + "step": 11223 + }, + { + "epoch": 0.99, + "grad_norm": 7.351712027343101, + "learning_rate": 5.3081126311937955e-09, + "loss": 0.8215, + "step": 11224 + }, + { + "epoch": 0.99, + "grad_norm": 18.30644820229641, + "learning_rate": 5.242794056000145e-09, + "loss": 0.6699, + "step": 11225 + }, + { + "epoch": 0.99, + "grad_norm": 9.198055832390617, + "learning_rate": 5.177879648251005e-09, + "loss": 0.6664, + "step": 11226 + }, + { + "epoch": 0.99, + "grad_norm": 10.716767253295354, + "learning_rate": 5.113369413199398e-09, + "loss": 0.6963, + "step": 11227 + }, + { + "epoch": 0.99, + "grad_norm": 8.741096761511441, + "learning_rate": 5.049263356065037e-09, + "loss": 0.6624, + "step": 11228 + }, + { + "epoch": 0.99, + "grad_norm": 6.697316752456402, + "learning_rate": 4.9855614820359945e-09, + "loss": 0.6356, + "step": 11229 + }, + { + "epoch": 0.99, + "grad_norm": 12.23174030343673, + "learning_rate": 4.92226379626648e-09, + "loss": 0.8249, + "step": 11230 + }, + { + "epoch": 0.99, + "grad_norm": 7.043200020738678, + "learning_rate": 4.859370303878508e-09, + "loss": 0.7823, + "step": 11231 + }, + { + "epoch": 0.99, + "grad_norm": 14.095334422274988, + "learning_rate": 4.796881009961341e-09, + "loss": 0.6762, + "step": 11232 + }, + { + "epoch": 0.99, + "grad_norm": 9.337809508968848, + "learning_rate": 4.734795919571489e-09, + "loss": 0.6245, + "step": 11233 + }, + { + "epoch": 0.99, + "grad_norm": 9.57261811953032, + "learning_rate": 4.6731150377327115e-09, + "loss": 0.6102, + "step": 11234 + }, + { + "epoch": 0.99, + "grad_norm": 6.465788600403207, + "learning_rate": 4.611838369436017e-09, + "loss": 0.6484, + "step": 11235 + }, + { + "epoch": 0.99, + "grad_norm": 7.779072317084926, + "learning_rate": 4.550965919640215e-09, + "loss": 0.6955, + "step": 11236 + }, + { + "epoch": 0.99, + "grad_norm": 12.673182066856265, + "learning_rate": 4.490497693270812e-09, + "loss": 0.7445, + "step": 11237 + }, + { + "epoch": 0.99, + "grad_norm": 5.273116811767655, + "learning_rate": 4.4304336952205594e-09, + "loss": 0.6042, + "step": 11238 + }, + { + "epoch": 0.99, + "grad_norm": 5.993902168335021, + "learning_rate": 4.370773930350014e-09, + "loss": 0.5873, + "step": 11239 + }, + { + "epoch": 0.99, + "grad_norm": 11.175648582855958, + "learning_rate": 4.311518403486425e-09, + "loss": 0.6031, + "step": 11240 + }, + { + "epoch": 0.99, + "grad_norm": 6.890549281041674, + "learning_rate": 4.252667119424847e-09, + "loss": 0.6549, + "step": 11241 + }, + { + "epoch": 0.99, + "grad_norm": 10.166630047742958, + "learning_rate": 4.19422008292758e-09, + "loss": 0.7696, + "step": 11242 + }, + { + "epoch": 0.99, + "grad_norm": 7.68272516711729, + "learning_rate": 4.136177298724176e-09, + "loss": 0.7347, + "step": 11243 + }, + { + "epoch": 0.99, + "grad_norm": 7.060806411374999, + "learning_rate": 4.078538771510876e-09, + "loss": 0.6758, + "step": 11244 + }, + { + "epoch": 0.99, + "grad_norm": 7.5206319507270365, + "learning_rate": 4.021304505951729e-09, + "loss": 0.752, + "step": 11245 + }, + { + "epoch": 0.99, + "grad_norm": 7.60615840160054, + "learning_rate": 3.964474506678584e-09, + "loss": 0.7087, + "step": 11246 + }, + { + "epoch": 0.99, + "grad_norm": 15.724607120334875, + "learning_rate": 3.9080487782894305e-09, + "loss": 0.6953, + "step": 11247 + }, + { + "epoch": 0.99, + "grad_norm": 7.23218271262123, + "learning_rate": 3.852027325350616e-09, + "loss": 0.7295, + "step": 11248 + }, + { + "epoch": 0.99, + "grad_norm": 10.75465993459358, + "learning_rate": 3.796410152394625e-09, + "loss": 0.8366, + "step": 11249 + }, + { + "epoch": 0.99, + "grad_norm": 7.248153800586697, + "learning_rate": 3.741197263922858e-09, + "loss": 0.5993, + "step": 11250 + }, + { + "epoch": 0.99, + "grad_norm": 9.587037605234094, + "learning_rate": 3.686388664402296e-09, + "loss": 0.718, + "step": 11251 + }, + { + "epoch": 0.99, + "grad_norm": 7.426540683944996, + "learning_rate": 3.6319843582682813e-09, + "loss": 0.6456, + "step": 11252 + }, + { + "epoch": 0.99, + "grad_norm": 16.44565400659475, + "learning_rate": 3.5779843499234025e-09, + "loss": 0.6545, + "step": 11253 + }, + { + "epoch": 0.99, + "grad_norm": 14.008088200329798, + "learning_rate": 3.524388643736387e-09, + "loss": 0.8526, + "step": 11254 + }, + { + "epoch": 0.99, + "grad_norm": 7.02693489713632, + "learning_rate": 3.471197244044877e-09, + "loss": 0.6367, + "step": 11255 + }, + { + "epoch": 0.99, + "grad_norm": 7.199177218583193, + "learning_rate": 3.4184101551526516e-09, + "loss": 0.705, + "step": 11256 + }, + { + "epoch": 0.99, + "grad_norm": 10.648356153126958, + "learning_rate": 3.366027381331294e-09, + "loss": 0.641, + "step": 11257 + }, + { + "epoch": 0.99, + "grad_norm": 5.92558834592672, + "learning_rate": 3.314048926819635e-09, + "loss": 0.6372, + "step": 11258 + }, + { + "epoch": 0.99, + "grad_norm": 7.39496776378115, + "learning_rate": 3.262474795823756e-09, + "loss": 0.7113, + "step": 11259 + }, + { + "epoch": 0.99, + "grad_norm": 10.004349129500515, + "learning_rate": 3.2113049925164286e-09, + "loss": 0.6165, + "step": 11260 + }, + { + "epoch": 0.99, + "grad_norm": 12.807482232319204, + "learning_rate": 3.1605395210387857e-09, + "loss": 0.7386, + "step": 11261 + }, + { + "epoch": 0.99, + "grad_norm": 7.344569912830076, + "learning_rate": 3.1101783854986524e-09, + "loss": 0.7172, + "step": 11262 + }, + { + "epoch": 0.99, + "grad_norm": 1.98800066590485, + "learning_rate": 3.060221589970547e-09, + "loss": 0.5065, + "step": 11263 + }, + { + "epoch": 0.99, + "grad_norm": 8.856419302611704, + "learning_rate": 3.0106691384973464e-09, + "loss": 0.7101, + "step": 11264 + }, + { + "epoch": 0.99, + "grad_norm": 10.743211725550177, + "learning_rate": 2.9615210350891764e-09, + "loss": 0.8013, + "step": 11265 + }, + { + "epoch": 0.99, + "grad_norm": 11.263574874844965, + "learning_rate": 2.912777283722301e-09, + "loss": 0.8385, + "step": 11266 + }, + { + "epoch": 0.99, + "grad_norm": 6.227769368916837, + "learning_rate": 2.864437888341343e-09, + "loss": 0.7907, + "step": 11267 + }, + { + "epoch": 0.99, + "grad_norm": 8.066111422976135, + "learning_rate": 2.8165028528576166e-09, + "loss": 0.7061, + "step": 11268 + }, + { + "epoch": 0.99, + "grad_norm": 12.569865785370059, + "learning_rate": 2.768972181150242e-09, + "loss": 0.6214, + "step": 11269 + }, + { + "epoch": 0.99, + "grad_norm": 2.4009498328540544, + "learning_rate": 2.7218458770650324e-09, + "loss": 0.4803, + "step": 11270 + }, + { + "epoch": 0.99, + "grad_norm": 8.947078359647211, + "learning_rate": 2.675123944415603e-09, + "loss": 0.782, + "step": 11271 + }, + { + "epoch": 0.99, + "grad_norm": 10.577848697993884, + "learning_rate": 2.628806386982263e-09, + "loss": 0.5661, + "step": 11272 + }, + { + "epoch": 0.99, + "grad_norm": 8.439587529942088, + "learning_rate": 2.5828932085136813e-09, + "loss": 0.7601, + "step": 11273 + }, + { + "epoch": 0.99, + "grad_norm": 7.3817144087685485, + "learning_rate": 2.5373844127241086e-09, + "loss": 0.6625, + "step": 11274 + }, + { + "epoch": 0.99, + "grad_norm": 8.735536083325009, + "learning_rate": 2.4922800032967097e-09, + "loss": 0.9194, + "step": 11275 + }, + { + "epoch": 0.99, + "grad_norm": 6.736390091881008, + "learning_rate": 2.447579983881343e-09, + "loss": 0.7369, + "step": 11276 + }, + { + "epoch": 0.99, + "grad_norm": 9.73364757433462, + "learning_rate": 2.4032843580945597e-09, + "loss": 0.5786, + "step": 11277 + }, + { + "epoch": 0.99, + "grad_norm": 7.6447126540015535, + "learning_rate": 2.35939312952127e-09, + "loss": 0.7053, + "step": 11278 + }, + { + "epoch": 0.99, + "grad_norm": 8.578075058781367, + "learning_rate": 2.3159063017119675e-09, + "loss": 0.6426, + "step": 11279 + }, + { + "epoch": 0.99, + "grad_norm": 12.050117400291567, + "learning_rate": 2.272823878187169e-09, + "loss": 0.5956, + "step": 11280 + }, + { + "epoch": 0.99, + "grad_norm": 12.549312106393897, + "learning_rate": 2.230145862431865e-09, + "loss": 0.7755, + "step": 11281 + }, + { + "epoch": 0.99, + "grad_norm": 11.75811714908447, + "learning_rate": 2.1878722578994037e-09, + "loss": 0.7715, + "step": 11282 + }, + { + "epoch": 0.99, + "grad_norm": 2.4315566488531353, + "learning_rate": 2.146003068011493e-09, + "loss": 0.503, + "step": 11283 + }, + { + "epoch": 0.99, + "grad_norm": 6.971458832629815, + "learning_rate": 2.1045382961548676e-09, + "loss": 0.7089, + "step": 11284 + }, + { + "epoch": 0.99, + "grad_norm": 5.835638120168877, + "learning_rate": 2.063477945685732e-09, + "loss": 0.7133, + "step": 11285 + }, + { + "epoch": 0.99, + "grad_norm": 7.080530188626402, + "learning_rate": 2.022822019926429e-09, + "loss": 0.81, + "step": 11286 + }, + { + "epoch": 0.99, + "grad_norm": 8.340796613106592, + "learning_rate": 1.9825705221665493e-09, + "loss": 0.7673, + "step": 11287 + }, + { + "epoch": 0.99, + "grad_norm": 10.910889562074837, + "learning_rate": 1.942723455663487e-09, + "loss": 0.8763, + "step": 11288 + }, + { + "epoch": 0.99, + "grad_norm": 8.136062785162997, + "learning_rate": 1.9032808236413293e-09, + "loss": 0.7026, + "step": 11289 + }, + { + "epoch": 0.99, + "grad_norm": 3.1459604785872246, + "learning_rate": 1.8642426292914128e-09, + "loss": 0.4837, + "step": 11290 + }, + { + "epoch": 0.99, + "grad_norm": 8.49637140838315, + "learning_rate": 1.8256088757734324e-09, + "loss": 0.6089, + "step": 11291 + }, + { + "epoch": 0.99, + "grad_norm": 12.06691659970757, + "learning_rate": 1.787379566213221e-09, + "loss": 0.7134, + "step": 11292 + }, + { + "epoch": 0.99, + "grad_norm": 20.262257515674005, + "learning_rate": 1.7495547037038597e-09, + "loss": 0.8228, + "step": 11293 + }, + { + "epoch": 0.99, + "grad_norm": 6.658237286252517, + "learning_rate": 1.7121342913062334e-09, + "loss": 0.5821, + "step": 11294 + }, + { + "epoch": 0.99, + "grad_norm": 2.0722455249744707, + "learning_rate": 1.6751183320490305e-09, + "loss": 0.4488, + "step": 11295 + }, + { + "epoch": 0.99, + "grad_norm": 9.187233866564322, + "learning_rate": 1.6385068289265227e-09, + "loss": 0.8929, + "step": 11296 + }, + { + "epoch": 0.99, + "grad_norm": 6.920678082588645, + "learning_rate": 1.602299784901895e-09, + "loss": 0.5843, + "step": 11297 + }, + { + "epoch": 0.99, + "grad_norm": 7.163679469779699, + "learning_rate": 1.566497202904471e-09, + "loss": 0.7897, + "step": 11298 + }, + { + "epoch": 0.99, + "grad_norm": 7.577746987563259, + "learning_rate": 1.531099085831933e-09, + "loss": 0.705, + "step": 11299 + }, + { + "epoch": 0.99, + "grad_norm": 78.62294748974391, + "learning_rate": 1.496105436548101e-09, + "loss": 0.7834, + "step": 11300 + }, + { + "epoch": 0.99, + "grad_norm": 3.321977322394703, + "learning_rate": 1.4615162578851538e-09, + "loss": 0.5265, + "step": 11301 + }, + { + "epoch": 0.99, + "grad_norm": 7.617888304567917, + "learning_rate": 1.427331552641409e-09, + "loss": 0.8144, + "step": 11302 + }, + { + "epoch": 0.99, + "grad_norm": 8.61405614977529, + "learning_rate": 1.393551323583542e-09, + "loss": 0.5883, + "step": 11303 + }, + { + "epoch": 0.99, + "grad_norm": 6.911367242203922, + "learning_rate": 1.3601755734443667e-09, + "loss": 0.8488, + "step": 11304 + }, + { + "epoch": 0.99, + "grad_norm": 6.26570934585888, + "learning_rate": 1.327204304925056e-09, + "loss": 0.6214, + "step": 11305 + }, + { + "epoch": 0.99, + "grad_norm": 15.884943543410229, + "learning_rate": 1.2946375206934759e-09, + "loss": 0.6716, + "step": 11306 + }, + { + "epoch": 0.99, + "grad_norm": 10.706264741405557, + "learning_rate": 1.2624752233852956e-09, + "loss": 0.823, + "step": 11307 + }, + { + "epoch": 0.99, + "grad_norm": 8.488686696915462, + "learning_rate": 1.2307174156023228e-09, + "loss": 0.8003, + "step": 11308 + }, + { + "epoch": 0.99, + "grad_norm": 9.25125120667783, + "learning_rate": 1.1993640999147238e-09, + "loss": 0.7401, + "step": 11309 + }, + { + "epoch": 0.99, + "grad_norm": 6.7493090057240135, + "learning_rate": 1.1684152788593583e-09, + "loss": 0.5225, + "step": 11310 + }, + { + "epoch": 0.99, + "grad_norm": 10.591339042428682, + "learning_rate": 1.1378709549408896e-09, + "loss": 0.5922, + "step": 11311 + }, + { + "epoch": 0.99, + "grad_norm": 7.372477665160945, + "learning_rate": 1.1077311306306737e-09, + "loss": 0.6785, + "step": 11312 + }, + { + "epoch": 0.99, + "grad_norm": 10.494678924716176, + "learning_rate": 1.077995808367871e-09, + "loss": 0.7435, + "step": 11313 + }, + { + "epoch": 0.99, + "grad_norm": 8.363489937912565, + "learning_rate": 1.0486649905583346e-09, + "loss": 0.6595, + "step": 11314 + }, + { + "epoch": 0.99, + "grad_norm": 6.592255152882613, + "learning_rate": 1.0197386795751662e-09, + "loss": 0.6746, + "step": 11315 + }, + { + "epoch": 0.99, + "grad_norm": 15.470121092991299, + "learning_rate": 9.912168777592713e-10, + "loss": 0.5638, + "step": 11316 + }, + { + "epoch": 0.99, + "grad_norm": 7.081801728510612, + "learning_rate": 9.630995874193584e-10, + "loss": 0.6451, + "step": 11317 + }, + { + "epoch": 0.99, + "grad_norm": 7.462146209985988, + "learning_rate": 9.353868108291643e-10, + "loss": 0.6731, + "step": 11318 + }, + { + "epoch": 0.99, + "grad_norm": 10.187931036843993, + "learning_rate": 9.080785502324496e-10, + "loss": 0.7294, + "step": 11319 + }, + { + "epoch": 0.99, + "grad_norm": 7.775005848931595, + "learning_rate": 8.811748078385584e-10, + "loss": 0.7996, + "step": 11320 + }, + { + "epoch": 0.99, + "grad_norm": 9.623494319688211, + "learning_rate": 8.546755858240829e-10, + "loss": 0.7578, + "step": 11321 + }, + { + "epoch": 0.99, + "grad_norm": 5.665131560430747, + "learning_rate": 8.285808863334188e-10, + "loss": 0.6612, + "step": 11322 + }, + { + "epoch": 0.99, + "grad_norm": 9.223865319404847, + "learning_rate": 8.028907114787654e-10, + "loss": 0.7777, + "step": 11323 + }, + { + "epoch": 0.99, + "grad_norm": 10.101560283955815, + "learning_rate": 7.776050633379051e-10, + "loss": 0.8078, + "step": 11324 + }, + { + "epoch": 0.99, + "grad_norm": 10.259592339936862, + "learning_rate": 7.527239439580892e-10, + "loss": 0.698, + "step": 11325 + }, + { + "epoch": 0.99, + "grad_norm": 18.37026517035548, + "learning_rate": 7.282473553515968e-10, + "loss": 0.575, + "step": 11326 + }, + { + "epoch": 0.99, + "grad_norm": 2.9808797354428163, + "learning_rate": 7.04175299499621e-10, + "loss": 0.4535, + "step": 11327 + }, + { + "epoch": 0.99, + "grad_norm": 14.808132953060998, + "learning_rate": 6.80507778350048e-10, + "loss": 0.8298, + "step": 11328 + }, + { + "epoch": 0.99, + "grad_norm": 8.796511663121336, + "learning_rate": 6.572447938174576e-10, + "loss": 0.5397, + "step": 11329 + }, + { + "epoch": 1.0, + "grad_norm": 2.748902168264421, + "learning_rate": 6.343863477847878e-10, + "loss": 0.5407, + "step": 11330 + }, + { + "epoch": 1.0, + "grad_norm": 7.873906389458754, + "learning_rate": 6.119324421016704e-10, + "loss": 0.7839, + "step": 11331 + }, + { + "epoch": 1.0, + "grad_norm": 11.71226145610752, + "learning_rate": 5.898830785849851e-10, + "loss": 0.6158, + "step": 11332 + }, + { + "epoch": 1.0, + "grad_norm": 6.673763110881576, + "learning_rate": 5.682382590188606e-10, + "loss": 0.6855, + "step": 11333 + }, + { + "epoch": 1.0, + "grad_norm": 9.128132596876497, + "learning_rate": 5.469979851546737e-10, + "loss": 0.7489, + "step": 11334 + }, + { + "epoch": 1.0, + "grad_norm": 6.320225986661812, + "learning_rate": 5.261622587110493e-10, + "loss": 0.6726, + "step": 11335 + }, + { + "epoch": 1.0, + "grad_norm": 8.098607804398124, + "learning_rate": 5.057310813744165e-10, + "loss": 0.7914, + "step": 11336 + }, + { + "epoch": 1.0, + "grad_norm": 5.485346856952726, + "learning_rate": 4.857044547978973e-10, + "loss": 0.6835, + "step": 11337 + }, + { + "epoch": 1.0, + "grad_norm": 8.289428884415855, + "learning_rate": 4.66082380601307e-10, + "loss": 0.8678, + "step": 11338 + }, + { + "epoch": 1.0, + "grad_norm": 2.148977486141544, + "learning_rate": 4.4686486037337494e-10, + "loss": 0.47, + "step": 11339 + }, + { + "epoch": 1.0, + "grad_norm": 8.392439090648786, + "learning_rate": 4.280518956689683e-10, + "loss": 0.8094, + "step": 11340 + }, + { + "epoch": 1.0, + "grad_norm": 10.550596856961793, + "learning_rate": 4.096434880102029e-10, + "loss": 0.7271, + "step": 11341 + }, + { + "epoch": 1.0, + "grad_norm": 7.19341149942172, + "learning_rate": 3.916396388869981e-10, + "loss": 0.5399, + "step": 11342 + }, + { + "epoch": 1.0, + "grad_norm": 8.891729568999795, + "learning_rate": 3.7404034975541127e-10, + "loss": 0.7465, + "step": 11343 + }, + { + "epoch": 1.0, + "grad_norm": 8.245072698247114, + "learning_rate": 3.5684562204041375e-10, + "loss": 0.7657, + "step": 11344 + }, + { + "epoch": 1.0, + "grad_norm": 6.948931309441005, + "learning_rate": 3.4005545713255985e-10, + "loss": 0.9463, + "step": 11345 + }, + { + "epoch": 1.0, + "grad_norm": 7.377173380128651, + "learning_rate": 3.2366985639076255e-10, + "loss": 0.6399, + "step": 11346 + }, + { + "epoch": 1.0, + "grad_norm": 17.218463773315136, + "learning_rate": 3.076888211417384e-10, + "loss": 0.6744, + "step": 11347 + }, + { + "epoch": 1.0, + "grad_norm": 2.5880040969325417, + "learning_rate": 2.9211235267778693e-10, + "loss": 0.4314, + "step": 11348 + }, + { + "epoch": 1.0, + "grad_norm": 7.832437091057278, + "learning_rate": 2.7694045225901135e-10, + "loss": 0.6626, + "step": 11349 + }, + { + "epoch": 1.0, + "grad_norm": 5.835263402022734, + "learning_rate": 2.621731211138734e-10, + "loss": 0.7631, + "step": 11350 + }, + { + "epoch": 1.0, + "grad_norm": 8.597503120304903, + "learning_rate": 2.4781036043752813e-10, + "loss": 0.7763, + "step": 11351 + }, + { + "epoch": 1.0, + "grad_norm": 11.131950910546374, + "learning_rate": 2.338521713912689e-10, + "loss": 0.6863, + "step": 11352 + }, + { + "epoch": 1.0, + "grad_norm": 18.126606106739803, + "learning_rate": 2.2029855510474762e-10, + "loss": 0.761, + "step": 11353 + }, + { + "epoch": 1.0, + "grad_norm": 9.895845512294938, + "learning_rate": 2.071495126754197e-10, + "loss": 0.7081, + "step": 11354 + }, + { + "epoch": 1.0, + "grad_norm": 8.479427268324843, + "learning_rate": 1.9440504516687886e-10, + "loss": 0.7145, + "step": 11355 + }, + { + "epoch": 1.0, + "grad_norm": 6.465994560696873, + "learning_rate": 1.8206515360996713e-10, + "loss": 0.6857, + "step": 11356 + }, + { + "epoch": 1.0, + "grad_norm": 7.912518958110133, + "learning_rate": 1.7012983900388524e-10, + "loss": 0.5668, + "step": 11357 + }, + { + "epoch": 1.0, + "grad_norm": 2.614086228477766, + "learning_rate": 1.5859910231397213e-10, + "loss": 0.4626, + "step": 11358 + }, + { + "epoch": 1.0, + "grad_norm": 7.34747312858572, + "learning_rate": 1.4747294447337024e-10, + "loss": 0.7239, + "step": 11359 + }, + { + "epoch": 1.0, + "grad_norm": 11.746404168318461, + "learning_rate": 1.3675136638247045e-10, + "loss": 0.6753, + "step": 11360 + }, + { + "epoch": 1.0, + "grad_norm": 9.219713970011282, + "learning_rate": 1.2643436890835692e-10, + "loss": 0.6795, + "step": 11361 + }, + { + "epoch": 1.0, + "grad_norm": 1.9727970479503676, + "learning_rate": 1.165219528870276e-10, + "loss": 0.426, + "step": 11362 + }, + { + "epoch": 1.0, + "grad_norm": 6.658476450435527, + "learning_rate": 1.0701411911950842e-10, + "loss": 0.6689, + "step": 11363 + }, + { + "epoch": 1.0, + "grad_norm": 8.88243055552283, + "learning_rate": 9.791086837573905e-11, + "loss": 0.7331, + "step": 11364 + }, + { + "epoch": 1.0, + "grad_norm": 9.734215151513899, + "learning_rate": 8.921220139179732e-11, + "loss": 0.6404, + "step": 11365 + }, + { + "epoch": 1.0, + "grad_norm": 3.554677026636676, + "learning_rate": 8.091811887211976e-11, + "loss": 0.5305, + "step": 11366 + }, + { + "epoch": 1.0, + "grad_norm": 15.229492773838636, + "learning_rate": 7.302862148728107e-11, + "loss": 0.6944, + "step": 11367 + }, + { + "epoch": 1.0, + "grad_norm": 6.614406692362396, + "learning_rate": 6.554370987621462e-11, + "loss": 0.7062, + "step": 11368 + }, + { + "epoch": 1.0, + "grad_norm": 9.217694600695335, + "learning_rate": 5.846338464454704e-11, + "loss": 0.7081, + "step": 11369 + }, + { + "epoch": 1.0, + "grad_norm": 9.509769256035046, + "learning_rate": 5.1787646365153435e-11, + "loss": 0.7792, + "step": 11370 + }, + { + "epoch": 1.0, + "grad_norm": 18.371230802116557, + "learning_rate": 4.551649557760218e-11, + "loss": 0.6301, + "step": 11371 + }, + { + "epoch": 1.0, + "grad_norm": 6.918864516870986, + "learning_rate": 3.964993279037544e-11, + "loss": 0.7148, + "step": 11372 + }, + { + "epoch": 1.0, + "grad_norm": 8.199912675293056, + "learning_rate": 3.4187958476983306e-11, + "loss": 0.7076, + "step": 11373 + }, + { + "epoch": 1.0, + "grad_norm": 9.755054898031743, + "learning_rate": 2.9130573080959903e-11, + "loss": 0.7145, + "step": 11374 + }, + { + "epoch": 1.0, + "grad_norm": 5.779518481961051, + "learning_rate": 2.4477777010312175e-11, + "loss": 0.5838, + "step": 11375 + }, + { + "epoch": 1.0, + "grad_norm": 7.399708165361747, + "learning_rate": 2.022957064140574e-11, + "loss": 0.6191, + "step": 11376 + }, + { + "epoch": 1.0, + "grad_norm": 7.429104802644474, + "learning_rate": 1.6385954318964836e-11, + "loss": 0.7057, + "step": 11377 + }, + { + "epoch": 1.0, + "grad_norm": 7.002639278187008, + "learning_rate": 1.2946928353296806e-11, + "loss": 0.6094, + "step": 11378 + }, + { + "epoch": 1.0, + "grad_norm": 16.15553569329751, + "learning_rate": 9.912493023067627e-12, + "loss": 0.7514, + "step": 11379 + }, + { + "epoch": 1.0, + "grad_norm": 8.614808724777802, + "learning_rate": 7.282648573636586e-12, + "loss": 0.8341, + "step": 11380 + }, + { + "epoch": 1.0, + "grad_norm": 8.79739561215265, + "learning_rate": 5.057395218166505e-12, + "loss": 0.7123, + "step": 11381 + }, + { + "epoch": 1.0, + "grad_norm": 8.552974911901655, + "learning_rate": 3.236733135958403e-12, + "loss": 0.6307, + "step": 11382 + }, + { + "epoch": 1.0, + "grad_norm": 9.404130630342472, + "learning_rate": 1.820662475227053e-12, + "loss": 0.7303, + "step": 11383 + }, + { + "epoch": 1.0, + "grad_norm": 7.333714855114904, + "learning_rate": 8.091833492152035e-13, + "loss": 0.6619, + "step": 11384 + }, + { + "epoch": 1.0, + "grad_norm": 10.62656347518754, + "learning_rate": 2.0229584118958144e-13, + "loss": 0.7469, + "step": 11385 + }, + { + "epoch": 1.0, + "grad_norm": 12.968826925099858, + "learning_rate": 0.0, + "loss": 0.6043, + "step": 11386 + }, + { + "epoch": 1.0, + "step": 11386, + "total_flos": 1351424401776640.0, + "train_loss": 0.7622169985851588, + "train_runtime": 96533.5199, + "train_samples_per_second": 7.549, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 1.0, + "max_steps": 11386, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "total_flos": 1351424401776640.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}