diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,45761 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6533, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 383.1503991853064, + "learning_rate": 1.0204081632653061e-07, + "loss": 3.16, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1115.042586838023, + "learning_rate": 2.0408163265306121e-07, + "loss": 3.4515, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 641.1581033180005, + "learning_rate": 3.0612244897959183e-07, + "loss": 3.4183, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 543.9001930338954, + "learning_rate": 4.0816326530612243e-07, + "loss": 3.1405, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 415.6700527460039, + "learning_rate": 5.102040816326531e-07, + "loss": 2.9573, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 560.8177502149401, + "learning_rate": 6.122448979591837e-07, + "loss": 3.0618, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 549.9535790740002, + "learning_rate": 7.142857142857143e-07, + "loss": 2.787, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 135.6880950548416, + "learning_rate": 8.163265306122449e-07, + "loss": 2.1078, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 304.45317074621744, + "learning_rate": 9.183673469387756e-07, + "loss": 2.0253, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 129.6075967509114, + "learning_rate": 1.0204081632653063e-06, + "loss": 1.987, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 116.64503619601689, + "learning_rate": 1.122448979591837e-06, + "loss": 1.7659, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 82.57909679030715, + "learning_rate": 1.2244897959183673e-06, + "loss": 1.7313, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 51.7873071107413, + "learning_rate": 1.3265306122448982e-06, + "loss": 1.6111, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 21.55146603646816, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.6476, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 52.146128170413405, + "learning_rate": 1.5306122448979593e-06, + "loss": 1.4587, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 25.018684442954445, + "learning_rate": 1.6326530612244897e-06, + "loss": 1.5048, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 10.633909350728741, + "learning_rate": 1.7346938775510206e-06, + "loss": 0.58, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 31.3525327981405, + "learning_rate": 1.8367346938775512e-06, + "loss": 1.257, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 43.10854106994972, + "learning_rate": 1.938775510204082e-06, + "loss": 1.2033, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 19.144610314079856, + "learning_rate": 2.0408163265306125e-06, + "loss": 1.1375, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 24.783699466031027, + "learning_rate": 2.1428571428571427e-06, + "loss": 1.1003, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 11.60475253722536, + "learning_rate": 2.244897959183674e-06, + "loss": 1.2054, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 10.340451063129958, + "learning_rate": 2.3469387755102044e-06, + "loss": 1.055, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 10.793621550631057, + "learning_rate": 2.4489795918367347e-06, + "loss": 1.1943, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 22.91654748783573, + "learning_rate": 2.5510204081632657e-06, + "loss": 1.1807, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 11.665266982661313, + "learning_rate": 2.6530612244897964e-06, + "loss": 1.1996, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 10.279207516291493, + "learning_rate": 2.7551020408163266e-06, + "loss": 1.1656, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 17.967120212081813, + "learning_rate": 2.8571428571428573e-06, + "loss": 1.1532, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 10.383928985415782, + "learning_rate": 2.959183673469388e-06, + "loss": 0.9899, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 18.203716579957373, + "learning_rate": 3.0612244897959185e-06, + "loss": 1.0944, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 10.071417491040497, + "learning_rate": 3.1632653061224496e-06, + "loss": 1.1039, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 11.075535174989573, + "learning_rate": 3.2653061224489794e-06, + "loss": 1.0426, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 8.406595669779419, + "learning_rate": 3.3673469387755105e-06, + "loss": 1.0862, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 5.618904672536877, + "learning_rate": 3.469387755102041e-06, + "loss": 0.4901, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 22.979117055215557, + "learning_rate": 3.5714285714285718e-06, + "loss": 1.1253, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 98.03089037716222, + "learning_rate": 3.6734693877551024e-06, + "loss": 1.1556, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 11.287036563173132, + "learning_rate": 3.7755102040816327e-06, + "loss": 1.0596, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 13.815097934554741, + "learning_rate": 3.877551020408164e-06, + "loss": 1.0359, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 13.600743070021583, + "learning_rate": 3.979591836734694e-06, + "loss": 1.0887, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 9.078085686869478, + "learning_rate": 4.081632653061225e-06, + "loss": 1.0267, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 8.085579534504056, + "learning_rate": 4.183673469387755e-06, + "loss": 1.0953, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 9.251690714050342, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.0291, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 10.157487900257816, + "learning_rate": 4.3877551020408165e-06, + "loss": 1.1289, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 15.394745194540516, + "learning_rate": 4.489795918367348e-06, + "loss": 1.1724, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 8.206250385388731, + "learning_rate": 4.591836734693878e-06, + "loss": 1.0897, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 11.385889193455325, + "learning_rate": 4.693877551020409e-06, + "loss": 1.0748, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 8.013036055651966, + "learning_rate": 4.795918367346939e-06, + "loss": 1.0592, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 6.540522265960379, + "learning_rate": 4.897959183673469e-06, + "loss": 0.9947, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 5.128507543319763, + "learning_rate": 5e-06, + "loss": 0.4851, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 7.456741645011198, + "learning_rate": 5.1020408163265315e-06, + "loss": 1.1242, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 9.073951090190663, + "learning_rate": 5.204081632653062e-06, + "loss": 1.1371, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 7.848944210654246, + "learning_rate": 5.306122448979593e-06, + "loss": 0.9847, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 6.814113508474519, + "learning_rate": 5.408163265306123e-06, + "loss": 0.965, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 6.075740772611151, + "learning_rate": 5.510204081632653e-06, + "loss": 0.4762, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 8.253826117056938, + "learning_rate": 5.6122448979591834e-06, + "loss": 1.0542, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 8.609978272010776, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.0704, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 9.397718068371473, + "learning_rate": 5.816326530612246e-06, + "loss": 1.1626, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 7.260133946689537, + "learning_rate": 5.918367346938776e-06, + "loss": 1.0113, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 3.852944108730934, + "learning_rate": 6.020408163265307e-06, + "loss": 0.4874, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 6.178769332745159, + "learning_rate": 6.122448979591837e-06, + "loss": 1.0644, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 7.103892360369785, + "learning_rate": 6.224489795918368e-06, + "loss": 1.0928, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 7.494779873667326, + "learning_rate": 6.326530612244899e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 6.629387087808804, + "learning_rate": 6.4285714285714295e-06, + "loss": 1.0211, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 7.464978091176936, + "learning_rate": 6.530612244897959e-06, + "loss": 1.0977, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 7.660183593852527, + "learning_rate": 6.63265306122449e-06, + "loss": 1.0661, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 7.831455627108557, + "learning_rate": 6.734693877551021e-06, + "loss": 1.0992, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 6.722994865015469, + "learning_rate": 6.836734693877551e-06, + "loss": 1.1489, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 7.8110902501706425, + "learning_rate": 6.938775510204082e-06, + "loss": 1.0966, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 7.14628522025491, + "learning_rate": 7.0408163265306125e-06, + "loss": 0.9751, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 6.698256800772039, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.9841, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 6.727266458576524, + "learning_rate": 7.244897959183675e-06, + "loss": 1.078, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 9.712483103175598, + "learning_rate": 7.346938775510205e-06, + "loss": 0.4774, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 7.267739903860731, + "learning_rate": 7.448979591836736e-06, + "loss": 1.0613, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 11.970987660453837, + "learning_rate": 7.551020408163265e-06, + "loss": 1.0343, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 6.088617564760351, + "learning_rate": 7.653061224489796e-06, + "loss": 0.9876, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 6.702281289345013, + "learning_rate": 7.755102040816327e-06, + "loss": 1.055, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 6.672033119575265, + "learning_rate": 7.857142857142858e-06, + "loss": 0.9966, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 5.069957763879359, + "learning_rate": 7.959183673469388e-06, + "loss": 0.5214, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 10.247445520922536, + "learning_rate": 8.06122448979592e-06, + "loss": 1.0514, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 11.516511852647083, + "learning_rate": 8.16326530612245e-06, + "loss": 1.0861, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 8.052749314346663, + "learning_rate": 8.26530612244898e-06, + "loss": 1.016, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 6.750995746740932, + "learning_rate": 8.36734693877551e-06, + "loss": 1.1089, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 8.713321176065724, + "learning_rate": 8.469387755102042e-06, + "loss": 1.0066, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 7.794509522449217, + "learning_rate": 8.571428571428571e-06, + "loss": 1.012, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 6.664116358377716, + "learning_rate": 8.673469387755103e-06, + "loss": 1.0805, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 6.320697850800005, + "learning_rate": 8.775510204081633e-06, + "loss": 1.0494, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 7.006680080855596, + "learning_rate": 8.877551020408163e-06, + "loss": 1.0981, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 6.213977754985643, + "learning_rate": 8.979591836734695e-06, + "loss": 1.0488, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 5.977104454473886, + "learning_rate": 9.081632653061225e-06, + "loss": 0.9923, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 26.93170641637084, + "learning_rate": 9.183673469387756e-06, + "loss": 1.1516, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 6.731873248521301, + "learning_rate": 9.285714285714288e-06, + "loss": 1.0429, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 6.4993308961508784, + "learning_rate": 9.387755102040818e-06, + "loss": 1.0329, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 6.509109125779879, + "learning_rate": 9.489795918367348e-06, + "loss": 1.028, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 7.2065413570334895, + "learning_rate": 9.591836734693878e-06, + "loss": 1.0646, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 9.809366591371024, + "learning_rate": 9.693877551020408e-06, + "loss": 1.1289, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 7.476657106326629, + "learning_rate": 9.795918367346939e-06, + "loss": 1.0345, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 6.927969816535037, + "learning_rate": 9.89795918367347e-06, + "loss": 0.996, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 59.64526210339528, + "learning_rate": 1e-05, + "loss": 1.0362, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 6.401085627447702, + "learning_rate": 1.0102040816326531e-05, + "loss": 1.0387, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 6.258010610505343, + "learning_rate": 1.0204081632653063e-05, + "loss": 1.145, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 5.661261424729024, + "learning_rate": 1.0306122448979591e-05, + "loss": 1.1126, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 6.257632267247965, + "learning_rate": 1.0408163265306123e-05, + "loss": 0.9783, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 5.410703256358511, + "learning_rate": 1.0510204081632654e-05, + "loss": 1.0303, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 6.848851190925594, + "learning_rate": 1.0612244897959186e-05, + "loss": 1.1671, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 6.5957945532873214, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.1229, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 7.606270565711428, + "learning_rate": 1.0816326530612246e-05, + "loss": 1.0775, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 6.7030893221224215, + "learning_rate": 1.0918367346938776e-05, + "loss": 1.1389, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 6.739502719203268, + "learning_rate": 1.1020408163265306e-05, + "loss": 1.0237, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 5.301768842909257, + "learning_rate": 1.1122448979591838e-05, + "loss": 0.9889, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 7.18146117465339, + "learning_rate": 1.1224489795918367e-05, + "loss": 1.0543, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 6.047654607041304, + "learning_rate": 1.1326530612244899e-05, + "loss": 1.09, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 7.637823055554012, + "learning_rate": 1.1428571428571429e-05, + "loss": 1.0977, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 6.281759210863109, + "learning_rate": 1.1530612244897961e-05, + "loss": 1.1304, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 5.552251083233848, + "learning_rate": 1.1632653061224491e-05, + "loss": 1.0337, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 6.73550542774755, + "learning_rate": 1.1734693877551021e-05, + "loss": 1.0381, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 5.767902308250887, + "learning_rate": 1.1836734693877552e-05, + "loss": 1.0879, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 5.813147914121843, + "learning_rate": 1.1938775510204084e-05, + "loss": 1.0883, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 6.756174166724238, + "learning_rate": 1.2040816326530614e-05, + "loss": 0.9809, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 5.362500089300721, + "learning_rate": 1.2142857142857142e-05, + "loss": 1.0745, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 6.699334275929911, + "learning_rate": 1.2244897959183674e-05, + "loss": 1.0208, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 5.953344832033511, + "learning_rate": 1.2346938775510204e-05, + "loss": 0.9662, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 6.090119536112503, + "learning_rate": 1.2448979591836736e-05, + "loss": 1.1166, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 6.724578786244247, + "learning_rate": 1.2551020408163267e-05, + "loss": 1.0081, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 6.82491736656003, + "learning_rate": 1.2653061224489798e-05, + "loss": 1.1457, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 6.253776774271056, + "learning_rate": 1.2755102040816327e-05, + "loss": 1.1222, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 6.271747960540945, + "learning_rate": 1.2857142857142859e-05, + "loss": 1.1013, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 5.69741315980313, + "learning_rate": 1.2959183673469389e-05, + "loss": 1.0754, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 5.4875349314231565, + "learning_rate": 1.3061224489795918e-05, + "loss": 1.102, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 6.038528782577277, + "learning_rate": 1.316326530612245e-05, + "loss": 1.1194, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 5.606252307014983, + "learning_rate": 1.326530612244898e-05, + "loss": 1.1034, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 5.327163657525908, + "learning_rate": 1.3367346938775512e-05, + "loss": 1.2087, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 6.280116871322719, + "learning_rate": 1.3469387755102042e-05, + "loss": 1.0586, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 6.6456091927763365, + "learning_rate": 1.3571428571428574e-05, + "loss": 1.0209, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 6.558331129509311, + "learning_rate": 1.3673469387755102e-05, + "loss": 1.1321, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 5.498934702458065, + "learning_rate": 1.3775510204081634e-05, + "loss": 1.0218, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 5.339930806582817, + "learning_rate": 1.3877551020408165e-05, + "loss": 1.0357, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 5.454047705726223, + "learning_rate": 1.3979591836734696e-05, + "loss": 0.9997, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 5.979588983206454, + "learning_rate": 1.4081632653061225e-05, + "loss": 1.1083, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 5.50575669433855, + "learning_rate": 1.4183673469387755e-05, + "loss": 1.0733, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 7.352196459546808, + "learning_rate": 1.4285714285714287e-05, + "loss": 1.0983, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 5.932749572950468, + "learning_rate": 1.4387755102040817e-05, + "loss": 1.1568, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 6.83481653533095, + "learning_rate": 1.448979591836735e-05, + "loss": 1.1989, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 6.2113642259074915, + "learning_rate": 1.4591836734693878e-05, + "loss": 1.0765, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 5.559171013530669, + "learning_rate": 1.469387755102041e-05, + "loss": 1.2268, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 6.313659971812374, + "learning_rate": 1.479591836734694e-05, + "loss": 1.0467, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 5.643371072712547, + "learning_rate": 1.4897959183673472e-05, + "loss": 1.1269, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 6.021005714047092, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1134, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 6.373077262357572, + "learning_rate": 1.510204081632653e-05, + "loss": 1.0595, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 6.5019027656338215, + "learning_rate": 1.5204081632653063e-05, + "loss": 1.0591, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 5.834636688292023, + "learning_rate": 1.530612244897959e-05, + "loss": 1.0933, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 5.277045405401534, + "learning_rate": 1.5408163265306123e-05, + "loss": 1.0802, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 6.249863285521951, + "learning_rate": 1.5510204081632655e-05, + "loss": 1.0813, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 5.709586843024497, + "learning_rate": 1.5612244897959187e-05, + "loss": 1.0674, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 5.445331032824768, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.9501, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 5.36900789371636, + "learning_rate": 1.5816326530612247e-05, + "loss": 1.1602, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 5.647764431722952, + "learning_rate": 1.5918367346938776e-05, + "loss": 1.1474, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 5.312425415130779, + "learning_rate": 1.6020408163265308e-05, + "loss": 1.0233, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 5.383622524455289, + "learning_rate": 1.612244897959184e-05, + "loss": 1.1282, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 5.665115899124785, + "learning_rate": 1.6224489795918368e-05, + "loss": 1.0337, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 5.249438793574112, + "learning_rate": 1.63265306122449e-05, + "loss": 1.131, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 5.182023365734747, + "learning_rate": 1.642857142857143e-05, + "loss": 1.1596, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 5.233978778860636, + "learning_rate": 1.653061224489796e-05, + "loss": 1.0368, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 5.211250025821531, + "learning_rate": 1.6632653061224492e-05, + "loss": 1.1777, + "step": 163 + }, + { + "epoch": 0.03, + "grad_norm": 5.193660311427835, + "learning_rate": 1.673469387755102e-05, + "loss": 1.0773, + "step": 164 + }, + { + "epoch": 0.03, + "grad_norm": 5.667021464427709, + "learning_rate": 1.6836734693877553e-05, + "loss": 1.1726, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 6.144632567111517, + "learning_rate": 1.6938775510204085e-05, + "loss": 1.028, + "step": 166 + }, + { + "epoch": 0.03, + "grad_norm": 5.419125839731695, + "learning_rate": 1.7040816326530613e-05, + "loss": 1.0666, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 5.5162703152902735, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.9476, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 5.534476185280114, + "learning_rate": 1.7244897959183674e-05, + "loss": 1.0293, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 5.2943079469894, + "learning_rate": 1.7346938775510206e-05, + "loss": 1.0758, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 6.076390118325341, + "learning_rate": 1.7448979591836738e-05, + "loss": 1.204, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 5.087148129774895, + "learning_rate": 1.7551020408163266e-05, + "loss": 1.1198, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 5.071546224881873, + "learning_rate": 1.7653061224489798e-05, + "loss": 1.0729, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 4.934479929849503, + "learning_rate": 1.7755102040816327e-05, + "loss": 1.1263, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 5.5097148641155105, + "learning_rate": 1.785714285714286e-05, + "loss": 1.0998, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 4.933561500487126, + "learning_rate": 1.795918367346939e-05, + "loss": 1.0783, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 5.1969610822139884, + "learning_rate": 1.806122448979592e-05, + "loss": 1.0884, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 5.219171188346219, + "learning_rate": 1.816326530612245e-05, + "loss": 1.139, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 5.238974571818712, + "learning_rate": 1.826530612244898e-05, + "loss": 1.096, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 5.321624515588689, + "learning_rate": 1.836734693877551e-05, + "loss": 1.1538, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 6.287719028369076, + "learning_rate": 1.8469387755102043e-05, + "loss": 1.1504, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 4.816101595539008, + "learning_rate": 1.8571428571428575e-05, + "loss": 1.0856, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 5.421835257556857, + "learning_rate": 1.8673469387755104e-05, + "loss": 1.115, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 5.226520103966431, + "learning_rate": 1.8775510204081636e-05, + "loss": 1.1927, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 4.957358233989192, + "learning_rate": 1.8877551020408164e-05, + "loss": 1.067, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 4.884140241314185, + "learning_rate": 1.8979591836734696e-05, + "loss": 1.0874, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 4.744041077689365, + "learning_rate": 1.9081632653061225e-05, + "loss": 1.0818, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 5.5741498454057234, + "learning_rate": 1.9183673469387756e-05, + "loss": 1.0791, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 6.252343995041189, + "learning_rate": 1.928571428571429e-05, + "loss": 1.1573, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 5.436600844797034, + "learning_rate": 1.9387755102040817e-05, + "loss": 1.0886, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 5.686800742042656, + "learning_rate": 1.948979591836735e-05, + "loss": 1.0898, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 5.3941981721298164, + "learning_rate": 1.9591836734693877e-05, + "loss": 1.1444, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 5.043454999129311, + "learning_rate": 1.969387755102041e-05, + "loss": 1.0928, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 5.371089443369653, + "learning_rate": 1.979591836734694e-05, + "loss": 1.1281, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 5.203990082279683, + "learning_rate": 1.9897959183673473e-05, + "loss": 1.1808, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 4.997545641343556, + "learning_rate": 2e-05, + "loss": 1.144, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 5.456843142113312, + "learning_rate": 1.999999877114023e-05, + "loss": 1.0797, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 5.453312121763317, + "learning_rate": 1.9999995084561225e-05, + "loss": 1.0321, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 10.309253158294316, + "learning_rate": 1.9999988940263887e-05, + "loss": 0.7624, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 5.513880161766579, + "learning_rate": 1.9999980338249726e-05, + "loss": 1.1025, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 5.668581887524097, + "learning_rate": 1.999996927852086e-05, + "loss": 1.1191, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 5.402701531746498, + "learning_rate": 1.9999955761080003e-05, + "loss": 1.0652, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 4.9881038307362, + "learning_rate": 1.999993978593048e-05, + "loss": 1.1601, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 5.586499941037915, + "learning_rate": 1.9999921353076216e-05, + "loss": 1.1691, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 5.46705629801981, + "learning_rate": 1.9999900462521743e-05, + "loss": 1.1723, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 59.17034445169865, + "learning_rate": 1.9999877114272194e-05, + "loss": 0.9822, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 6.176551551254107, + "learning_rate": 1.9999851308333305e-05, + "loss": 1.1339, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 6.182693806023538, + "learning_rate": 1.9999823044711424e-05, + "loss": 0.9911, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 4.95969238154806, + "learning_rate": 1.9999792323413492e-05, + "loss": 1.1375, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 4.304652344445716, + "learning_rate": 1.9999759144447062e-05, + "loss": 1.0383, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 4.736874041587823, + "learning_rate": 1.9999723507820288e-05, + "loss": 1.0764, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 4.647646408322613, + "learning_rate": 1.999968541354193e-05, + "loss": 1.2264, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 5.084198319034713, + "learning_rate": 1.9999644861621346e-05, + "loss": 1.1324, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 4.938224249926442, + "learning_rate": 1.9999601852068507e-05, + "loss": 1.102, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 6.063465444477739, + "learning_rate": 1.9999556384893984e-05, + "loss": 1.0721, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 5.564512063889825, + "learning_rate": 1.999950846010895e-05, + "loss": 1.1503, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 5.414783073934225, + "learning_rate": 1.999945807772518e-05, + "loss": 1.061, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 5.011468038105515, + "learning_rate": 1.999940523775506e-05, + "loss": 1.1042, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 4.833874683418772, + "learning_rate": 1.999934994021158e-05, + "loss": 0.9945, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 4.799836412952987, + "learning_rate": 1.9999292185108322e-05, + "loss": 1.0803, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 4.554987452781492, + "learning_rate": 1.999923197245949e-05, + "loss": 1.0829, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 5.372674266383977, + "learning_rate": 1.9999169302279874e-05, + "loss": 1.1556, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 5.657197518806744, + "learning_rate": 1.9999104174584885e-05, + "loss": 1.0696, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 4.728835803379367, + "learning_rate": 1.999903658939052e-05, + "loss": 1.0844, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 5.828075849629188, + "learning_rate": 1.99989665467134e-05, + "loss": 1.0699, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 5.769915650459503, + "learning_rate": 1.9998894046570735e-05, + "loss": 1.1587, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 6.069781347033537, + "learning_rate": 1.9998819088980338e-05, + "loss": 1.211, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 5.27532701759076, + "learning_rate": 1.999874167396064e-05, + "loss": 1.1225, + "step": 228 + }, + { + "epoch": 0.04, + "grad_norm": 5.972816909113727, + "learning_rate": 1.999866180153066e-05, + "loss": 1.2299, + "step": 229 + }, + { + "epoch": 0.04, + "grad_norm": 12.328508278998827, + "learning_rate": 1.9998579471710037e-05, + "loss": 1.1446, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 4.761237265785631, + "learning_rate": 1.9998494684519e-05, + "loss": 1.0745, + "step": 231 + }, + { + "epoch": 0.04, + "grad_norm": 6.0094751510628495, + "learning_rate": 1.9998407439978383e-05, + "loss": 1.1848, + "step": 232 + }, + { + "epoch": 0.04, + "grad_norm": 4.687867135519146, + "learning_rate": 1.9998317738109638e-05, + "loss": 1.1307, + "step": 233 + }, + { + "epoch": 0.04, + "grad_norm": 5.373549487869426, + "learning_rate": 1.99982255789348e-05, + "loss": 1.1336, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 4.491520583566882, + "learning_rate": 1.999813096247653e-05, + "loss": 1.1269, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 4.5109900214204846, + "learning_rate": 1.999803388875808e-05, + "loss": 1.1576, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 4.897322807826128, + "learning_rate": 1.9997934357803302e-05, + "loss": 1.1426, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 4.737081073192212, + "learning_rate": 1.999783236963666e-05, + "loss": 0.9822, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 507.51763352048624, + "learning_rate": 1.9997727924283227e-05, + "loss": 2.0016, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 6.720695674654614, + "learning_rate": 1.9997621021768663e-05, + "loss": 1.0005, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 5.22021102440788, + "learning_rate": 1.9997511662119248e-05, + "loss": 1.2467, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 4.925981261258009, + "learning_rate": 1.9997399845361853e-05, + "loss": 1.1494, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 6.326969030434556, + "learning_rate": 1.9997285571523966e-05, + "loss": 1.2098, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 4.852488413814456, + "learning_rate": 1.999716884063367e-05, + "loss": 1.1044, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 4.981439821590262, + "learning_rate": 1.9997049652719655e-05, + "loss": 1.1347, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 298.8321545901624, + "learning_rate": 1.999692800781121e-05, + "loss": 0.9508, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 5.030060046557593, + "learning_rate": 1.9996803905938237e-05, + "loss": 1.1103, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 4.209425242514626, + "learning_rate": 1.9996677347131237e-05, + "loss": 1.0899, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 4.786140168586079, + "learning_rate": 1.999654833142131e-05, + "loss": 1.2291, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 4.675732962915219, + "learning_rate": 1.9996416858840167e-05, + "loss": 1.1652, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 5.486052375807091, + "learning_rate": 1.999628292942012e-05, + "loss": 1.1669, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 5.513586744808445, + "learning_rate": 1.9996146543194086e-05, + "loss": 1.2232, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 5.482304872859721, + "learning_rate": 1.9996007700195583e-05, + "loss": 1.0857, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 7.219060691629565, + "learning_rate": 1.9995866400458736e-05, + "loss": 1.0267, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 4.629086694340683, + "learning_rate": 1.9995722644018275e-05, + "loss": 1.1984, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 4.554770506006259, + "learning_rate": 1.9995576430909526e-05, + "loss": 1.1749, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 4.486611373898943, + "learning_rate": 1.9995427761168427e-05, + "loss": 1.0657, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 5.086051833263698, + "learning_rate": 1.9995276634831518e-05, + "loss": 1.1945, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 4.784755615304702, + "learning_rate": 1.9995123051935938e-05, + "loss": 0.9962, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 5.1527510227553535, + "learning_rate": 1.9994967012519436e-05, + "loss": 1.0401, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 4.979116067663447, + "learning_rate": 1.9994808516620362e-05, + "loss": 0.9774, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 4.845044200480525, + "learning_rate": 1.999464756427767e-05, + "loss": 1.0955, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 4.388519751080092, + "learning_rate": 1.999448415553092e-05, + "loss": 1.0714, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 4.667854781851994, + "learning_rate": 1.9994318290420264e-05, + "loss": 0.9685, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 4.912765838925781, + "learning_rate": 1.9994149968986477e-05, + "loss": 1.2169, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 4.323589806398268, + "learning_rate": 1.9993979191270923e-05, + "loss": 1.0595, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 4.482357421422183, + "learning_rate": 1.9993805957315575e-05, + "loss": 1.1664, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 5.221027555569462, + "learning_rate": 1.999363026716301e-05, + "loss": 1.1076, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 16.703437262233603, + "learning_rate": 1.9993452120856412e-05, + "loss": 0.88, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 7.941943116606574, + "learning_rate": 1.9993271518439554e-05, + "loss": 1.0686, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 4.695548346288317, + "learning_rate": 1.999308845995683e-05, + "loss": 1.0999, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 4.326582127278831, + "learning_rate": 1.9992902945453227e-05, + "loss": 1.1071, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 4.5673615588610215, + "learning_rate": 1.9992714974974344e-05, + "loss": 1.1184, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 5.379449041896806, + "learning_rate": 1.9992524548566378e-05, + "loss": 1.1919, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 5.356848963709712, + "learning_rate": 1.9992331666276126e-05, + "loss": 1.0153, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 5.7495735533750665, + "learning_rate": 1.9992136328151e-05, + "loss": 1.1424, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 4.708712119538405, + "learning_rate": 1.9991938534238996e-05, + "loss": 1.0777, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 4.623074644106911, + "learning_rate": 1.9991738284588743e-05, + "loss": 1.1632, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 5.038994081163376, + "learning_rate": 1.9991535579249443e-05, + "loss": 1.1002, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 4.4993011643736915, + "learning_rate": 1.9991330418270926e-05, + "loss": 1.1014, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 4.312602735744396, + "learning_rate": 1.9991122801703606e-05, + "loss": 1.087, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 281.9379233611627, + "learning_rate": 1.9990912729598512e-05, + "loss": 1.3675, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 4.403901398769606, + "learning_rate": 1.9990700202007276e-05, + "loss": 0.9703, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 5.718875253576995, + "learning_rate": 1.999048521898213e-05, + "loss": 1.0062, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 4.383041046256885, + "learning_rate": 1.999026778057591e-05, + "loss": 1.0738, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 4.662485194619379, + "learning_rate": 1.999004788684206e-05, + "loss": 1.1438, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 4.353269823598559, + "learning_rate": 1.9989825537834623e-05, + "loss": 1.0382, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 4.508331270093196, + "learning_rate": 1.9989600733608235e-05, + "loss": 1.0688, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 15.046375086111272, + "learning_rate": 1.9989373474218163e-05, + "loss": 1.1265, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 4.7962186931442705, + "learning_rate": 1.998914375972025e-05, + "loss": 1.146, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 4.70566502248541, + "learning_rate": 1.998891159017096e-05, + "loss": 1.068, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 80.8458009620087, + "learning_rate": 1.998867696562735e-05, + "loss": 1.1027, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 4.554679405892336, + "learning_rate": 1.9988439886147082e-05, + "loss": 1.0465, + "step": 293 + }, + { + "epoch": 0.05, + "grad_norm": 4.717994795659879, + "learning_rate": 1.9988200351788425e-05, + "loss": 1.125, + "step": 294 + }, + { + "epoch": 0.05, + "grad_norm": 4.380389098649303, + "learning_rate": 1.9987958362610255e-05, + "loss": 0.9189, + "step": 295 + }, + { + "epoch": 0.05, + "grad_norm": 5.012117873767208, + "learning_rate": 1.9987713918672044e-05, + "loss": 1.2106, + "step": 296 + }, + { + "epoch": 0.05, + "grad_norm": 5.827671238106491, + "learning_rate": 1.9987467020033865e-05, + "loss": 1.1088, + "step": 297 + }, + { + "epoch": 0.05, + "grad_norm": 4.102478908570774, + "learning_rate": 1.99872176667564e-05, + "loss": 1.0611, + "step": 298 + }, + { + "epoch": 0.05, + "grad_norm": 4.919138180496703, + "learning_rate": 1.9986965858900934e-05, + "loss": 1.0139, + "step": 299 + }, + { + "epoch": 0.05, + "grad_norm": 4.634896848034215, + "learning_rate": 1.9986711596529356e-05, + "loss": 1.198, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 10.76351079430654, + "learning_rate": 1.9986454879704157e-05, + "loss": 1.208, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 5.003244093064284, + "learning_rate": 1.9986195708488428e-05, + "loss": 1.1318, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 4.3217012747656565, + "learning_rate": 1.9985934082945868e-05, + "loss": 1.0452, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 4.800641119661177, + "learning_rate": 1.9985670003140777e-05, + "loss": 1.1481, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 4.248394762455576, + "learning_rate": 1.9985403469138056e-05, + "loss": 1.098, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 4.557215159093832, + "learning_rate": 1.9985134481003213e-05, + "loss": 1.1635, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 4.411297001088896, + "learning_rate": 1.9984863038802364e-05, + "loss": 1.0967, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 4.619947266168186, + "learning_rate": 1.998458914260221e-05, + "loss": 1.1012, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 4.7941902957102895, + "learning_rate": 1.9984312792470074e-05, + "loss": 1.1977, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 4.78262219409935, + "learning_rate": 1.9984033988473874e-05, + "loss": 1.1142, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 4.6109737402679, + "learning_rate": 1.9983752730682132e-05, + "loss": 1.1662, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 12.660705540197382, + "learning_rate": 1.9983469019163976e-05, + "loss": 1.1834, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 4.477088184986276, + "learning_rate": 1.998318285398913e-05, + "loss": 1.1383, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 4.494691763849153, + "learning_rate": 1.998289423522793e-05, + "loss": 1.1171, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 4.583589314263017, + "learning_rate": 1.9982603162951303e-05, + "loss": 1.0342, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 13.239258176680355, + "learning_rate": 1.9982309637230796e-05, + "loss": 1.0779, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 4.533301393135521, + "learning_rate": 1.9982013658138544e-05, + "loss": 1.1758, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 4.160943400606343, + "learning_rate": 1.9981715225747287e-05, + "loss": 0.993, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 4.611711397681244, + "learning_rate": 1.998141434013038e-05, + "loss": 1.061, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 4.98935758205428, + "learning_rate": 1.9981111001361762e-05, + "loss": 1.1703, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 4.475805740319163, + "learning_rate": 1.9980805209515995e-05, + "loss": 1.1512, + "step": 321 + }, + { + "epoch": 0.05, + "grad_norm": 4.976352036289246, + "learning_rate": 1.9980496964668228e-05, + "loss": 1.059, + "step": 322 + }, + { + "epoch": 0.05, + "grad_norm": 4.779970748595774, + "learning_rate": 1.998018626689422e-05, + "loss": 1.1983, + "step": 323 + }, + { + "epoch": 0.05, + "grad_norm": 15.610805708531752, + "learning_rate": 1.9979873116270333e-05, + "loss": 1.1178, + "step": 324 + }, + { + "epoch": 0.05, + "grad_norm": 4.4942074564889785, + "learning_rate": 1.997955751287353e-05, + "loss": 1.0686, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 4.349619273852159, + "learning_rate": 1.997923945678138e-05, + "loss": 1.1128, + "step": 326 + }, + { + "epoch": 0.05, + "grad_norm": 102.78795406026858, + "learning_rate": 1.9978918948072047e-05, + "loss": 1.5781, + "step": 327 + }, + { + "epoch": 0.05, + "grad_norm": 5.167255102561554, + "learning_rate": 1.997859598682431e-05, + "loss": 1.1255, + "step": 328 + }, + { + "epoch": 0.05, + "grad_norm": 4.462766578218212, + "learning_rate": 1.997827057311753e-05, + "loss": 1.1399, + "step": 329 + }, + { + "epoch": 0.05, + "grad_norm": 4.196999251460958, + "learning_rate": 1.99779427070317e-05, + "loss": 1.0407, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 4.672166926366032, + "learning_rate": 1.99776123886474e-05, + "loss": 1.1284, + "step": 331 + }, + { + "epoch": 0.05, + "grad_norm": 4.394708768001454, + "learning_rate": 1.9977279618045798e-05, + "loss": 1.1958, + "step": 332 + }, + { + "epoch": 0.05, + "grad_norm": 4.585192132933774, + "learning_rate": 1.9976944395308696e-05, + "loss": 1.0985, + "step": 333 + }, + { + "epoch": 0.05, + "grad_norm": 4.732639403301168, + "learning_rate": 1.9976606720518474e-05, + "loss": 0.9398, + "step": 334 + }, + { + "epoch": 0.05, + "grad_norm": 4.385645024123697, + "learning_rate": 1.9976266593758123e-05, + "loss": 1.1773, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 3.883495168249728, + "learning_rate": 1.9975924015111243e-05, + "loss": 0.9511, + "step": 336 + }, + { + "epoch": 0.05, + "grad_norm": 4.378627617287336, + "learning_rate": 1.9975578984662017e-05, + "loss": 0.9816, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 4.272296105323741, + "learning_rate": 1.9975231502495255e-05, + "loss": 1.0512, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 5.1752536572488035, + "learning_rate": 1.9974881568696358e-05, + "loss": 1.1703, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 4.280722914541011, + "learning_rate": 1.997452918335133e-05, + "loss": 1.1305, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 5.659806010735759, + "learning_rate": 1.9974174346546767e-05, + "loss": 1.0883, + "step": 341 + }, + { + "epoch": 0.05, + "grad_norm": 5.021467650646564, + "learning_rate": 1.9973817058369892e-05, + "loss": 1.21, + "step": 342 + }, + { + "epoch": 0.05, + "grad_norm": 4.73234270501322, + "learning_rate": 1.9973457318908508e-05, + "loss": 0.982, + "step": 343 + }, + { + "epoch": 0.05, + "grad_norm": 4.808261971142038, + "learning_rate": 1.997309512825103e-05, + "loss": 1.1985, + "step": 344 + }, + { + "epoch": 0.05, + "grad_norm": 4.862801194088539, + "learning_rate": 1.9972730486486476e-05, + "loss": 1.158, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 4.679447242215395, + "learning_rate": 1.9972363393704466e-05, + "loss": 1.1748, + "step": 346 + }, + { + "epoch": 0.05, + "grad_norm": 4.0787881510678945, + "learning_rate": 1.997199384999522e-05, + "loss": 1.0843, + "step": 347 + }, + { + "epoch": 0.05, + "grad_norm": 4.801558853286235, + "learning_rate": 1.997162185544956e-05, + "loss": 1.0041, + "step": 348 + }, + { + "epoch": 0.05, + "grad_norm": 4.679054095707912, + "learning_rate": 1.9971247410158908e-05, + "loss": 0.972, + "step": 349 + }, + { + "epoch": 0.05, + "grad_norm": 4.383388745457075, + "learning_rate": 1.99708705142153e-05, + "loss": 1.0952, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 4.590760880880462, + "learning_rate": 1.9970491167711365e-05, + "loss": 1.0535, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 4.253862279071871, + "learning_rate": 1.9970109370740333e-05, + "loss": 1.1829, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 4.440583224131077, + "learning_rate": 1.996972512339604e-05, + "loss": 1.0307, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 3.708308805584549, + "learning_rate": 1.9969338425772918e-05, + "loss": 1.1596, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 3.982991930831451, + "learning_rate": 1.996894927796602e-05, + "loss": 1.0943, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 4.5351066336760235, + "learning_rate": 1.9968557680070972e-05, + "loss": 1.2282, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 4.088719223079842, + "learning_rate": 1.996816363218403e-05, + "loss": 1.018, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 4.403197360776069, + "learning_rate": 1.9967767134402033e-05, + "loss": 1.0434, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 4.3813635695825655, + "learning_rate": 1.9967368186822428e-05, + "loss": 0.9905, + "step": 359 + }, + { + "epoch": 0.06, + "grad_norm": 4.229914743790464, + "learning_rate": 1.996696678954327e-05, + "loss": 1.1482, + "step": 360 + }, + { + "epoch": 0.06, + "grad_norm": 4.770387275611607, + "learning_rate": 1.996656294266321e-05, + "loss": 1.1022, + "step": 361 + }, + { + "epoch": 0.06, + "grad_norm": 4.874988333761614, + "learning_rate": 1.9966156646281502e-05, + "loss": 1.1424, + "step": 362 + }, + { + "epoch": 0.06, + "grad_norm": 3.9995638240159233, + "learning_rate": 1.9965747900498002e-05, + "loss": 0.9644, + "step": 363 + }, + { + "epoch": 0.06, + "grad_norm": 3.9612943598465606, + "learning_rate": 1.9965336705413167e-05, + "loss": 1.1147, + "step": 364 + }, + { + "epoch": 0.06, + "grad_norm": 8.956813036420114, + "learning_rate": 1.996492306112806e-05, + "loss": 1.0985, + "step": 365 + }, + { + "epoch": 0.06, + "grad_norm": 4.552523678043079, + "learning_rate": 1.9964506967744336e-05, + "loss": 1.0996, + "step": 366 + }, + { + "epoch": 0.06, + "grad_norm": 4.438890757180122, + "learning_rate": 1.9964088425364267e-05, + "loss": 1.1752, + "step": 367 + }, + { + "epoch": 0.06, + "grad_norm": 4.816508118861078, + "learning_rate": 1.9963667434090717e-05, + "loss": 1.104, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 3.858878038488043, + "learning_rate": 1.9963243994027157e-05, + "loss": 1.1146, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 8.368493036325697, + "learning_rate": 1.9962818105277648e-05, + "loss": 1.1827, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 4.1246671758312585, + "learning_rate": 1.9962389767946867e-05, + "loss": 1.0241, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 3.9237526131257954, + "learning_rate": 1.9961958982140088e-05, + "loss": 1.035, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 3.9771041368400177, + "learning_rate": 1.996152574796318e-05, + "loss": 1.004, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 5.265715900162166, + "learning_rate": 1.996109006552263e-05, + "loss": 1.1111, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 4.32195360794682, + "learning_rate": 1.9960651934925514e-05, + "loss": 1.0701, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 3.977376902921775, + "learning_rate": 1.9960211356279502e-05, + "loss": 1.0774, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 4.4335843341897005, + "learning_rate": 1.9959768329692884e-05, + "loss": 1.1743, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 3.873209849255563, + "learning_rate": 1.995932285527455e-05, + "loss": 1.0233, + "step": 378 + }, + { + "epoch": 0.06, + "grad_norm": 4.401801021300256, + "learning_rate": 1.995887493313397e-05, + "loss": 1.1243, + "step": 379 + }, + { + "epoch": 0.06, + "grad_norm": 4.386100362770559, + "learning_rate": 1.9958424563381245e-05, + "loss": 1.1078, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 4.403668567877387, + "learning_rate": 1.9957971746127052e-05, + "loss": 1.0611, + "step": 381 + }, + { + "epoch": 0.06, + "grad_norm": 4.496062950052071, + "learning_rate": 1.9957516481482686e-05, + "loss": 1.0897, + "step": 382 + }, + { + "epoch": 0.06, + "grad_norm": 5.1495625285379605, + "learning_rate": 1.9957058769560042e-05, + "loss": 1.1743, + "step": 383 + }, + { + "epoch": 0.06, + "grad_norm": 4.460766501417975, + "learning_rate": 1.9956598610471603e-05, + "loss": 1.1613, + "step": 384 + }, + { + "epoch": 0.06, + "grad_norm": 4.389722614887968, + "learning_rate": 1.995613600433047e-05, + "loss": 1.0859, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 4.576442531385308, + "learning_rate": 1.995567095125034e-05, + "loss": 1.0435, + "step": 386 + }, + { + "epoch": 0.06, + "grad_norm": 4.237018164006678, + "learning_rate": 1.9955203451345507e-05, + "loss": 1.0716, + "step": 387 + }, + { + "epoch": 0.06, + "grad_norm": 5.028493746861455, + "learning_rate": 1.9954733504730868e-05, + "loss": 1.0211, + "step": 388 + }, + { + "epoch": 0.06, + "grad_norm": 4.362387612595332, + "learning_rate": 1.9954261111521926e-05, + "loss": 1.0603, + "step": 389 + }, + { + "epoch": 0.06, + "grad_norm": 4.280742489164971, + "learning_rate": 1.9953786271834777e-05, + "loss": 1.0198, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 4.343266350378013, + "learning_rate": 1.9953308985786132e-05, + "loss": 1.1851, + "step": 391 + }, + { + "epoch": 0.06, + "grad_norm": 5.382126821153117, + "learning_rate": 1.9952829253493287e-05, + "loss": 1.1069, + "step": 392 + }, + { + "epoch": 0.06, + "grad_norm": 4.4645047350214515, + "learning_rate": 1.995234707507415e-05, + "loss": 1.0897, + "step": 393 + }, + { + "epoch": 0.06, + "grad_norm": 4.347303636479825, + "learning_rate": 1.9951862450647227e-05, + "loss": 1.0755, + "step": 394 + }, + { + "epoch": 0.06, + "grad_norm": 6.687625133699968, + "learning_rate": 1.9951375380331622e-05, + "loss": 1.0315, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 4.909428928532073, + "learning_rate": 1.9950885864247043e-05, + "loss": 1.1643, + "step": 396 + }, + { + "epoch": 0.06, + "grad_norm": 4.578689500603283, + "learning_rate": 1.9950393902513804e-05, + "loss": 1.0898, + "step": 397 + }, + { + "epoch": 0.06, + "grad_norm": 10.609108223587599, + "learning_rate": 1.9949899495252816e-05, + "loss": 1.0755, + "step": 398 + }, + { + "epoch": 0.06, + "grad_norm": 4.841143079264003, + "learning_rate": 1.9949402642585585e-05, + "loss": 1.1456, + "step": 399 + }, + { + "epoch": 0.06, + "grad_norm": 4.629729376883061, + "learning_rate": 1.9948903344634223e-05, + "loss": 1.065, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 55.86395081501506, + "learning_rate": 1.994840160152145e-05, + "loss": 1.9193, + "step": 401 + }, + { + "epoch": 0.06, + "grad_norm": 3.6236513738541767, + "learning_rate": 1.9947897413370575e-05, + "loss": 1.0813, + "step": 402 + }, + { + "epoch": 0.06, + "grad_norm": 5.37260215681647, + "learning_rate": 1.9947390780305515e-05, + "loss": 1.2356, + "step": 403 + }, + { + "epoch": 0.06, + "grad_norm": 4.394105492668787, + "learning_rate": 1.9946881702450788e-05, + "loss": 1.0607, + "step": 404 + }, + { + "epoch": 0.06, + "grad_norm": 89.50489669809592, + "learning_rate": 1.994637017993151e-05, + "loss": 1.5628, + "step": 405 + }, + { + "epoch": 0.06, + "grad_norm": 5.597609858089008, + "learning_rate": 1.9945856212873394e-05, + "loss": 1.0847, + "step": 406 + }, + { + "epoch": 0.06, + "grad_norm": 4.459506128727871, + "learning_rate": 1.9945339801402768e-05, + "loss": 1.0432, + "step": 407 + }, + { + "epoch": 0.06, + "grad_norm": 4.071685345913883, + "learning_rate": 1.9944820945646543e-05, + "loss": 1.0675, + "step": 408 + }, + { + "epoch": 0.06, + "grad_norm": 4.231922265254759, + "learning_rate": 1.9944299645732242e-05, + "loss": 1.0479, + "step": 409 + }, + { + "epoch": 0.06, + "grad_norm": 4.389508171925526, + "learning_rate": 1.9943775901787988e-05, + "loss": 1.0723, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 4.503983891850023, + "learning_rate": 1.99432497139425e-05, + "loss": 1.0749, + "step": 411 + }, + { + "epoch": 0.06, + "grad_norm": 4.152011570105329, + "learning_rate": 1.9942721082325103e-05, + "loss": 1.0135, + "step": 412 + }, + { + "epoch": 0.06, + "grad_norm": 50.68316350833893, + "learning_rate": 1.9942190007065715e-05, + "loss": 1.1019, + "step": 413 + }, + { + "epoch": 0.06, + "grad_norm": 5.500780906339276, + "learning_rate": 1.9941656488294865e-05, + "loss": 1.0915, + "step": 414 + }, + { + "epoch": 0.06, + "grad_norm": 4.191607605165063, + "learning_rate": 1.9941120526143673e-05, + "loss": 1.0326, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 4.330054099494727, + "learning_rate": 1.9940582120743867e-05, + "loss": 0.9911, + "step": 416 + }, + { + "epoch": 0.06, + "grad_norm": 4.511469571369914, + "learning_rate": 1.9940041272227767e-05, + "loss": 1.1471, + "step": 417 + }, + { + "epoch": 0.06, + "grad_norm": 4.329974316823594, + "learning_rate": 1.9939497980728305e-05, + "loss": 1.0064, + "step": 418 + }, + { + "epoch": 0.06, + "grad_norm": 3.7911891744849937, + "learning_rate": 1.9938952246378996e-05, + "loss": 1.0258, + "step": 419 + }, + { + "epoch": 0.06, + "grad_norm": 4.619262957851557, + "learning_rate": 1.993840406931398e-05, + "loss": 0.9322, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 4.866106813938292, + "learning_rate": 1.9937853449667975e-05, + "loss": 1.0814, + "step": 421 + }, + { + "epoch": 0.06, + "grad_norm": 4.454545061711105, + "learning_rate": 1.993730038757631e-05, + "loss": 1.1405, + "step": 422 + }, + { + "epoch": 0.06, + "grad_norm": 4.344621327807707, + "learning_rate": 1.993674488317491e-05, + "loss": 1.202, + "step": 423 + }, + { + "epoch": 0.06, + "grad_norm": 8.97729689010236, + "learning_rate": 1.9936186936600307e-05, + "loss": 1.1599, + "step": 424 + }, + { + "epoch": 0.07, + "grad_norm": 15.863469621733676, + "learning_rate": 1.9935626547989623e-05, + "loss": 1.0807, + "step": 425 + }, + { + "epoch": 0.07, + "grad_norm": 4.242651785752871, + "learning_rate": 1.9935063717480587e-05, + "loss": 1.0514, + "step": 426 + }, + { + "epoch": 0.07, + "grad_norm": 6.455541990923886, + "learning_rate": 1.9934498445211533e-05, + "loss": 1.0635, + "step": 427 + }, + { + "epoch": 0.07, + "grad_norm": 52.6839859691502, + "learning_rate": 1.993393073132138e-05, + "loss": 1.2474, + "step": 428 + }, + { + "epoch": 0.07, + "grad_norm": 4.836687324373342, + "learning_rate": 1.9933360575949666e-05, + "loss": 0.9968, + "step": 429 + }, + { + "epoch": 0.07, + "grad_norm": 4.629865348213475, + "learning_rate": 1.993278797923651e-05, + "loss": 1.1198, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 4.358351299783933, + "learning_rate": 1.9932212941322647e-05, + "loss": 1.0978, + "step": 431 + }, + { + "epoch": 0.07, + "grad_norm": 3.9377679799485334, + "learning_rate": 1.99316354623494e-05, + "loss": 1.0485, + "step": 432 + }, + { + "epoch": 0.07, + "grad_norm": 4.6018038205612255, + "learning_rate": 1.99310555424587e-05, + "loss": 1.0514, + "step": 433 + }, + { + "epoch": 0.07, + "grad_norm": 4.0440373728530785, + "learning_rate": 1.9930473181793072e-05, + "loss": 0.9412, + "step": 434 + }, + { + "epoch": 0.07, + "grad_norm": 4.2822041977740986, + "learning_rate": 1.992988838049565e-05, + "loss": 0.9845, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 4.3024294123951226, + "learning_rate": 1.992930113871016e-05, + "loss": 1.0778, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 4.426499951361489, + "learning_rate": 1.9928711456580924e-05, + "loss": 1.0527, + "step": 437 + }, + { + "epoch": 0.07, + "grad_norm": 3.97926425301124, + "learning_rate": 1.9928119334252874e-05, + "loss": 1.0938, + "step": 438 + }, + { + "epoch": 0.07, + "grad_norm": 4.494487628691639, + "learning_rate": 1.9927524771871537e-05, + "loss": 1.0805, + "step": 439 + }, + { + "epoch": 0.07, + "grad_norm": 4.1492409727708885, + "learning_rate": 1.992692776958304e-05, + "loss": 1.1888, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 3.7536887124074765, + "learning_rate": 1.9926328327534108e-05, + "loss": 1.1237, + "step": 441 + }, + { + "epoch": 0.07, + "grad_norm": 4.17207580577035, + "learning_rate": 1.9925726445872064e-05, + "loss": 1.117, + "step": 442 + }, + { + "epoch": 0.07, + "grad_norm": 4.034264466789217, + "learning_rate": 1.992512212474484e-05, + "loss": 1.038, + "step": 443 + }, + { + "epoch": 0.07, + "grad_norm": 5.935357931179911, + "learning_rate": 1.992451536430096e-05, + "loss": 0.9145, + "step": 444 + }, + { + "epoch": 0.07, + "grad_norm": 4.188855247069045, + "learning_rate": 1.9923906164689545e-05, + "loss": 1.1691, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 3.872423420867507, + "learning_rate": 1.9923294526060318e-05, + "loss": 0.9843, + "step": 446 + }, + { + "epoch": 0.07, + "grad_norm": 4.043213930905514, + "learning_rate": 1.992268044856361e-05, + "loss": 1.2413, + "step": 447 + }, + { + "epoch": 0.07, + "grad_norm": 3.8760649774844254, + "learning_rate": 1.992206393235034e-05, + "loss": 1.0825, + "step": 448 + }, + { + "epoch": 0.07, + "grad_norm": 3.574510132496088, + "learning_rate": 1.9921444977572026e-05, + "loss": 1.0488, + "step": 449 + }, + { + "epoch": 0.07, + "grad_norm": 9.872733544881923, + "learning_rate": 1.9920823584380797e-05, + "loss": 1.212, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 15.738193513731524, + "learning_rate": 1.992019975292937e-05, + "loss": 1.2698, + "step": 451 + }, + { + "epoch": 0.07, + "grad_norm": 4.1666626790638075, + "learning_rate": 1.9919573483371065e-05, + "loss": 1.0719, + "step": 452 + }, + { + "epoch": 0.07, + "grad_norm": 4.152054735117916, + "learning_rate": 1.9918944775859805e-05, + "loss": 1.0218, + "step": 453 + }, + { + "epoch": 0.07, + "grad_norm": 4.0873913265028285, + "learning_rate": 1.9918313630550107e-05, + "loss": 1.1434, + "step": 454 + }, + { + "epoch": 0.07, + "grad_norm": 3.9934330107747416, + "learning_rate": 1.9917680047597085e-05, + "loss": 1.1273, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 4.376873182361851, + "learning_rate": 1.9917044027156462e-05, + "loss": 1.06, + "step": 456 + }, + { + "epoch": 0.07, + "grad_norm": 4.715172465931275, + "learning_rate": 1.9916405569384548e-05, + "loss": 1.0255, + "step": 457 + }, + { + "epoch": 0.07, + "grad_norm": 68.34701161214348, + "learning_rate": 1.9915764674438263e-05, + "loss": 1.2339, + "step": 458 + }, + { + "epoch": 0.07, + "grad_norm": 4.665587219676493, + "learning_rate": 1.991512134247512e-05, + "loss": 1.0084, + "step": 459 + }, + { + "epoch": 0.07, + "grad_norm": 3.8415418746312766, + "learning_rate": 1.9914475573653228e-05, + "loss": 1.0346, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 3.8246643408106213, + "learning_rate": 1.9913827368131303e-05, + "loss": 1.0004, + "step": 461 + }, + { + "epoch": 0.07, + "grad_norm": 4.114748947891342, + "learning_rate": 1.991317672606866e-05, + "loss": 1.1898, + "step": 462 + }, + { + "epoch": 0.07, + "grad_norm": 3.6777417567463417, + "learning_rate": 1.9912523647625195e-05, + "loss": 1.1035, + "step": 463 + }, + { + "epoch": 0.07, + "grad_norm": 4.9174622373220895, + "learning_rate": 1.991186813296143e-05, + "loss": 0.9963, + "step": 464 + }, + { + "epoch": 0.07, + "grad_norm": 4.6798659957889805, + "learning_rate": 1.9911210182238462e-05, + "loss": 1.1222, + "step": 465 + }, + { + "epoch": 0.07, + "grad_norm": 3.879278324637952, + "learning_rate": 1.9910549795618003e-05, + "loss": 1.0055, + "step": 466 + }, + { + "epoch": 0.07, + "grad_norm": 4.0401268558658145, + "learning_rate": 1.9909886973262356e-05, + "loss": 1.1659, + "step": 467 + }, + { + "epoch": 0.07, + "grad_norm": 20.71395567573766, + "learning_rate": 1.9909221715334428e-05, + "loss": 1.1785, + "step": 468 + }, + { + "epoch": 0.07, + "grad_norm": 3.9734760731086074, + "learning_rate": 1.9908554021997715e-05, + "loss": 1.0212, + "step": 469 + }, + { + "epoch": 0.07, + "grad_norm": 5.3755313994964915, + "learning_rate": 1.990788389341632e-05, + "loss": 0.9178, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 3.9667867009878366, + "learning_rate": 1.990721132975494e-05, + "loss": 1.139, + "step": 471 + }, + { + "epoch": 0.07, + "grad_norm": 4.996757965018768, + "learning_rate": 1.9906536331178873e-05, + "loss": 1.0152, + "step": 472 + }, + { + "epoch": 0.07, + "grad_norm": 4.065537265184598, + "learning_rate": 1.9905858897854013e-05, + "loss": 1.0876, + "step": 473 + }, + { + "epoch": 0.07, + "grad_norm": 3.853734590158613, + "learning_rate": 1.990517902994686e-05, + "loss": 1.0335, + "step": 474 + }, + { + "epoch": 0.07, + "grad_norm": 3.7530342271361223, + "learning_rate": 1.9904496727624498e-05, + "loss": 1.1245, + "step": 475 + }, + { + "epoch": 0.07, + "grad_norm": 3.770253015681632, + "learning_rate": 1.9903811991054628e-05, + "loss": 1.0104, + "step": 476 + }, + { + "epoch": 0.07, + "grad_norm": 3.957710365597872, + "learning_rate": 1.990312482040553e-05, + "loss": 1.0127, + "step": 477 + }, + { + "epoch": 0.07, + "grad_norm": 3.9657426975966645, + "learning_rate": 1.9902435215846096e-05, + "loss": 1.1692, + "step": 478 + }, + { + "epoch": 0.07, + "grad_norm": 3.962760636201044, + "learning_rate": 1.9901743177545807e-05, + "loss": 1.0969, + "step": 479 + }, + { + "epoch": 0.07, + "grad_norm": 4.0883879467739215, + "learning_rate": 1.9901048705674752e-05, + "loss": 1.0422, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 4.118434571260297, + "learning_rate": 1.990035180040361e-05, + "loss": 1.0369, + "step": 481 + }, + { + "epoch": 0.07, + "grad_norm": 25.230164702697778, + "learning_rate": 1.9899652461903662e-05, + "loss": 1.0999, + "step": 482 + }, + { + "epoch": 0.07, + "grad_norm": 11.068930128392779, + "learning_rate": 1.9898950690346784e-05, + "loss": 1.0602, + "step": 483 + }, + { + "epoch": 0.07, + "grad_norm": 5.64858207036738, + "learning_rate": 1.9898246485905456e-05, + "loss": 1.238, + "step": 484 + }, + { + "epoch": 0.07, + "grad_norm": 5.167629424192774, + "learning_rate": 1.9897539848752743e-05, + "loss": 0.9315, + "step": 485 + }, + { + "epoch": 0.07, + "grad_norm": 4.325684939850164, + "learning_rate": 1.9896830779062325e-05, + "loss": 1.0613, + "step": 486 + }, + { + "epoch": 0.07, + "grad_norm": 4.591801602541418, + "learning_rate": 1.989611927700847e-05, + "loss": 1.1534, + "step": 487 + }, + { + "epoch": 0.07, + "grad_norm": 3.8731127863301777, + "learning_rate": 1.9895405342766044e-05, + "loss": 1.0965, + "step": 488 + }, + { + "epoch": 0.07, + "grad_norm": 3.9256611935348227, + "learning_rate": 1.989468897651051e-05, + "loss": 1.0757, + "step": 489 + }, + { + "epoch": 0.08, + "grad_norm": 3.842769298855828, + "learning_rate": 1.9893970178417933e-05, + "loss": 1.0648, + "step": 490 + }, + { + "epoch": 0.08, + "grad_norm": 4.188248922725501, + "learning_rate": 1.989324894866497e-05, + "loss": 1.0819, + "step": 491 + }, + { + "epoch": 0.08, + "grad_norm": 4.498270743406448, + "learning_rate": 1.9892525287428885e-05, + "loss": 1.0548, + "step": 492 + }, + { + "epoch": 0.08, + "grad_norm": 4.064522194976723, + "learning_rate": 1.989179919488753e-05, + "loss": 1.177, + "step": 493 + }, + { + "epoch": 0.08, + "grad_norm": 4.192200466678487, + "learning_rate": 1.9891070671219358e-05, + "loss": 1.0633, + "step": 494 + }, + { + "epoch": 0.08, + "grad_norm": 3.6610634108607893, + "learning_rate": 1.9890339716603424e-05, + "loss": 1.0271, + "step": 495 + }, + { + "epoch": 0.08, + "grad_norm": 4.19383464605963, + "learning_rate": 1.988960633121937e-05, + "loss": 1.1628, + "step": 496 + }, + { + "epoch": 0.08, + "grad_norm": 4.324148856756533, + "learning_rate": 1.9888870515247445e-05, + "loss": 1.1725, + "step": 497 + }, + { + "epoch": 0.08, + "grad_norm": 4.094678732832611, + "learning_rate": 1.988813226886849e-05, + "loss": 1.1152, + "step": 498 + }, + { + "epoch": 0.08, + "grad_norm": 100.17236461049514, + "learning_rate": 1.9887391592263947e-05, + "loss": 1.713, + "step": 499 + }, + { + "epoch": 0.08, + "grad_norm": 4.200236194076793, + "learning_rate": 1.9886648485615852e-05, + "loss": 1.0963, + "step": 500 + }, + { + "epoch": 0.08, + "grad_norm": 4.654556285087075, + "learning_rate": 1.9885902949106842e-05, + "loss": 1.1179, + "step": 501 + }, + { + "epoch": 0.08, + "grad_norm": 3.559411065207602, + "learning_rate": 1.988515498292015e-05, + "loss": 1.0149, + "step": 502 + }, + { + "epoch": 0.08, + "grad_norm": 11.07254697409706, + "learning_rate": 1.9884404587239597e-05, + "loss": 1.0651, + "step": 503 + }, + { + "epoch": 0.08, + "grad_norm": 3.8097851753577676, + "learning_rate": 1.9883651762249618e-05, + "loss": 0.968, + "step": 504 + }, + { + "epoch": 0.08, + "grad_norm": 4.229524204117181, + "learning_rate": 1.9882896508135236e-05, + "loss": 1.102, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 3.8740072573172974, + "learning_rate": 1.9882138825082066e-05, + "loss": 1.1369, + "step": 506 + }, + { + "epoch": 0.08, + "grad_norm": 3.7317673445914985, + "learning_rate": 1.9881378713276323e-05, + "loss": 0.9885, + "step": 507 + }, + { + "epoch": 0.08, + "grad_norm": 10.295999670061802, + "learning_rate": 1.9880616172904833e-05, + "loss": 1.0094, + "step": 508 + }, + { + "epoch": 0.08, + "grad_norm": 4.281729889433247, + "learning_rate": 1.9879851204154996e-05, + "loss": 0.9946, + "step": 509 + }, + { + "epoch": 0.08, + "grad_norm": 4.396280320607497, + "learning_rate": 1.9879083807214827e-05, + "loss": 1.0109, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 4.20457733433576, + "learning_rate": 1.9878313982272926e-05, + "loss": 1.179, + "step": 511 + }, + { + "epoch": 0.08, + "grad_norm": 4.0518090897332595, + "learning_rate": 1.9877541729518496e-05, + "loss": 1.1367, + "step": 512 + }, + { + "epoch": 0.08, + "grad_norm": 3.8944019034517012, + "learning_rate": 1.9876767049141334e-05, + "loss": 1.0746, + "step": 513 + }, + { + "epoch": 0.08, + "grad_norm": 3.7920745114516183, + "learning_rate": 1.987598994133184e-05, + "loss": 1.1664, + "step": 514 + }, + { + "epoch": 0.08, + "grad_norm": 3.9722247307359373, + "learning_rate": 1.9875210406280993e-05, + "loss": 1.1303, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 4.182258325736007, + "learning_rate": 1.9874428444180395e-05, + "loss": 0.9902, + "step": 516 + }, + { + "epoch": 0.08, + "grad_norm": 5.592378225884718, + "learning_rate": 1.987364405522222e-05, + "loss": 1.1012, + "step": 517 + }, + { + "epoch": 0.08, + "grad_norm": 4.105204210476816, + "learning_rate": 1.9872857239599254e-05, + "loss": 1.1137, + "step": 518 + }, + { + "epoch": 0.08, + "grad_norm": 4.167332270422541, + "learning_rate": 1.9872067997504873e-05, + "loss": 1.1159, + "step": 519 + }, + { + "epoch": 0.08, + "grad_norm": 13.08947117736944, + "learning_rate": 1.9871276329133054e-05, + "loss": 1.0862, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 4.049584156774763, + "learning_rate": 1.9870482234678362e-05, + "loss": 1.023, + "step": 521 + }, + { + "epoch": 0.08, + "grad_norm": 4.232270233273242, + "learning_rate": 1.9869685714335966e-05, + "loss": 1.0735, + "step": 522 + }, + { + "epoch": 0.08, + "grad_norm": 4.2805838206307625, + "learning_rate": 1.986888676830162e-05, + "loss": 1.0401, + "step": 523 + }, + { + "epoch": 0.08, + "grad_norm": 4.004363644777837, + "learning_rate": 1.9868085396771696e-05, + "loss": 1.0112, + "step": 524 + }, + { + "epoch": 0.08, + "grad_norm": 4.1293441807832805, + "learning_rate": 1.9867281599943143e-05, + "loss": 1.0584, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 4.060908520373464, + "learning_rate": 1.986647537801351e-05, + "loss": 0.9488, + "step": 526 + }, + { + "epoch": 0.08, + "grad_norm": 5.247746303908252, + "learning_rate": 1.9865666731180946e-05, + "loss": 1.0041, + "step": 527 + }, + { + "epoch": 0.08, + "grad_norm": 5.175237819303534, + "learning_rate": 1.9864855659644188e-05, + "loss": 1.0653, + "step": 528 + }, + { + "epoch": 0.08, + "grad_norm": 3.7167967522781247, + "learning_rate": 1.986404216360258e-05, + "loss": 1.0688, + "step": 529 + }, + { + "epoch": 0.08, + "grad_norm": 3.819542500669309, + "learning_rate": 1.986322624325606e-05, + "loss": 1.1037, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 4.298779141385325, + "learning_rate": 1.986240789880515e-05, + "loss": 1.1797, + "step": 531 + }, + { + "epoch": 0.08, + "grad_norm": 3.7973224850258624, + "learning_rate": 1.9861587130450983e-05, + "loss": 1.0882, + "step": 532 + }, + { + "epoch": 0.08, + "grad_norm": 14.693961767865298, + "learning_rate": 1.9860763938395276e-05, + "loss": 1.0669, + "step": 533 + }, + { + "epoch": 0.08, + "grad_norm": 4.07838118819138, + "learning_rate": 1.985993832284035e-05, + "loss": 1.1385, + "step": 534 + }, + { + "epoch": 0.08, + "grad_norm": 4.39069818425914, + "learning_rate": 1.9859110283989115e-05, + "loss": 1.0543, + "step": 535 + }, + { + "epoch": 0.08, + "grad_norm": 4.034497990877742, + "learning_rate": 1.985827982204508e-05, + "loss": 1.096, + "step": 536 + }, + { + "epoch": 0.08, + "grad_norm": 3.960533593233808, + "learning_rate": 1.9857446937212354e-05, + "loss": 1.0496, + "step": 537 + }, + { + "epoch": 0.08, + "grad_norm": 4.309808184319671, + "learning_rate": 1.9856611629695632e-05, + "loss": 0.996, + "step": 538 + }, + { + "epoch": 0.08, + "grad_norm": 3.5147331155631987, + "learning_rate": 1.985577389970021e-05, + "loss": 1.0178, + "step": 539 + }, + { + "epoch": 0.08, + "grad_norm": 3.7971287131584917, + "learning_rate": 1.9854933747431978e-05, + "loss": 1.0094, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 4.386234386949602, + "learning_rate": 1.9854091173097423e-05, + "loss": 1.1386, + "step": 541 + }, + { + "epoch": 0.08, + "grad_norm": 4.000079644424622, + "learning_rate": 1.9853246176903628e-05, + "loss": 0.9622, + "step": 542 + }, + { + "epoch": 0.08, + "grad_norm": 4.339334686890175, + "learning_rate": 1.9852398759058267e-05, + "loss": 1.0296, + "step": 543 + }, + { + "epoch": 0.08, + "grad_norm": 3.7739468015723268, + "learning_rate": 1.985154891976961e-05, + "loss": 1.0136, + "step": 544 + }, + { + "epoch": 0.08, + "grad_norm": 4.0151575509057436, + "learning_rate": 1.9850696659246527e-05, + "loss": 1.1141, + "step": 545 + }, + { + "epoch": 0.08, + "grad_norm": 4.124140334325582, + "learning_rate": 1.984984197769848e-05, + "loss": 1.068, + "step": 546 + }, + { + "epoch": 0.08, + "grad_norm": 6.798960767229643, + "learning_rate": 1.984898487533552e-05, + "loss": 1.042, + "step": 547 + }, + { + "epoch": 0.08, + "grad_norm": 3.88888530591446, + "learning_rate": 1.9848125352368304e-05, + "loss": 1.0092, + "step": 548 + }, + { + "epoch": 0.08, + "grad_norm": 5.590651116963613, + "learning_rate": 1.984726340900808e-05, + "loss": 1.0699, + "step": 549 + }, + { + "epoch": 0.08, + "grad_norm": 13.111484667737962, + "learning_rate": 1.9846399045466683e-05, + "loss": 1.1293, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 7.1949741389531106, + "learning_rate": 1.9845532261956556e-05, + "loss": 0.9913, + "step": 551 + }, + { + "epoch": 0.08, + "grad_norm": 3.577889355072513, + "learning_rate": 1.984466305869073e-05, + "loss": 0.9844, + "step": 552 + }, + { + "epoch": 0.08, + "grad_norm": 3.9399427915413283, + "learning_rate": 1.9843791435882823e-05, + "loss": 1.1503, + "step": 553 + }, + { + "epoch": 0.08, + "grad_norm": 4.8808408631804285, + "learning_rate": 1.9842917393747063e-05, + "loss": 1.0585, + "step": 554 + }, + { + "epoch": 0.08, + "grad_norm": 8.148362947766365, + "learning_rate": 1.984204093249826e-05, + "loss": 1.0008, + "step": 555 + }, + { + "epoch": 0.09, + "grad_norm": 4.022762411797265, + "learning_rate": 1.984116205235183e-05, + "loss": 1.0576, + "step": 556 + }, + { + "epoch": 0.09, + "grad_norm": 3.7848552941502684, + "learning_rate": 1.984028075352377e-05, + "loss": 1.1343, + "step": 557 + }, + { + "epoch": 0.09, + "grad_norm": 4.194440701612392, + "learning_rate": 1.9839397036230683e-05, + "loss": 1.1125, + "step": 558 + }, + { + "epoch": 0.09, + "grad_norm": 3.8417025422559186, + "learning_rate": 1.983851090068976e-05, + "loss": 1.0576, + "step": 559 + }, + { + "epoch": 0.09, + "grad_norm": 3.6424172482294517, + "learning_rate": 1.983762234711879e-05, + "loss": 1.0682, + "step": 560 + }, + { + "epoch": 0.09, + "grad_norm": 3.8586575805191674, + "learning_rate": 1.9836731375736152e-05, + "loss": 1.14, + "step": 561 + }, + { + "epoch": 0.09, + "grad_norm": 3.7510893528582527, + "learning_rate": 1.9835837986760826e-05, + "loss": 1.0344, + "step": 562 + }, + { + "epoch": 0.09, + "grad_norm": 3.8841637756939287, + "learning_rate": 1.983494218041238e-05, + "loss": 1.0781, + "step": 563 + }, + { + "epoch": 0.09, + "grad_norm": 3.638573793002325, + "learning_rate": 1.9834043956910977e-05, + "loss": 1.0021, + "step": 564 + }, + { + "epoch": 0.09, + "grad_norm": 3.84261917367866, + "learning_rate": 1.9833143316477373e-05, + "loss": 1.0403, + "step": 565 + }, + { + "epoch": 0.09, + "grad_norm": 3.99726497722348, + "learning_rate": 1.9832240259332926e-05, + "loss": 1.129, + "step": 566 + }, + { + "epoch": 0.09, + "grad_norm": 4.287995225308431, + "learning_rate": 1.9831334785699573e-05, + "loss": 1.0588, + "step": 567 + }, + { + "epoch": 0.09, + "grad_norm": 4.517804901348297, + "learning_rate": 1.9830426895799863e-05, + "loss": 1.0125, + "step": 568 + }, + { + "epoch": 0.09, + "grad_norm": 4.016138201292742, + "learning_rate": 1.9829516589856927e-05, + "loss": 1.1118, + "step": 569 + }, + { + "epoch": 0.09, + "grad_norm": 3.7553954379546117, + "learning_rate": 1.9828603868094493e-05, + "loss": 1.0246, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 27.1244431360396, + "learning_rate": 1.982768873073688e-05, + "loss": 1.1834, + "step": 571 + }, + { + "epoch": 0.09, + "grad_norm": 3.972960301159113, + "learning_rate": 1.9826771178009004e-05, + "loss": 0.9803, + "step": 572 + }, + { + "epoch": 0.09, + "grad_norm": 4.135770025339726, + "learning_rate": 1.9825851210136377e-05, + "loss": 1.0974, + "step": 573 + }, + { + "epoch": 0.09, + "grad_norm": 3.661648194074585, + "learning_rate": 1.98249288273451e-05, + "loss": 1.0345, + "step": 574 + }, + { + "epoch": 0.09, + "grad_norm": 4.2506882343689805, + "learning_rate": 1.9824004029861865e-05, + "loss": 1.0352, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 3.918479068734851, + "learning_rate": 1.9823076817913965e-05, + "loss": 1.0562, + "step": 576 + }, + { + "epoch": 0.09, + "grad_norm": 3.8531954074705776, + "learning_rate": 1.9822147191729282e-05, + "loss": 1.1355, + "step": 577 + }, + { + "epoch": 0.09, + "grad_norm": 5.093074093090599, + "learning_rate": 1.9821215151536292e-05, + "loss": 1.0082, + "step": 578 + }, + { + "epoch": 0.09, + "grad_norm": 3.74784655442073, + "learning_rate": 1.9820280697564062e-05, + "loss": 1.0291, + "step": 579 + }, + { + "epoch": 0.09, + "grad_norm": 3.8133856210226313, + "learning_rate": 1.981934383004226e-05, + "loss": 1.0908, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 4.275051791622801, + "learning_rate": 1.9818404549201134e-05, + "loss": 1.116, + "step": 581 + }, + { + "epoch": 0.09, + "grad_norm": 3.717147775719318, + "learning_rate": 1.981746285527154e-05, + "loss": 1.0581, + "step": 582 + }, + { + "epoch": 0.09, + "grad_norm": 4.179106753877097, + "learning_rate": 1.9816518748484918e-05, + "loss": 1.1118, + "step": 583 + }, + { + "epoch": 0.09, + "grad_norm": 3.4321594846881496, + "learning_rate": 1.9815572229073302e-05, + "loss": 1.1244, + "step": 584 + }, + { + "epoch": 0.09, + "grad_norm": 44.024519397743994, + "learning_rate": 1.9814623297269318e-05, + "loss": 1.1965, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 3.688970794697736, + "learning_rate": 1.981367195330619e-05, + "loss": 0.9591, + "step": 586 + }, + { + "epoch": 0.09, + "grad_norm": 4.613809556224252, + "learning_rate": 1.9812718197417732e-05, + "loss": 1.0524, + "step": 587 + }, + { + "epoch": 0.09, + "grad_norm": 19.4274900048794, + "learning_rate": 1.981176202983835e-05, + "loss": 1.0762, + "step": 588 + }, + { + "epoch": 0.09, + "grad_norm": 5.2882461921580655, + "learning_rate": 1.981080345080304e-05, + "loss": 1.1226, + "step": 589 + }, + { + "epoch": 0.09, + "grad_norm": 4.470081133562788, + "learning_rate": 1.980984246054739e-05, + "loss": 1.0775, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 4.236591608309034, + "learning_rate": 1.98088790593076e-05, + "loss": 1.1254, + "step": 591 + }, + { + "epoch": 0.09, + "grad_norm": 3.943827833626578, + "learning_rate": 1.9807913247320437e-05, + "loss": 1.105, + "step": 592 + }, + { + "epoch": 0.09, + "grad_norm": 5.051124059877304, + "learning_rate": 1.9806945024823268e-05, + "loss": 1.0184, + "step": 593 + }, + { + "epoch": 0.09, + "grad_norm": 3.8137043251275897, + "learning_rate": 1.9805974392054057e-05, + "loss": 1.0157, + "step": 594 + }, + { + "epoch": 0.09, + "grad_norm": 3.8810926289689256, + "learning_rate": 1.980500134925136e-05, + "loss": 1.0248, + "step": 595 + }, + { + "epoch": 0.09, + "grad_norm": 4.249114811226602, + "learning_rate": 1.9804025896654323e-05, + "loss": 1.0927, + "step": 596 + }, + { + "epoch": 0.09, + "grad_norm": 3.937037416045328, + "learning_rate": 1.9803048034502686e-05, + "loss": 0.9526, + "step": 597 + }, + { + "epoch": 0.09, + "grad_norm": 3.785040156544341, + "learning_rate": 1.9802067763036777e-05, + "loss": 1.0215, + "step": 598 + }, + { + "epoch": 0.09, + "grad_norm": 3.8696537498290473, + "learning_rate": 1.9801085082497524e-05, + "loss": 1.0136, + "step": 599 + }, + { + "epoch": 0.09, + "grad_norm": 4.042449573541756, + "learning_rate": 1.980009999312644e-05, + "loss": 0.9528, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 3.901999977345569, + "learning_rate": 1.979911249516563e-05, + "loss": 1.0932, + "step": 601 + }, + { + "epoch": 0.09, + "grad_norm": 3.9792433955931603, + "learning_rate": 1.9798122588857792e-05, + "loss": 1.0753, + "step": 602 + }, + { + "epoch": 0.09, + "grad_norm": 3.855871910802278, + "learning_rate": 1.9797130274446226e-05, + "loss": 0.9931, + "step": 603 + }, + { + "epoch": 0.09, + "grad_norm": 3.8407523106332966, + "learning_rate": 1.9796135552174806e-05, + "loss": 1.052, + "step": 604 + }, + { + "epoch": 0.09, + "grad_norm": 3.6097273941283787, + "learning_rate": 1.979513842228801e-05, + "loss": 1.0838, + "step": 605 + }, + { + "epoch": 0.09, + "grad_norm": 3.7398132484603246, + "learning_rate": 1.9794138885030904e-05, + "loss": 1.0803, + "step": 606 + }, + { + "epoch": 0.09, + "grad_norm": 3.9933304582799956, + "learning_rate": 1.979313694064915e-05, + "loss": 1.0301, + "step": 607 + }, + { + "epoch": 0.09, + "grad_norm": 3.8702263071544802, + "learning_rate": 1.979213258938899e-05, + "loss": 1.091, + "step": 608 + }, + { + "epoch": 0.09, + "grad_norm": 4.532117228434824, + "learning_rate": 1.9791125831497275e-05, + "loss": 1.0777, + "step": 609 + }, + { + "epoch": 0.09, + "grad_norm": 3.6877373194692176, + "learning_rate": 1.979011666722143e-05, + "loss": 1.0882, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 3.6714087433404483, + "learning_rate": 1.9789105096809486e-05, + "loss": 1.0265, + "step": 611 + }, + { + "epoch": 0.09, + "grad_norm": 4.253830373999391, + "learning_rate": 1.9788091120510054e-05, + "loss": 1.0392, + "step": 612 + }, + { + "epoch": 0.09, + "grad_norm": 4.383486203412253, + "learning_rate": 1.978707473857234e-05, + "loss": 0.9853, + "step": 613 + }, + { + "epoch": 0.09, + "grad_norm": 3.9207782444723676, + "learning_rate": 1.9786055951246145e-05, + "loss": 1.0359, + "step": 614 + }, + { + "epoch": 0.09, + "grad_norm": 4.27299707690402, + "learning_rate": 1.978503475878186e-05, + "loss": 1.0598, + "step": 615 + }, + { + "epoch": 0.09, + "grad_norm": 3.611938859351543, + "learning_rate": 1.978401116143046e-05, + "loss": 1.0947, + "step": 616 + }, + { + "epoch": 0.09, + "grad_norm": 4.110523917481957, + "learning_rate": 1.9782985159443516e-05, + "loss": 1.0986, + "step": 617 + }, + { + "epoch": 0.09, + "grad_norm": 3.9721702902162335, + "learning_rate": 1.97819567530732e-05, + "loss": 0.9685, + "step": 618 + }, + { + "epoch": 0.09, + "grad_norm": 3.6766477960601947, + "learning_rate": 1.9780925942572254e-05, + "loss": 0.9494, + "step": 619 + }, + { + "epoch": 0.09, + "grad_norm": 4.257434563692485, + "learning_rate": 1.9779892728194033e-05, + "loss": 1.1736, + "step": 620 + }, + { + "epoch": 0.1, + "grad_norm": 3.7810009161622755, + "learning_rate": 1.977885711019246e-05, + "loss": 0.9732, + "step": 621 + }, + { + "epoch": 0.1, + "grad_norm": 4.154657757370879, + "learning_rate": 1.977781908882207e-05, + "loss": 1.0125, + "step": 622 + }, + { + "epoch": 0.1, + "grad_norm": 12.90341959750237, + "learning_rate": 1.9776778664337983e-05, + "loss": 1.0735, + "step": 623 + }, + { + "epoch": 0.1, + "grad_norm": 4.029745128260756, + "learning_rate": 1.97757358369959e-05, + "loss": 1.0611, + "step": 624 + }, + { + "epoch": 0.1, + "grad_norm": 3.6947993759190734, + "learning_rate": 1.9774690607052113e-05, + "loss": 0.9721, + "step": 625 + }, + { + "epoch": 0.1, + "grad_norm": 3.5784447673937834, + "learning_rate": 1.9773642974763518e-05, + "loss": 1.0404, + "step": 626 + }, + { + "epoch": 0.1, + "grad_norm": 4.071737235973688, + "learning_rate": 1.9772592940387592e-05, + "loss": 0.997, + "step": 627 + }, + { + "epoch": 0.1, + "grad_norm": 3.618782283976266, + "learning_rate": 1.9771540504182404e-05, + "loss": 1.148, + "step": 628 + }, + { + "epoch": 0.1, + "grad_norm": 3.3777647657574272, + "learning_rate": 1.9770485666406612e-05, + "loss": 1.0647, + "step": 629 + }, + { + "epoch": 0.1, + "grad_norm": 3.811698216066833, + "learning_rate": 1.976942842731947e-05, + "loss": 1.1163, + "step": 630 + }, + { + "epoch": 0.1, + "grad_norm": 3.8554878867521087, + "learning_rate": 1.976836878718081e-05, + "loss": 1.0056, + "step": 631 + }, + { + "epoch": 0.1, + "grad_norm": 3.5595599483392077, + "learning_rate": 1.9767306746251073e-05, + "loss": 1.0362, + "step": 632 + }, + { + "epoch": 0.1, + "grad_norm": 4.355698503630512, + "learning_rate": 1.976624230479127e-05, + "loss": 1.0318, + "step": 633 + }, + { + "epoch": 0.1, + "grad_norm": 3.5800019812910033, + "learning_rate": 1.976517546306301e-05, + "loss": 1.143, + "step": 634 + }, + { + "epoch": 0.1, + "grad_norm": 4.109303971927999, + "learning_rate": 1.9764106221328495e-05, + "loss": 1.1507, + "step": 635 + }, + { + "epoch": 0.1, + "grad_norm": 11.341830505198706, + "learning_rate": 1.976303457985052e-05, + "loss": 0.9271, + "step": 636 + }, + { + "epoch": 0.1, + "grad_norm": 6.158302554726382, + "learning_rate": 1.9761960538892456e-05, + "loss": 1.0866, + "step": 637 + }, + { + "epoch": 0.1, + "grad_norm": 3.6323990269274447, + "learning_rate": 1.9760884098718277e-05, + "loss": 1.0624, + "step": 638 + }, + { + "epoch": 0.1, + "grad_norm": 3.593221906283679, + "learning_rate": 1.9759805259592543e-05, + "loss": 0.9801, + "step": 639 + }, + { + "epoch": 0.1, + "grad_norm": 3.8310072853709722, + "learning_rate": 1.97587240217804e-05, + "loss": 1.1142, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 3.708340807762302, + "learning_rate": 1.975764038554758e-05, + "loss": 1.065, + "step": 641 + }, + { + "epoch": 0.1, + "grad_norm": 3.602682523381259, + "learning_rate": 1.9756554351160423e-05, + "loss": 1.0641, + "step": 642 + }, + { + "epoch": 0.1, + "grad_norm": 3.677081539155567, + "learning_rate": 1.9755465918885836e-05, + "loss": 1.0378, + "step": 643 + }, + { + "epoch": 0.1, + "grad_norm": 3.7668871395570487, + "learning_rate": 1.975437508899133e-05, + "loss": 1.1344, + "step": 644 + }, + { + "epoch": 0.1, + "grad_norm": 3.8359506126491563, + "learning_rate": 1.9753281861745e-05, + "loss": 0.9694, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 4.206408482603931, + "learning_rate": 1.9752186237415528e-05, + "loss": 1.0533, + "step": 646 + }, + { + "epoch": 0.1, + "grad_norm": 3.7597389829258656, + "learning_rate": 1.9751088216272186e-05, + "loss": 1.1411, + "step": 647 + }, + { + "epoch": 0.1, + "grad_norm": 3.7120014142519246, + "learning_rate": 1.9749987798584845e-05, + "loss": 1.0662, + "step": 648 + }, + { + "epoch": 0.1, + "grad_norm": 3.9987562421524383, + "learning_rate": 1.9748884984623952e-05, + "loss": 1.0139, + "step": 649 + }, + { + "epoch": 0.1, + "grad_norm": 4.282466973540785, + "learning_rate": 1.974777977466054e-05, + "loss": 1.0623, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 4.193144935671022, + "learning_rate": 1.9746672168966253e-05, + "loss": 1.0129, + "step": 651 + }, + { + "epoch": 0.1, + "grad_norm": 3.481908768091186, + "learning_rate": 1.9745562167813302e-05, + "loss": 0.9982, + "step": 652 + }, + { + "epoch": 0.1, + "grad_norm": 3.7044461584994797, + "learning_rate": 1.9744449771474496e-05, + "loss": 1.0352, + "step": 653 + }, + { + "epoch": 0.1, + "grad_norm": 4.21153552187076, + "learning_rate": 1.9743334980223224e-05, + "loss": 0.9555, + "step": 654 + }, + { + "epoch": 0.1, + "grad_norm": 3.597834834338504, + "learning_rate": 1.9742217794333483e-05, + "loss": 1.0555, + "step": 655 + }, + { + "epoch": 0.1, + "grad_norm": 4.509560249870177, + "learning_rate": 1.9741098214079833e-05, + "loss": 1.122, + "step": 656 + }, + { + "epoch": 0.1, + "grad_norm": 3.634147646970652, + "learning_rate": 1.9739976239737444e-05, + "loss": 1.1282, + "step": 657 + }, + { + "epoch": 0.1, + "grad_norm": 3.9355859624871243, + "learning_rate": 1.973885187158206e-05, + "loss": 1.0385, + "step": 658 + }, + { + "epoch": 0.1, + "grad_norm": 3.9223226174844936, + "learning_rate": 1.9737725109890027e-05, + "loss": 1.155, + "step": 659 + }, + { + "epoch": 0.1, + "grad_norm": 3.839110455653833, + "learning_rate": 1.9736595954938263e-05, + "loss": 0.9182, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 3.3843046936219263, + "learning_rate": 1.973546440700429e-05, + "loss": 1.0475, + "step": 661 + }, + { + "epoch": 0.1, + "grad_norm": 3.95335162441481, + "learning_rate": 1.9734330466366204e-05, + "loss": 1.1071, + "step": 662 + }, + { + "epoch": 0.1, + "grad_norm": 3.5083022418894405, + "learning_rate": 1.97331941333027e-05, + "loss": 1.0379, + "step": 663 + }, + { + "epoch": 0.1, + "grad_norm": 12.503320132172616, + "learning_rate": 1.9732055408093055e-05, + "loss": 1.095, + "step": 664 + }, + { + "epoch": 0.1, + "grad_norm": 3.8602096098013314, + "learning_rate": 1.973091429101714e-05, + "loss": 0.9738, + "step": 665 + }, + { + "epoch": 0.1, + "grad_norm": 3.567515287128138, + "learning_rate": 1.9729770782355402e-05, + "loss": 1.0115, + "step": 666 + }, + { + "epoch": 0.1, + "grad_norm": 4.1619498585102885, + "learning_rate": 1.9728624882388887e-05, + "loss": 1.113, + "step": 667 + }, + { + "epoch": 0.1, + "grad_norm": 3.8403129128241567, + "learning_rate": 1.972747659139923e-05, + "loss": 1.0106, + "step": 668 + }, + { + "epoch": 0.1, + "grad_norm": 3.6732666376119303, + "learning_rate": 1.972632590966864e-05, + "loss": 1.0863, + "step": 669 + }, + { + "epoch": 0.1, + "grad_norm": 8.63342630749796, + "learning_rate": 1.9725172837479926e-05, + "loss": 0.9528, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 27.87691370932522, + "learning_rate": 1.9724017375116482e-05, + "loss": 1.176, + "step": 671 + }, + { + "epoch": 0.1, + "grad_norm": 3.683702726110297, + "learning_rate": 1.972285952286229e-05, + "loss": 1.1573, + "step": 672 + }, + { + "epoch": 0.1, + "grad_norm": 3.617695052292551, + "learning_rate": 1.972169928100191e-05, + "loss": 1.0399, + "step": 673 + }, + { + "epoch": 0.1, + "grad_norm": 4.297740360065807, + "learning_rate": 1.9720536649820506e-05, + "loss": 0.9832, + "step": 674 + }, + { + "epoch": 0.1, + "grad_norm": 3.9857184734105227, + "learning_rate": 1.9719371629603815e-05, + "loss": 1.0523, + "step": 675 + }, + { + "epoch": 0.1, + "grad_norm": 3.5057614162854858, + "learning_rate": 1.9718204220638172e-05, + "loss": 1.0747, + "step": 676 + }, + { + "epoch": 0.1, + "grad_norm": 3.682742988164712, + "learning_rate": 1.9717034423210482e-05, + "loss": 1.1584, + "step": 677 + }, + { + "epoch": 0.1, + "grad_norm": 3.903662120563867, + "learning_rate": 1.9715862237608257e-05, + "loss": 1.0778, + "step": 678 + }, + { + "epoch": 0.1, + "grad_norm": 4.647491392542745, + "learning_rate": 1.9714687664119588e-05, + "loss": 1.0308, + "step": 679 + }, + { + "epoch": 0.1, + "grad_norm": 3.981060186015297, + "learning_rate": 1.9713510703033146e-05, + "loss": 0.9786, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 3.9494055759818574, + "learning_rate": 1.97123313546382e-05, + "loss": 1.1111, + "step": 681 + }, + { + "epoch": 0.1, + "grad_norm": 3.7899296812945624, + "learning_rate": 1.9711149619224604e-05, + "loss": 1.0374, + "step": 682 + }, + { + "epoch": 0.1, + "grad_norm": 3.871215861412619, + "learning_rate": 1.9709965497082786e-05, + "loss": 1.111, + "step": 683 + }, + { + "epoch": 0.1, + "grad_norm": 4.67068255262732, + "learning_rate": 1.9708778988503777e-05, + "loss": 1.2062, + "step": 684 + }, + { + "epoch": 0.1, + "grad_norm": 3.657131507652763, + "learning_rate": 1.9707590093779185e-05, + "loss": 1.0664, + "step": 685 + }, + { + "epoch": 0.11, + "grad_norm": 3.8816693045054795, + "learning_rate": 1.9706398813201207e-05, + "loss": 0.9103, + "step": 686 + }, + { + "epoch": 0.11, + "grad_norm": 3.9452131690989654, + "learning_rate": 1.970520514706263e-05, + "loss": 1.0652, + "step": 687 + }, + { + "epoch": 0.11, + "grad_norm": 3.8074508879557953, + "learning_rate": 1.9704009095656818e-05, + "loss": 1.0803, + "step": 688 + }, + { + "epoch": 0.11, + "grad_norm": 3.6545433112322137, + "learning_rate": 1.9702810659277728e-05, + "loss": 1.0225, + "step": 689 + }, + { + "epoch": 0.11, + "grad_norm": 3.981129458125146, + "learning_rate": 1.9701609838219907e-05, + "loss": 0.9947, + "step": 690 + }, + { + "epoch": 0.11, + "grad_norm": 3.9045146363435643, + "learning_rate": 1.9700406632778476e-05, + "loss": 1.0019, + "step": 691 + }, + { + "epoch": 0.11, + "grad_norm": 4.32347733310266, + "learning_rate": 1.9699201043249155e-05, + "loss": 1.0846, + "step": 692 + }, + { + "epoch": 0.11, + "grad_norm": 3.686360328821075, + "learning_rate": 1.969799306992824e-05, + "loss": 1.0297, + "step": 693 + }, + { + "epoch": 0.11, + "grad_norm": 4.01988978049729, + "learning_rate": 1.9696782713112622e-05, + "loss": 1.0701, + "step": 694 + }, + { + "epoch": 0.11, + "grad_norm": 3.8563763177269594, + "learning_rate": 1.9695569973099765e-05, + "loss": 1.0882, + "step": 695 + }, + { + "epoch": 0.11, + "grad_norm": 3.937134848854293, + "learning_rate": 1.9694354850187735e-05, + "loss": 1.1373, + "step": 696 + }, + { + "epoch": 0.11, + "grad_norm": 3.8809806630437658, + "learning_rate": 1.969313734467517e-05, + "loss": 1.1474, + "step": 697 + }, + { + "epoch": 0.11, + "grad_norm": 3.6916313779988994, + "learning_rate": 1.96919174568613e-05, + "loss": 0.9033, + "step": 698 + }, + { + "epoch": 0.11, + "grad_norm": 3.508260916496033, + "learning_rate": 1.969069518704594e-05, + "loss": 1.0931, + "step": 699 + }, + { + "epoch": 0.11, + "grad_norm": 4.763611485514735, + "learning_rate": 1.9689470535529486e-05, + "loss": 0.9209, + "step": 700 + }, + { + "epoch": 0.11, + "grad_norm": 4.03664208026575, + "learning_rate": 1.968824350261293e-05, + "loss": 0.9931, + "step": 701 + }, + { + "epoch": 0.11, + "grad_norm": 3.4671068542228993, + "learning_rate": 1.9687014088597835e-05, + "loss": 1.1389, + "step": 702 + }, + { + "epoch": 0.11, + "grad_norm": 3.6583966015737657, + "learning_rate": 1.968578229378636e-05, + "loss": 1.1128, + "step": 703 + }, + { + "epoch": 0.11, + "grad_norm": 4.10644984836611, + "learning_rate": 1.9684548118481243e-05, + "loss": 1.0528, + "step": 704 + }, + { + "epoch": 0.11, + "grad_norm": 3.777346025347817, + "learning_rate": 1.9683311562985814e-05, + "loss": 1.1407, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 4.164046825464703, + "learning_rate": 1.9682072627603983e-05, + "loss": 1.0261, + "step": 706 + }, + { + "epoch": 0.11, + "grad_norm": 3.8758635415084726, + "learning_rate": 1.968083131264024e-05, + "loss": 1.1707, + "step": 707 + }, + { + "epoch": 0.11, + "grad_norm": 3.12795773688994, + "learning_rate": 1.9679587618399674e-05, + "loss": 0.9405, + "step": 708 + }, + { + "epoch": 0.11, + "grad_norm": 4.441454785504363, + "learning_rate": 1.9678341545187942e-05, + "loss": 0.953, + "step": 709 + }, + { + "epoch": 0.11, + "grad_norm": 3.834429816258187, + "learning_rate": 1.96770930933113e-05, + "loss": 1.0836, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 3.631866186382932, + "learning_rate": 1.9675842263076582e-05, + "loss": 1.0692, + "step": 711 + }, + { + "epoch": 0.11, + "grad_norm": 3.6349093277364863, + "learning_rate": 1.96745890547912e-05, + "loss": 1.0568, + "step": 712 + }, + { + "epoch": 0.11, + "grad_norm": 3.989233135030243, + "learning_rate": 1.967333346876316e-05, + "loss": 0.9956, + "step": 713 + }, + { + "epoch": 0.11, + "grad_norm": 3.5976223876570796, + "learning_rate": 1.967207550530106e-05, + "loss": 1.1034, + "step": 714 + }, + { + "epoch": 0.11, + "grad_norm": 3.5445659972537817, + "learning_rate": 1.967081516471406e-05, + "loss": 0.9946, + "step": 715 + }, + { + "epoch": 0.11, + "grad_norm": 3.962320605519689, + "learning_rate": 1.9669552447311924e-05, + "loss": 1.0553, + "step": 716 + }, + { + "epoch": 0.11, + "grad_norm": 3.617516912860955, + "learning_rate": 1.9668287353404985e-05, + "loss": 1.0419, + "step": 717 + }, + { + "epoch": 0.11, + "grad_norm": 3.9464756935533716, + "learning_rate": 1.9667019883304174e-05, + "loss": 1.0305, + "step": 718 + }, + { + "epoch": 0.11, + "grad_norm": 4.000106682444183, + "learning_rate": 1.9665750037320997e-05, + "loss": 1.0838, + "step": 719 + }, + { + "epoch": 0.11, + "grad_norm": 3.6435817212177857, + "learning_rate": 1.9664477815767547e-05, + "loss": 1.0185, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 4.0570616965153805, + "learning_rate": 1.96632032189565e-05, + "loss": 1.1407, + "step": 721 + }, + { + "epoch": 0.11, + "grad_norm": 4.486195508863955, + "learning_rate": 1.9661926247201114e-05, + "loss": 1.1585, + "step": 722 + }, + { + "epoch": 0.11, + "grad_norm": 4.229993019271557, + "learning_rate": 1.966064690081524e-05, + "loss": 1.0752, + "step": 723 + }, + { + "epoch": 0.11, + "grad_norm": 4.189231501099848, + "learning_rate": 1.9659365180113297e-05, + "loss": 1.0476, + "step": 724 + }, + { + "epoch": 0.11, + "grad_norm": 3.7017992871809446, + "learning_rate": 1.96580810854103e-05, + "loss": 1.0926, + "step": 725 + }, + { + "epoch": 0.11, + "grad_norm": 3.44021198429482, + "learning_rate": 1.965679461702185e-05, + "loss": 0.9926, + "step": 726 + }, + { + "epoch": 0.11, + "grad_norm": 3.803164724153642, + "learning_rate": 1.9655505775264112e-05, + "loss": 1.1952, + "step": 727 + }, + { + "epoch": 0.11, + "grad_norm": 3.7636211547252727, + "learning_rate": 1.965421456045385e-05, + "loss": 1.1308, + "step": 728 + }, + { + "epoch": 0.11, + "grad_norm": 14.558419284935113, + "learning_rate": 1.9652920972908417e-05, + "loss": 1.2105, + "step": 729 + }, + { + "epoch": 0.11, + "grad_norm": 3.620390642949841, + "learning_rate": 1.9651625012945735e-05, + "loss": 1.0653, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 3.8772827825120344, + "learning_rate": 1.9650326680884312e-05, + "loss": 0.9776, + "step": 731 + }, + { + "epoch": 0.11, + "grad_norm": 3.7144554851138603, + "learning_rate": 1.9649025977043248e-05, + "loss": 0.992, + "step": 732 + }, + { + "epoch": 0.11, + "grad_norm": 3.7390114991851795, + "learning_rate": 1.9647722901742214e-05, + "loss": 0.9189, + "step": 733 + }, + { + "epoch": 0.11, + "grad_norm": 4.052214849078998, + "learning_rate": 1.964641745530147e-05, + "loss": 1.033, + "step": 734 + }, + { + "epoch": 0.11, + "grad_norm": 3.806759472316589, + "learning_rate": 1.9645109638041863e-05, + "loss": 1.0422, + "step": 735 + }, + { + "epoch": 0.11, + "grad_norm": 38.73670037211717, + "learning_rate": 1.964379945028481e-05, + "loss": 1.2231, + "step": 736 + }, + { + "epoch": 0.11, + "grad_norm": 3.5072756289465525, + "learning_rate": 1.9642486892352326e-05, + "loss": 1.0772, + "step": 737 + }, + { + "epoch": 0.11, + "grad_norm": 3.7031172727863413, + "learning_rate": 1.9641171964566993e-05, + "loss": 1.1608, + "step": 738 + }, + { + "epoch": 0.11, + "grad_norm": 3.8411867594705162, + "learning_rate": 1.9639854667251995e-05, + "loss": 1.0666, + "step": 739 + }, + { + "epoch": 0.11, + "grad_norm": 3.8576674197679814, + "learning_rate": 1.9638535000731075e-05, + "loss": 1.1506, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 4.2951006484863985, + "learning_rate": 1.9637212965328574e-05, + "loss": 1.0101, + "step": 741 + }, + { + "epoch": 0.11, + "grad_norm": 3.3545177240029, + "learning_rate": 1.9635888561369415e-05, + "loss": 1.0713, + "step": 742 + }, + { + "epoch": 0.11, + "grad_norm": 3.4714756904233024, + "learning_rate": 1.9634561789179093e-05, + "loss": 1.0043, + "step": 743 + }, + { + "epoch": 0.11, + "grad_norm": 3.3900929988252564, + "learning_rate": 1.9633232649083697e-05, + "loss": 0.9865, + "step": 744 + }, + { + "epoch": 0.11, + "grad_norm": 3.8653969256444016, + "learning_rate": 1.9631901141409888e-05, + "loss": 1.0011, + "step": 745 + }, + { + "epoch": 0.11, + "grad_norm": 3.5304687792541385, + "learning_rate": 1.9630567266484918e-05, + "loss": 1.0675, + "step": 746 + }, + { + "epoch": 0.11, + "grad_norm": 3.41936470088297, + "learning_rate": 1.962923102463661e-05, + "loss": 1.0268, + "step": 747 + }, + { + "epoch": 0.11, + "grad_norm": 3.9386474133604734, + "learning_rate": 1.962789241619338e-05, + "loss": 0.9598, + "step": 748 + }, + { + "epoch": 0.11, + "grad_norm": 3.7646981028998376, + "learning_rate": 1.962655144148422e-05, + "loss": 1.0279, + "step": 749 + }, + { + "epoch": 0.11, + "grad_norm": 3.60321616972536, + "learning_rate": 1.9625208100838698e-05, + "loss": 1.0293, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 3.524009445596224, + "learning_rate": 1.9623862394586978e-05, + "loss": 1.043, + "step": 751 + }, + { + "epoch": 0.12, + "grad_norm": 3.749208823784576, + "learning_rate": 1.9622514323059788e-05, + "loss": 1.0363, + "step": 752 + }, + { + "epoch": 0.12, + "grad_norm": 3.5714432297536622, + "learning_rate": 1.9621163886588457e-05, + "loss": 0.9095, + "step": 753 + }, + { + "epoch": 0.12, + "grad_norm": 3.644928681020688, + "learning_rate": 1.9619811085504877e-05, + "loss": 1.1088, + "step": 754 + }, + { + "epoch": 0.12, + "grad_norm": 3.5243190898327037, + "learning_rate": 1.961845592014153e-05, + "loss": 1.08, + "step": 755 + }, + { + "epoch": 0.12, + "grad_norm": 3.5845755162216784, + "learning_rate": 1.961709839083147e-05, + "loss": 1.0601, + "step": 756 + }, + { + "epoch": 0.12, + "grad_norm": 3.8386738684758286, + "learning_rate": 1.9615738497908355e-05, + "loss": 1.109, + "step": 757 + }, + { + "epoch": 0.12, + "grad_norm": 3.4281897633822194, + "learning_rate": 1.96143762417064e-05, + "loss": 1.0573, + "step": 758 + }, + { + "epoch": 0.12, + "grad_norm": 3.9133448707630483, + "learning_rate": 1.961301162256041e-05, + "loss": 1.0018, + "step": 759 + }, + { + "epoch": 0.12, + "grad_norm": 3.623091036318239, + "learning_rate": 1.961164464080577e-05, + "loss": 1.0683, + "step": 760 + }, + { + "epoch": 0.12, + "grad_norm": 3.801994531879226, + "learning_rate": 1.9610275296778443e-05, + "loss": 1.0179, + "step": 761 + }, + { + "epoch": 0.12, + "grad_norm": 3.742948802159206, + "learning_rate": 1.960890359081498e-05, + "loss": 0.9554, + "step": 762 + }, + { + "epoch": 0.12, + "grad_norm": 3.6508770514039557, + "learning_rate": 1.9607529523252507e-05, + "loss": 0.9559, + "step": 763 + }, + { + "epoch": 0.12, + "grad_norm": 4.021960305430146, + "learning_rate": 1.960615309442873e-05, + "loss": 1.0517, + "step": 764 + }, + { + "epoch": 0.12, + "grad_norm": 3.593832974448563, + "learning_rate": 1.9604774304681937e-05, + "loss": 1.1218, + "step": 765 + }, + { + "epoch": 0.12, + "grad_norm": 3.624834274865465, + "learning_rate": 1.9603393154350996e-05, + "loss": 0.9769, + "step": 766 + }, + { + "epoch": 0.12, + "grad_norm": 6.1752990912282915, + "learning_rate": 1.960200964377535e-05, + "loss": 1.1958, + "step": 767 + }, + { + "epoch": 0.12, + "grad_norm": 3.8413333660780666, + "learning_rate": 1.960062377329504e-05, + "loss": 0.9992, + "step": 768 + }, + { + "epoch": 0.12, + "grad_norm": 4.261820789196503, + "learning_rate": 1.9599235543250662e-05, + "loss": 1.0441, + "step": 769 + }, + { + "epoch": 0.12, + "grad_norm": 4.1863418239141374, + "learning_rate": 1.959784495398341e-05, + "loss": 1.1747, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 3.6590672543008886, + "learning_rate": 1.959645200583505e-05, + "loss": 1.0349, + "step": 771 + }, + { + "epoch": 0.12, + "grad_norm": 3.838377478601075, + "learning_rate": 1.959505669914793e-05, + "loss": 1.1176, + "step": 772 + }, + { + "epoch": 0.12, + "grad_norm": 3.7519830934354257, + "learning_rate": 1.9593659034264973e-05, + "loss": 1.0497, + "step": 773 + }, + { + "epoch": 0.12, + "grad_norm": 3.980442885458292, + "learning_rate": 1.9592259011529694e-05, + "loss": 1.0741, + "step": 774 + }, + { + "epoch": 0.12, + "grad_norm": 4.041357306903657, + "learning_rate": 1.9590856631286173e-05, + "loss": 0.9991, + "step": 775 + }, + { + "epoch": 0.12, + "grad_norm": 3.719310496292942, + "learning_rate": 1.9589451893879075e-05, + "loss": 1.0681, + "step": 776 + }, + { + "epoch": 0.12, + "grad_norm": 3.648060644724543, + "learning_rate": 1.958804479965365e-05, + "loss": 1.0148, + "step": 777 + }, + { + "epoch": 0.12, + "grad_norm": 4.385002892701598, + "learning_rate": 1.9586635348955726e-05, + "loss": 1.122, + "step": 778 + }, + { + "epoch": 0.12, + "grad_norm": 3.288261943500257, + "learning_rate": 1.9585223542131694e-05, + "loss": 1.0396, + "step": 779 + }, + { + "epoch": 0.12, + "grad_norm": 3.462958609822878, + "learning_rate": 1.9583809379528543e-05, + "loss": 0.9797, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 3.405964285120037, + "learning_rate": 1.9582392861493835e-05, + "loss": 1.0695, + "step": 781 + }, + { + "epoch": 0.12, + "grad_norm": 3.6432920238058637, + "learning_rate": 1.958097398837571e-05, + "loss": 0.9781, + "step": 782 + }, + { + "epoch": 0.12, + "grad_norm": 3.481847514892652, + "learning_rate": 1.9579552760522887e-05, + "loss": 1.0203, + "step": 783 + }, + { + "epoch": 0.12, + "grad_norm": 4.578362002892191, + "learning_rate": 1.9578129178284664e-05, + "loss": 1.0154, + "step": 784 + }, + { + "epoch": 0.12, + "grad_norm": 4.002478093346479, + "learning_rate": 1.9576703242010923e-05, + "loss": 1.1206, + "step": 785 + }, + { + "epoch": 0.12, + "grad_norm": 3.9881379135003465, + "learning_rate": 1.9575274952052105e-05, + "loss": 1.0538, + "step": 786 + }, + { + "epoch": 0.12, + "grad_norm": 4.081683690181119, + "learning_rate": 1.957384430875926e-05, + "loss": 0.9941, + "step": 787 + }, + { + "epoch": 0.12, + "grad_norm": 3.7175468212827343, + "learning_rate": 1.957241131248399e-05, + "loss": 1.0754, + "step": 788 + }, + { + "epoch": 0.12, + "grad_norm": 5.534881287893714, + "learning_rate": 1.9570975963578485e-05, + "loss": 0.9715, + "step": 789 + }, + { + "epoch": 0.12, + "grad_norm": 3.432903947303448, + "learning_rate": 1.9569538262395517e-05, + "loss": 0.9574, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 9.796886352729334, + "learning_rate": 1.9568098209288436e-05, + "loss": 1.1519, + "step": 791 + }, + { + "epoch": 0.12, + "grad_norm": 3.906061714461183, + "learning_rate": 1.9566655804611156e-05, + "loss": 1.0291, + "step": 792 + }, + { + "epoch": 0.12, + "grad_norm": 4.770470886347632, + "learning_rate": 1.956521104871819e-05, + "loss": 1.2086, + "step": 793 + }, + { + "epoch": 0.12, + "grad_norm": 4.223319458578048, + "learning_rate": 1.9563763941964615e-05, + "loss": 1.0458, + "step": 794 + }, + { + "epoch": 0.12, + "grad_norm": 3.9765422553961254, + "learning_rate": 1.956231448470609e-05, + "loss": 1.0013, + "step": 795 + }, + { + "epoch": 0.12, + "grad_norm": 3.7688427996518112, + "learning_rate": 1.9560862677298848e-05, + "loss": 1.0523, + "step": 796 + }, + { + "epoch": 0.12, + "grad_norm": 3.944295892478544, + "learning_rate": 1.9559408520099703e-05, + "loss": 0.9943, + "step": 797 + }, + { + "epoch": 0.12, + "grad_norm": 10.285120031612841, + "learning_rate": 1.955795201346605e-05, + "loss": 1.07, + "step": 798 + }, + { + "epoch": 0.12, + "grad_norm": 3.360886095298879, + "learning_rate": 1.955649315775585e-05, + "loss": 1.0184, + "step": 799 + }, + { + "epoch": 0.12, + "grad_norm": 3.453504735902985, + "learning_rate": 1.955503195332766e-05, + "loss": 1.0194, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 3.7720666704980377, + "learning_rate": 1.9553568400540594e-05, + "loss": 1.1322, + "step": 801 + }, + { + "epoch": 0.12, + "grad_norm": 4.250064309135509, + "learning_rate": 1.9552102499754356e-05, + "loss": 1.048, + "step": 802 + }, + { + "epoch": 0.12, + "grad_norm": 4.151007308172151, + "learning_rate": 1.955063425132922e-05, + "loss": 1.0671, + "step": 803 + }, + { + "epoch": 0.12, + "grad_norm": 4.504712656197753, + "learning_rate": 1.9549163655626044e-05, + "loss": 0.9555, + "step": 804 + }, + { + "epoch": 0.12, + "grad_norm": 3.9652043005001696, + "learning_rate": 1.954769071300626e-05, + "loss": 1.1281, + "step": 805 + }, + { + "epoch": 0.12, + "grad_norm": 3.417589770846741, + "learning_rate": 1.954621542383187e-05, + "loss": 0.9591, + "step": 806 + }, + { + "epoch": 0.12, + "grad_norm": 3.1455662258661916, + "learning_rate": 1.9544737788465465e-05, + "loss": 0.9198, + "step": 807 + }, + { + "epoch": 0.12, + "grad_norm": 3.371944981568343, + "learning_rate": 1.9543257807270204e-05, + "loss": 1.0036, + "step": 808 + }, + { + "epoch": 0.12, + "grad_norm": 3.6233229149796298, + "learning_rate": 1.9541775480609823e-05, + "loss": 1.0705, + "step": 809 + }, + { + "epoch": 0.12, + "grad_norm": 3.7455642763851267, + "learning_rate": 1.9540290808848637e-05, + "loss": 1.0605, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 3.67169620521609, + "learning_rate": 1.953880379235154e-05, + "loss": 1.0644, + "step": 811 + }, + { + "epoch": 0.12, + "grad_norm": 3.563725907642809, + "learning_rate": 1.9537314431483996e-05, + "loss": 1.0724, + "step": 812 + }, + { + "epoch": 0.12, + "grad_norm": 3.6587065592888997, + "learning_rate": 1.953582272661205e-05, + "loss": 1.0925, + "step": 813 + }, + { + "epoch": 0.12, + "grad_norm": 4.1180071058625725, + "learning_rate": 1.9534328678102316e-05, + "loss": 1.1036, + "step": 814 + }, + { + "epoch": 0.12, + "grad_norm": 4.189957751724557, + "learning_rate": 1.9532832286321996e-05, + "loss": 1.0334, + "step": 815 + }, + { + "epoch": 0.12, + "grad_norm": 3.4550391618359204, + "learning_rate": 1.9531333551638858e-05, + "loss": 1.035, + "step": 816 + }, + { + "epoch": 0.13, + "grad_norm": 3.4523104004446936, + "learning_rate": 1.952983247442125e-05, + "loss": 1.1495, + "step": 817 + }, + { + "epoch": 0.13, + "grad_norm": 3.324773915920645, + "learning_rate": 1.9528329055038094e-05, + "loss": 1.0231, + "step": 818 + }, + { + "epoch": 0.13, + "grad_norm": 3.779739514408404, + "learning_rate": 1.9526823293858888e-05, + "loss": 1.0396, + "step": 819 + }, + { + "epoch": 0.13, + "grad_norm": 3.5490924567379984, + "learning_rate": 1.9525315191253706e-05, + "loss": 1.0161, + "step": 820 + }, + { + "epoch": 0.13, + "grad_norm": 3.455144063503647, + "learning_rate": 1.95238047475932e-05, + "loss": 1.0069, + "step": 821 + }, + { + "epoch": 0.13, + "grad_norm": 3.790835644184474, + "learning_rate": 1.9522291963248588e-05, + "loss": 1.0302, + "step": 822 + }, + { + "epoch": 0.13, + "grad_norm": 3.411888232632968, + "learning_rate": 1.9520776838591673e-05, + "loss": 0.993, + "step": 823 + }, + { + "epoch": 0.13, + "grad_norm": 3.606694666212379, + "learning_rate": 1.9519259373994834e-05, + "loss": 0.9957, + "step": 824 + }, + { + "epoch": 0.13, + "grad_norm": 10.792601477958348, + "learning_rate": 1.951773956983102e-05, + "loss": 1.1405, + "step": 825 + }, + { + "epoch": 0.13, + "grad_norm": 3.8188365519437704, + "learning_rate": 1.951621742647375e-05, + "loss": 1.2124, + "step": 826 + }, + { + "epoch": 0.13, + "grad_norm": 3.6597773196242165, + "learning_rate": 1.9514692944297137e-05, + "loss": 1.1435, + "step": 827 + }, + { + "epoch": 0.13, + "grad_norm": 3.194410296347501, + "learning_rate": 1.9513166123675838e-05, + "loss": 1.0048, + "step": 828 + }, + { + "epoch": 0.13, + "grad_norm": 3.582744555272188, + "learning_rate": 1.951163696498512e-05, + "loss": 0.9583, + "step": 829 + }, + { + "epoch": 0.13, + "grad_norm": 3.6207705927160245, + "learning_rate": 1.9510105468600797e-05, + "loss": 1.0335, + "step": 830 + }, + { + "epoch": 0.13, + "grad_norm": 3.6278228482113617, + "learning_rate": 1.950857163489927e-05, + "loss": 1.1164, + "step": 831 + }, + { + "epoch": 0.13, + "grad_norm": 3.5307035817039605, + "learning_rate": 1.9507035464257515e-05, + "loss": 0.9872, + "step": 832 + }, + { + "epoch": 0.13, + "grad_norm": 3.824472225302144, + "learning_rate": 1.9505496957053078e-05, + "loss": 1.0227, + "step": 833 + }, + { + "epoch": 0.13, + "grad_norm": 3.9818563480117914, + "learning_rate": 1.9503956113664078e-05, + "loss": 1.1242, + "step": 834 + }, + { + "epoch": 0.13, + "grad_norm": 3.869625239367482, + "learning_rate": 1.9502412934469215e-05, + "loss": 1.0929, + "step": 835 + }, + { + "epoch": 0.13, + "grad_norm": 3.2976612309343882, + "learning_rate": 1.950086741984776e-05, + "loss": 1.0272, + "step": 836 + }, + { + "epoch": 0.13, + "grad_norm": 12.769326972793142, + "learning_rate": 1.949931957017955e-05, + "loss": 1.1102, + "step": 837 + }, + { + "epoch": 0.13, + "grad_norm": 3.8481939837470174, + "learning_rate": 1.9497769385845012e-05, + "loss": 1.025, + "step": 838 + }, + { + "epoch": 0.13, + "grad_norm": 3.3352628206516677, + "learning_rate": 1.9496216867225134e-05, + "loss": 1.0924, + "step": 839 + }, + { + "epoch": 0.13, + "grad_norm": 3.3533347409436707, + "learning_rate": 1.949466201470148e-05, + "loss": 1.0003, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 3.542021472164938, + "learning_rate": 1.9493104828656187e-05, + "loss": 1.1117, + "step": 841 + }, + { + "epoch": 0.13, + "grad_norm": 3.6301160736140052, + "learning_rate": 1.949154530947198e-05, + "loss": 1.0294, + "step": 842 + }, + { + "epoch": 0.13, + "grad_norm": 3.3460425083645564, + "learning_rate": 1.948998345753213e-05, + "loss": 1.0436, + "step": 843 + }, + { + "epoch": 0.13, + "grad_norm": 3.2446259615251285, + "learning_rate": 1.9488419273220503e-05, + "loss": 0.9428, + "step": 844 + }, + { + "epoch": 0.13, + "grad_norm": 3.310632804542017, + "learning_rate": 1.9486852756921534e-05, + "loss": 1.0236, + "step": 845 + }, + { + "epoch": 0.13, + "grad_norm": 4.364801744366941, + "learning_rate": 1.9485283909020226e-05, + "loss": 1.0437, + "step": 846 + }, + { + "epoch": 0.13, + "grad_norm": 3.2934837675991124, + "learning_rate": 1.9483712729902152e-05, + "loss": 1.0043, + "step": 847 + }, + { + "epoch": 0.13, + "grad_norm": 3.6795888850025262, + "learning_rate": 1.9482139219953478e-05, + "loss": 1.0745, + "step": 848 + }, + { + "epoch": 0.13, + "grad_norm": 3.411422407071628, + "learning_rate": 1.9480563379560915e-05, + "loss": 0.934, + "step": 849 + }, + { + "epoch": 0.13, + "grad_norm": 3.460071140538643, + "learning_rate": 1.9478985209111767e-05, + "loss": 1.0779, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 3.380162168990996, + "learning_rate": 1.9477404708993904e-05, + "loss": 0.9893, + "step": 851 + }, + { + "epoch": 0.13, + "grad_norm": 3.2651632117157217, + "learning_rate": 1.9475821879595765e-05, + "loss": 0.9183, + "step": 852 + }, + { + "epoch": 0.13, + "grad_norm": 3.2452134078511325, + "learning_rate": 1.947423672130637e-05, + "loss": 1.0649, + "step": 853 + }, + { + "epoch": 0.13, + "grad_norm": 3.754485555340307, + "learning_rate": 1.9472649234515303e-05, + "loss": 1.2328, + "step": 854 + }, + { + "epoch": 0.13, + "grad_norm": 3.3907456912187404, + "learning_rate": 1.9471059419612724e-05, + "loss": 1.1444, + "step": 855 + }, + { + "epoch": 0.13, + "grad_norm": 4.104580673184325, + "learning_rate": 1.9469467276989366e-05, + "loss": 0.8636, + "step": 856 + }, + { + "epoch": 0.13, + "grad_norm": 3.5082479856079214, + "learning_rate": 1.946787280703653e-05, + "loss": 0.9407, + "step": 857 + }, + { + "epoch": 0.13, + "grad_norm": 4.222298799756904, + "learning_rate": 1.9466276010146097e-05, + "loss": 1.1906, + "step": 858 + }, + { + "epoch": 0.13, + "grad_norm": 3.4676251777168177, + "learning_rate": 1.9464676886710513e-05, + "loss": 0.9284, + "step": 859 + }, + { + "epoch": 0.13, + "grad_norm": 3.748250993913235, + "learning_rate": 1.9463075437122792e-05, + "loss": 1.0374, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 3.5471276859661747, + "learning_rate": 1.9461471661776536e-05, + "loss": 1.105, + "step": 861 + }, + { + "epoch": 0.13, + "grad_norm": 3.8740533564053607, + "learning_rate": 1.94598655610659e-05, + "loss": 0.9877, + "step": 862 + }, + { + "epoch": 0.13, + "grad_norm": 3.680125228295212, + "learning_rate": 1.945825713538562e-05, + "loss": 1.1043, + "step": 863 + }, + { + "epoch": 0.13, + "grad_norm": 4.148577769562295, + "learning_rate": 1.9456646385131006e-05, + "loss": 1.0213, + "step": 864 + }, + { + "epoch": 0.13, + "grad_norm": 9.574910256683985, + "learning_rate": 1.945503331069793e-05, + "loss": 1.1023, + "step": 865 + }, + { + "epoch": 0.13, + "grad_norm": 3.8228224803141773, + "learning_rate": 1.9453417912482843e-05, + "loss": 1.0985, + "step": 866 + }, + { + "epoch": 0.13, + "grad_norm": 3.7000581035892046, + "learning_rate": 1.9451800190882767e-05, + "loss": 1.0697, + "step": 867 + }, + { + "epoch": 0.13, + "grad_norm": 3.529672076487252, + "learning_rate": 1.9450180146295286e-05, + "loss": 0.8486, + "step": 868 + }, + { + "epoch": 0.13, + "grad_norm": 3.5757251881192853, + "learning_rate": 1.944855777911857e-05, + "loss": 1.1062, + "step": 869 + }, + { + "epoch": 0.13, + "grad_norm": 3.895084514836089, + "learning_rate": 1.9446933089751342e-05, + "loss": 1.0017, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 3.4793166789795213, + "learning_rate": 1.944530607859291e-05, + "loss": 1.0879, + "step": 871 + }, + { + "epoch": 0.13, + "grad_norm": 3.9300137550777734, + "learning_rate": 1.9443676746043146e-05, + "loss": 0.9493, + "step": 872 + }, + { + "epoch": 0.13, + "grad_norm": 3.730736121130802, + "learning_rate": 1.94420450925025e-05, + "loss": 1.0333, + "step": 873 + }, + { + "epoch": 0.13, + "grad_norm": 3.9223348859910074, + "learning_rate": 1.944041111837198e-05, + "loss": 1.1305, + "step": 874 + }, + { + "epoch": 0.13, + "grad_norm": 3.418566499173944, + "learning_rate": 1.9438774824053174e-05, + "loss": 1.0247, + "step": 875 + }, + { + "epoch": 0.13, + "grad_norm": 5.710512251678389, + "learning_rate": 1.9437136209948235e-05, + "loss": 1.0405, + "step": 876 + }, + { + "epoch": 0.13, + "grad_norm": 3.6291136655677816, + "learning_rate": 1.9435495276459892e-05, + "loss": 1.0759, + "step": 877 + }, + { + "epoch": 0.13, + "grad_norm": 3.6660336507415625, + "learning_rate": 1.9433852023991438e-05, + "loss": 1.051, + "step": 878 + }, + { + "epoch": 0.13, + "grad_norm": 3.7405109288915126, + "learning_rate": 1.9432206452946738e-05, + "loss": 0.9667, + "step": 879 + }, + { + "epoch": 0.13, + "grad_norm": 3.795044548099862, + "learning_rate": 1.943055856373023e-05, + "loss": 0.9896, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 3.226216202872285, + "learning_rate": 1.9428908356746914e-05, + "loss": 1.0357, + "step": 881 + }, + { + "epoch": 0.14, + "grad_norm": 3.6773692597552903, + "learning_rate": 1.942725583240237e-05, + "loss": 0.9852, + "step": 882 + }, + { + "epoch": 0.14, + "grad_norm": 3.5055818432672696, + "learning_rate": 1.9425600991102737e-05, + "loss": 1.0631, + "step": 883 + }, + { + "epoch": 0.14, + "grad_norm": 3.7887724499734183, + "learning_rate": 1.9423943833254736e-05, + "loss": 1.0482, + "step": 884 + }, + { + "epoch": 0.14, + "grad_norm": 3.497326903888074, + "learning_rate": 1.9422284359265638e-05, + "loss": 0.9832, + "step": 885 + }, + { + "epoch": 0.14, + "grad_norm": 3.5198908403193294, + "learning_rate": 1.942062256954331e-05, + "loss": 0.9818, + "step": 886 + }, + { + "epoch": 0.14, + "grad_norm": 3.6349791779928973, + "learning_rate": 1.941895846449616e-05, + "loss": 0.9803, + "step": 887 + }, + { + "epoch": 0.14, + "grad_norm": 4.139055135898923, + "learning_rate": 1.9417292044533187e-05, + "loss": 1.1132, + "step": 888 + }, + { + "epoch": 0.14, + "grad_norm": 3.3110527123846234, + "learning_rate": 1.9415623310063946e-05, + "loss": 0.99, + "step": 889 + }, + { + "epoch": 0.14, + "grad_norm": 3.4959525311625033, + "learning_rate": 1.941395226149857e-05, + "loss": 1.0202, + "step": 890 + }, + { + "epoch": 0.14, + "grad_norm": 3.1293930744554226, + "learning_rate": 1.9412278899247748e-05, + "loss": 0.9667, + "step": 891 + }, + { + "epoch": 0.14, + "grad_norm": 3.2870879037590712, + "learning_rate": 1.941060322372275e-05, + "loss": 1.0469, + "step": 892 + }, + { + "epoch": 0.14, + "grad_norm": 3.432365023023609, + "learning_rate": 1.940892523533541e-05, + "loss": 0.9081, + "step": 893 + }, + { + "epoch": 0.14, + "grad_norm": 3.349988942640587, + "learning_rate": 1.940724493449813e-05, + "loss": 0.9457, + "step": 894 + }, + { + "epoch": 0.14, + "grad_norm": 3.060076170295506, + "learning_rate": 1.9405562321623884e-05, + "loss": 1.0294, + "step": 895 + }, + { + "epoch": 0.14, + "grad_norm": 3.5164199715881743, + "learning_rate": 1.940387739712621e-05, + "loss": 0.9969, + "step": 896 + }, + { + "epoch": 0.14, + "grad_norm": 3.4851977251380837, + "learning_rate": 1.940219016141921e-05, + "loss": 0.9124, + "step": 897 + }, + { + "epoch": 0.14, + "grad_norm": 3.8061545721513124, + "learning_rate": 1.9400500614917563e-05, + "loss": 1.0424, + "step": 898 + }, + { + "epoch": 0.14, + "grad_norm": 3.424299477384805, + "learning_rate": 1.939880875803651e-05, + "loss": 1.0319, + "step": 899 + }, + { + "epoch": 0.14, + "grad_norm": 3.744122164546516, + "learning_rate": 1.9397114591191866e-05, + "loss": 1.0477, + "step": 900 + }, + { + "epoch": 0.14, + "grad_norm": 3.445323224862705, + "learning_rate": 1.9395418114800005e-05, + "loss": 1.0916, + "step": 901 + }, + { + "epoch": 0.14, + "grad_norm": 3.93929967636297, + "learning_rate": 1.939371932927788e-05, + "loss": 0.9989, + "step": 902 + }, + { + "epoch": 0.14, + "grad_norm": 7.544785983942014, + "learning_rate": 1.9392018235043e-05, + "loss": 1.152, + "step": 903 + }, + { + "epoch": 0.14, + "grad_norm": 3.892545070640911, + "learning_rate": 1.9390314832513442e-05, + "loss": 1.1214, + "step": 904 + }, + { + "epoch": 0.14, + "grad_norm": 21.987263122887104, + "learning_rate": 1.9388609122107864e-05, + "loss": 1.1062, + "step": 905 + }, + { + "epoch": 0.14, + "grad_norm": 3.6562630231187017, + "learning_rate": 1.9386901104245475e-05, + "loss": 0.9848, + "step": 906 + }, + { + "epoch": 0.14, + "grad_norm": 3.518496911988627, + "learning_rate": 1.9385190779346058e-05, + "loss": 1.0192, + "step": 907 + }, + { + "epoch": 0.14, + "grad_norm": 3.5625079045270605, + "learning_rate": 1.938347814782997e-05, + "loss": 1.0707, + "step": 908 + }, + { + "epoch": 0.14, + "grad_norm": 3.528834299569891, + "learning_rate": 1.938176321011812e-05, + "loss": 0.9774, + "step": 909 + }, + { + "epoch": 0.14, + "grad_norm": 3.6018264100036257, + "learning_rate": 1.9380045966631994e-05, + "loss": 1.1302, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 3.312438165651826, + "learning_rate": 1.9378326417793646e-05, + "loss": 1.0168, + "step": 911 + }, + { + "epoch": 0.14, + "grad_norm": 3.2024102793678733, + "learning_rate": 1.9376604564025685e-05, + "loss": 1.0516, + "step": 912 + }, + { + "epoch": 0.14, + "grad_norm": 3.5675605420399723, + "learning_rate": 1.93748804057513e-05, + "loss": 1.1061, + "step": 913 + }, + { + "epoch": 0.14, + "grad_norm": 3.826640315336116, + "learning_rate": 1.9373153943394242e-05, + "loss": 1.0564, + "step": 914 + }, + { + "epoch": 0.14, + "grad_norm": 3.624789270347231, + "learning_rate": 1.9371425177378824e-05, + "loss": 1.0781, + "step": 915 + }, + { + "epoch": 0.14, + "grad_norm": 3.5070096506335084, + "learning_rate": 1.9369694108129928e-05, + "loss": 1.0496, + "step": 916 + }, + { + "epoch": 0.14, + "grad_norm": 3.6300723896660076, + "learning_rate": 1.9367960736073002e-05, + "loss": 1.0479, + "step": 917 + }, + { + "epoch": 0.14, + "grad_norm": 3.578842278567324, + "learning_rate": 1.9366225061634064e-05, + "loss": 0.9765, + "step": 918 + }, + { + "epoch": 0.14, + "grad_norm": 3.581842751711827, + "learning_rate": 1.936448708523969e-05, + "loss": 1.1024, + "step": 919 + }, + { + "epoch": 0.14, + "grad_norm": 3.1951935078674176, + "learning_rate": 1.936274680731703e-05, + "loss": 0.9714, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 3.331940461952663, + "learning_rate": 1.9361004228293788e-05, + "loss": 0.94, + "step": 921 + }, + { + "epoch": 0.14, + "grad_norm": 3.8236395933927243, + "learning_rate": 1.935925934859825e-05, + "loss": 1.0488, + "step": 922 + }, + { + "epoch": 0.14, + "grad_norm": 3.363276927878964, + "learning_rate": 1.935751216865925e-05, + "loss": 1.1099, + "step": 923 + }, + { + "epoch": 0.14, + "grad_norm": 3.596516227911937, + "learning_rate": 1.9355762688906202e-05, + "loss": 1.116, + "step": 924 + }, + { + "epoch": 0.14, + "grad_norm": 3.348121383472041, + "learning_rate": 1.935401090976908e-05, + "loss": 1.0218, + "step": 925 + }, + { + "epoch": 0.14, + "grad_norm": 3.733343930017785, + "learning_rate": 1.935225683167842e-05, + "loss": 1.0043, + "step": 926 + }, + { + "epoch": 0.14, + "grad_norm": 3.758966389591936, + "learning_rate": 1.935050045506532e-05, + "loss": 0.9766, + "step": 927 + }, + { + "epoch": 0.14, + "grad_norm": 3.580226561244626, + "learning_rate": 1.9348741780361455e-05, + "loss": 1.0552, + "step": 928 + }, + { + "epoch": 0.14, + "grad_norm": 4.206751124430905, + "learning_rate": 1.9346980807999055e-05, + "loss": 1.0489, + "step": 929 + }, + { + "epoch": 0.14, + "grad_norm": 30.23771096559672, + "learning_rate": 1.934521753841092e-05, + "loss": 1.2118, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 3.322973754926158, + "learning_rate": 1.9343451972030407e-05, + "loss": 1.0232, + "step": 931 + }, + { + "epoch": 0.14, + "grad_norm": 3.6437034126980783, + "learning_rate": 1.934168410929145e-05, + "loss": 1.02, + "step": 932 + }, + { + "epoch": 0.14, + "grad_norm": 3.9613253906670383, + "learning_rate": 1.9339913950628536e-05, + "loss": 0.9895, + "step": 933 + }, + { + "epoch": 0.14, + "grad_norm": 3.7446020566588074, + "learning_rate": 1.933814149647672e-05, + "loss": 0.999, + "step": 934 + }, + { + "epoch": 0.14, + "grad_norm": 3.666157074758006, + "learning_rate": 1.933636674727162e-05, + "loss": 1.0742, + "step": 935 + }, + { + "epoch": 0.14, + "grad_norm": 3.5990163747656454, + "learning_rate": 1.9334589703449424e-05, + "loss": 0.9915, + "step": 936 + }, + { + "epoch": 0.14, + "grad_norm": 3.679207043975646, + "learning_rate": 1.9332810365446876e-05, + "loss": 1.1157, + "step": 937 + }, + { + "epoch": 0.14, + "grad_norm": 3.2096157271834107, + "learning_rate": 1.9331028733701292e-05, + "loss": 0.9847, + "step": 938 + }, + { + "epoch": 0.14, + "grad_norm": 3.7360702314930085, + "learning_rate": 1.932924480865054e-05, + "loss": 1.0495, + "step": 939 + }, + { + "epoch": 0.14, + "grad_norm": 12.163737309143626, + "learning_rate": 1.9327458590733062e-05, + "loss": 1.1496, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 3.2904270215911993, + "learning_rate": 1.9325670080387863e-05, + "loss": 0.9515, + "step": 941 + }, + { + "epoch": 0.14, + "grad_norm": 3.6522178641809253, + "learning_rate": 1.932387927805451e-05, + "loss": 1.0595, + "step": 942 + }, + { + "epoch": 0.14, + "grad_norm": 3.46856880025099, + "learning_rate": 1.9322086184173122e-05, + "loss": 1.0325, + "step": 943 + }, + { + "epoch": 0.14, + "grad_norm": 3.287603494668713, + "learning_rate": 1.93202907991844e-05, + "loss": 0.9141, + "step": 944 + }, + { + "epoch": 0.14, + "grad_norm": 3.6251112696061867, + "learning_rate": 1.9318493123529597e-05, + "loss": 0.9155, + "step": 945 + }, + { + "epoch": 0.14, + "grad_norm": 2.987137986210219, + "learning_rate": 1.931669315765053e-05, + "loss": 1.0145, + "step": 946 + }, + { + "epoch": 0.14, + "grad_norm": 3.3424632848737277, + "learning_rate": 1.9314890901989583e-05, + "loss": 1.0415, + "step": 947 + }, + { + "epoch": 0.15, + "grad_norm": 3.2353948522445077, + "learning_rate": 1.9313086356989698e-05, + "loss": 0.9595, + "step": 948 + }, + { + "epoch": 0.15, + "grad_norm": 3.576702121389513, + "learning_rate": 1.931127952309438e-05, + "loss": 1.1594, + "step": 949 + }, + { + "epoch": 0.15, + "grad_norm": 3.3757575820385246, + "learning_rate": 1.93094704007477e-05, + "loss": 1.0258, + "step": 950 + }, + { + "epoch": 0.15, + "grad_norm": 3.293818922537095, + "learning_rate": 1.9307658990394293e-05, + "loss": 1.0243, + "step": 951 + }, + { + "epoch": 0.15, + "grad_norm": 3.3688391700609315, + "learning_rate": 1.9305845292479346e-05, + "loss": 1.009, + "step": 952 + }, + { + "epoch": 0.15, + "grad_norm": 9.80828691267798, + "learning_rate": 1.930402930744862e-05, + "loss": 1.1966, + "step": 953 + }, + { + "epoch": 0.15, + "grad_norm": 3.35247059313853, + "learning_rate": 1.930221103574843e-05, + "loss": 0.9744, + "step": 954 + }, + { + "epoch": 0.15, + "grad_norm": 3.4461099267973307, + "learning_rate": 1.930039047782566e-05, + "loss": 1.0224, + "step": 955 + }, + { + "epoch": 0.15, + "grad_norm": 3.611440236247767, + "learning_rate": 1.9298567634127748e-05, + "loss": 1.0344, + "step": 956 + }, + { + "epoch": 0.15, + "grad_norm": 3.713992871427326, + "learning_rate": 1.92967425051027e-05, + "loss": 1.0251, + "step": 957 + }, + { + "epoch": 0.15, + "grad_norm": 3.449908187883703, + "learning_rate": 1.929491509119908e-05, + "loss": 0.9121, + "step": 958 + }, + { + "epoch": 0.15, + "grad_norm": 4.062615991041237, + "learning_rate": 1.9293085392866016e-05, + "loss": 0.9828, + "step": 959 + }, + { + "epoch": 0.15, + "grad_norm": 3.42483551399482, + "learning_rate": 1.9291253410553198e-05, + "loss": 0.9705, + "step": 960 + }, + { + "epoch": 0.15, + "grad_norm": 2.945041099579405, + "learning_rate": 1.9289419144710874e-05, + "loss": 1.0338, + "step": 961 + }, + { + "epoch": 0.15, + "grad_norm": 19.364824164234715, + "learning_rate": 1.928758259578986e-05, + "loss": 1.1394, + "step": 962 + }, + { + "epoch": 0.15, + "grad_norm": 3.7583163155418124, + "learning_rate": 1.928574376424152e-05, + "loss": 1.0891, + "step": 963 + }, + { + "epoch": 0.15, + "grad_norm": 3.6452998336249296, + "learning_rate": 1.9283902650517792e-05, + "loss": 1.0854, + "step": 964 + }, + { + "epoch": 0.15, + "grad_norm": 3.4468051684722307, + "learning_rate": 1.9282059255071166e-05, + "loss": 0.9391, + "step": 965 + }, + { + "epoch": 0.15, + "grad_norm": 3.7334849226378406, + "learning_rate": 1.9280213578354704e-05, + "loss": 1.0808, + "step": 966 + }, + { + "epoch": 0.15, + "grad_norm": 3.623689601351252, + "learning_rate": 1.927836562082202e-05, + "loss": 1.0664, + "step": 967 + }, + { + "epoch": 0.15, + "grad_norm": 3.1724322437995816, + "learning_rate": 1.9276515382927284e-05, + "loss": 0.9325, + "step": 968 + }, + { + "epoch": 0.15, + "grad_norm": 3.5921713171702434, + "learning_rate": 1.9274662865125235e-05, + "loss": 0.9363, + "step": 969 + }, + { + "epoch": 0.15, + "grad_norm": 3.2800275079762544, + "learning_rate": 1.9272808067871173e-05, + "loss": 0.9843, + "step": 970 + }, + { + "epoch": 0.15, + "grad_norm": 3.5548791781879294, + "learning_rate": 1.9270950991620955e-05, + "loss": 1.0932, + "step": 971 + }, + { + "epoch": 0.15, + "grad_norm": 3.4175167933953956, + "learning_rate": 1.9269091636830998e-05, + "loss": 1.0733, + "step": 972 + }, + { + "epoch": 0.15, + "grad_norm": 3.3628941721131165, + "learning_rate": 1.9267230003958276e-05, + "loss": 0.9672, + "step": 973 + }, + { + "epoch": 0.15, + "grad_norm": 3.6295281626924045, + "learning_rate": 1.926536609346033e-05, + "loss": 1.0338, + "step": 974 + }, + { + "epoch": 0.15, + "grad_norm": 3.648118265875055, + "learning_rate": 1.926349990579525e-05, + "loss": 0.8895, + "step": 975 + }, + { + "epoch": 0.15, + "grad_norm": 3.1202643710316105, + "learning_rate": 1.9261631441421703e-05, + "loss": 0.9469, + "step": 976 + }, + { + "epoch": 0.15, + "grad_norm": 3.649872049484259, + "learning_rate": 1.92597607007989e-05, + "loss": 1.0917, + "step": 977 + }, + { + "epoch": 0.15, + "grad_norm": 4.175536441933523, + "learning_rate": 1.9257887684386614e-05, + "loss": 1.112, + "step": 978 + }, + { + "epoch": 0.15, + "grad_norm": 3.1752679166085684, + "learning_rate": 1.925601239264518e-05, + "loss": 1.0251, + "step": 979 + }, + { + "epoch": 0.15, + "grad_norm": 3.681899019040447, + "learning_rate": 1.92541348260355e-05, + "loss": 1.039, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 3.4234714701595217, + "learning_rate": 1.925225498501902e-05, + "loss": 0.964, + "step": 981 + }, + { + "epoch": 0.15, + "grad_norm": 3.71223071364033, + "learning_rate": 1.9250372870057754e-05, + "loss": 1.0858, + "step": 982 + }, + { + "epoch": 0.15, + "grad_norm": 4.212599435406759, + "learning_rate": 1.924848848161427e-05, + "loss": 0.948, + "step": 983 + }, + { + "epoch": 0.15, + "grad_norm": 3.5192934057012577, + "learning_rate": 1.9246601820151705e-05, + "loss": 1.0132, + "step": 984 + }, + { + "epoch": 0.15, + "grad_norm": 3.928109835134174, + "learning_rate": 1.924471288613374e-05, + "loss": 1.0298, + "step": 985 + }, + { + "epoch": 0.15, + "grad_norm": 3.7549381559022263, + "learning_rate": 1.9242821680024625e-05, + "loss": 1.0001, + "step": 986 + }, + { + "epoch": 0.15, + "grad_norm": 4.239151421371474, + "learning_rate": 1.9240928202289168e-05, + "loss": 1.061, + "step": 987 + }, + { + "epoch": 0.15, + "grad_norm": 3.54700933413855, + "learning_rate": 1.9239032453392733e-05, + "loss": 1.0863, + "step": 988 + }, + { + "epoch": 0.15, + "grad_norm": 3.670780179665936, + "learning_rate": 1.9237134433801235e-05, + "loss": 1.1231, + "step": 989 + }, + { + "epoch": 0.15, + "grad_norm": 3.289598309932846, + "learning_rate": 1.923523414398116e-05, + "loss": 1.0588, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 3.489414422570557, + "learning_rate": 1.9233331584399542e-05, + "loss": 0.8921, + "step": 991 + }, + { + "epoch": 0.15, + "grad_norm": 3.4955835415910568, + "learning_rate": 1.923142675552398e-05, + "loss": 0.9683, + "step": 992 + }, + { + "epoch": 0.15, + "grad_norm": 11.967525951762857, + "learning_rate": 1.9229519657822627e-05, + "loss": 1.1301, + "step": 993 + }, + { + "epoch": 0.15, + "grad_norm": 3.6364302200755745, + "learning_rate": 1.9227610291764194e-05, + "loss": 1.0683, + "step": 994 + }, + { + "epoch": 0.15, + "grad_norm": 3.6486532832454164, + "learning_rate": 1.922569865781795e-05, + "loss": 1.005, + "step": 995 + }, + { + "epoch": 0.15, + "grad_norm": 4.081910338426013, + "learning_rate": 1.922378475645372e-05, + "loss": 1.0145, + "step": 996 + }, + { + "epoch": 0.15, + "grad_norm": 3.4099413617680425, + "learning_rate": 1.9221868588141886e-05, + "loss": 0.943, + "step": 997 + }, + { + "epoch": 0.15, + "grad_norm": 3.570163953758169, + "learning_rate": 1.9219950153353393e-05, + "loss": 1.159, + "step": 998 + }, + { + "epoch": 0.15, + "grad_norm": 3.3798061968174875, + "learning_rate": 1.9218029452559733e-05, + "loss": 1.043, + "step": 999 + }, + { + "epoch": 0.15, + "grad_norm": 3.4073060967897386, + "learning_rate": 1.9216106486232965e-05, + "loss": 1.0475, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 3.943847825429316, + "learning_rate": 1.9214181254845696e-05, + "loss": 1.0153, + "step": 1001 + }, + { + "epoch": 0.15, + "grad_norm": 3.7346551319913384, + "learning_rate": 1.92122537588711e-05, + "loss": 0.957, + "step": 1002 + }, + { + "epoch": 0.15, + "grad_norm": 3.4143581157133607, + "learning_rate": 1.9210323998782892e-05, + "loss": 1.0708, + "step": 1003 + }, + { + "epoch": 0.15, + "grad_norm": 3.551479921614783, + "learning_rate": 1.920839197505536e-05, + "loss": 1.0025, + "step": 1004 + }, + { + "epoch": 0.15, + "grad_norm": 3.542686643829843, + "learning_rate": 1.9206457688163338e-05, + "loss": 0.9833, + "step": 1005 + }, + { + "epoch": 0.15, + "grad_norm": 3.5015473236779293, + "learning_rate": 1.9204521138582227e-05, + "loss": 1.0198, + "step": 1006 + }, + { + "epoch": 0.15, + "grad_norm": 3.563350601802636, + "learning_rate": 1.9202582326787963e-05, + "loss": 1.0396, + "step": 1007 + }, + { + "epoch": 0.15, + "grad_norm": 3.292978792419636, + "learning_rate": 1.9200641253257064e-05, + "loss": 1.0761, + "step": 1008 + }, + { + "epoch": 0.15, + "grad_norm": 3.555083711210735, + "learning_rate": 1.9198697918466585e-05, + "loss": 0.9949, + "step": 1009 + }, + { + "epoch": 0.15, + "grad_norm": 3.5581102126972795, + "learning_rate": 1.9196752322894144e-05, + "loss": 0.9737, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 3.4863765960327533, + "learning_rate": 1.9194804467017916e-05, + "loss": 1.0135, + "step": 1011 + }, + { + "epoch": 0.15, + "grad_norm": 3.583370538658214, + "learning_rate": 1.9192854351316627e-05, + "loss": 0.952, + "step": 1012 + }, + { + "epoch": 0.16, + "grad_norm": 3.8863410802663925, + "learning_rate": 1.9190901976269564e-05, + "loss": 0.9946, + "step": 1013 + }, + { + "epoch": 0.16, + "grad_norm": 3.169190485865015, + "learning_rate": 1.9188947342356562e-05, + "loss": 0.9519, + "step": 1014 + }, + { + "epoch": 0.16, + "grad_norm": 3.697746603380063, + "learning_rate": 1.9186990450058018e-05, + "loss": 1.1704, + "step": 1015 + }, + { + "epoch": 0.16, + "grad_norm": 3.8443610504164885, + "learning_rate": 1.918503129985488e-05, + "loss": 0.9317, + "step": 1016 + }, + { + "epoch": 0.16, + "grad_norm": 9.613406671233491, + "learning_rate": 1.9183069892228648e-05, + "loss": 1.0153, + "step": 1017 + }, + { + "epoch": 0.16, + "grad_norm": 3.750460562880933, + "learning_rate": 1.9181106227661394e-05, + "loss": 1.0646, + "step": 1018 + }, + { + "epoch": 0.16, + "grad_norm": 3.543565576301301, + "learning_rate": 1.9179140306635715e-05, + "loss": 1.0755, + "step": 1019 + }, + { + "epoch": 0.16, + "grad_norm": 3.1179341803007268, + "learning_rate": 1.917717212963479e-05, + "loss": 1.0341, + "step": 1020 + }, + { + "epoch": 0.16, + "grad_norm": 3.693231646113712, + "learning_rate": 1.917520169714234e-05, + "loss": 1.0512, + "step": 1021 + }, + { + "epoch": 0.16, + "grad_norm": 3.6424860961845242, + "learning_rate": 1.917322900964264e-05, + "loss": 0.9632, + "step": 1022 + }, + { + "epoch": 0.16, + "grad_norm": 3.143360794092663, + "learning_rate": 1.9171254067620525e-05, + "loss": 0.9626, + "step": 1023 + }, + { + "epoch": 0.16, + "grad_norm": 6.019061378499708, + "learning_rate": 1.9169276871561372e-05, + "loss": 1.0185, + "step": 1024 + }, + { + "epoch": 0.16, + "grad_norm": 3.7413802411315658, + "learning_rate": 1.9167297421951133e-05, + "loss": 0.9387, + "step": 1025 + }, + { + "epoch": 0.16, + "grad_norm": 3.4166342976005435, + "learning_rate": 1.916531571927629e-05, + "loss": 0.9611, + "step": 1026 + }, + { + "epoch": 0.16, + "grad_norm": 3.4654811937421526, + "learning_rate": 1.9163331764023893e-05, + "loss": 0.9905, + "step": 1027 + }, + { + "epoch": 0.16, + "grad_norm": 3.3771874199837653, + "learning_rate": 1.9161345556681548e-05, + "loss": 0.9824, + "step": 1028 + }, + { + "epoch": 0.16, + "grad_norm": 3.9825758010399266, + "learning_rate": 1.91593570977374e-05, + "loss": 0.9843, + "step": 1029 + }, + { + "epoch": 0.16, + "grad_norm": 3.500299091233586, + "learning_rate": 1.9157366387680164e-05, + "loss": 1.0668, + "step": 1030 + }, + { + "epoch": 0.16, + "grad_norm": 4.002409073404688, + "learning_rate": 1.9155373426999096e-05, + "loss": 1.0126, + "step": 1031 + }, + { + "epoch": 0.16, + "grad_norm": 3.1178497772698615, + "learning_rate": 1.9153378216184013e-05, + "loss": 0.912, + "step": 1032 + }, + { + "epoch": 0.16, + "grad_norm": 8.875971732474303, + "learning_rate": 1.9151380755725282e-05, + "loss": 1.1375, + "step": 1033 + }, + { + "epoch": 0.16, + "grad_norm": 4.194747601119983, + "learning_rate": 1.9149381046113817e-05, + "loss": 0.9627, + "step": 1034 + }, + { + "epoch": 0.16, + "grad_norm": 3.566551938326764, + "learning_rate": 1.91473790878411e-05, + "loss": 0.9576, + "step": 1035 + }, + { + "epoch": 0.16, + "grad_norm": 3.7111810907560225, + "learning_rate": 1.9145374881399144e-05, + "loss": 0.8816, + "step": 1036 + }, + { + "epoch": 0.16, + "grad_norm": 3.279089276894769, + "learning_rate": 1.9143368427280542e-05, + "loss": 1.0099, + "step": 1037 + }, + { + "epoch": 0.16, + "grad_norm": 3.6872390618227047, + "learning_rate": 1.914135972597841e-05, + "loss": 1.006, + "step": 1038 + }, + { + "epoch": 0.16, + "grad_norm": 3.4511173229460845, + "learning_rate": 1.9139348777986443e-05, + "loss": 0.9577, + "step": 1039 + }, + { + "epoch": 0.16, + "grad_norm": 3.246134239880193, + "learning_rate": 1.913733558379886e-05, + "loss": 0.9671, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 3.16485398398124, + "learning_rate": 1.9135320143910465e-05, + "loss": 1.0102, + "step": 1041 + }, + { + "epoch": 0.16, + "grad_norm": 3.498137213691302, + "learning_rate": 1.9133302458816586e-05, + "loss": 1.1029, + "step": 1042 + }, + { + "epoch": 0.16, + "grad_norm": 3.9868504509756595, + "learning_rate": 1.9131282529013114e-05, + "loss": 0.9275, + "step": 1043 + }, + { + "epoch": 0.16, + "grad_norm": 3.2756053563969894, + "learning_rate": 1.9129260354996493e-05, + "loss": 1.0949, + "step": 1044 + }, + { + "epoch": 0.16, + "grad_norm": 3.7173141603029385, + "learning_rate": 1.9127235937263716e-05, + "loss": 1.0517, + "step": 1045 + }, + { + "epoch": 0.16, + "grad_norm": 3.085399324277153, + "learning_rate": 1.912520927631233e-05, + "loss": 1.1233, + "step": 1046 + }, + { + "epoch": 0.16, + "grad_norm": 4.118322366645487, + "learning_rate": 1.9123180372640428e-05, + "loss": 1.0455, + "step": 1047 + }, + { + "epoch": 0.16, + "grad_norm": 3.29119929398604, + "learning_rate": 1.912114922674666e-05, + "loss": 0.9441, + "step": 1048 + }, + { + "epoch": 0.16, + "grad_norm": 3.1222949030253933, + "learning_rate": 1.9119115839130227e-05, + "loss": 0.9119, + "step": 1049 + }, + { + "epoch": 0.16, + "grad_norm": 3.436195624820573, + "learning_rate": 1.9117080210290873e-05, + "loss": 1.0198, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 3.5871822829406406, + "learning_rate": 1.9115042340728904e-05, + "loss": 1.0135, + "step": 1051 + }, + { + "epoch": 0.16, + "grad_norm": 3.3799660086390766, + "learning_rate": 1.9113002230945166e-05, + "loss": 1.0078, + "step": 1052 + }, + { + "epoch": 0.16, + "grad_norm": 3.2646985796547536, + "learning_rate": 1.9110959881441064e-05, + "loss": 0.8787, + "step": 1053 + }, + { + "epoch": 0.16, + "grad_norm": 3.15046936919999, + "learning_rate": 1.910891529271855e-05, + "loss": 1.0025, + "step": 1054 + }, + { + "epoch": 0.16, + "grad_norm": 3.7288170635268973, + "learning_rate": 1.9106868465280125e-05, + "loss": 0.87, + "step": 1055 + }, + { + "epoch": 0.16, + "grad_norm": 3.454722575329552, + "learning_rate": 1.9104819399628846e-05, + "loss": 1.0159, + "step": 1056 + }, + { + "epoch": 0.16, + "grad_norm": 3.3362831788943454, + "learning_rate": 1.9102768096268312e-05, + "loss": 1.0137, + "step": 1057 + }, + { + "epoch": 0.16, + "grad_norm": 3.6138909028367387, + "learning_rate": 1.9100714555702673e-05, + "loss": 0.8808, + "step": 1058 + }, + { + "epoch": 0.16, + "grad_norm": 3.289557942859989, + "learning_rate": 1.909865877843664e-05, + "loss": 0.9233, + "step": 1059 + }, + { + "epoch": 0.16, + "grad_norm": 3.448109506123443, + "learning_rate": 1.9096600764975458e-05, + "loss": 1.0014, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 2.9931691189641203, + "learning_rate": 1.9094540515824933e-05, + "loss": 0.9862, + "step": 1061 + }, + { + "epoch": 0.16, + "grad_norm": 3.22334136087793, + "learning_rate": 1.909247803149141e-05, + "loss": 1.0753, + "step": 1062 + }, + { + "epoch": 0.16, + "grad_norm": 3.3609406665630064, + "learning_rate": 1.9090413312481803e-05, + "loss": 1.0485, + "step": 1063 + }, + { + "epoch": 0.16, + "grad_norm": 3.3705566669272056, + "learning_rate": 1.908834635930355e-05, + "loss": 1.0542, + "step": 1064 + }, + { + "epoch": 0.16, + "grad_norm": 3.5487167698301945, + "learning_rate": 1.9086277172464657e-05, + "loss": 0.9729, + "step": 1065 + }, + { + "epoch": 0.16, + "grad_norm": 3.33753925640636, + "learning_rate": 1.9084205752473665e-05, + "loss": 1.0363, + "step": 1066 + }, + { + "epoch": 0.16, + "grad_norm": 3.43946877330562, + "learning_rate": 1.9082132099839678e-05, + "loss": 1.053, + "step": 1067 + }, + { + "epoch": 0.16, + "grad_norm": 3.5135991771708275, + "learning_rate": 1.9080056215072335e-05, + "loss": 1.0468, + "step": 1068 + }, + { + "epoch": 0.16, + "grad_norm": 3.2571006947721535, + "learning_rate": 1.9077978098681838e-05, + "loss": 1.0529, + "step": 1069 + }, + { + "epoch": 0.16, + "grad_norm": 3.041552540240493, + "learning_rate": 1.9075897751178924e-05, + "loss": 0.9847, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 3.571357087030962, + "learning_rate": 1.9073815173074887e-05, + "loss": 0.945, + "step": 1071 + }, + { + "epoch": 0.16, + "grad_norm": 9.476556745270255, + "learning_rate": 1.9071730364881564e-05, + "loss": 1.0811, + "step": 1072 + }, + { + "epoch": 0.16, + "grad_norm": 3.4951504929385124, + "learning_rate": 1.9069643327111343e-05, + "loss": 0.9277, + "step": 1073 + }, + { + "epoch": 0.16, + "grad_norm": 3.124336692811566, + "learning_rate": 1.9067554060277163e-05, + "loss": 0.9597, + "step": 1074 + }, + { + "epoch": 0.16, + "grad_norm": 3.527427864527887, + "learning_rate": 1.90654625648925e-05, + "loss": 1.0146, + "step": 1075 + }, + { + "epoch": 0.16, + "grad_norm": 3.455911351310847, + "learning_rate": 1.9063368841471394e-05, + "loss": 0.9454, + "step": 1076 + }, + { + "epoch": 0.16, + "grad_norm": 3.3105290669891643, + "learning_rate": 1.9061272890528414e-05, + "loss": 0.9481, + "step": 1077 + }, + { + "epoch": 0.17, + "grad_norm": 3.2685108467907966, + "learning_rate": 1.9059174712578692e-05, + "loss": 0.9618, + "step": 1078 + }, + { + "epoch": 0.17, + "grad_norm": 3.9964576477210114, + "learning_rate": 1.90570743081379e-05, + "loss": 1.0274, + "step": 1079 + }, + { + "epoch": 0.17, + "grad_norm": 3.6542742179123198, + "learning_rate": 1.905497167772226e-05, + "loss": 0.9668, + "step": 1080 + }, + { + "epoch": 0.17, + "grad_norm": 3.5924874165069554, + "learning_rate": 1.9052866821848536e-05, + "loss": 1.0075, + "step": 1081 + }, + { + "epoch": 0.17, + "grad_norm": 3.276204761156857, + "learning_rate": 1.9050759741034043e-05, + "loss": 0.9846, + "step": 1082 + }, + { + "epoch": 0.17, + "grad_norm": 3.327735856352041, + "learning_rate": 1.904865043579664e-05, + "loss": 0.9602, + "step": 1083 + }, + { + "epoch": 0.17, + "grad_norm": 5.197330224276992, + "learning_rate": 1.9046538906654745e-05, + "loss": 1.0542, + "step": 1084 + }, + { + "epoch": 0.17, + "grad_norm": 3.855628629594563, + "learning_rate": 1.904442515412731e-05, + "loss": 0.9439, + "step": 1085 + }, + { + "epoch": 0.17, + "grad_norm": 11.381720411209631, + "learning_rate": 1.9042309178733825e-05, + "loss": 1.1813, + "step": 1086 + }, + { + "epoch": 0.17, + "grad_norm": 4.714974701017107, + "learning_rate": 1.904019098099435e-05, + "loss": 1.0041, + "step": 1087 + }, + { + "epoch": 0.17, + "grad_norm": 3.882190357917079, + "learning_rate": 1.9038070561429468e-05, + "loss": 0.986, + "step": 1088 + }, + { + "epoch": 0.17, + "grad_norm": 3.4545158928815023, + "learning_rate": 1.9035947920560327e-05, + "loss": 1.0752, + "step": 1089 + }, + { + "epoch": 0.17, + "grad_norm": 3.5499121222160874, + "learning_rate": 1.903382305890861e-05, + "loss": 1.0289, + "step": 1090 + }, + { + "epoch": 0.17, + "grad_norm": 3.4587592105542635, + "learning_rate": 1.903169597699655e-05, + "loss": 0.9598, + "step": 1091 + }, + { + "epoch": 0.17, + "grad_norm": 3.334344279452804, + "learning_rate": 1.9029566675346916e-05, + "loss": 0.9692, + "step": 1092 + }, + { + "epoch": 0.17, + "grad_norm": 3.3315491706075857, + "learning_rate": 1.9027435154483044e-05, + "loss": 1.0866, + "step": 1093 + }, + { + "epoch": 0.17, + "grad_norm": 3.383403403950702, + "learning_rate": 1.9025301414928795e-05, + "loss": 1.0768, + "step": 1094 + }, + { + "epoch": 0.17, + "grad_norm": 3.440092757443575, + "learning_rate": 1.9023165457208577e-05, + "loss": 0.9649, + "step": 1095 + }, + { + "epoch": 0.17, + "grad_norm": 3.193803361721852, + "learning_rate": 1.9021027281847354e-05, + "loss": 1.0055, + "step": 1096 + }, + { + "epoch": 0.17, + "grad_norm": 3.809350222993406, + "learning_rate": 1.9018886889370636e-05, + "loss": 0.9537, + "step": 1097 + }, + { + "epoch": 0.17, + "grad_norm": 3.502375873447698, + "learning_rate": 1.9016744280304457e-05, + "loss": 0.9742, + "step": 1098 + }, + { + "epoch": 0.17, + "grad_norm": 3.4995535233150092, + "learning_rate": 1.901459945517542e-05, + "loss": 0.9055, + "step": 1099 + }, + { + "epoch": 0.17, + "grad_norm": 3.4377117018648042, + "learning_rate": 1.9012452414510667e-05, + "loss": 1.0599, + "step": 1100 + }, + { + "epoch": 0.17, + "grad_norm": 3.895500725772442, + "learning_rate": 1.901030315883787e-05, + "loss": 1.0256, + "step": 1101 + }, + { + "epoch": 0.17, + "grad_norm": 3.370139212744162, + "learning_rate": 1.9008151688685255e-05, + "loss": 0.9934, + "step": 1102 + }, + { + "epoch": 0.17, + "grad_norm": 3.177175252089327, + "learning_rate": 1.9005998004581606e-05, + "loss": 1.0087, + "step": 1103 + }, + { + "epoch": 0.17, + "grad_norm": 3.0968416904480254, + "learning_rate": 1.9003842107056224e-05, + "loss": 0.9516, + "step": 1104 + }, + { + "epoch": 0.17, + "grad_norm": 4.683234071334093, + "learning_rate": 1.9001683996638978e-05, + "loss": 0.9738, + "step": 1105 + }, + { + "epoch": 0.17, + "grad_norm": 3.7105828983358378, + "learning_rate": 1.8999523673860266e-05, + "loss": 0.9792, + "step": 1106 + }, + { + "epoch": 0.17, + "grad_norm": 3.890400305329194, + "learning_rate": 1.8997361139251036e-05, + "loss": 1.0313, + "step": 1107 + }, + { + "epoch": 0.17, + "grad_norm": 3.4767511809652216, + "learning_rate": 1.8995196393342778e-05, + "loss": 1.1165, + "step": 1108 + }, + { + "epoch": 0.17, + "grad_norm": 3.2470769090829967, + "learning_rate": 1.899302943666753e-05, + "loss": 1.0464, + "step": 1109 + }, + { + "epoch": 0.17, + "grad_norm": 3.1368112228941754, + "learning_rate": 1.899086026975786e-05, + "loss": 0.9509, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 3.3698732139606413, + "learning_rate": 1.89886888931469e-05, + "loss": 1.0738, + "step": 1111 + }, + { + "epoch": 0.17, + "grad_norm": 3.37952733028334, + "learning_rate": 1.89865153073683e-05, + "loss": 0.8875, + "step": 1112 + }, + { + "epoch": 0.17, + "grad_norm": 3.3612014588697523, + "learning_rate": 1.898433951295628e-05, + "loss": 1.0317, + "step": 1113 + }, + { + "epoch": 0.17, + "grad_norm": 3.574115698052471, + "learning_rate": 1.8982161510445577e-05, + "loss": 1.0568, + "step": 1114 + }, + { + "epoch": 0.17, + "grad_norm": 3.5775869562020173, + "learning_rate": 1.8979981300371492e-05, + "loss": 1.0596, + "step": 1115 + }, + { + "epoch": 0.17, + "grad_norm": 3.464328670626159, + "learning_rate": 1.8977798883269858e-05, + "loss": 0.9411, + "step": 1116 + }, + { + "epoch": 0.17, + "grad_norm": 3.801939625366067, + "learning_rate": 1.897561425967705e-05, + "loss": 0.9531, + "step": 1117 + }, + { + "epoch": 0.17, + "grad_norm": 3.8808600285040273, + "learning_rate": 1.8973427430129986e-05, + "loss": 1.0827, + "step": 1118 + }, + { + "epoch": 0.17, + "grad_norm": 3.4145714567838223, + "learning_rate": 1.8971238395166128e-05, + "loss": 0.9949, + "step": 1119 + }, + { + "epoch": 0.17, + "grad_norm": 3.097114782795992, + "learning_rate": 1.8969047155323484e-05, + "loss": 0.9522, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 3.3808431452573715, + "learning_rate": 1.896685371114059e-05, + "loss": 1.0558, + "step": 1121 + }, + { + "epoch": 0.17, + "grad_norm": 3.213562730595572, + "learning_rate": 1.8964658063156542e-05, + "loss": 0.9506, + "step": 1122 + }, + { + "epoch": 0.17, + "grad_norm": 3.23503323959287, + "learning_rate": 1.8962460211910968e-05, + "loss": 0.9897, + "step": 1123 + }, + { + "epoch": 0.17, + "grad_norm": 3.6174475409775577, + "learning_rate": 1.896026015794403e-05, + "loss": 0.9811, + "step": 1124 + }, + { + "epoch": 0.17, + "grad_norm": 3.3036656051786983, + "learning_rate": 1.8958057901796446e-05, + "loss": 1.0645, + "step": 1125 + }, + { + "epoch": 0.17, + "grad_norm": 3.317419068376816, + "learning_rate": 1.895585344400947e-05, + "loss": 1.0342, + "step": 1126 + }, + { + "epoch": 0.17, + "grad_norm": 3.558894837151238, + "learning_rate": 1.8953646785124895e-05, + "loss": 0.9451, + "step": 1127 + }, + { + "epoch": 0.17, + "grad_norm": 4.262813533733853, + "learning_rate": 1.8951437925685053e-05, + "loss": 1.0044, + "step": 1128 + }, + { + "epoch": 0.17, + "grad_norm": 4.615319625164075, + "learning_rate": 1.8949226866232822e-05, + "loss": 1.0139, + "step": 1129 + }, + { + "epoch": 0.17, + "grad_norm": 3.7198693704317325, + "learning_rate": 1.8947013607311614e-05, + "loss": 1.1314, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 4.179425008195092, + "learning_rate": 1.8944798149465394e-05, + "loss": 1.066, + "step": 1131 + }, + { + "epoch": 0.17, + "grad_norm": 3.3484987417758982, + "learning_rate": 1.8942580493238655e-05, + "loss": 1.0742, + "step": 1132 + }, + { + "epoch": 0.17, + "grad_norm": 3.610034457301421, + "learning_rate": 1.8940360639176435e-05, + "loss": 1.0215, + "step": 1133 + }, + { + "epoch": 0.17, + "grad_norm": 3.4479854456032344, + "learning_rate": 1.893813858782431e-05, + "loss": 1.096, + "step": 1134 + }, + { + "epoch": 0.17, + "grad_norm": 2.989140158166724, + "learning_rate": 1.89359143397284e-05, + "loss": 0.9756, + "step": 1135 + }, + { + "epoch": 0.17, + "grad_norm": 3.5927108470611304, + "learning_rate": 1.8933687895435363e-05, + "loss": 0.9714, + "step": 1136 + }, + { + "epoch": 0.17, + "grad_norm": 3.4687364701321726, + "learning_rate": 1.8931459255492396e-05, + "loss": 1.0522, + "step": 1137 + }, + { + "epoch": 0.17, + "grad_norm": 3.355312740447251, + "learning_rate": 1.8929228420447234e-05, + "loss": 1.0474, + "step": 1138 + }, + { + "epoch": 0.17, + "grad_norm": 5.329216562737884, + "learning_rate": 1.8926995390848158e-05, + "loss": 0.9987, + "step": 1139 + }, + { + "epoch": 0.17, + "grad_norm": 3.4784093304330126, + "learning_rate": 1.892476016724398e-05, + "loss": 0.9661, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 4.578408602821964, + "learning_rate": 1.892252275018406e-05, + "loss": 0.9503, + "step": 1141 + }, + { + "epoch": 0.17, + "grad_norm": 3.291121071892807, + "learning_rate": 1.892028314021829e-05, + "loss": 1.0053, + "step": 1142 + }, + { + "epoch": 0.17, + "grad_norm": 3.6034792441405075, + "learning_rate": 1.89180413378971e-05, + "loss": 1.0765, + "step": 1143 + }, + { + "epoch": 0.18, + "grad_norm": 3.558401358394951, + "learning_rate": 1.8915797343771464e-05, + "loss": 1.0834, + "step": 1144 + }, + { + "epoch": 0.18, + "grad_norm": 3.230448244024029, + "learning_rate": 1.89135511583929e-05, + "loss": 1.0834, + "step": 1145 + }, + { + "epoch": 0.18, + "grad_norm": 3.040874118640548, + "learning_rate": 1.8911302782313442e-05, + "loss": 0.9802, + "step": 1146 + }, + { + "epoch": 0.18, + "grad_norm": 3.330212581063196, + "learning_rate": 1.8909052216085695e-05, + "loss": 1.0496, + "step": 1147 + }, + { + "epoch": 0.18, + "grad_norm": 3.276501255977922, + "learning_rate": 1.8906799460262776e-05, + "loss": 0.9585, + "step": 1148 + }, + { + "epoch": 0.18, + "grad_norm": 3.7767677443907792, + "learning_rate": 1.8904544515398348e-05, + "loss": 0.8529, + "step": 1149 + }, + { + "epoch": 0.18, + "grad_norm": 3.339419253619894, + "learning_rate": 1.8902287382046613e-05, + "loss": 0.9477, + "step": 1150 + }, + { + "epoch": 0.18, + "grad_norm": 14.351120059873713, + "learning_rate": 1.8900028060762317e-05, + "loss": 1.3391, + "step": 1151 + }, + { + "epoch": 0.18, + "grad_norm": 3.7045864230149754, + "learning_rate": 1.8897766552100735e-05, + "loss": 1.0245, + "step": 1152 + }, + { + "epoch": 0.18, + "grad_norm": 3.5387993543284884, + "learning_rate": 1.889550285661768e-05, + "loss": 1.1662, + "step": 1153 + }, + { + "epoch": 0.18, + "grad_norm": 3.363800219601907, + "learning_rate": 1.8893236974869503e-05, + "loss": 0.9452, + "step": 1154 + }, + { + "epoch": 0.18, + "grad_norm": 3.775347794310135, + "learning_rate": 1.8890968907413103e-05, + "loss": 1.0074, + "step": 1155 + }, + { + "epoch": 0.18, + "grad_norm": 3.444627030455306, + "learning_rate": 1.88886986548059e-05, + "loss": 1.0151, + "step": 1156 + }, + { + "epoch": 0.18, + "grad_norm": 3.4675043582061207, + "learning_rate": 1.888642621760586e-05, + "loss": 0.9661, + "step": 1157 + }, + { + "epoch": 0.18, + "grad_norm": 3.7803298567162296, + "learning_rate": 1.8884151596371487e-05, + "loss": 1.0353, + "step": 1158 + }, + { + "epoch": 0.18, + "grad_norm": 3.3184377487122805, + "learning_rate": 1.888187479166182e-05, + "loss": 0.9759, + "step": 1159 + }, + { + "epoch": 0.18, + "grad_norm": 3.816499168073624, + "learning_rate": 1.8879595804036424e-05, + "loss": 1.0366, + "step": 1160 + }, + { + "epoch": 0.18, + "grad_norm": 3.2112431734482842, + "learning_rate": 1.8877314634055418e-05, + "loss": 1.0806, + "step": 1161 + }, + { + "epoch": 0.18, + "grad_norm": 2.818032573141438, + "learning_rate": 1.8875031282279452e-05, + "loss": 0.9278, + "step": 1162 + }, + { + "epoch": 0.18, + "grad_norm": 3.3892480376956096, + "learning_rate": 1.88727457492697e-05, + "loss": 1.0231, + "step": 1163 + }, + { + "epoch": 0.18, + "grad_norm": 3.338034412494329, + "learning_rate": 1.8870458035587896e-05, + "loss": 0.9942, + "step": 1164 + }, + { + "epoch": 0.18, + "grad_norm": 3.4858890392363016, + "learning_rate": 1.886816814179629e-05, + "loss": 1.0469, + "step": 1165 + }, + { + "epoch": 0.18, + "grad_norm": 2.932865879989182, + "learning_rate": 1.8865876068457663e-05, + "loss": 0.9496, + "step": 1166 + }, + { + "epoch": 0.18, + "grad_norm": 20.12195654806366, + "learning_rate": 1.8863581816135355e-05, + "loss": 1.2516, + "step": 1167 + }, + { + "epoch": 0.18, + "grad_norm": 3.3345948724808117, + "learning_rate": 1.8861285385393226e-05, + "loss": 1.0154, + "step": 1168 + }, + { + "epoch": 0.18, + "grad_norm": 3.5548649180893577, + "learning_rate": 1.8858986776795673e-05, + "loss": 0.945, + "step": 1169 + }, + { + "epoch": 0.18, + "grad_norm": 3.05695734572861, + "learning_rate": 1.8856685990907627e-05, + "loss": 0.9177, + "step": 1170 + }, + { + "epoch": 0.18, + "grad_norm": 3.3215030028624435, + "learning_rate": 1.8854383028294563e-05, + "loss": 0.914, + "step": 1171 + }, + { + "epoch": 0.18, + "grad_norm": 3.534699822951794, + "learning_rate": 1.885207788952248e-05, + "loss": 0.9461, + "step": 1172 + }, + { + "epoch": 0.18, + "grad_norm": 8.630422675866747, + "learning_rate": 1.884977057515792e-05, + "loss": 1.0953, + "step": 1173 + }, + { + "epoch": 0.18, + "grad_norm": 3.3762286736972444, + "learning_rate": 1.884746108576795e-05, + "loss": 1.0774, + "step": 1174 + }, + { + "epoch": 0.18, + "grad_norm": 3.2132032320980506, + "learning_rate": 1.8845149421920183e-05, + "loss": 1.0781, + "step": 1175 + }, + { + "epoch": 0.18, + "grad_norm": 3.140304792219557, + "learning_rate": 1.884283558418276e-05, + "loss": 0.9953, + "step": 1176 + }, + { + "epoch": 0.18, + "grad_norm": 3.586259201431184, + "learning_rate": 1.8840519573124355e-05, + "loss": 0.9602, + "step": 1177 + }, + { + "epoch": 0.18, + "grad_norm": 3.0692544921950002, + "learning_rate": 1.8838201389314186e-05, + "loss": 0.9531, + "step": 1178 + }, + { + "epoch": 0.18, + "grad_norm": 4.059529893306157, + "learning_rate": 1.8835881033321987e-05, + "loss": 0.9284, + "step": 1179 + }, + { + "epoch": 0.18, + "grad_norm": 3.8748227160913618, + "learning_rate": 1.883355850571804e-05, + "loss": 0.9953, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 3.547535182182359, + "learning_rate": 1.8831233807073162e-05, + "loss": 1.0275, + "step": 1181 + }, + { + "epoch": 0.18, + "grad_norm": 3.332665967214049, + "learning_rate": 1.8828906937958697e-05, + "loss": 1.0284, + "step": 1182 + }, + { + "epoch": 0.18, + "grad_norm": 3.997463814001846, + "learning_rate": 1.882657789894652e-05, + "loss": 1.0457, + "step": 1183 + }, + { + "epoch": 0.18, + "grad_norm": 3.167722059482026, + "learning_rate": 1.882424669060905e-05, + "loss": 0.9619, + "step": 1184 + }, + { + "epoch": 0.18, + "grad_norm": 3.3420295940411417, + "learning_rate": 1.8821913313519222e-05, + "loss": 1.0286, + "step": 1185 + }, + { + "epoch": 0.18, + "grad_norm": 3.4323628920909, + "learning_rate": 1.8819577768250527e-05, + "loss": 0.9414, + "step": 1186 + }, + { + "epoch": 0.18, + "grad_norm": 3.471810980202804, + "learning_rate": 1.8817240055376967e-05, + "loss": 1.0593, + "step": 1187 + }, + { + "epoch": 0.18, + "grad_norm": 4.014072585779561, + "learning_rate": 1.8814900175473095e-05, + "loss": 1.0279, + "step": 1188 + }, + { + "epoch": 0.18, + "grad_norm": 3.5142431015633555, + "learning_rate": 1.881255812911398e-05, + "loss": 0.9126, + "step": 1189 + }, + { + "epoch": 0.18, + "grad_norm": 3.1249815251555946, + "learning_rate": 1.8810213916875232e-05, + "loss": 0.9398, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 3.095309690894207, + "learning_rate": 1.8807867539333e-05, + "loss": 1.0398, + "step": 1191 + }, + { + "epoch": 0.18, + "grad_norm": 3.2039191939862124, + "learning_rate": 1.8805518997063947e-05, + "loss": 1.0105, + "step": 1192 + }, + { + "epoch": 0.18, + "grad_norm": 3.3474256293689044, + "learning_rate": 1.8803168290645287e-05, + "loss": 0.9162, + "step": 1193 + }, + { + "epoch": 0.18, + "grad_norm": 3.0466196271546186, + "learning_rate": 1.8800815420654758e-05, + "loss": 0.9939, + "step": 1194 + }, + { + "epoch": 0.18, + "grad_norm": 3.346909179921009, + "learning_rate": 1.8798460387670622e-05, + "loss": 0.9594, + "step": 1195 + }, + { + "epoch": 0.18, + "grad_norm": 3.486888312468145, + "learning_rate": 1.8796103192271688e-05, + "loss": 1.064, + "step": 1196 + }, + { + "epoch": 0.18, + "grad_norm": 3.5432823849117385, + "learning_rate": 1.8793743835037287e-05, + "loss": 1.0371, + "step": 1197 + }, + { + "epoch": 0.18, + "grad_norm": 3.7968288431929107, + "learning_rate": 1.8791382316547276e-05, + "loss": 1.0197, + "step": 1198 + }, + { + "epoch": 0.18, + "grad_norm": 3.7397975872652305, + "learning_rate": 1.878901863738206e-05, + "loss": 1.0208, + "step": 1199 + }, + { + "epoch": 0.18, + "grad_norm": 3.039320733733214, + "learning_rate": 1.8786652798122557e-05, + "loss": 1.0011, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 3.443305156795146, + "learning_rate": 1.878428479935023e-05, + "loss": 1.0169, + "step": 1201 + }, + { + "epoch": 0.18, + "grad_norm": 3.4890962089355315, + "learning_rate": 1.878191464164706e-05, + "loss": 1.0539, + "step": 1202 + }, + { + "epoch": 0.18, + "grad_norm": 3.3572622978384468, + "learning_rate": 1.8779542325595572e-05, + "loss": 1.0377, + "step": 1203 + }, + { + "epoch": 0.18, + "grad_norm": 3.4645394707459998, + "learning_rate": 1.877716785177881e-05, + "loss": 1.0387, + "step": 1204 + }, + { + "epoch": 0.18, + "grad_norm": 3.4755656750309, + "learning_rate": 1.8774791220780358e-05, + "loss": 1.0699, + "step": 1205 + }, + { + "epoch": 0.18, + "grad_norm": 3.2676798552234088, + "learning_rate": 1.877241243318432e-05, + "loss": 0.9958, + "step": 1206 + }, + { + "epoch": 0.18, + "grad_norm": 3.379353075378151, + "learning_rate": 1.8770031489575336e-05, + "loss": 0.9279, + "step": 1207 + }, + { + "epoch": 0.18, + "grad_norm": 4.009969046904054, + "learning_rate": 1.876764839053858e-05, + "loss": 0.9824, + "step": 1208 + }, + { + "epoch": 0.19, + "grad_norm": 3.195814747484976, + "learning_rate": 1.8765263136659747e-05, + "loss": 1.0223, + "step": 1209 + }, + { + "epoch": 0.19, + "grad_norm": 2.98702869224199, + "learning_rate": 1.8762875728525063e-05, + "loss": 0.9445, + "step": 1210 + }, + { + "epoch": 0.19, + "grad_norm": 3.5172367808354177, + "learning_rate": 1.876048616672129e-05, + "loss": 0.8926, + "step": 1211 + }, + { + "epoch": 0.19, + "grad_norm": 3.4871172798691954, + "learning_rate": 1.875809445183572e-05, + "loss": 1.0432, + "step": 1212 + }, + { + "epoch": 0.19, + "grad_norm": 3.5064858806387686, + "learning_rate": 1.8755700584456156e-05, + "loss": 1.0091, + "step": 1213 + }, + { + "epoch": 0.19, + "grad_norm": 3.1161036668435775, + "learning_rate": 1.8753304565170953e-05, + "loss": 0.987, + "step": 1214 + }, + { + "epoch": 0.19, + "grad_norm": 3.4399247964110287, + "learning_rate": 1.8750906394568984e-05, + "loss": 1.132, + "step": 1215 + }, + { + "epoch": 0.19, + "grad_norm": 5.439032864890778, + "learning_rate": 1.8748506073239654e-05, + "loss": 0.8858, + "step": 1216 + }, + { + "epoch": 0.19, + "grad_norm": 3.9288136576207817, + "learning_rate": 1.874610360177289e-05, + "loss": 0.9878, + "step": 1217 + }, + { + "epoch": 0.19, + "grad_norm": 5.412264557712734, + "learning_rate": 1.8743698980759154e-05, + "loss": 1.0101, + "step": 1218 + }, + { + "epoch": 0.19, + "grad_norm": 3.0818804914527385, + "learning_rate": 1.8741292210789435e-05, + "loss": 0.9747, + "step": 1219 + }, + { + "epoch": 0.19, + "grad_norm": 3.3379575642167576, + "learning_rate": 1.873888329245525e-05, + "loss": 1.0409, + "step": 1220 + }, + { + "epoch": 0.19, + "grad_norm": 3.108719942681841, + "learning_rate": 1.8736472226348643e-05, + "loss": 0.8671, + "step": 1221 + }, + { + "epoch": 0.19, + "grad_norm": 3.433163389985319, + "learning_rate": 1.873405901306219e-05, + "loss": 1.0531, + "step": 1222 + }, + { + "epoch": 0.19, + "grad_norm": 3.367416135930543, + "learning_rate": 1.873164365318898e-05, + "loss": 1.115, + "step": 1223 + }, + { + "epoch": 0.19, + "grad_norm": 3.1944459496476703, + "learning_rate": 1.8729226147322653e-05, + "loss": 1.0592, + "step": 1224 + }, + { + "epoch": 0.19, + "grad_norm": 3.3534934567247827, + "learning_rate": 1.8726806496057356e-05, + "loss": 0.9829, + "step": 1225 + }, + { + "epoch": 0.19, + "grad_norm": 3.4050693643260876, + "learning_rate": 1.8724384699987776e-05, + "loss": 1.0231, + "step": 1226 + }, + { + "epoch": 0.19, + "grad_norm": 3.2027668780710354, + "learning_rate": 1.872196075970912e-05, + "loss": 0.9412, + "step": 1227 + }, + { + "epoch": 0.19, + "grad_norm": 3.225637604293947, + "learning_rate": 1.871953467581713e-05, + "loss": 1.0382, + "step": 1228 + }, + { + "epoch": 0.19, + "grad_norm": 3.0737196348462033, + "learning_rate": 1.8717106448908065e-05, + "loss": 1.0373, + "step": 1229 + }, + { + "epoch": 0.19, + "grad_norm": 3.091074286664361, + "learning_rate": 1.871467607957871e-05, + "loss": 1.0822, + "step": 1230 + }, + { + "epoch": 0.19, + "grad_norm": 3.1267167205958373, + "learning_rate": 1.871224356842639e-05, + "loss": 0.9143, + "step": 1231 + }, + { + "epoch": 0.19, + "grad_norm": 3.697620074634605, + "learning_rate": 1.8709808916048948e-05, + "loss": 0.9823, + "step": 1232 + }, + { + "epoch": 0.19, + "grad_norm": 3.5889398544631512, + "learning_rate": 1.8707372123044746e-05, + "loss": 0.9262, + "step": 1233 + }, + { + "epoch": 0.19, + "grad_norm": 3.5183700822735426, + "learning_rate": 1.8704933190012683e-05, + "loss": 1.0104, + "step": 1234 + }, + { + "epoch": 0.19, + "grad_norm": 3.0718014289597306, + "learning_rate": 1.8702492117552185e-05, + "loss": 0.9683, + "step": 1235 + }, + { + "epoch": 0.19, + "grad_norm": 3.6196911699554666, + "learning_rate": 1.870004890626319e-05, + "loss": 0.8832, + "step": 1236 + }, + { + "epoch": 0.19, + "grad_norm": 3.072246372090567, + "learning_rate": 1.869760355674618e-05, + "loss": 0.8994, + "step": 1237 + }, + { + "epoch": 0.19, + "grad_norm": 3.1411597515237917, + "learning_rate": 1.869515606960215e-05, + "loss": 0.9609, + "step": 1238 + }, + { + "epoch": 0.19, + "grad_norm": 3.5541454860928097, + "learning_rate": 1.869270644543262e-05, + "loss": 1.1036, + "step": 1239 + }, + { + "epoch": 0.19, + "grad_norm": 3.4779650247855365, + "learning_rate": 1.869025468483964e-05, + "loss": 1.0456, + "step": 1240 + }, + { + "epoch": 0.19, + "grad_norm": 3.2139546208099206, + "learning_rate": 1.868780078842579e-05, + "loss": 1.021, + "step": 1241 + }, + { + "epoch": 0.19, + "grad_norm": 3.225726212496146, + "learning_rate": 1.8685344756794163e-05, + "loss": 0.9912, + "step": 1242 + }, + { + "epoch": 0.19, + "grad_norm": 3.186249770125133, + "learning_rate": 1.8682886590548385e-05, + "loss": 0.9572, + "step": 1243 + }, + { + "epoch": 0.19, + "grad_norm": 3.422670951947607, + "learning_rate": 1.8680426290292603e-05, + "loss": 0.9215, + "step": 1244 + }, + { + "epoch": 0.19, + "grad_norm": 3.374702483528041, + "learning_rate": 1.867796385663149e-05, + "loss": 1.0541, + "step": 1245 + }, + { + "epoch": 0.19, + "grad_norm": 3.7650617765604655, + "learning_rate": 1.8675499290170243e-05, + "loss": 0.964, + "step": 1246 + }, + { + "epoch": 0.19, + "grad_norm": 3.366569229191678, + "learning_rate": 1.8673032591514586e-05, + "loss": 0.9945, + "step": 1247 + }, + { + "epoch": 0.19, + "grad_norm": 3.3031430086247657, + "learning_rate": 1.8670563761270762e-05, + "loss": 1.0205, + "step": 1248 + }, + { + "epoch": 0.19, + "grad_norm": 3.004205593212754, + "learning_rate": 1.866809280004554e-05, + "loss": 0.8999, + "step": 1249 + }, + { + "epoch": 0.19, + "grad_norm": 3.9436629690025033, + "learning_rate": 1.8665619708446216e-05, + "loss": 0.9898, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 3.263914489945494, + "learning_rate": 1.86631444870806e-05, + "loss": 0.9416, + "step": 1251 + }, + { + "epoch": 0.19, + "grad_norm": 3.388649773428647, + "learning_rate": 1.8660667136557038e-05, + "loss": 1.0842, + "step": 1252 + }, + { + "epoch": 0.19, + "grad_norm": 3.284605163224415, + "learning_rate": 1.865818765748439e-05, + "loss": 0.9032, + "step": 1253 + }, + { + "epoch": 0.19, + "grad_norm": 3.36238307880044, + "learning_rate": 1.8655706050472045e-05, + "loss": 0.9571, + "step": 1254 + }, + { + "epoch": 0.19, + "grad_norm": 3.3955441759934106, + "learning_rate": 1.8653222316129914e-05, + "loss": 0.9705, + "step": 1255 + }, + { + "epoch": 0.19, + "grad_norm": 3.4215491117960726, + "learning_rate": 1.865073645506842e-05, + "loss": 0.96, + "step": 1256 + }, + { + "epoch": 0.19, + "grad_norm": 3.0799731990389914, + "learning_rate": 1.864824846789853e-05, + "loss": 0.9825, + "step": 1257 + }, + { + "epoch": 0.19, + "grad_norm": 3.16621852615012, + "learning_rate": 1.8645758355231712e-05, + "loss": 0.9223, + "step": 1258 + }, + { + "epoch": 0.19, + "grad_norm": 2.9485309307936913, + "learning_rate": 1.8643266117679973e-05, + "loss": 0.9407, + "step": 1259 + }, + { + "epoch": 0.19, + "grad_norm": 3.0035964787739, + "learning_rate": 1.864077175585583e-05, + "loss": 1.0411, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 3.4098162622882207, + "learning_rate": 1.863827527037233e-05, + "loss": 0.9541, + "step": 1261 + }, + { + "epoch": 0.19, + "grad_norm": 3.11021479917164, + "learning_rate": 1.8635776661843037e-05, + "loss": 0.9688, + "step": 1262 + }, + { + "epoch": 0.19, + "grad_norm": 11.319618014901573, + "learning_rate": 1.863327593088204e-05, + "loss": 1.1829, + "step": 1263 + }, + { + "epoch": 0.19, + "grad_norm": 3.3061764646598077, + "learning_rate": 1.8630773078103947e-05, + "loss": 0.8817, + "step": 1264 + }, + { + "epoch": 0.19, + "grad_norm": 3.189631081214496, + "learning_rate": 1.8628268104123895e-05, + "loss": 0.9856, + "step": 1265 + }, + { + "epoch": 0.19, + "grad_norm": 6.2252370788231, + "learning_rate": 1.8625761009557527e-05, + "loss": 1.0615, + "step": 1266 + }, + { + "epoch": 0.19, + "grad_norm": 4.112050914940334, + "learning_rate": 1.8623251795021026e-05, + "loss": 1.0279, + "step": 1267 + }, + { + "epoch": 0.19, + "grad_norm": 3.472255495162265, + "learning_rate": 1.8620740461131078e-05, + "loss": 1.02, + "step": 1268 + }, + { + "epoch": 0.19, + "grad_norm": 3.3616466067796567, + "learning_rate": 1.8618227008504903e-05, + "loss": 1.0231, + "step": 1269 + }, + { + "epoch": 0.19, + "grad_norm": 3.135356926756423, + "learning_rate": 1.861571143776024e-05, + "loss": 0.9841, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 3.1447769298644737, + "learning_rate": 1.8613193749515336e-05, + "loss": 0.9787, + "step": 1271 + }, + { + "epoch": 0.19, + "grad_norm": 2.9614186561247378, + "learning_rate": 1.861067394438898e-05, + "loss": 1.0221, + "step": 1272 + }, + { + "epoch": 0.19, + "grad_norm": 3.629189000456825, + "learning_rate": 1.860815202300046e-05, + "loss": 1.0493, + "step": 1273 + }, + { + "epoch": 0.2, + "grad_norm": 3.5198156291852416, + "learning_rate": 1.86056279859696e-05, + "loss": 1.1185, + "step": 1274 + }, + { + "epoch": 0.2, + "grad_norm": 3.228813921955044, + "learning_rate": 1.860310183391673e-05, + "loss": 1.0301, + "step": 1275 + }, + { + "epoch": 0.2, + "grad_norm": 3.294294729255725, + "learning_rate": 1.8600573567462722e-05, + "loss": 1.0082, + "step": 1276 + }, + { + "epoch": 0.2, + "grad_norm": 9.191062319084121, + "learning_rate": 1.8598043187228937e-05, + "loss": 1.1555, + "step": 1277 + }, + { + "epoch": 0.2, + "grad_norm": 3.2943202829958955, + "learning_rate": 1.8595510693837277e-05, + "loss": 0.9557, + "step": 1278 + }, + { + "epoch": 0.2, + "grad_norm": 3.3228994938865206, + "learning_rate": 1.8592976087910162e-05, + "loss": 1.0104, + "step": 1279 + }, + { + "epoch": 0.2, + "grad_norm": 3.4589290611274968, + "learning_rate": 1.8590439370070518e-05, + "loss": 1.0474, + "step": 1280 + }, + { + "epoch": 0.2, + "grad_norm": 3.089947078240719, + "learning_rate": 1.858790054094181e-05, + "loss": 0.8952, + "step": 1281 + }, + { + "epoch": 0.2, + "grad_norm": 3.223869917514333, + "learning_rate": 1.8585359601148005e-05, + "loss": 0.9582, + "step": 1282 + }, + { + "epoch": 0.2, + "grad_norm": 3.5241596955063565, + "learning_rate": 1.8582816551313594e-05, + "loss": 1.0233, + "step": 1283 + }, + { + "epoch": 0.2, + "grad_norm": 3.1393632796116306, + "learning_rate": 1.858027139206359e-05, + "loss": 0.9223, + "step": 1284 + }, + { + "epoch": 0.2, + "grad_norm": 3.0147943589181088, + "learning_rate": 1.857772412402352e-05, + "loss": 0.9559, + "step": 1285 + }, + { + "epoch": 0.2, + "grad_norm": 3.76710874031879, + "learning_rate": 1.857517474781943e-05, + "loss": 1.0139, + "step": 1286 + }, + { + "epoch": 0.2, + "grad_norm": 3.012356243143458, + "learning_rate": 1.857262326407789e-05, + "loss": 0.9859, + "step": 1287 + }, + { + "epoch": 0.2, + "grad_norm": 3.214110568695533, + "learning_rate": 1.8570069673425978e-05, + "loss": 1.0213, + "step": 1288 + }, + { + "epoch": 0.2, + "grad_norm": 3.4750350660104727, + "learning_rate": 1.8567513976491298e-05, + "loss": 0.988, + "step": 1289 + }, + { + "epoch": 0.2, + "grad_norm": 10.760410578608612, + "learning_rate": 1.856495617390197e-05, + "loss": 1.2646, + "step": 1290 + }, + { + "epoch": 0.2, + "grad_norm": 3.1666923521830657, + "learning_rate": 1.856239626628662e-05, + "loss": 0.9592, + "step": 1291 + }, + { + "epoch": 0.2, + "grad_norm": 4.017141775920228, + "learning_rate": 1.8559834254274413e-05, + "loss": 1.0568, + "step": 1292 + }, + { + "epoch": 0.2, + "grad_norm": 3.2580613936555403, + "learning_rate": 1.8557270138495017e-05, + "loss": 1.0296, + "step": 1293 + }, + { + "epoch": 0.2, + "grad_norm": 6.722825184968627, + "learning_rate": 1.8554703919578617e-05, + "loss": 1.0888, + "step": 1294 + }, + { + "epoch": 0.2, + "grad_norm": 3.8520569991398985, + "learning_rate": 1.855213559815592e-05, + "loss": 1.0294, + "step": 1295 + }, + { + "epoch": 0.2, + "grad_norm": 3.4248927635388777, + "learning_rate": 1.8549565174858148e-05, + "loss": 1.0494, + "step": 1296 + }, + { + "epoch": 0.2, + "grad_norm": 3.382377772814438, + "learning_rate": 1.8546992650317035e-05, + "loss": 0.9951, + "step": 1297 + }, + { + "epoch": 0.2, + "grad_norm": 3.3623590404197627, + "learning_rate": 1.854441802516484e-05, + "loss": 1.0029, + "step": 1298 + }, + { + "epoch": 0.2, + "grad_norm": 12.938670771717353, + "learning_rate": 1.854184130003433e-05, + "loss": 1.1985, + "step": 1299 + }, + { + "epoch": 0.2, + "grad_norm": 3.177523092609845, + "learning_rate": 1.8539262475558794e-05, + "loss": 1.0393, + "step": 1300 + }, + { + "epoch": 0.2, + "grad_norm": 3.142874464959742, + "learning_rate": 1.8536681552372035e-05, + "loss": 1.0258, + "step": 1301 + }, + { + "epoch": 0.2, + "grad_norm": 3.1804710183341443, + "learning_rate": 1.8534098531108374e-05, + "loss": 0.8893, + "step": 1302 + }, + { + "epoch": 0.2, + "grad_norm": 7.385721670884511, + "learning_rate": 1.8531513412402635e-05, + "loss": 1.1044, + "step": 1303 + }, + { + "epoch": 0.2, + "grad_norm": 3.1355109726054384, + "learning_rate": 1.852892619689018e-05, + "loss": 1.0309, + "step": 1304 + }, + { + "epoch": 0.2, + "grad_norm": 3.2700105399947113, + "learning_rate": 1.8526336885206863e-05, + "loss": 1.0062, + "step": 1305 + }, + { + "epoch": 0.2, + "grad_norm": 3.83603218845176, + "learning_rate": 1.8523745477989074e-05, + "loss": 0.9573, + "step": 1306 + }, + { + "epoch": 0.2, + "grad_norm": 3.274591564653697, + "learning_rate": 1.85211519758737e-05, + "loss": 0.9857, + "step": 1307 + }, + { + "epoch": 0.2, + "grad_norm": 3.738378402447023, + "learning_rate": 1.851855637949816e-05, + "loss": 0.9904, + "step": 1308 + }, + { + "epoch": 0.2, + "grad_norm": 3.4655143401997606, + "learning_rate": 1.851595868950037e-05, + "loss": 1.032, + "step": 1309 + }, + { + "epoch": 0.2, + "grad_norm": 3.267631524880856, + "learning_rate": 1.8513358906518773e-05, + "loss": 0.9451, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 3.1241374456914595, + "learning_rate": 1.8510757031192325e-05, + "loss": 0.9012, + "step": 1311 + }, + { + "epoch": 0.2, + "grad_norm": 3.1106665906055904, + "learning_rate": 1.850815306416049e-05, + "loss": 0.9004, + "step": 1312 + }, + { + "epoch": 0.2, + "grad_norm": 4.57024670362329, + "learning_rate": 1.8505547006063254e-05, + "loss": 1.0498, + "step": 1313 + }, + { + "epoch": 0.2, + "grad_norm": 3.1600728276232752, + "learning_rate": 1.8502938857541112e-05, + "loss": 1.0602, + "step": 1314 + }, + { + "epoch": 0.2, + "grad_norm": 3.301046353011309, + "learning_rate": 1.850032861923507e-05, + "loss": 1.1654, + "step": 1315 + }, + { + "epoch": 0.2, + "grad_norm": 3.238340991080974, + "learning_rate": 1.8497716291786653e-05, + "loss": 0.9199, + "step": 1316 + }, + { + "epoch": 0.2, + "grad_norm": 2.824416857467904, + "learning_rate": 1.8495101875837903e-05, + "loss": 1.0123, + "step": 1317 + }, + { + "epoch": 0.2, + "grad_norm": 2.962611707799049, + "learning_rate": 1.8492485372031363e-05, + "loss": 0.9427, + "step": 1318 + }, + { + "epoch": 0.2, + "grad_norm": 3.2661569512729365, + "learning_rate": 1.84898667810101e-05, + "loss": 1.0405, + "step": 1319 + }, + { + "epoch": 0.2, + "grad_norm": 3.0660251603874067, + "learning_rate": 1.8487246103417692e-05, + "loss": 0.8864, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 3.321859969901298, + "learning_rate": 1.848462333989822e-05, + "loss": 1.0434, + "step": 1321 + }, + { + "epoch": 0.2, + "grad_norm": 3.5085205805452113, + "learning_rate": 1.8481998491096294e-05, + "loss": 0.992, + "step": 1322 + }, + { + "epoch": 0.2, + "grad_norm": 3.729147238125545, + "learning_rate": 1.8479371557657028e-05, + "loss": 1.0681, + "step": 1323 + }, + { + "epoch": 0.2, + "grad_norm": 3.314751970939591, + "learning_rate": 1.847674254022604e-05, + "loss": 0.9572, + "step": 1324 + }, + { + "epoch": 0.2, + "grad_norm": 3.0172757269056842, + "learning_rate": 1.847411143944948e-05, + "loss": 0.9826, + "step": 1325 + }, + { + "epoch": 0.2, + "grad_norm": 3.285621040400702, + "learning_rate": 1.847147825597399e-05, + "loss": 1.1254, + "step": 1326 + }, + { + "epoch": 0.2, + "grad_norm": 3.448844738758746, + "learning_rate": 1.8468842990446738e-05, + "loss": 1.052, + "step": 1327 + }, + { + "epoch": 0.2, + "grad_norm": 3.495073379772717, + "learning_rate": 1.84662056435154e-05, + "loss": 1.0485, + "step": 1328 + }, + { + "epoch": 0.2, + "grad_norm": 12.588068668936893, + "learning_rate": 1.8463566215828153e-05, + "loss": 1.1313, + "step": 1329 + }, + { + "epoch": 0.2, + "grad_norm": 3.5046696564398703, + "learning_rate": 1.8460924708033703e-05, + "loss": 1.0228, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 3.311822770954448, + "learning_rate": 1.845828112078125e-05, + "loss": 0.9789, + "step": 1331 + }, + { + "epoch": 0.2, + "grad_norm": 3.6065329287172045, + "learning_rate": 1.8455635454720523e-05, + "loss": 0.952, + "step": 1332 + }, + { + "epoch": 0.2, + "grad_norm": 3.337957980232742, + "learning_rate": 1.845298771050175e-05, + "loss": 1.039, + "step": 1333 + }, + { + "epoch": 0.2, + "grad_norm": 3.1850080920372354, + "learning_rate": 1.845033788877567e-05, + "loss": 0.933, + "step": 1334 + }, + { + "epoch": 0.2, + "grad_norm": 3.230879748045431, + "learning_rate": 1.8447685990193535e-05, + "loss": 1.0526, + "step": 1335 + }, + { + "epoch": 0.2, + "grad_norm": 2.85396414990987, + "learning_rate": 1.8445032015407107e-05, + "loss": 1.0069, + "step": 1336 + }, + { + "epoch": 0.2, + "grad_norm": 2.9378848623087728, + "learning_rate": 1.844237596506866e-05, + "loss": 0.9669, + "step": 1337 + }, + { + "epoch": 0.2, + "grad_norm": 2.854668930026993, + "learning_rate": 1.8439717839830974e-05, + "loss": 1.0146, + "step": 1338 + }, + { + "epoch": 0.2, + "grad_norm": 3.099600996688908, + "learning_rate": 1.843705764034735e-05, + "loss": 1.0386, + "step": 1339 + }, + { + "epoch": 0.21, + "grad_norm": 3.2855360587889026, + "learning_rate": 1.843439536727158e-05, + "loss": 0.9487, + "step": 1340 + }, + { + "epoch": 0.21, + "grad_norm": 3.4579980415822007, + "learning_rate": 1.843173102125798e-05, + "loss": 0.9574, + "step": 1341 + }, + { + "epoch": 0.21, + "grad_norm": 3.7273952322801116, + "learning_rate": 1.842906460296137e-05, + "loss": 0.9395, + "step": 1342 + }, + { + "epoch": 0.21, + "grad_norm": 3.275683451454534, + "learning_rate": 1.8426396113037085e-05, + "loss": 0.9334, + "step": 1343 + }, + { + "epoch": 0.21, + "grad_norm": 3.210881842639474, + "learning_rate": 1.8423725552140964e-05, + "loss": 0.986, + "step": 1344 + }, + { + "epoch": 0.21, + "grad_norm": 34.68569610373674, + "learning_rate": 1.842105292092935e-05, + "loss": 1.1653, + "step": 1345 + }, + { + "epoch": 0.21, + "grad_norm": 2.802505121467872, + "learning_rate": 1.841837822005911e-05, + "loss": 0.9074, + "step": 1346 + }, + { + "epoch": 0.21, + "grad_norm": 3.1070745294033553, + "learning_rate": 1.8415701450187603e-05, + "loss": 1.0476, + "step": 1347 + }, + { + "epoch": 0.21, + "grad_norm": 3.262380215220285, + "learning_rate": 1.8413022611972707e-05, + "loss": 1.0297, + "step": 1348 + }, + { + "epoch": 0.21, + "grad_norm": 3.033952372865329, + "learning_rate": 1.8410341706072805e-05, + "loss": 1.015, + "step": 1349 + }, + { + "epoch": 0.21, + "grad_norm": 3.323263789902725, + "learning_rate": 1.8407658733146788e-05, + "loss": 0.9336, + "step": 1350 + }, + { + "epoch": 0.21, + "grad_norm": 3.3046967974580457, + "learning_rate": 1.8404973693854056e-05, + "loss": 0.9665, + "step": 1351 + }, + { + "epoch": 0.21, + "grad_norm": 3.2788674106321425, + "learning_rate": 1.840228658885452e-05, + "loss": 1.001, + "step": 1352 + }, + { + "epoch": 0.21, + "grad_norm": 3.1826543622998025, + "learning_rate": 1.8399597418808588e-05, + "loss": 0.9061, + "step": 1353 + }, + { + "epoch": 0.21, + "grad_norm": 3.232063653180614, + "learning_rate": 1.8396906184377185e-05, + "loss": 0.9828, + "step": 1354 + }, + { + "epoch": 0.21, + "grad_norm": 3.1539792259539645, + "learning_rate": 1.8394212886221743e-05, + "loss": 1.0339, + "step": 1355 + }, + { + "epoch": 0.21, + "grad_norm": 3.2276005454751435, + "learning_rate": 1.8391517525004202e-05, + "loss": 0.9972, + "step": 1356 + }, + { + "epoch": 0.21, + "grad_norm": 2.933260145178024, + "learning_rate": 1.8388820101387e-05, + "loss": 0.8192, + "step": 1357 + }, + { + "epoch": 0.21, + "grad_norm": 3.302982115720018, + "learning_rate": 1.8386120616033088e-05, + "loss": 1.0183, + "step": 1358 + }, + { + "epoch": 0.21, + "grad_norm": 3.6318377709444007, + "learning_rate": 1.8383419069605928e-05, + "loss": 1.0127, + "step": 1359 + }, + { + "epoch": 0.21, + "grad_norm": 3.0740579352046966, + "learning_rate": 1.838071546276948e-05, + "loss": 0.9438, + "step": 1360 + }, + { + "epoch": 0.21, + "grad_norm": 2.7634617730211786, + "learning_rate": 1.837800979618822e-05, + "loss": 0.9507, + "step": 1361 + }, + { + "epoch": 0.21, + "grad_norm": 3.340650030000716, + "learning_rate": 1.837530207052712e-05, + "loss": 0.9877, + "step": 1362 + }, + { + "epoch": 0.21, + "grad_norm": 3.1075200381421957, + "learning_rate": 1.8372592286451668e-05, + "loss": 1.0457, + "step": 1363 + }, + { + "epoch": 0.21, + "grad_norm": 3.336523570898075, + "learning_rate": 1.836988044462785e-05, + "loss": 1.0177, + "step": 1364 + }, + { + "epoch": 0.21, + "grad_norm": 3.314086950533041, + "learning_rate": 1.836716654572216e-05, + "loss": 1.0487, + "step": 1365 + }, + { + "epoch": 0.21, + "grad_norm": 3.6581734234286944, + "learning_rate": 1.83644505904016e-05, + "loss": 0.9876, + "step": 1366 + }, + { + "epoch": 0.21, + "grad_norm": 3.224387145206634, + "learning_rate": 1.836173257933367e-05, + "loss": 0.9458, + "step": 1367 + }, + { + "epoch": 0.21, + "grad_norm": 3.140033745322203, + "learning_rate": 1.835901251318639e-05, + "loss": 0.9106, + "step": 1368 + }, + { + "epoch": 0.21, + "grad_norm": 2.875829759401323, + "learning_rate": 1.835629039262827e-05, + "loss": 0.9946, + "step": 1369 + }, + { + "epoch": 0.21, + "grad_norm": 7.450143819458855, + "learning_rate": 1.8353566218328333e-05, + "loss": 1.1555, + "step": 1370 + }, + { + "epoch": 0.21, + "grad_norm": 3.442123035895518, + "learning_rate": 1.8350839990956104e-05, + "loss": 0.9642, + "step": 1371 + }, + { + "epoch": 0.21, + "grad_norm": 3.1978975027659757, + "learning_rate": 1.8348111711181614e-05, + "loss": 0.8898, + "step": 1372 + }, + { + "epoch": 0.21, + "grad_norm": 3.259845525408301, + "learning_rate": 1.8345381379675397e-05, + "loss": 1.0292, + "step": 1373 + }, + { + "epoch": 0.21, + "grad_norm": 3.165231681661386, + "learning_rate": 1.834264899710849e-05, + "loss": 0.9839, + "step": 1374 + }, + { + "epoch": 0.21, + "grad_norm": 2.895179041315747, + "learning_rate": 1.8339914564152442e-05, + "loss": 0.912, + "step": 1375 + }, + { + "epoch": 0.21, + "grad_norm": 3.33805355430569, + "learning_rate": 1.8337178081479292e-05, + "loss": 1.0233, + "step": 1376 + }, + { + "epoch": 0.21, + "grad_norm": 5.270798423226357, + "learning_rate": 1.8334439549761596e-05, + "loss": 1.0438, + "step": 1377 + }, + { + "epoch": 0.21, + "grad_norm": 3.386659692187951, + "learning_rate": 1.8331698969672405e-05, + "loss": 1.0603, + "step": 1378 + }, + { + "epoch": 0.21, + "grad_norm": 3.29273059399376, + "learning_rate": 1.832895634188528e-05, + "loss": 0.9176, + "step": 1379 + }, + { + "epoch": 0.21, + "grad_norm": 3.2139646813203813, + "learning_rate": 1.832621166707428e-05, + "loss": 1.0601, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 3.591974463038327, + "learning_rate": 1.8323464945913967e-05, + "loss": 0.9984, + "step": 1381 + }, + { + "epoch": 0.21, + "grad_norm": 3.0970045925072758, + "learning_rate": 1.8320716179079414e-05, + "loss": 0.9418, + "step": 1382 + }, + { + "epoch": 0.21, + "grad_norm": 3.3020292537179117, + "learning_rate": 1.831796536724619e-05, + "loss": 0.977, + "step": 1383 + }, + { + "epoch": 0.21, + "grad_norm": 3.1556072875509744, + "learning_rate": 1.831521251109036e-05, + "loss": 0.9942, + "step": 1384 + }, + { + "epoch": 0.21, + "grad_norm": 3.4902091848804098, + "learning_rate": 1.8312457611288506e-05, + "loss": 1.038, + "step": 1385 + }, + { + "epoch": 0.21, + "grad_norm": 4.785598451488263, + "learning_rate": 1.8309700668517703e-05, + "loss": 0.9436, + "step": 1386 + }, + { + "epoch": 0.21, + "grad_norm": 3.0068519029089527, + "learning_rate": 1.8306941683455528e-05, + "loss": 1.0645, + "step": 1387 + }, + { + "epoch": 0.21, + "grad_norm": 3.220487909077531, + "learning_rate": 1.8304180656780062e-05, + "loss": 0.9674, + "step": 1388 + }, + { + "epoch": 0.21, + "grad_norm": 3.163243004696232, + "learning_rate": 1.8301417589169898e-05, + "loss": 1.0972, + "step": 1389 + }, + { + "epoch": 0.21, + "grad_norm": 4.4175119770718325, + "learning_rate": 1.8298652481304104e-05, + "loss": 0.8804, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 3.820628451840013, + "learning_rate": 1.829588533386228e-05, + "loss": 1.0035, + "step": 1391 + }, + { + "epoch": 0.21, + "grad_norm": 3.0343947926471664, + "learning_rate": 1.8293116147524506e-05, + "loss": 0.9804, + "step": 1392 + }, + { + "epoch": 0.21, + "grad_norm": 3.48329283864859, + "learning_rate": 1.829034492297137e-05, + "loss": 1.0318, + "step": 1393 + }, + { + "epoch": 0.21, + "grad_norm": 3.2509945928878103, + "learning_rate": 1.8287571660883967e-05, + "loss": 0.9648, + "step": 1394 + }, + { + "epoch": 0.21, + "grad_norm": 3.3114408715390753, + "learning_rate": 1.828479636194388e-05, + "loss": 1.0345, + "step": 1395 + }, + { + "epoch": 0.21, + "grad_norm": 3.026796233762158, + "learning_rate": 1.8282019026833205e-05, + "loss": 1.0983, + "step": 1396 + }, + { + "epoch": 0.21, + "grad_norm": 3.3602743957091996, + "learning_rate": 1.8279239656234537e-05, + "loss": 0.9512, + "step": 1397 + }, + { + "epoch": 0.21, + "grad_norm": 2.9546018486657926, + "learning_rate": 1.8276458250830954e-05, + "loss": 0.9391, + "step": 1398 + }, + { + "epoch": 0.21, + "grad_norm": 2.8325729704271416, + "learning_rate": 1.8273674811306056e-05, + "loss": 1.0347, + "step": 1399 + }, + { + "epoch": 0.21, + "grad_norm": 3.0823581307912495, + "learning_rate": 1.8270889338343934e-05, + "loss": 0.9359, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 3.1512300975836927, + "learning_rate": 1.8268101832629177e-05, + "loss": 0.9626, + "step": 1401 + }, + { + "epoch": 0.21, + "grad_norm": 3.3099205822582833, + "learning_rate": 1.826531229484688e-05, + "loss": 0.8663, + "step": 1402 + }, + { + "epoch": 0.21, + "grad_norm": 3.183353819847706, + "learning_rate": 1.8262520725682628e-05, + "loss": 1.008, + "step": 1403 + }, + { + "epoch": 0.21, + "grad_norm": 3.123487813760808, + "learning_rate": 1.8259727125822514e-05, + "loss": 0.9991, + "step": 1404 + }, + { + "epoch": 0.22, + "grad_norm": 3.1737631440311107, + "learning_rate": 1.825693149595312e-05, + "loss": 1.0189, + "step": 1405 + }, + { + "epoch": 0.22, + "grad_norm": 3.2698968208545995, + "learning_rate": 1.8254133836761547e-05, + "loss": 1.0219, + "step": 1406 + }, + { + "epoch": 0.22, + "grad_norm": 2.993678348890785, + "learning_rate": 1.825133414893537e-05, + "loss": 0.978, + "step": 1407 + }, + { + "epoch": 0.22, + "grad_norm": 2.8830861585649417, + "learning_rate": 1.8248532433162672e-05, + "loss": 1.0617, + "step": 1408 + }, + { + "epoch": 0.22, + "grad_norm": 3.5798528339110396, + "learning_rate": 1.8245728690132044e-05, + "loss": 1.0044, + "step": 1409 + }, + { + "epoch": 0.22, + "grad_norm": 3.211449496027972, + "learning_rate": 1.8242922920532566e-05, + "loss": 0.9857, + "step": 1410 + }, + { + "epoch": 0.22, + "grad_norm": 3.243617650417417, + "learning_rate": 1.8240115125053816e-05, + "loss": 0.9601, + "step": 1411 + }, + { + "epoch": 0.22, + "grad_norm": 3.259960752667389, + "learning_rate": 1.823730530438587e-05, + "loss": 1.0182, + "step": 1412 + }, + { + "epoch": 0.22, + "grad_norm": 3.3528873993989965, + "learning_rate": 1.82344934592193e-05, + "loss": 0.9254, + "step": 1413 + }, + { + "epoch": 0.22, + "grad_norm": 3.3073550478896805, + "learning_rate": 1.8231679590245185e-05, + "loss": 0.9833, + "step": 1414 + }, + { + "epoch": 0.22, + "grad_norm": 3.3731207113327057, + "learning_rate": 1.8228863698155095e-05, + "loss": 1.0441, + "step": 1415 + }, + { + "epoch": 0.22, + "grad_norm": 3.4125271371405015, + "learning_rate": 1.8226045783641094e-05, + "loss": 0.9709, + "step": 1416 + }, + { + "epoch": 0.22, + "grad_norm": 3.4114700891629424, + "learning_rate": 1.8223225847395745e-05, + "loss": 1.1116, + "step": 1417 + }, + { + "epoch": 0.22, + "grad_norm": 3.302800754545404, + "learning_rate": 1.822040389011212e-05, + "loss": 1.0117, + "step": 1418 + }, + { + "epoch": 0.22, + "grad_norm": 3.472788092538862, + "learning_rate": 1.8217579912483758e-05, + "loss": 0.9642, + "step": 1419 + }, + { + "epoch": 0.22, + "grad_norm": 3.4919990970322954, + "learning_rate": 1.821475391520473e-05, + "loss": 0.9361, + "step": 1420 + }, + { + "epoch": 0.22, + "grad_norm": 3.578796053182178, + "learning_rate": 1.821192589896958e-05, + "loss": 0.9699, + "step": 1421 + }, + { + "epoch": 0.22, + "grad_norm": 2.943372444254199, + "learning_rate": 1.8209095864473357e-05, + "loss": 0.9645, + "step": 1422 + }, + { + "epoch": 0.22, + "grad_norm": 2.957107692774212, + "learning_rate": 1.82062638124116e-05, + "loss": 1.1112, + "step": 1423 + }, + { + "epoch": 0.22, + "grad_norm": 3.3304564555728913, + "learning_rate": 1.8203429743480356e-05, + "loss": 0.9292, + "step": 1424 + }, + { + "epoch": 0.22, + "grad_norm": 3.4338118492304623, + "learning_rate": 1.8200593658376152e-05, + "loss": 1.0365, + "step": 1425 + }, + { + "epoch": 0.22, + "grad_norm": 3.6715752225401705, + "learning_rate": 1.819775555779602e-05, + "loss": 0.9023, + "step": 1426 + }, + { + "epoch": 0.22, + "grad_norm": 2.9897434792770268, + "learning_rate": 1.8194915442437487e-05, + "loss": 0.9809, + "step": 1427 + }, + { + "epoch": 0.22, + "grad_norm": 3.2133451138736357, + "learning_rate": 1.8192073312998574e-05, + "loss": 1.0375, + "step": 1428 + }, + { + "epoch": 0.22, + "grad_norm": 3.2079646767332197, + "learning_rate": 1.8189229170177797e-05, + "loss": 0.9501, + "step": 1429 + }, + { + "epoch": 0.22, + "grad_norm": 3.5101854117888256, + "learning_rate": 1.8186383014674164e-05, + "loss": 0.9791, + "step": 1430 + }, + { + "epoch": 0.22, + "grad_norm": 3.195330863399731, + "learning_rate": 1.8183534847187184e-05, + "loss": 0.9585, + "step": 1431 + }, + { + "epoch": 0.22, + "grad_norm": 2.942786231263773, + "learning_rate": 1.818068466841685e-05, + "loss": 0.9436, + "step": 1432 + }, + { + "epoch": 0.22, + "grad_norm": 3.380362306882057, + "learning_rate": 1.8177832479063663e-05, + "loss": 1.0471, + "step": 1433 + }, + { + "epoch": 0.22, + "grad_norm": 2.9912397523219942, + "learning_rate": 1.817497827982861e-05, + "loss": 0.9627, + "step": 1434 + }, + { + "epoch": 0.22, + "grad_norm": 3.131458645693922, + "learning_rate": 1.817212207141317e-05, + "loss": 1.0078, + "step": 1435 + }, + { + "epoch": 0.22, + "grad_norm": 3.5044622984176814, + "learning_rate": 1.816926385451932e-05, + "loss": 0.9924, + "step": 1436 + }, + { + "epoch": 0.22, + "grad_norm": 3.2764286835338834, + "learning_rate": 1.8166403629849533e-05, + "loss": 0.9193, + "step": 1437 + }, + { + "epoch": 0.22, + "grad_norm": 2.955062497191095, + "learning_rate": 1.8163541398106765e-05, + "loss": 0.967, + "step": 1438 + }, + { + "epoch": 0.22, + "grad_norm": 3.2063986435021348, + "learning_rate": 1.8160677159994482e-05, + "loss": 0.9361, + "step": 1439 + }, + { + "epoch": 0.22, + "grad_norm": 3.3253753539110837, + "learning_rate": 1.815781091621662e-05, + "loss": 0.9221, + "step": 1440 + }, + { + "epoch": 0.22, + "grad_norm": 3.147461882152192, + "learning_rate": 1.815494266747764e-05, + "loss": 1.0155, + "step": 1441 + }, + { + "epoch": 0.22, + "grad_norm": 3.05539303881804, + "learning_rate": 1.8152072414482456e-05, + "loss": 1.026, + "step": 1442 + }, + { + "epoch": 0.22, + "grad_norm": 3.0298394868604213, + "learning_rate": 1.8149200157936512e-05, + "loss": 1.0316, + "step": 1443 + }, + { + "epoch": 0.22, + "grad_norm": 7.056299294425606, + "learning_rate": 1.814632589854572e-05, + "loss": 1.1486, + "step": 1444 + }, + { + "epoch": 0.22, + "grad_norm": 2.9964798081837896, + "learning_rate": 1.8143449637016495e-05, + "loss": 0.9409, + "step": 1445 + }, + { + "epoch": 0.22, + "grad_norm": 3.500281476121371, + "learning_rate": 1.8140571374055737e-05, + "loss": 0.9786, + "step": 1446 + }, + { + "epoch": 0.22, + "grad_norm": 2.7525323910569925, + "learning_rate": 1.8137691110370852e-05, + "loss": 0.9384, + "step": 1447 + }, + { + "epoch": 0.22, + "grad_norm": 3.005090782342863, + "learning_rate": 1.813480884666972e-05, + "loss": 1.0044, + "step": 1448 + }, + { + "epoch": 0.22, + "grad_norm": 2.996105565791517, + "learning_rate": 1.813192458366072e-05, + "loss": 1.0387, + "step": 1449 + }, + { + "epoch": 0.22, + "grad_norm": 3.2121612732948535, + "learning_rate": 1.8129038322052727e-05, + "loss": 0.9612, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 3.0154524615068325, + "learning_rate": 1.8126150062555107e-05, + "loss": 0.9509, + "step": 1451 + }, + { + "epoch": 0.22, + "grad_norm": 3.124974923490825, + "learning_rate": 1.81232598058777e-05, + "loss": 0.9746, + "step": 1452 + }, + { + "epoch": 0.22, + "grad_norm": 3.264530323861853, + "learning_rate": 1.8120367552730865e-05, + "loss": 0.9343, + "step": 1453 + }, + { + "epoch": 0.22, + "grad_norm": 3.394005356454698, + "learning_rate": 1.8117473303825426e-05, + "loss": 1.0168, + "step": 1454 + }, + { + "epoch": 0.22, + "grad_norm": 3.0760671123580106, + "learning_rate": 1.811457705987271e-05, + "loss": 1.067, + "step": 1455 + }, + { + "epoch": 0.22, + "grad_norm": 3.196152023485417, + "learning_rate": 1.811167882158454e-05, + "loss": 0.9804, + "step": 1456 + }, + { + "epoch": 0.22, + "grad_norm": 3.5596382958999953, + "learning_rate": 1.8108778589673216e-05, + "loss": 1.0635, + "step": 1457 + }, + { + "epoch": 0.22, + "grad_norm": 3.116134009243598, + "learning_rate": 1.810587636485153e-05, + "loss": 1.1408, + "step": 1458 + }, + { + "epoch": 0.22, + "grad_norm": 3.0966651802438867, + "learning_rate": 1.8102972147832775e-05, + "loss": 0.9698, + "step": 1459 + }, + { + "epoch": 0.22, + "grad_norm": 2.9877198835598753, + "learning_rate": 1.8100065939330716e-05, + "loss": 0.9354, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 3.0404801902248355, + "learning_rate": 1.809715774005963e-05, + "loss": 1.0685, + "step": 1461 + }, + { + "epoch": 0.22, + "grad_norm": 2.948143128537915, + "learning_rate": 1.8094247550734262e-05, + "loss": 1.0719, + "step": 1462 + }, + { + "epoch": 0.22, + "grad_norm": 3.066447715290956, + "learning_rate": 1.8091335372069857e-05, + "loss": 0.9708, + "step": 1463 + }, + { + "epoch": 0.22, + "grad_norm": 3.115030578189605, + "learning_rate": 1.8088421204782153e-05, + "loss": 1.0377, + "step": 1464 + }, + { + "epoch": 0.22, + "grad_norm": 3.1425095469293893, + "learning_rate": 1.808550504958736e-05, + "loss": 0.9705, + "step": 1465 + }, + { + "epoch": 0.22, + "grad_norm": 3.244217587585244, + "learning_rate": 1.8082586907202194e-05, + "loss": 0.9619, + "step": 1466 + }, + { + "epoch": 0.22, + "grad_norm": 3.20412918400006, + "learning_rate": 1.8079666778343853e-05, + "loss": 1.0447, + "step": 1467 + }, + { + "epoch": 0.22, + "grad_norm": 4.459745892769258, + "learning_rate": 1.8076744663730016e-05, + "loss": 0.9423, + "step": 1468 + }, + { + "epoch": 0.22, + "grad_norm": 3.138069027731141, + "learning_rate": 1.8073820564078865e-05, + "loss": 1.0486, + "step": 1469 + }, + { + "epoch": 0.23, + "grad_norm": 3.425622599914716, + "learning_rate": 1.8070894480109056e-05, + "loss": 0.8424, + "step": 1470 + }, + { + "epoch": 0.23, + "grad_norm": 3.2962585539079887, + "learning_rate": 1.806796641253974e-05, + "loss": 1.0893, + "step": 1471 + }, + { + "epoch": 0.23, + "grad_norm": 3.2303926782782053, + "learning_rate": 1.8065036362090555e-05, + "loss": 0.975, + "step": 1472 + }, + { + "epoch": 0.23, + "grad_norm": 9.037522825764029, + "learning_rate": 1.806210432948163e-05, + "loss": 1.1881, + "step": 1473 + }, + { + "epoch": 0.23, + "grad_norm": 3.288380690108407, + "learning_rate": 1.8059170315433565e-05, + "loss": 0.9964, + "step": 1474 + }, + { + "epoch": 0.23, + "grad_norm": 2.920521462332691, + "learning_rate": 1.8056234320667465e-05, + "loss": 0.9707, + "step": 1475 + }, + { + "epoch": 0.23, + "grad_norm": 5.257974512775954, + "learning_rate": 1.8053296345904913e-05, + "loss": 1.1267, + "step": 1476 + }, + { + "epoch": 0.23, + "grad_norm": 3.307161089261329, + "learning_rate": 1.8050356391867988e-05, + "loss": 0.9739, + "step": 1477 + }, + { + "epoch": 0.23, + "grad_norm": 3.24739860932814, + "learning_rate": 1.804741445927924e-05, + "loss": 0.8637, + "step": 1478 + }, + { + "epoch": 0.23, + "grad_norm": 2.9989659358800385, + "learning_rate": 1.8044470548861715e-05, + "loss": 0.9047, + "step": 1479 + }, + { + "epoch": 0.23, + "grad_norm": 3.10437368249323, + "learning_rate": 1.8041524661338943e-05, + "loss": 0.9233, + "step": 1480 + }, + { + "epoch": 0.23, + "grad_norm": 2.794739597626496, + "learning_rate": 1.8038576797434945e-05, + "loss": 0.827, + "step": 1481 + }, + { + "epoch": 0.23, + "grad_norm": 3.1128014833427837, + "learning_rate": 1.803562695787422e-05, + "loss": 0.9067, + "step": 1482 + }, + { + "epoch": 0.23, + "grad_norm": 2.9988136690702203, + "learning_rate": 1.8032675143381756e-05, + "loss": 0.9164, + "step": 1483 + }, + { + "epoch": 0.23, + "grad_norm": 3.011789954198862, + "learning_rate": 1.8029721354683024e-05, + "loss": 0.8821, + "step": 1484 + }, + { + "epoch": 0.23, + "grad_norm": 3.1678259833469715, + "learning_rate": 1.802676559250399e-05, + "loss": 0.9442, + "step": 1485 + }, + { + "epoch": 0.23, + "grad_norm": 3.186525158553192, + "learning_rate": 1.8023807857571092e-05, + "loss": 0.9632, + "step": 1486 + }, + { + "epoch": 0.23, + "grad_norm": 3.397793056047527, + "learning_rate": 1.8020848150611257e-05, + "loss": 0.9299, + "step": 1487 + }, + { + "epoch": 0.23, + "grad_norm": 3.1337686121653165, + "learning_rate": 1.8017886472351898e-05, + "loss": 1.0272, + "step": 1488 + }, + { + "epoch": 0.23, + "grad_norm": 3.2023763342435103, + "learning_rate": 1.8014922823520918e-05, + "loss": 1.0095, + "step": 1489 + }, + { + "epoch": 0.23, + "grad_norm": 3.179613140976094, + "learning_rate": 1.8011957204846694e-05, + "loss": 0.9999, + "step": 1490 + }, + { + "epoch": 0.23, + "grad_norm": 2.9569570559790943, + "learning_rate": 1.8008989617058088e-05, + "loss": 0.9663, + "step": 1491 + }, + { + "epoch": 0.23, + "grad_norm": 2.819221986668142, + "learning_rate": 1.800602006088446e-05, + "loss": 0.9449, + "step": 1492 + }, + { + "epoch": 0.23, + "grad_norm": 3.0987259678730616, + "learning_rate": 1.8003048537055634e-05, + "loss": 0.9663, + "step": 1493 + }, + { + "epoch": 0.23, + "grad_norm": 3.3105056929517525, + "learning_rate": 1.8000075046301937e-05, + "loss": 0.971, + "step": 1494 + }, + { + "epoch": 0.23, + "grad_norm": 3.0776614929117736, + "learning_rate": 1.7997099589354162e-05, + "loss": 0.9827, + "step": 1495 + }, + { + "epoch": 0.23, + "grad_norm": 2.808460903546118, + "learning_rate": 1.799412216694359e-05, + "loss": 0.8083, + "step": 1496 + }, + { + "epoch": 0.23, + "grad_norm": 4.12190977184157, + "learning_rate": 1.7991142779801997e-05, + "loss": 0.9388, + "step": 1497 + }, + { + "epoch": 0.23, + "grad_norm": 3.4668326908121068, + "learning_rate": 1.798816142866163e-05, + "loss": 1.0586, + "step": 1498 + }, + { + "epoch": 0.23, + "grad_norm": 3.0588969927516345, + "learning_rate": 1.798517811425522e-05, + "loss": 0.9958, + "step": 1499 + }, + { + "epoch": 0.23, + "grad_norm": 3.0271086429161507, + "learning_rate": 1.798219283731598e-05, + "loss": 1.0154, + "step": 1500 + }, + { + "epoch": 0.23, + "grad_norm": 3.113264536411894, + "learning_rate": 1.797920559857761e-05, + "loss": 0.9389, + "step": 1501 + }, + { + "epoch": 0.23, + "grad_norm": 3.181762870336013, + "learning_rate": 1.7976216398774292e-05, + "loss": 1.0066, + "step": 1502 + }, + { + "epoch": 0.23, + "grad_norm": 2.957030913988016, + "learning_rate": 1.7973225238640682e-05, + "loss": 1.0518, + "step": 1503 + }, + { + "epoch": 0.23, + "grad_norm": 3.1225430695720218, + "learning_rate": 1.7970232118911927e-05, + "loss": 0.954, + "step": 1504 + }, + { + "epoch": 0.23, + "grad_norm": 2.8346355377657373, + "learning_rate": 1.796723704032365e-05, + "loss": 1.0051, + "step": 1505 + }, + { + "epoch": 0.23, + "grad_norm": 3.222418799430063, + "learning_rate": 1.7964240003611958e-05, + "loss": 0.9165, + "step": 1506 + }, + { + "epoch": 0.23, + "grad_norm": 2.943155474907022, + "learning_rate": 1.7961241009513436e-05, + "loss": 0.9589, + "step": 1507 + }, + { + "epoch": 0.23, + "grad_norm": 10.091694549571042, + "learning_rate": 1.795824005876516e-05, + "loss": 1.2324, + "step": 1508 + }, + { + "epoch": 0.23, + "grad_norm": 3.289114095658659, + "learning_rate": 1.7955237152104673e-05, + "loss": 0.9287, + "step": 1509 + }, + { + "epoch": 0.23, + "grad_norm": 3.59972267883229, + "learning_rate": 1.7952232290270007e-05, + "loss": 0.9872, + "step": 1510 + }, + { + "epoch": 0.23, + "grad_norm": 3.5090128528213906, + "learning_rate": 1.794922547399967e-05, + "loss": 0.9683, + "step": 1511 + }, + { + "epoch": 0.23, + "grad_norm": 3.1106921697068057, + "learning_rate": 1.7946216704032662e-05, + "loss": 0.9422, + "step": 1512 + }, + { + "epoch": 0.23, + "grad_norm": 3.0391493782354364, + "learning_rate": 1.7943205981108442e-05, + "loss": 0.9405, + "step": 1513 + }, + { + "epoch": 0.23, + "grad_norm": 2.8944574754838523, + "learning_rate": 1.794019330596697e-05, + "loss": 0.9658, + "step": 1514 + }, + { + "epoch": 0.23, + "grad_norm": 3.0285972606047755, + "learning_rate": 1.7937178679348675e-05, + "loss": 0.9986, + "step": 1515 + }, + { + "epoch": 0.23, + "grad_norm": 3.0896134059161215, + "learning_rate": 1.7934162101994464e-05, + "loss": 0.8707, + "step": 1516 + }, + { + "epoch": 0.23, + "grad_norm": 3.3401810543095163, + "learning_rate": 1.7931143574645735e-05, + "loss": 1.0286, + "step": 1517 + }, + { + "epoch": 0.23, + "grad_norm": 3.711419672994701, + "learning_rate": 1.7928123098044353e-05, + "loss": 0.9151, + "step": 1518 + }, + { + "epoch": 0.23, + "grad_norm": 3.2190767791084656, + "learning_rate": 1.792510067293266e-05, + "loss": 1.0052, + "step": 1519 + }, + { + "epoch": 0.23, + "grad_norm": 3.570504511484273, + "learning_rate": 1.7922076300053493e-05, + "loss": 0.9412, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 3.212523605322943, + "learning_rate": 1.7919049980150155e-05, + "loss": 1.0195, + "step": 1521 + }, + { + "epoch": 0.23, + "grad_norm": 3.3752111549249815, + "learning_rate": 1.791602171396643e-05, + "loss": 0.9564, + "step": 1522 + }, + { + "epoch": 0.23, + "grad_norm": 2.8989772659100908, + "learning_rate": 1.7912991502246578e-05, + "loss": 0.9777, + "step": 1523 + }, + { + "epoch": 0.23, + "grad_norm": 3.0079429607041592, + "learning_rate": 1.7909959345735346e-05, + "loss": 0.9082, + "step": 1524 + }, + { + "epoch": 0.23, + "grad_norm": 3.3863097419634585, + "learning_rate": 1.790692524517795e-05, + "loss": 0.9536, + "step": 1525 + }, + { + "epoch": 0.23, + "grad_norm": 3.2678279877371126, + "learning_rate": 1.7903889201320083e-05, + "loss": 0.9782, + "step": 1526 + }, + { + "epoch": 0.23, + "grad_norm": 3.219374266336708, + "learning_rate": 1.7900851214907925e-05, + "loss": 0.9392, + "step": 1527 + }, + { + "epoch": 0.23, + "grad_norm": 3.04359995207335, + "learning_rate": 1.789781128668813e-05, + "loss": 1.068, + "step": 1528 + }, + { + "epoch": 0.23, + "grad_norm": 3.190840020564337, + "learning_rate": 1.789476941740782e-05, + "loss": 0.9424, + "step": 1529 + }, + { + "epoch": 0.23, + "grad_norm": 4.107360791628095, + "learning_rate": 1.7891725607814602e-05, + "loss": 0.9268, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 2.8123151853339428, + "learning_rate": 1.7888679858656566e-05, + "loss": 0.909, + "step": 1531 + }, + { + "epoch": 0.23, + "grad_norm": 2.9281300713797394, + "learning_rate": 1.7885632170682266e-05, + "loss": 0.9875, + "step": 1532 + }, + { + "epoch": 0.23, + "grad_norm": 3.1581987594287066, + "learning_rate": 1.7882582544640736e-05, + "loss": 0.8797, + "step": 1533 + }, + { + "epoch": 0.23, + "grad_norm": 2.955887862870306, + "learning_rate": 1.7879530981281492e-05, + "loss": 0.956, + "step": 1534 + }, + { + "epoch": 0.23, + "grad_norm": 2.9509542667470274, + "learning_rate": 1.7876477481354527e-05, + "loss": 1.0239, + "step": 1535 + }, + { + "epoch": 0.24, + "grad_norm": 2.543176942825485, + "learning_rate": 1.78734220456103e-05, + "loss": 0.8289, + "step": 1536 + }, + { + "epoch": 0.24, + "grad_norm": 3.1781665264428587, + "learning_rate": 1.787036467479975e-05, + "loss": 0.9332, + "step": 1537 + }, + { + "epoch": 0.24, + "grad_norm": 3.009086672085074, + "learning_rate": 1.7867305369674297e-05, + "loss": 1.0283, + "step": 1538 + }, + { + "epoch": 0.24, + "grad_norm": 3.001359683433981, + "learning_rate": 1.7864244130985827e-05, + "loss": 0.9153, + "step": 1539 + }, + { + "epoch": 0.24, + "grad_norm": 3.5693255061400704, + "learning_rate": 1.7861180959486718e-05, + "loss": 1.0207, + "step": 1540 + }, + { + "epoch": 0.24, + "grad_norm": 3.1355580568313433, + "learning_rate": 1.7858115855929797e-05, + "loss": 0.8707, + "step": 1541 + }, + { + "epoch": 0.24, + "grad_norm": 7.535492268525197, + "learning_rate": 1.785504882106839e-05, + "loss": 1.1984, + "step": 1542 + }, + { + "epoch": 0.24, + "grad_norm": 3.2349686394576134, + "learning_rate": 1.7851979855656292e-05, + "loss": 1.023, + "step": 1543 + }, + { + "epoch": 0.24, + "grad_norm": 3.128701067205983, + "learning_rate": 1.784890896044776e-05, + "loss": 0.931, + "step": 1544 + }, + { + "epoch": 0.24, + "grad_norm": 3.1756175385193623, + "learning_rate": 1.784583613619753e-05, + "loss": 1.0761, + "step": 1545 + }, + { + "epoch": 0.24, + "grad_norm": 3.2455472161944567, + "learning_rate": 1.784276138366083e-05, + "loss": 0.9923, + "step": 1546 + }, + { + "epoch": 0.24, + "grad_norm": 3.372209180487672, + "learning_rate": 1.7839684703593333e-05, + "loss": 0.9046, + "step": 1547 + }, + { + "epoch": 0.24, + "grad_norm": 3.7337452784886636, + "learning_rate": 1.7836606096751215e-05, + "loss": 0.9483, + "step": 1548 + }, + { + "epoch": 0.24, + "grad_norm": 4.047376458044384, + "learning_rate": 1.78335255638911e-05, + "loss": 0.9749, + "step": 1549 + }, + { + "epoch": 0.24, + "grad_norm": 2.9377508290351053, + "learning_rate": 1.7830443105770104e-05, + "loss": 0.9275, + "step": 1550 + }, + { + "epoch": 0.24, + "grad_norm": 3.3213718081164423, + "learning_rate": 1.7827358723145803e-05, + "loss": 1.0016, + "step": 1551 + }, + { + "epoch": 0.24, + "grad_norm": 3.125648807004082, + "learning_rate": 1.7824272416776255e-05, + "loss": 0.9499, + "step": 1552 + }, + { + "epoch": 0.24, + "grad_norm": 2.992772454537541, + "learning_rate": 1.7821184187419986e-05, + "loss": 1.0947, + "step": 1553 + }, + { + "epoch": 0.24, + "grad_norm": 3.3498984461989743, + "learning_rate": 1.7818094035835997e-05, + "loss": 0.977, + "step": 1554 + }, + { + "epoch": 0.24, + "grad_norm": 9.808823836859759, + "learning_rate": 1.781500196278376e-05, + "loss": 1.1493, + "step": 1555 + }, + { + "epoch": 0.24, + "grad_norm": 2.7414312834078487, + "learning_rate": 1.7811907969023226e-05, + "loss": 0.9074, + "step": 1556 + }, + { + "epoch": 0.24, + "grad_norm": 3.096725210487838, + "learning_rate": 1.7808812055314803e-05, + "loss": 1.0496, + "step": 1557 + }, + { + "epoch": 0.24, + "grad_norm": 3.1454246260790453, + "learning_rate": 1.7805714222419383e-05, + "loss": 0.9642, + "step": 1558 + }, + { + "epoch": 0.24, + "grad_norm": 4.39456934603716, + "learning_rate": 1.7802614471098325e-05, + "loss": 0.9703, + "step": 1559 + }, + { + "epoch": 0.24, + "grad_norm": 2.8039173071331853, + "learning_rate": 1.7799512802113463e-05, + "loss": 0.8879, + "step": 1560 + }, + { + "epoch": 0.24, + "grad_norm": 2.9674655561333063, + "learning_rate": 1.77964092162271e-05, + "loss": 1.0401, + "step": 1561 + }, + { + "epoch": 0.24, + "grad_norm": 2.9232148337048396, + "learning_rate": 1.7793303714202012e-05, + "loss": 0.941, + "step": 1562 + }, + { + "epoch": 0.24, + "grad_norm": 2.9934636053586705, + "learning_rate": 1.7790196296801443e-05, + "loss": 0.9383, + "step": 1563 + }, + { + "epoch": 0.24, + "grad_norm": 2.9223907005030956, + "learning_rate": 1.7787086964789107e-05, + "loss": 0.9159, + "step": 1564 + }, + { + "epoch": 0.24, + "grad_norm": 3.032596903774553, + "learning_rate": 1.7783975718929188e-05, + "loss": 0.8643, + "step": 1565 + }, + { + "epoch": 0.24, + "grad_norm": 2.829783049588635, + "learning_rate": 1.7780862559986353e-05, + "loss": 0.9304, + "step": 1566 + }, + { + "epoch": 0.24, + "grad_norm": 3.371369832138475, + "learning_rate": 1.777774748872572e-05, + "loss": 1.0539, + "step": 1567 + }, + { + "epoch": 0.24, + "grad_norm": 3.098014467231705, + "learning_rate": 1.7774630505912894e-05, + "loss": 1.0586, + "step": 1568 + }, + { + "epoch": 0.24, + "grad_norm": 6.854963050985832, + "learning_rate": 1.7771511612313932e-05, + "loss": 1.0926, + "step": 1569 + }, + { + "epoch": 0.24, + "grad_norm": 3.03748855360975, + "learning_rate": 1.7768390808695376e-05, + "loss": 1.0332, + "step": 1570 + }, + { + "epoch": 0.24, + "grad_norm": 3.2395041609630377, + "learning_rate": 1.7765268095824234e-05, + "loss": 0.8884, + "step": 1571 + }, + { + "epoch": 0.24, + "grad_norm": 3.5766715300588996, + "learning_rate": 1.776214347446798e-05, + "loss": 1.0497, + "step": 1572 + }, + { + "epoch": 0.24, + "grad_norm": 3.115277159581461, + "learning_rate": 1.7759016945394554e-05, + "loss": 0.9077, + "step": 1573 + }, + { + "epoch": 0.24, + "grad_norm": 2.9805465907515285, + "learning_rate": 1.7755888509372378e-05, + "loss": 0.9555, + "step": 1574 + }, + { + "epoch": 0.24, + "grad_norm": 2.8237610297834492, + "learning_rate": 1.7752758167170327e-05, + "loss": 0.8722, + "step": 1575 + }, + { + "epoch": 0.24, + "grad_norm": 3.346858502108061, + "learning_rate": 1.774962591955775e-05, + "loss": 0.9513, + "step": 1576 + }, + { + "epoch": 0.24, + "grad_norm": 6.972990157972103, + "learning_rate": 1.774649176730447e-05, + "loss": 1.0721, + "step": 1577 + }, + { + "epoch": 0.24, + "grad_norm": 3.475587952247821, + "learning_rate": 1.7743355711180772e-05, + "loss": 1.0924, + "step": 1578 + }, + { + "epoch": 0.24, + "grad_norm": 3.3373101780404073, + "learning_rate": 1.774021775195741e-05, + "loss": 0.9527, + "step": 1579 + }, + { + "epoch": 0.24, + "grad_norm": 3.680026816225568, + "learning_rate": 1.773707789040561e-05, + "loss": 1.0351, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 3.20231272940428, + "learning_rate": 1.7733936127297053e-05, + "loss": 0.9025, + "step": 1581 + }, + { + "epoch": 0.24, + "grad_norm": 3.135934758066214, + "learning_rate": 1.7730792463403907e-05, + "loss": 0.9604, + "step": 1582 + }, + { + "epoch": 0.24, + "grad_norm": 3.048731265002102, + "learning_rate": 1.772764689949879e-05, + "loss": 0.9117, + "step": 1583 + }, + { + "epoch": 0.24, + "grad_norm": 3.067612401660131, + "learning_rate": 1.7724499436354796e-05, + "loss": 0.9069, + "step": 1584 + }, + { + "epoch": 0.24, + "grad_norm": 3.3240696511439345, + "learning_rate": 1.7721350074745483e-05, + "loss": 0.8682, + "step": 1585 + }, + { + "epoch": 0.24, + "grad_norm": 3.2752756682214947, + "learning_rate": 1.771819881544487e-05, + "loss": 1.0783, + "step": 1586 + }, + { + "epoch": 0.24, + "grad_norm": 2.882160412513278, + "learning_rate": 1.771504565922746e-05, + "loss": 0.9463, + "step": 1587 + }, + { + "epoch": 0.24, + "grad_norm": 2.9978510227502273, + "learning_rate": 1.7711890606868202e-05, + "loss": 0.9297, + "step": 1588 + }, + { + "epoch": 0.24, + "grad_norm": 2.9646358151861225, + "learning_rate": 1.770873365914252e-05, + "loss": 0.923, + "step": 1589 + }, + { + "epoch": 0.24, + "grad_norm": 3.1091547510176922, + "learning_rate": 1.77055748168263e-05, + "loss": 0.9384, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 3.1877979522883124, + "learning_rate": 1.7702414080695907e-05, + "loss": 1.0714, + "step": 1591 + }, + { + "epoch": 0.24, + "grad_norm": 3.2640181587855484, + "learning_rate": 1.7699251451528152e-05, + "loss": 1.0194, + "step": 1592 + }, + { + "epoch": 0.24, + "grad_norm": 2.925145799537389, + "learning_rate": 1.7696086930100323e-05, + "loss": 0.8909, + "step": 1593 + }, + { + "epoch": 0.24, + "grad_norm": 3.133756814413861, + "learning_rate": 1.7692920517190175e-05, + "loss": 0.9, + "step": 1594 + }, + { + "epoch": 0.24, + "grad_norm": 3.2710171605177805, + "learning_rate": 1.768975221357592e-05, + "loss": 0.9672, + "step": 1595 + }, + { + "epoch": 0.24, + "grad_norm": 3.068629248668608, + "learning_rate": 1.7686582020036234e-05, + "loss": 0.9769, + "step": 1596 + }, + { + "epoch": 0.24, + "grad_norm": 2.830114814388996, + "learning_rate": 1.7683409937350267e-05, + "loss": 0.9226, + "step": 1597 + }, + { + "epoch": 0.24, + "grad_norm": 3.052722151159689, + "learning_rate": 1.768023596629763e-05, + "loss": 0.9782, + "step": 1598 + }, + { + "epoch": 0.24, + "grad_norm": 3.181907971299097, + "learning_rate": 1.7677060107658387e-05, + "loss": 0.94, + "step": 1599 + }, + { + "epoch": 0.24, + "grad_norm": 3.0584100002745003, + "learning_rate": 1.7673882362213085e-05, + "loss": 0.9347, + "step": 1600 + }, + { + "epoch": 0.25, + "grad_norm": 3.0730524882941292, + "learning_rate": 1.7670702730742722e-05, + "loss": 0.9766, + "step": 1601 + }, + { + "epoch": 0.25, + "grad_norm": 2.743519182081731, + "learning_rate": 1.7667521214028757e-05, + "loss": 0.8648, + "step": 1602 + }, + { + "epoch": 0.25, + "grad_norm": 3.0055379073044355, + "learning_rate": 1.7664337812853122e-05, + "loss": 0.9404, + "step": 1603 + }, + { + "epoch": 0.25, + "grad_norm": 3.1226930811580136, + "learning_rate": 1.766115252799821e-05, + "loss": 1.0251, + "step": 1604 + }, + { + "epoch": 0.25, + "grad_norm": 3.2624488224353567, + "learning_rate": 1.7657965360246867e-05, + "loss": 0.9208, + "step": 1605 + }, + { + "epoch": 0.25, + "grad_norm": 3.0925944435585944, + "learning_rate": 1.7654776310382417e-05, + "loss": 0.8821, + "step": 1606 + }, + { + "epoch": 0.25, + "grad_norm": 3.3295149139126234, + "learning_rate": 1.7651585379188635e-05, + "loss": 1.0782, + "step": 1607 + }, + { + "epoch": 0.25, + "grad_norm": 2.903090564857671, + "learning_rate": 1.7648392567449764e-05, + "loss": 0.9306, + "step": 1608 + }, + { + "epoch": 0.25, + "grad_norm": 2.981070901378658, + "learning_rate": 1.7645197875950507e-05, + "loss": 1.0098, + "step": 1609 + }, + { + "epoch": 0.25, + "grad_norm": 3.2721472207784768, + "learning_rate": 1.764200130547603e-05, + "loss": 1.0286, + "step": 1610 + }, + { + "epoch": 0.25, + "grad_norm": 3.0694378697061078, + "learning_rate": 1.763880285681196e-05, + "loss": 1.0652, + "step": 1611 + }, + { + "epoch": 0.25, + "grad_norm": 3.6228799725758836, + "learning_rate": 1.7635602530744387e-05, + "loss": 0.9625, + "step": 1612 + }, + { + "epoch": 0.25, + "grad_norm": 7.298349477713538, + "learning_rate": 1.763240032805986e-05, + "loss": 1.1157, + "step": 1613 + }, + { + "epoch": 0.25, + "grad_norm": 3.4453013308981912, + "learning_rate": 1.762919624954539e-05, + "loss": 1.0214, + "step": 1614 + }, + { + "epoch": 0.25, + "grad_norm": 3.025070577653514, + "learning_rate": 1.762599029598845e-05, + "loss": 0.9589, + "step": 1615 + }, + { + "epoch": 0.25, + "grad_norm": 3.233682546527499, + "learning_rate": 1.7622782468176974e-05, + "loss": 1.036, + "step": 1616 + }, + { + "epoch": 0.25, + "grad_norm": 3.211561445588491, + "learning_rate": 1.761957276689936e-05, + "loss": 0.9992, + "step": 1617 + }, + { + "epoch": 0.25, + "grad_norm": 3.277215642818311, + "learning_rate": 1.761636119294446e-05, + "loss": 0.9019, + "step": 1618 + }, + { + "epoch": 0.25, + "grad_norm": 6.402280666544251, + "learning_rate": 1.761314774710158e-05, + "loss": 1.0616, + "step": 1619 + }, + { + "epoch": 0.25, + "grad_norm": 3.736470200541887, + "learning_rate": 1.760993243016051e-05, + "loss": 0.92, + "step": 1620 + }, + { + "epoch": 0.25, + "grad_norm": 2.940226552076836, + "learning_rate": 1.7606715242911468e-05, + "loss": 0.8812, + "step": 1621 + }, + { + "epoch": 0.25, + "grad_norm": 3.1723309239716233, + "learning_rate": 1.7603496186145168e-05, + "loss": 0.8744, + "step": 1622 + }, + { + "epoch": 0.25, + "grad_norm": 3.191071987973035, + "learning_rate": 1.7600275260652746e-05, + "loss": 0.9277, + "step": 1623 + }, + { + "epoch": 0.25, + "grad_norm": 3.239417253255005, + "learning_rate": 1.7597052467225827e-05, + "loss": 0.9204, + "step": 1624 + }, + { + "epoch": 0.25, + "grad_norm": 3.192265372865029, + "learning_rate": 1.7593827806656477e-05, + "loss": 0.9358, + "step": 1625 + }, + { + "epoch": 0.25, + "grad_norm": 3.0184333324198205, + "learning_rate": 1.7590601279737232e-05, + "loss": 0.9797, + "step": 1626 + }, + { + "epoch": 0.25, + "grad_norm": 3.279109165748821, + "learning_rate": 1.7587372887261077e-05, + "loss": 1.0752, + "step": 1627 + }, + { + "epoch": 0.25, + "grad_norm": 2.6953986178070153, + "learning_rate": 1.7584142630021458e-05, + "loss": 0.9401, + "step": 1628 + }, + { + "epoch": 0.25, + "grad_norm": 3.172191903175026, + "learning_rate": 1.758091050881229e-05, + "loss": 1.0515, + "step": 1629 + }, + { + "epoch": 0.25, + "grad_norm": 3.1727497820346677, + "learning_rate": 1.7577676524427934e-05, + "loss": 1.0884, + "step": 1630 + }, + { + "epoch": 0.25, + "grad_norm": 3.1850780747179583, + "learning_rate": 1.7574440677663212e-05, + "loss": 1.1565, + "step": 1631 + }, + { + "epoch": 0.25, + "grad_norm": 3.02776283992973, + "learning_rate": 1.7571202969313403e-05, + "loss": 1.0804, + "step": 1632 + }, + { + "epoch": 0.25, + "grad_norm": 4.1377205276174385, + "learning_rate": 1.7567963400174245e-05, + "loss": 0.9776, + "step": 1633 + }, + { + "epoch": 0.25, + "grad_norm": 3.2465903735570545, + "learning_rate": 1.7564721971041937e-05, + "loss": 0.8512, + "step": 1634 + }, + { + "epoch": 0.25, + "grad_norm": 3.453705119144487, + "learning_rate": 1.756147868271313e-05, + "loss": 0.9562, + "step": 1635 + }, + { + "epoch": 0.25, + "grad_norm": 3.1922476204360235, + "learning_rate": 1.7558233535984933e-05, + "loss": 0.9605, + "step": 1636 + }, + { + "epoch": 0.25, + "grad_norm": 3.700710967013734, + "learning_rate": 1.755498653165491e-05, + "loss": 1.0348, + "step": 1637 + }, + { + "epoch": 0.25, + "grad_norm": 3.2615697733633113, + "learning_rate": 1.7551737670521084e-05, + "loss": 1.0719, + "step": 1638 + }, + { + "epoch": 0.25, + "grad_norm": 3.1274414203254888, + "learning_rate": 1.7548486953381937e-05, + "loss": 0.9591, + "step": 1639 + }, + { + "epoch": 0.25, + "grad_norm": 3.1152780749906666, + "learning_rate": 1.75452343810364e-05, + "loss": 0.9675, + "step": 1640 + }, + { + "epoch": 0.25, + "grad_norm": 3.0035425673242537, + "learning_rate": 1.7541979954283864e-05, + "loss": 1.0109, + "step": 1641 + }, + { + "epoch": 0.25, + "grad_norm": 3.077978228296154, + "learning_rate": 1.7538723673924185e-05, + "loss": 0.922, + "step": 1642 + }, + { + "epoch": 0.25, + "grad_norm": 2.739028913696504, + "learning_rate": 1.7535465540757654e-05, + "loss": 0.8579, + "step": 1643 + }, + { + "epoch": 0.25, + "grad_norm": 3.3548693985388605, + "learning_rate": 1.7532205555585032e-05, + "loss": 0.8721, + "step": 1644 + }, + { + "epoch": 0.25, + "grad_norm": 3.1780558911413643, + "learning_rate": 1.7528943719207536e-05, + "loss": 0.9789, + "step": 1645 + }, + { + "epoch": 0.25, + "grad_norm": 8.872343924062486, + "learning_rate": 1.752568003242683e-05, + "loss": 1.1482, + "step": 1646 + }, + { + "epoch": 0.25, + "grad_norm": 2.722081445165806, + "learning_rate": 1.7522414496045037e-05, + "loss": 0.9274, + "step": 1647 + }, + { + "epoch": 0.25, + "grad_norm": 3.109960299502892, + "learning_rate": 1.7519147110864736e-05, + "loss": 1.0573, + "step": 1648 + }, + { + "epoch": 0.25, + "grad_norm": 2.9004201318080516, + "learning_rate": 1.7515877877688957e-05, + "loss": 0.912, + "step": 1649 + }, + { + "epoch": 0.25, + "grad_norm": 3.069111610446601, + "learning_rate": 1.7512606797321185e-05, + "loss": 1.1247, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 2.716118441546312, + "learning_rate": 1.7509333870565364e-05, + "loss": 0.9329, + "step": 1651 + }, + { + "epoch": 0.25, + "grad_norm": 3.1907520999620735, + "learning_rate": 1.7506059098225884e-05, + "loss": 0.9952, + "step": 1652 + }, + { + "epoch": 0.25, + "grad_norm": 3.0379065945927852, + "learning_rate": 1.750278248110759e-05, + "loss": 0.9498, + "step": 1653 + }, + { + "epoch": 0.25, + "grad_norm": 3.037290982411241, + "learning_rate": 1.749950402001579e-05, + "loss": 0.9708, + "step": 1654 + }, + { + "epoch": 0.25, + "grad_norm": 3.107830375806654, + "learning_rate": 1.749622371575623e-05, + "loss": 1.0052, + "step": 1655 + }, + { + "epoch": 0.25, + "grad_norm": 2.839118227383374, + "learning_rate": 1.749294156913512e-05, + "loss": 0.9887, + "step": 1656 + }, + { + "epoch": 0.25, + "grad_norm": 3.296029959912079, + "learning_rate": 1.748965758095912e-05, + "loss": 0.9337, + "step": 1657 + }, + { + "epoch": 0.25, + "grad_norm": 3.0948027241708327, + "learning_rate": 1.7486371752035346e-05, + "loss": 0.9437, + "step": 1658 + }, + { + "epoch": 0.25, + "grad_norm": 3.095667626766412, + "learning_rate": 1.7483084083171353e-05, + "loss": 0.9261, + "step": 1659 + }, + { + "epoch": 0.25, + "grad_norm": 2.9205509097298443, + "learning_rate": 1.7479794575175167e-05, + "loss": 0.9924, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 2.982852037709161, + "learning_rate": 1.7476503228855254e-05, + "loss": 0.9751, + "step": 1661 + }, + { + "epoch": 0.25, + "grad_norm": 3.0325740886574923, + "learning_rate": 1.747321004502053e-05, + "loss": 0.9428, + "step": 1662 + }, + { + "epoch": 0.25, + "grad_norm": 9.321678349275457, + "learning_rate": 1.746991502448037e-05, + "loss": 1.2062, + "step": 1663 + }, + { + "epoch": 0.25, + "grad_norm": 3.207975724754327, + "learning_rate": 1.7466618168044604e-05, + "loss": 0.9685, + "step": 1664 + }, + { + "epoch": 0.25, + "grad_norm": 2.972145230828689, + "learning_rate": 1.74633194765235e-05, + "loss": 0.9395, + "step": 1665 + }, + { + "epoch": 0.26, + "grad_norm": 3.0197396910817815, + "learning_rate": 1.746001895072778e-05, + "loss": 1.0408, + "step": 1666 + }, + { + "epoch": 0.26, + "grad_norm": 3.391534577184481, + "learning_rate": 1.7456716591468632e-05, + "loss": 0.8894, + "step": 1667 + }, + { + "epoch": 0.26, + "grad_norm": 2.6510303451802515, + "learning_rate": 1.7453412399557673e-05, + "loss": 0.9196, + "step": 1668 + }, + { + "epoch": 0.26, + "grad_norm": 5.69220613921482, + "learning_rate": 1.7450106375806988e-05, + "loss": 1.0706, + "step": 1669 + }, + { + "epoch": 0.26, + "grad_norm": 3.4899235907480253, + "learning_rate": 1.7446798521029104e-05, + "loss": 0.9742, + "step": 1670 + }, + { + "epoch": 0.26, + "grad_norm": 3.403430566508585, + "learning_rate": 1.744348883603699e-05, + "loss": 1.0179, + "step": 1671 + }, + { + "epoch": 0.26, + "grad_norm": 2.8503841944131776, + "learning_rate": 1.7440177321644085e-05, + "loss": 0.8921, + "step": 1672 + }, + { + "epoch": 0.26, + "grad_norm": 3.117540564950011, + "learning_rate": 1.7436863978664264e-05, + "loss": 0.8934, + "step": 1673 + }, + { + "epoch": 0.26, + "grad_norm": 3.0231089470141677, + "learning_rate": 1.7433548807911846e-05, + "loss": 1.0579, + "step": 1674 + }, + { + "epoch": 0.26, + "grad_norm": 2.91070938996234, + "learning_rate": 1.7430231810201616e-05, + "loss": 0.9335, + "step": 1675 + }, + { + "epoch": 0.26, + "grad_norm": 3.499268591374772, + "learning_rate": 1.7426912986348797e-05, + "loss": 0.9894, + "step": 1676 + }, + { + "epoch": 0.26, + "grad_norm": 3.024082227265866, + "learning_rate": 1.742359233716906e-05, + "loss": 0.9656, + "step": 1677 + }, + { + "epoch": 0.26, + "grad_norm": 2.9461564317791638, + "learning_rate": 1.742026986347853e-05, + "loss": 0.9275, + "step": 1678 + }, + { + "epoch": 0.26, + "grad_norm": 8.29468643253041, + "learning_rate": 1.7416945566093775e-05, + "loss": 1.1882, + "step": 1679 + }, + { + "epoch": 0.26, + "grad_norm": 3.3109274899198193, + "learning_rate": 1.7413619445831815e-05, + "loss": 0.9028, + "step": 1680 + }, + { + "epoch": 0.26, + "grad_norm": 2.883852212607492, + "learning_rate": 1.741029150351012e-05, + "loss": 0.9543, + "step": 1681 + }, + { + "epoch": 0.26, + "grad_norm": 3.1513321934934866, + "learning_rate": 1.7406961739946605e-05, + "loss": 1.012, + "step": 1682 + }, + { + "epoch": 0.26, + "grad_norm": 3.0608795599630514, + "learning_rate": 1.7403630155959626e-05, + "loss": 0.9721, + "step": 1683 + }, + { + "epoch": 0.26, + "grad_norm": 2.9213729732013003, + "learning_rate": 1.7400296752368e-05, + "loss": 1.0405, + "step": 1684 + }, + { + "epoch": 0.26, + "grad_norm": 2.8757556386881253, + "learning_rate": 1.7396961529990978e-05, + "loss": 0.9275, + "step": 1685 + }, + { + "epoch": 0.26, + "grad_norm": 3.0883656707818514, + "learning_rate": 1.739362448964827e-05, + "loss": 1.0599, + "step": 1686 + }, + { + "epoch": 0.26, + "grad_norm": 2.8710024780202064, + "learning_rate": 1.7390285632160025e-05, + "loss": 0.94, + "step": 1687 + }, + { + "epoch": 0.26, + "grad_norm": 2.9563153036212664, + "learning_rate": 1.738694495834684e-05, + "loss": 0.9108, + "step": 1688 + }, + { + "epoch": 0.26, + "grad_norm": 3.0319132328021308, + "learning_rate": 1.738360246902976e-05, + "loss": 0.9292, + "step": 1689 + }, + { + "epoch": 0.26, + "grad_norm": 2.9543356755124495, + "learning_rate": 1.738025816503027e-05, + "loss": 0.9085, + "step": 1690 + }, + { + "epoch": 0.26, + "grad_norm": 3.127768570393247, + "learning_rate": 1.7376912047170312e-05, + "loss": 0.9394, + "step": 1691 + }, + { + "epoch": 0.26, + "grad_norm": 3.149126382527672, + "learning_rate": 1.7373564116272268e-05, + "loss": 1.0142, + "step": 1692 + }, + { + "epoch": 0.26, + "grad_norm": 3.16266300650271, + "learning_rate": 1.7370214373158962e-05, + "loss": 1.0159, + "step": 1693 + }, + { + "epoch": 0.26, + "grad_norm": 2.7613314349272127, + "learning_rate": 1.7366862818653668e-05, + "loss": 0.8128, + "step": 1694 + }, + { + "epoch": 0.26, + "grad_norm": 3.3646451143842917, + "learning_rate": 1.7363509453580104e-05, + "loss": 0.9478, + "step": 1695 + }, + { + "epoch": 0.26, + "grad_norm": 2.7744638924974123, + "learning_rate": 1.7360154278762437e-05, + "loss": 0.9192, + "step": 1696 + }, + { + "epoch": 0.26, + "grad_norm": 2.9470719493608946, + "learning_rate": 1.7356797295025267e-05, + "loss": 0.9279, + "step": 1697 + }, + { + "epoch": 0.26, + "grad_norm": 3.0824620731472856, + "learning_rate": 1.7353438503193657e-05, + "loss": 0.9528, + "step": 1698 + }, + { + "epoch": 0.26, + "grad_norm": 3.0844674185118532, + "learning_rate": 1.7350077904093094e-05, + "loss": 0.9883, + "step": 1699 + }, + { + "epoch": 0.26, + "grad_norm": 3.093740006730346, + "learning_rate": 1.734671549854952e-05, + "loss": 0.8871, + "step": 1700 + }, + { + "epoch": 0.26, + "grad_norm": 2.9962269846127003, + "learning_rate": 1.7343351287389328e-05, + "loss": 1.0069, + "step": 1701 + }, + { + "epoch": 0.26, + "grad_norm": 3.0301979248106408, + "learning_rate": 1.733998527143934e-05, + "loss": 0.9567, + "step": 1702 + }, + { + "epoch": 0.26, + "grad_norm": 2.900471623801645, + "learning_rate": 1.733661745152683e-05, + "loss": 0.8398, + "step": 1703 + }, + { + "epoch": 0.26, + "grad_norm": 3.1403426337374203, + "learning_rate": 1.733324782847951e-05, + "loss": 1.0183, + "step": 1704 + }, + { + "epoch": 0.26, + "grad_norm": 3.4374172819757125, + "learning_rate": 1.7329876403125547e-05, + "loss": 0.8892, + "step": 1705 + }, + { + "epoch": 0.26, + "grad_norm": 3.030795047947954, + "learning_rate": 1.7326503176293536e-05, + "loss": 0.9275, + "step": 1706 + }, + { + "epoch": 0.26, + "grad_norm": 2.8903678182970776, + "learning_rate": 1.7323128148812525e-05, + "loss": 0.8777, + "step": 1707 + }, + { + "epoch": 0.26, + "grad_norm": 3.287031736581128, + "learning_rate": 1.7319751321511998e-05, + "loss": 0.9923, + "step": 1708 + }, + { + "epoch": 0.26, + "grad_norm": 3.0188191593876836, + "learning_rate": 1.7316372695221888e-05, + "loss": 0.8788, + "step": 1709 + }, + { + "epoch": 0.26, + "grad_norm": 3.019615610447504, + "learning_rate": 1.7312992270772564e-05, + "loss": 0.9385, + "step": 1710 + }, + { + "epoch": 0.26, + "grad_norm": 2.8104085965673415, + "learning_rate": 1.730961004899484e-05, + "loss": 0.906, + "step": 1711 + }, + { + "epoch": 0.26, + "grad_norm": 2.907845279180565, + "learning_rate": 1.7306226030719972e-05, + "loss": 0.9527, + "step": 1712 + }, + { + "epoch": 0.26, + "grad_norm": 3.4993671502111137, + "learning_rate": 1.7302840216779657e-05, + "loss": 1.0291, + "step": 1713 + }, + { + "epoch": 0.26, + "grad_norm": 4.106702965434131, + "learning_rate": 1.7299452608006034e-05, + "loss": 0.93, + "step": 1714 + }, + { + "epoch": 0.26, + "grad_norm": 3.1101385424088748, + "learning_rate": 1.7296063205231676e-05, + "loss": 0.972, + "step": 1715 + }, + { + "epoch": 0.26, + "grad_norm": 3.1293898506147744, + "learning_rate": 1.729267200928961e-05, + "loss": 1.104, + "step": 1716 + }, + { + "epoch": 0.26, + "grad_norm": 2.9730924394570124, + "learning_rate": 1.7289279021013298e-05, + "loss": 1.0663, + "step": 1717 + }, + { + "epoch": 0.26, + "grad_norm": 2.931559610901426, + "learning_rate": 1.7285884241236636e-05, + "loss": 0.8909, + "step": 1718 + }, + { + "epoch": 0.26, + "grad_norm": 2.945747264549108, + "learning_rate": 1.7282487670793967e-05, + "loss": 0.9308, + "step": 1719 + }, + { + "epoch": 0.26, + "grad_norm": 3.372598482443039, + "learning_rate": 1.7279089310520075e-05, + "loss": 0.9671, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 3.252864971605568, + "learning_rate": 1.727568916125018e-05, + "loss": 0.9744, + "step": 1721 + }, + { + "epoch": 0.26, + "grad_norm": 3.0895113567236314, + "learning_rate": 1.727228722381994e-05, + "loss": 0.9229, + "step": 1722 + }, + { + "epoch": 0.26, + "grad_norm": 2.882664161383922, + "learning_rate": 1.7268883499065467e-05, + "loss": 0.9749, + "step": 1723 + }, + { + "epoch": 0.26, + "grad_norm": 3.1085081031820945, + "learning_rate": 1.7265477987823287e-05, + "loss": 1.0048, + "step": 1724 + }, + { + "epoch": 0.26, + "grad_norm": 3.0004823214772047, + "learning_rate": 1.7262070690930386e-05, + "loss": 0.8328, + "step": 1725 + }, + { + "epoch": 0.26, + "grad_norm": 3.038832818626629, + "learning_rate": 1.7258661609224186e-05, + "loss": 0.9228, + "step": 1726 + }, + { + "epoch": 0.26, + "grad_norm": 3.192020710399445, + "learning_rate": 1.7255250743542533e-05, + "loss": 1.0519, + "step": 1727 + }, + { + "epoch": 0.26, + "grad_norm": 2.6668903579950114, + "learning_rate": 1.7251838094723732e-05, + "loss": 0.8243, + "step": 1728 + }, + { + "epoch": 0.26, + "grad_norm": 3.0670559820655092, + "learning_rate": 1.7248423663606514e-05, + "loss": 0.8683, + "step": 1729 + }, + { + "epoch": 0.26, + "grad_norm": 2.8676037134225485, + "learning_rate": 1.7245007451030046e-05, + "loss": 0.9981, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 8.79497576470677, + "learning_rate": 1.724158945783394e-05, + "loss": 1.1708, + "step": 1731 + }, + { + "epoch": 0.27, + "grad_norm": 2.99738917376387, + "learning_rate": 1.723816968485825e-05, + "loss": 1.0504, + "step": 1732 + }, + { + "epoch": 0.27, + "grad_norm": 3.2178324841637616, + "learning_rate": 1.7234748132943445e-05, + "loss": 1.0838, + "step": 1733 + }, + { + "epoch": 0.27, + "grad_norm": 3.4115054171733132, + "learning_rate": 1.723132480293046e-05, + "loss": 0.9939, + "step": 1734 + }, + { + "epoch": 0.27, + "grad_norm": 2.7712272440729984, + "learning_rate": 1.7227899695660647e-05, + "loss": 0.8403, + "step": 1735 + }, + { + "epoch": 0.27, + "grad_norm": 3.402421858255346, + "learning_rate": 1.7224472811975803e-05, + "loss": 0.9411, + "step": 1736 + }, + { + "epoch": 0.27, + "grad_norm": 3.0223444412321774, + "learning_rate": 1.722104415271816e-05, + "loss": 0.9427, + "step": 1737 + }, + { + "epoch": 0.27, + "grad_norm": 3.017597258777127, + "learning_rate": 1.7217613718730385e-05, + "loss": 0.9807, + "step": 1738 + }, + { + "epoch": 0.27, + "grad_norm": 3.1635316587108906, + "learning_rate": 1.7214181510855582e-05, + "loss": 0.8457, + "step": 1739 + }, + { + "epoch": 0.27, + "grad_norm": 3.7802900453298824, + "learning_rate": 1.7210747529937296e-05, + "loss": 0.9754, + "step": 1740 + }, + { + "epoch": 0.27, + "grad_norm": 2.7253571537480985, + "learning_rate": 1.72073117768195e-05, + "loss": 0.8775, + "step": 1741 + }, + { + "epoch": 0.27, + "grad_norm": 2.86569813967416, + "learning_rate": 1.7203874252346607e-05, + "loss": 0.9749, + "step": 1742 + }, + { + "epoch": 0.27, + "grad_norm": 3.0864039605414426, + "learning_rate": 1.720043495736346e-05, + "loss": 0.8447, + "step": 1743 + }, + { + "epoch": 0.27, + "grad_norm": 2.790503841646772, + "learning_rate": 1.7196993892715344e-05, + "loss": 0.9175, + "step": 1744 + }, + { + "epoch": 0.27, + "grad_norm": 2.9046962258179656, + "learning_rate": 1.719355105924798e-05, + "loss": 0.9491, + "step": 1745 + }, + { + "epoch": 0.27, + "grad_norm": 3.188869353506275, + "learning_rate": 1.7190106457807515e-05, + "loss": 0.8645, + "step": 1746 + }, + { + "epoch": 0.27, + "grad_norm": 2.886800885546785, + "learning_rate": 1.7186660089240535e-05, + "loss": 0.8278, + "step": 1747 + }, + { + "epoch": 0.27, + "grad_norm": 3.055015385983957, + "learning_rate": 1.7183211954394063e-05, + "loss": 0.9109, + "step": 1748 + }, + { + "epoch": 0.27, + "grad_norm": 3.6597265454520023, + "learning_rate": 1.7179762054115553e-05, + "loss": 0.9243, + "step": 1749 + }, + { + "epoch": 0.27, + "grad_norm": 3.2097148429469717, + "learning_rate": 1.7176310389252897e-05, + "loss": 0.9359, + "step": 1750 + }, + { + "epoch": 0.27, + "grad_norm": 3.1357290092403733, + "learning_rate": 1.717285696065441e-05, + "loss": 0.8657, + "step": 1751 + }, + { + "epoch": 0.27, + "grad_norm": 3.542033237018684, + "learning_rate": 1.7169401769168855e-05, + "loss": 0.9101, + "step": 1752 + }, + { + "epoch": 0.27, + "grad_norm": 3.104045813974907, + "learning_rate": 1.7165944815645418e-05, + "loss": 0.9854, + "step": 1753 + }, + { + "epoch": 0.27, + "grad_norm": 3.0470300166305817, + "learning_rate": 1.7162486100933725e-05, + "loss": 0.927, + "step": 1754 + }, + { + "epoch": 0.27, + "grad_norm": 3.036460270383953, + "learning_rate": 1.7159025625883823e-05, + "loss": 0.8991, + "step": 1755 + }, + { + "epoch": 0.27, + "grad_norm": 20.187501332975838, + "learning_rate": 1.7155563391346207e-05, + "loss": 1.2687, + "step": 1756 + }, + { + "epoch": 0.27, + "grad_norm": 3.127249750494202, + "learning_rate": 1.7152099398171796e-05, + "loss": 1.001, + "step": 1757 + }, + { + "epoch": 0.27, + "grad_norm": 3.086661240260967, + "learning_rate": 1.7148633647211936e-05, + "loss": 0.9008, + "step": 1758 + }, + { + "epoch": 0.27, + "grad_norm": 3.6321929427561344, + "learning_rate": 1.714516613931842e-05, + "loss": 1.0084, + "step": 1759 + }, + { + "epoch": 0.27, + "grad_norm": 3.03961360149895, + "learning_rate": 1.7141696875343463e-05, + "loss": 0.933, + "step": 1760 + }, + { + "epoch": 0.27, + "grad_norm": 3.351519334357102, + "learning_rate": 1.7138225856139707e-05, + "loss": 0.9046, + "step": 1761 + }, + { + "epoch": 0.27, + "grad_norm": 3.097830329883883, + "learning_rate": 1.7134753082560236e-05, + "loss": 1.0162, + "step": 1762 + }, + { + "epoch": 0.27, + "grad_norm": 3.101611726104539, + "learning_rate": 1.7131278555458558e-05, + "loss": 0.9609, + "step": 1763 + }, + { + "epoch": 0.27, + "grad_norm": 2.9844918351327556, + "learning_rate": 1.7127802275688615e-05, + "loss": 0.9106, + "step": 1764 + }, + { + "epoch": 0.27, + "grad_norm": 3.2691109853035045, + "learning_rate": 1.7124324244104782e-05, + "loss": 0.8839, + "step": 1765 + }, + { + "epoch": 0.27, + "grad_norm": 3.120653729928492, + "learning_rate": 1.7120844461561857e-05, + "loss": 0.8986, + "step": 1766 + }, + { + "epoch": 0.27, + "grad_norm": 3.085688777871114, + "learning_rate": 1.7117362928915073e-05, + "loss": 0.9608, + "step": 1767 + }, + { + "epoch": 0.27, + "grad_norm": 2.988794942793776, + "learning_rate": 1.7113879647020098e-05, + "loss": 0.9892, + "step": 1768 + }, + { + "epoch": 0.27, + "grad_norm": 4.142175207922706, + "learning_rate": 1.711039461673302e-05, + "loss": 0.846, + "step": 1769 + }, + { + "epoch": 0.27, + "grad_norm": 3.1702848317984467, + "learning_rate": 1.7106907838910365e-05, + "loss": 0.9765, + "step": 1770 + }, + { + "epoch": 0.27, + "grad_norm": 18.480872514729015, + "learning_rate": 1.7103419314409084e-05, + "loss": 1.1757, + "step": 1771 + }, + { + "epoch": 0.27, + "grad_norm": 2.7328141943559916, + "learning_rate": 1.709992904408656e-05, + "loss": 0.8963, + "step": 1772 + }, + { + "epoch": 0.27, + "grad_norm": 3.0810657826869616, + "learning_rate": 1.70964370288006e-05, + "loss": 1.0049, + "step": 1773 + }, + { + "epoch": 0.27, + "grad_norm": 3.1060015118018565, + "learning_rate": 1.7092943269409442e-05, + "loss": 0.8715, + "step": 1774 + }, + { + "epoch": 0.27, + "grad_norm": 3.1102710293969507, + "learning_rate": 1.7089447766771762e-05, + "loss": 0.994, + "step": 1775 + }, + { + "epoch": 0.27, + "grad_norm": 3.2154911009841762, + "learning_rate": 1.708595052174665e-05, + "loss": 0.9058, + "step": 1776 + }, + { + "epoch": 0.27, + "grad_norm": 8.557012466247219, + "learning_rate": 1.7082451535193635e-05, + "loss": 1.167, + "step": 1777 + }, + { + "epoch": 0.27, + "grad_norm": 2.783641336634145, + "learning_rate": 1.7078950807972667e-05, + "loss": 0.8901, + "step": 1778 + }, + { + "epoch": 0.27, + "grad_norm": 2.9934657945221828, + "learning_rate": 1.7075448340944125e-05, + "loss": 0.9141, + "step": 1779 + }, + { + "epoch": 0.27, + "grad_norm": 3.150396420366335, + "learning_rate": 1.7071944134968817e-05, + "loss": 0.8861, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 3.3958232210218937, + "learning_rate": 1.7068438190907987e-05, + "loss": 0.9127, + "step": 1781 + }, + { + "epoch": 0.27, + "grad_norm": 3.0008540440950373, + "learning_rate": 1.7064930509623287e-05, + "loss": 0.9078, + "step": 1782 + }, + { + "epoch": 0.27, + "grad_norm": 2.890349461668927, + "learning_rate": 1.706142109197681e-05, + "loss": 0.9415, + "step": 1783 + }, + { + "epoch": 0.27, + "grad_norm": 2.865060978081574, + "learning_rate": 1.7057909938831077e-05, + "loss": 1.0334, + "step": 1784 + }, + { + "epoch": 0.27, + "grad_norm": 3.231093802050666, + "learning_rate": 1.7054397051049028e-05, + "loss": 0.9508, + "step": 1785 + }, + { + "epoch": 0.27, + "grad_norm": 2.8203188281216938, + "learning_rate": 1.705088242949403e-05, + "loss": 0.9999, + "step": 1786 + }, + { + "epoch": 0.27, + "grad_norm": 3.0897255312247514, + "learning_rate": 1.704736607502988e-05, + "loss": 1.0371, + "step": 1787 + }, + { + "epoch": 0.27, + "grad_norm": 2.9172885478578827, + "learning_rate": 1.70438479885208e-05, + "loss": 0.9821, + "step": 1788 + }, + { + "epoch": 0.27, + "grad_norm": 2.9720055648574664, + "learning_rate": 1.7040328170831438e-05, + "loss": 0.8796, + "step": 1789 + }, + { + "epoch": 0.27, + "grad_norm": 3.1382099352612167, + "learning_rate": 1.703680662282686e-05, + "loss": 0.8977, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 2.9843929380153864, + "learning_rate": 1.7033283345372577e-05, + "loss": 0.8788, + "step": 1791 + }, + { + "epoch": 0.27, + "grad_norm": 2.7407504477935687, + "learning_rate": 1.7029758339334493e-05, + "loss": 0.9148, + "step": 1792 + }, + { + "epoch": 0.27, + "grad_norm": 3.1593277458544846, + "learning_rate": 1.7026231605578977e-05, + "loss": 0.9894, + "step": 1793 + }, + { + "epoch": 0.27, + "grad_norm": 2.8811698457573636, + "learning_rate": 1.7022703144972783e-05, + "loss": 0.9389, + "step": 1794 + }, + { + "epoch": 0.27, + "grad_norm": 8.019957969047658, + "learning_rate": 1.7019172958383117e-05, + "loss": 1.1372, + "step": 1795 + }, + { + "epoch": 0.27, + "grad_norm": 3.1741523508416507, + "learning_rate": 1.70156410466776e-05, + "loss": 0.9265, + "step": 1796 + }, + { + "epoch": 0.28, + "grad_norm": 3.031657184880145, + "learning_rate": 1.7012107410724272e-05, + "loss": 1.009, + "step": 1797 + }, + { + "epoch": 0.28, + "grad_norm": 3.046990652580771, + "learning_rate": 1.7008572051391605e-05, + "loss": 1.0, + "step": 1798 + }, + { + "epoch": 0.28, + "grad_norm": 3.0803455961577884, + "learning_rate": 1.7005034969548494e-05, + "loss": 0.9237, + "step": 1799 + }, + { + "epoch": 0.28, + "grad_norm": 2.935693160122931, + "learning_rate": 1.7001496166064247e-05, + "loss": 1.0491, + "step": 1800 + }, + { + "epoch": 0.28, + "grad_norm": 2.7763607875745375, + "learning_rate": 1.6997955641808607e-05, + "loss": 0.9119, + "step": 1801 + }, + { + "epoch": 0.28, + "grad_norm": 3.1123010538326614, + "learning_rate": 1.6994413397651736e-05, + "loss": 1.0113, + "step": 1802 + }, + { + "epoch": 0.28, + "grad_norm": 3.130483097020963, + "learning_rate": 1.6990869434464217e-05, + "loss": 0.9616, + "step": 1803 + }, + { + "epoch": 0.28, + "grad_norm": 3.3471090454376884, + "learning_rate": 1.698732375311706e-05, + "loss": 0.939, + "step": 1804 + }, + { + "epoch": 0.28, + "grad_norm": 2.9468683554247557, + "learning_rate": 1.698377635448169e-05, + "loss": 0.8409, + "step": 1805 + }, + { + "epoch": 0.28, + "grad_norm": 3.293572112069917, + "learning_rate": 1.6980227239429957e-05, + "loss": 0.9963, + "step": 1806 + }, + { + "epoch": 0.28, + "grad_norm": 2.8948531850277086, + "learning_rate": 1.6976676408834137e-05, + "loss": 0.9784, + "step": 1807 + }, + { + "epoch": 0.28, + "grad_norm": 3.139711415530516, + "learning_rate": 1.6973123863566927e-05, + "loss": 0.9313, + "step": 1808 + }, + { + "epoch": 0.28, + "grad_norm": 3.1516894042204884, + "learning_rate": 1.6969569604501437e-05, + "loss": 0.9585, + "step": 1809 + }, + { + "epoch": 0.28, + "grad_norm": 2.973758728621806, + "learning_rate": 1.6966013632511207e-05, + "loss": 0.8765, + "step": 1810 + }, + { + "epoch": 0.28, + "grad_norm": 3.0817783963642067, + "learning_rate": 1.6962455948470197e-05, + "loss": 0.9369, + "step": 1811 + }, + { + "epoch": 0.28, + "grad_norm": 2.790610696099825, + "learning_rate": 1.6958896553252783e-05, + "loss": 0.893, + "step": 1812 + }, + { + "epoch": 0.28, + "grad_norm": 2.816385602375839, + "learning_rate": 1.6955335447733768e-05, + "loss": 0.9152, + "step": 1813 + }, + { + "epoch": 0.28, + "grad_norm": 2.8898860914694873, + "learning_rate": 1.6951772632788366e-05, + "loss": 1.0058, + "step": 1814 + }, + { + "epoch": 0.28, + "grad_norm": 2.9058069000027467, + "learning_rate": 1.6948208109292224e-05, + "loss": 0.9882, + "step": 1815 + }, + { + "epoch": 0.28, + "grad_norm": 3.0880966053571, + "learning_rate": 1.6944641878121397e-05, + "loss": 0.9876, + "step": 1816 + }, + { + "epoch": 0.28, + "grad_norm": 3.077793953569075, + "learning_rate": 1.6941073940152367e-05, + "loss": 0.905, + "step": 1817 + }, + { + "epoch": 0.28, + "grad_norm": 3.197647594173069, + "learning_rate": 1.693750429626203e-05, + "loss": 0.9701, + "step": 1818 + }, + { + "epoch": 0.28, + "grad_norm": 2.87426005324195, + "learning_rate": 1.693393294732771e-05, + "loss": 0.8155, + "step": 1819 + }, + { + "epoch": 0.28, + "grad_norm": 2.8098810857134957, + "learning_rate": 1.6930359894227137e-05, + "loss": 1.0062, + "step": 1820 + }, + { + "epoch": 0.28, + "grad_norm": 3.236471667352018, + "learning_rate": 1.6926785137838475e-05, + "loss": 1.1021, + "step": 1821 + }, + { + "epoch": 0.28, + "grad_norm": 2.7811463349506473, + "learning_rate": 1.6923208679040292e-05, + "loss": 0.9271, + "step": 1822 + }, + { + "epoch": 0.28, + "grad_norm": 3.119477770391288, + "learning_rate": 1.6919630518711588e-05, + "loss": 1.0113, + "step": 1823 + }, + { + "epoch": 0.28, + "grad_norm": 2.7893762696875952, + "learning_rate": 1.691605065773177e-05, + "loss": 1.0375, + "step": 1824 + }, + { + "epoch": 0.28, + "grad_norm": 2.6966638468986184, + "learning_rate": 1.6912469096980664e-05, + "loss": 0.9959, + "step": 1825 + }, + { + "epoch": 0.28, + "grad_norm": 3.271609100047215, + "learning_rate": 1.6908885837338525e-05, + "loss": 0.9358, + "step": 1826 + }, + { + "epoch": 0.28, + "grad_norm": 2.6535814458895586, + "learning_rate": 1.6905300879686012e-05, + "loss": 0.9314, + "step": 1827 + }, + { + "epoch": 0.28, + "grad_norm": 3.0781102996355814, + "learning_rate": 1.6901714224904215e-05, + "loss": 0.9368, + "step": 1828 + }, + { + "epoch": 0.28, + "grad_norm": 2.8514555295039825, + "learning_rate": 1.689812587387462e-05, + "loss": 0.9585, + "step": 1829 + }, + { + "epoch": 0.28, + "grad_norm": 3.1607038231576516, + "learning_rate": 1.6894535827479152e-05, + "loss": 1.0184, + "step": 1830 + }, + { + "epoch": 0.28, + "grad_norm": 2.7494442796069913, + "learning_rate": 1.6890944086600145e-05, + "loss": 0.9984, + "step": 1831 + }, + { + "epoch": 0.28, + "grad_norm": 3.2687465123361275, + "learning_rate": 1.6887350652120346e-05, + "loss": 0.9707, + "step": 1832 + }, + { + "epoch": 0.28, + "grad_norm": 3.078713456184605, + "learning_rate": 1.688375552492292e-05, + "loss": 0.9926, + "step": 1833 + }, + { + "epoch": 0.28, + "grad_norm": 2.7217043180958616, + "learning_rate": 1.688015870589144e-05, + "loss": 0.9223, + "step": 1834 + }, + { + "epoch": 0.28, + "grad_norm": 2.806529041950445, + "learning_rate": 1.6876560195909916e-05, + "loss": 0.9107, + "step": 1835 + }, + { + "epoch": 0.28, + "grad_norm": 15.422810929651483, + "learning_rate": 1.687295999586276e-05, + "loss": 1.2869, + "step": 1836 + }, + { + "epoch": 0.28, + "grad_norm": 3.194812363041196, + "learning_rate": 1.6869358106634794e-05, + "loss": 1.0234, + "step": 1837 + }, + { + "epoch": 0.28, + "grad_norm": 3.0263600740187413, + "learning_rate": 1.686575452911126e-05, + "loss": 0.9931, + "step": 1838 + }, + { + "epoch": 0.28, + "grad_norm": 3.217364912389536, + "learning_rate": 1.6862149264177826e-05, + "loss": 0.9718, + "step": 1839 + }, + { + "epoch": 0.28, + "grad_norm": 2.864268872192165, + "learning_rate": 1.6858542312720555e-05, + "loss": 0.971, + "step": 1840 + }, + { + "epoch": 0.28, + "grad_norm": 2.9868935967262358, + "learning_rate": 1.685493367562594e-05, + "loss": 0.9497, + "step": 1841 + }, + { + "epoch": 0.28, + "grad_norm": 3.02682450886872, + "learning_rate": 1.6851323353780883e-05, + "loss": 1.0055, + "step": 1842 + }, + { + "epoch": 0.28, + "grad_norm": 2.985454978585505, + "learning_rate": 1.6847711348072694e-05, + "loss": 0.9894, + "step": 1843 + }, + { + "epoch": 0.28, + "grad_norm": 3.062161794366507, + "learning_rate": 1.684409765938911e-05, + "loss": 0.9619, + "step": 1844 + }, + { + "epoch": 0.28, + "grad_norm": 2.920527000454507, + "learning_rate": 1.684048228861827e-05, + "loss": 0.9533, + "step": 1845 + }, + { + "epoch": 0.28, + "grad_norm": 2.8508734491132364, + "learning_rate": 1.6836865236648736e-05, + "loss": 0.8948, + "step": 1846 + }, + { + "epoch": 0.28, + "grad_norm": 3.109326001566116, + "learning_rate": 1.683324650436947e-05, + "loss": 0.866, + "step": 1847 + }, + { + "epoch": 0.28, + "grad_norm": 2.9223301296017796, + "learning_rate": 1.682962609266986e-05, + "loss": 0.9989, + "step": 1848 + }, + { + "epoch": 0.28, + "grad_norm": 2.9451164618486563, + "learning_rate": 1.68260040024397e-05, + "loss": 0.9472, + "step": 1849 + }, + { + "epoch": 0.28, + "grad_norm": 3.06203765888581, + "learning_rate": 1.68223802345692e-05, + "loss": 0.9352, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 2.947449793617199, + "learning_rate": 1.6818754789948974e-05, + "loss": 0.9157, + "step": 1851 + }, + { + "epoch": 0.28, + "grad_norm": 3.2220571907558564, + "learning_rate": 1.6815127669470066e-05, + "loss": 1.0802, + "step": 1852 + }, + { + "epoch": 0.28, + "grad_norm": 2.9745392570414517, + "learning_rate": 1.6811498874023914e-05, + "loss": 0.9811, + "step": 1853 + }, + { + "epoch": 0.28, + "grad_norm": 2.9466446307315883, + "learning_rate": 1.680786840450237e-05, + "loss": 0.8016, + "step": 1854 + }, + { + "epoch": 0.28, + "grad_norm": 3.221723283817632, + "learning_rate": 1.6804236261797707e-05, + "loss": 0.9568, + "step": 1855 + }, + { + "epoch": 0.28, + "grad_norm": 2.8783807082840136, + "learning_rate": 1.6800602446802604e-05, + "loss": 1.0294, + "step": 1856 + }, + { + "epoch": 0.28, + "grad_norm": 3.3234673136431776, + "learning_rate": 1.679696696041015e-05, + "loss": 1.0243, + "step": 1857 + }, + { + "epoch": 0.28, + "grad_norm": 3.1457564055469462, + "learning_rate": 1.6793329803513845e-05, + "loss": 1.0493, + "step": 1858 + }, + { + "epoch": 0.28, + "grad_norm": 2.833612508072633, + "learning_rate": 1.67896909770076e-05, + "loss": 0.9366, + "step": 1859 + }, + { + "epoch": 0.28, + "grad_norm": 3.0762053615072427, + "learning_rate": 1.678605048178574e-05, + "loss": 0.9819, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 3.2781884938178423, + "learning_rate": 1.678240831874299e-05, + "loss": 0.9688, + "step": 1861 + }, + { + "epoch": 0.29, + "grad_norm": 3.3692602382750247, + "learning_rate": 1.677876448877449e-05, + "loss": 0.8875, + "step": 1862 + }, + { + "epoch": 0.29, + "grad_norm": 3.013326890826228, + "learning_rate": 1.6775118992775805e-05, + "loss": 0.8707, + "step": 1863 + }, + { + "epoch": 0.29, + "grad_norm": 3.459040885528723, + "learning_rate": 1.6771471831642885e-05, + "loss": 1.0118, + "step": 1864 + }, + { + "epoch": 0.29, + "grad_norm": 2.901135398632171, + "learning_rate": 1.67678230062721e-05, + "loss": 0.9964, + "step": 1865 + }, + { + "epoch": 0.29, + "grad_norm": 3.146941247952338, + "learning_rate": 1.6764172517560232e-05, + "loss": 1.0776, + "step": 1866 + }, + { + "epoch": 0.29, + "grad_norm": 3.239123497963529, + "learning_rate": 1.6760520366404465e-05, + "loss": 0.935, + "step": 1867 + }, + { + "epoch": 0.29, + "grad_norm": 3.1929933652649822, + "learning_rate": 1.67568665537024e-05, + "loss": 0.8425, + "step": 1868 + }, + { + "epoch": 0.29, + "grad_norm": 3.033169365361725, + "learning_rate": 1.675321108035204e-05, + "loss": 0.9379, + "step": 1869 + }, + { + "epoch": 0.29, + "grad_norm": 3.0333266108265007, + "learning_rate": 1.6749553947251796e-05, + "loss": 0.8782, + "step": 1870 + }, + { + "epoch": 0.29, + "grad_norm": 2.876592953022209, + "learning_rate": 1.674589515530049e-05, + "loss": 0.8542, + "step": 1871 + }, + { + "epoch": 0.29, + "grad_norm": 3.302964058161183, + "learning_rate": 1.6742234705397353e-05, + "loss": 0.9858, + "step": 1872 + }, + { + "epoch": 0.29, + "grad_norm": 3.124949533715328, + "learning_rate": 1.6738572598442017e-05, + "loss": 0.9404, + "step": 1873 + }, + { + "epoch": 0.29, + "grad_norm": 3.0780370387166376, + "learning_rate": 1.6734908835334528e-05, + "loss": 0.9963, + "step": 1874 + }, + { + "epoch": 0.29, + "grad_norm": 2.8498009806918034, + "learning_rate": 1.673124341697533e-05, + "loss": 0.9845, + "step": 1875 + }, + { + "epoch": 0.29, + "grad_norm": 2.816079342172047, + "learning_rate": 1.672757634426529e-05, + "loss": 0.9985, + "step": 1876 + }, + { + "epoch": 0.29, + "grad_norm": 2.7436541873730276, + "learning_rate": 1.6723907618105664e-05, + "loss": 0.9205, + "step": 1877 + }, + { + "epoch": 0.29, + "grad_norm": 2.95311268150872, + "learning_rate": 1.6720237239398125e-05, + "loss": 0.8629, + "step": 1878 + }, + { + "epoch": 0.29, + "grad_norm": 19.916438316118874, + "learning_rate": 1.671656520904475e-05, + "loss": 1.22, + "step": 1879 + }, + { + "epoch": 0.29, + "grad_norm": 3.0861564067907, + "learning_rate": 1.671289152794802e-05, + "loss": 0.9189, + "step": 1880 + }, + { + "epoch": 0.29, + "grad_norm": 3.089133404397287, + "learning_rate": 1.670921619701082e-05, + "loss": 0.9411, + "step": 1881 + }, + { + "epoch": 0.29, + "grad_norm": 3.221747175011406, + "learning_rate": 1.6705539217136447e-05, + "loss": 0.9864, + "step": 1882 + }, + { + "epoch": 0.29, + "grad_norm": 2.996888161089025, + "learning_rate": 1.6701860589228597e-05, + "loss": 1.0954, + "step": 1883 + }, + { + "epoch": 0.29, + "grad_norm": 8.534999187383088, + "learning_rate": 1.6698180314191375e-05, + "loss": 1.1416, + "step": 1884 + }, + { + "epoch": 0.29, + "grad_norm": 3.250891562903199, + "learning_rate": 1.6694498392929293e-05, + "loss": 1.0242, + "step": 1885 + }, + { + "epoch": 0.29, + "grad_norm": 3.1721803755932974, + "learning_rate": 1.669081482634726e-05, + "loss": 1.0573, + "step": 1886 + }, + { + "epoch": 0.29, + "grad_norm": 3.0931656989393654, + "learning_rate": 1.668712961535059e-05, + "loss": 0.9245, + "step": 1887 + }, + { + "epoch": 0.29, + "grad_norm": 2.7676923458266494, + "learning_rate": 1.668344276084501e-05, + "loss": 0.9846, + "step": 1888 + }, + { + "epoch": 0.29, + "grad_norm": 3.2333026082841565, + "learning_rate": 1.6679754263736644e-05, + "loss": 0.9653, + "step": 1889 + }, + { + "epoch": 0.29, + "grad_norm": 2.9318513606737286, + "learning_rate": 1.6676064124932016e-05, + "loss": 0.942, + "step": 1890 + }, + { + "epoch": 0.29, + "grad_norm": 2.804796108438422, + "learning_rate": 1.6672372345338067e-05, + "loss": 1.0292, + "step": 1891 + }, + { + "epoch": 0.29, + "grad_norm": 2.7689755284678292, + "learning_rate": 1.666867892586213e-05, + "loss": 0.9791, + "step": 1892 + }, + { + "epoch": 0.29, + "grad_norm": 2.730471857480553, + "learning_rate": 1.6664983867411947e-05, + "loss": 0.9998, + "step": 1893 + }, + { + "epoch": 0.29, + "grad_norm": 2.9574138452567618, + "learning_rate": 1.6661287170895647e-05, + "loss": 1.0932, + "step": 1894 + }, + { + "epoch": 0.29, + "grad_norm": 2.9343442651527223, + "learning_rate": 1.665758883722179e-05, + "loss": 0.9637, + "step": 1895 + }, + { + "epoch": 0.29, + "grad_norm": 2.928698965252209, + "learning_rate": 1.6653888867299312e-05, + "loss": 0.9635, + "step": 1896 + }, + { + "epoch": 0.29, + "grad_norm": 3.0528707428961575, + "learning_rate": 1.6650187262037567e-05, + "loss": 1.0008, + "step": 1897 + }, + { + "epoch": 0.29, + "grad_norm": 3.0335541906489136, + "learning_rate": 1.6646484022346305e-05, + "loss": 0.8778, + "step": 1898 + }, + { + "epoch": 0.29, + "grad_norm": 3.2308635269804653, + "learning_rate": 1.6642779149135677e-05, + "loss": 0.9607, + "step": 1899 + }, + { + "epoch": 0.29, + "grad_norm": 3.0332734710472575, + "learning_rate": 1.663907264331624e-05, + "loss": 0.8383, + "step": 1900 + }, + { + "epoch": 0.29, + "grad_norm": 2.840711811673916, + "learning_rate": 1.6635364505798946e-05, + "loss": 0.9219, + "step": 1901 + }, + { + "epoch": 0.29, + "grad_norm": 16.051039006193275, + "learning_rate": 1.663165473749515e-05, + "loss": 1.1543, + "step": 1902 + }, + { + "epoch": 0.29, + "grad_norm": 3.045344183831331, + "learning_rate": 1.6627943339316616e-05, + "loss": 0.9771, + "step": 1903 + }, + { + "epoch": 0.29, + "grad_norm": 2.9859000516392347, + "learning_rate": 1.662423031217549e-05, + "loss": 0.9071, + "step": 1904 + }, + { + "epoch": 0.29, + "grad_norm": 3.274059140274478, + "learning_rate": 1.6620515656984343e-05, + "loss": 0.8979, + "step": 1905 + }, + { + "epoch": 0.29, + "grad_norm": 2.9111963302746187, + "learning_rate": 1.6616799374656124e-05, + "loss": 0.8854, + "step": 1906 + }, + { + "epoch": 0.29, + "grad_norm": 3.6802138181802873, + "learning_rate": 1.6613081466104196e-05, + "loss": 0.9194, + "step": 1907 + }, + { + "epoch": 0.29, + "grad_norm": 2.973806688301168, + "learning_rate": 1.660936193224231e-05, + "loss": 0.9397, + "step": 1908 + }, + { + "epoch": 0.29, + "grad_norm": 2.9352989064745683, + "learning_rate": 1.660564077398463e-05, + "loss": 0.9442, + "step": 1909 + }, + { + "epoch": 0.29, + "grad_norm": 2.9344758621160834, + "learning_rate": 1.6601917992245712e-05, + "loss": 0.9395, + "step": 1910 + }, + { + "epoch": 0.29, + "grad_norm": 2.8050275279211845, + "learning_rate": 1.6598193587940508e-05, + "loss": 1.0315, + "step": 1911 + }, + { + "epoch": 0.29, + "grad_norm": 3.226670654091148, + "learning_rate": 1.659446756198437e-05, + "loss": 0.9383, + "step": 1912 + }, + { + "epoch": 0.29, + "grad_norm": 3.112674249923903, + "learning_rate": 1.6590739915293056e-05, + "loss": 1.0231, + "step": 1913 + }, + { + "epoch": 0.29, + "grad_norm": 2.665704117888447, + "learning_rate": 1.6587010648782717e-05, + "loss": 0.9799, + "step": 1914 + }, + { + "epoch": 0.29, + "grad_norm": 9.465238430343106, + "learning_rate": 1.65832797633699e-05, + "loss": 1.1079, + "step": 1915 + }, + { + "epoch": 0.29, + "grad_norm": 2.911361683645425, + "learning_rate": 1.657954725997155e-05, + "loss": 0.9573, + "step": 1916 + }, + { + "epoch": 0.29, + "grad_norm": 2.927834883238085, + "learning_rate": 1.6575813139505016e-05, + "loss": 0.9304, + "step": 1917 + }, + { + "epoch": 0.29, + "grad_norm": 2.880364399662898, + "learning_rate": 1.6572077402888037e-05, + "loss": 0.8702, + "step": 1918 + }, + { + "epoch": 0.29, + "grad_norm": 2.9971621634760734, + "learning_rate": 1.6568340051038754e-05, + "loss": 0.963, + "step": 1919 + }, + { + "epoch": 0.29, + "grad_norm": 2.7330086419109856, + "learning_rate": 1.6564601084875703e-05, + "loss": 0.9331, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 3.2160871480825253, + "learning_rate": 1.6560860505317813e-05, + "loss": 1.0767, + "step": 1921 + }, + { + "epoch": 0.29, + "grad_norm": 3.6222550750639892, + "learning_rate": 1.655711831328442e-05, + "loss": 0.8794, + "step": 1922 + }, + { + "epoch": 0.29, + "grad_norm": 3.4903844136454523, + "learning_rate": 1.6553374509695244e-05, + "loss": 0.8383, + "step": 1923 + }, + { + "epoch": 0.29, + "grad_norm": 2.8607341149397123, + "learning_rate": 1.6549629095470413e-05, + "loss": 0.9757, + "step": 1924 + }, + { + "epoch": 0.29, + "grad_norm": 3.501136761886263, + "learning_rate": 1.6545882071530443e-05, + "loss": 1.1234, + "step": 1925 + }, + { + "epoch": 0.29, + "grad_norm": 3.003942035104415, + "learning_rate": 1.654213343879624e-05, + "loss": 1.0772, + "step": 1926 + }, + { + "epoch": 0.29, + "grad_norm": 6.455659166595698, + "learning_rate": 1.6538383198189122e-05, + "loss": 1.0843, + "step": 1927 + }, + { + "epoch": 0.3, + "grad_norm": 2.885447760378492, + "learning_rate": 1.653463135063079e-05, + "loss": 0.9104, + "step": 1928 + }, + { + "epoch": 0.3, + "grad_norm": 3.144702023071639, + "learning_rate": 1.6530877897043343e-05, + "loss": 1.0566, + "step": 1929 + }, + { + "epoch": 0.3, + "grad_norm": 3.1471032152462115, + "learning_rate": 1.6527122838349274e-05, + "loss": 1.0278, + "step": 1930 + }, + { + "epoch": 0.3, + "grad_norm": 2.890264832007854, + "learning_rate": 1.652336617547147e-05, + "loss": 0.9163, + "step": 1931 + }, + { + "epoch": 0.3, + "grad_norm": 2.9317766549252466, + "learning_rate": 1.6519607909333216e-05, + "loss": 0.8429, + "step": 1932 + }, + { + "epoch": 0.3, + "grad_norm": 3.4212716262094105, + "learning_rate": 1.6515848040858186e-05, + "loss": 0.9769, + "step": 1933 + }, + { + "epoch": 0.3, + "grad_norm": 3.248556388386718, + "learning_rate": 1.6512086570970455e-05, + "loss": 0.8718, + "step": 1934 + }, + { + "epoch": 0.3, + "grad_norm": 2.903299186840235, + "learning_rate": 1.650832350059448e-05, + "loss": 1.0638, + "step": 1935 + }, + { + "epoch": 0.3, + "grad_norm": 2.8786177507895068, + "learning_rate": 1.650455883065512e-05, + "loss": 0.991, + "step": 1936 + }, + { + "epoch": 0.3, + "grad_norm": 2.8504920475696833, + "learning_rate": 1.650079256207763e-05, + "loss": 0.9286, + "step": 1937 + }, + { + "epoch": 0.3, + "grad_norm": 3.108477746343841, + "learning_rate": 1.6497024695787646e-05, + "loss": 0.9307, + "step": 1938 + }, + { + "epoch": 0.3, + "grad_norm": 2.7398926753706903, + "learning_rate": 1.649325523271121e-05, + "loss": 0.9631, + "step": 1939 + }, + { + "epoch": 0.3, + "grad_norm": 3.05048561325502, + "learning_rate": 1.6489484173774747e-05, + "loss": 0.934, + "step": 1940 + }, + { + "epoch": 0.3, + "grad_norm": 3.1390413170814235, + "learning_rate": 1.648571151990508e-05, + "loss": 0.9479, + "step": 1941 + }, + { + "epoch": 0.3, + "grad_norm": 6.797072943904557, + "learning_rate": 1.648193727202942e-05, + "loss": 1.1066, + "step": 1942 + }, + { + "epoch": 0.3, + "grad_norm": 2.838565494232607, + "learning_rate": 1.647816143107537e-05, + "loss": 1.0361, + "step": 1943 + }, + { + "epoch": 0.3, + "grad_norm": 3.0395818003900956, + "learning_rate": 1.6474383997970928e-05, + "loss": 0.9884, + "step": 1944 + }, + { + "epoch": 0.3, + "grad_norm": 2.9470028853067123, + "learning_rate": 1.6470604973644483e-05, + "loss": 0.9513, + "step": 1945 + }, + { + "epoch": 0.3, + "grad_norm": 2.7727680957102336, + "learning_rate": 1.6466824359024803e-05, + "loss": 0.966, + "step": 1946 + }, + { + "epoch": 0.3, + "grad_norm": 6.713985789430561, + "learning_rate": 1.646304215504107e-05, + "loss": 1.1203, + "step": 1947 + }, + { + "epoch": 0.3, + "grad_norm": 3.0833960862474434, + "learning_rate": 1.645925836262284e-05, + "loss": 0.9877, + "step": 1948 + }, + { + "epoch": 0.3, + "grad_norm": 2.7057696057868297, + "learning_rate": 1.6455472982700055e-05, + "loss": 0.8799, + "step": 1949 + }, + { + "epoch": 0.3, + "grad_norm": 2.884971897595675, + "learning_rate": 1.6451686016203065e-05, + "loss": 0.9695, + "step": 1950 + }, + { + "epoch": 0.3, + "grad_norm": 3.159371915076138, + "learning_rate": 1.6447897464062593e-05, + "loss": 0.9409, + "step": 1951 + }, + { + "epoch": 0.3, + "grad_norm": 3.0243298134150836, + "learning_rate": 1.644410732720977e-05, + "loss": 0.904, + "step": 1952 + }, + { + "epoch": 0.3, + "grad_norm": 2.8079014057457963, + "learning_rate": 1.644031560657609e-05, + "loss": 0.9069, + "step": 1953 + }, + { + "epoch": 0.3, + "grad_norm": 3.102887289670685, + "learning_rate": 1.6436522303093462e-05, + "loss": 0.8864, + "step": 1954 + }, + { + "epoch": 0.3, + "grad_norm": 2.8903116065538312, + "learning_rate": 1.6432727417694172e-05, + "loss": 0.9538, + "step": 1955 + }, + { + "epoch": 0.3, + "grad_norm": 3.2832123605271866, + "learning_rate": 1.6428930951310895e-05, + "loss": 0.9247, + "step": 1956 + }, + { + "epoch": 0.3, + "grad_norm": 3.2273969880571656, + "learning_rate": 1.6425132904876696e-05, + "loss": 0.9876, + "step": 1957 + }, + { + "epoch": 0.3, + "grad_norm": 3.013714545246212, + "learning_rate": 1.642133327932503e-05, + "loss": 0.9537, + "step": 1958 + }, + { + "epoch": 0.3, + "grad_norm": 2.7463430437492122, + "learning_rate": 1.6417532075589733e-05, + "loss": 0.9166, + "step": 1959 + }, + { + "epoch": 0.3, + "grad_norm": 2.844332090271417, + "learning_rate": 1.6413729294605043e-05, + "loss": 0.9816, + "step": 1960 + }, + { + "epoch": 0.3, + "grad_norm": 8.087235521306631, + "learning_rate": 1.6409924937305567e-05, + "loss": 1.0865, + "step": 1961 + }, + { + "epoch": 0.3, + "grad_norm": 2.752478659387522, + "learning_rate": 1.640611900462632e-05, + "loss": 0.9745, + "step": 1962 + }, + { + "epoch": 0.3, + "grad_norm": 2.8605402043191, + "learning_rate": 1.6402311497502685e-05, + "loss": 0.9028, + "step": 1963 + }, + { + "epoch": 0.3, + "grad_norm": 2.8371013619172736, + "learning_rate": 1.6398502416870444e-05, + "loss": 0.943, + "step": 1964 + }, + { + "epoch": 0.3, + "grad_norm": 3.1775931381327656, + "learning_rate": 1.6394691763665762e-05, + "loss": 0.8965, + "step": 1965 + }, + { + "epoch": 0.3, + "grad_norm": 3.1302370672192414, + "learning_rate": 1.6390879538825188e-05, + "loss": 1.0056, + "step": 1966 + }, + { + "epoch": 0.3, + "grad_norm": 3.0799939612715064, + "learning_rate": 1.6387065743285667e-05, + "loss": 0.9704, + "step": 1967 + }, + { + "epoch": 0.3, + "grad_norm": 2.6241877817898187, + "learning_rate": 1.6383250377984515e-05, + "loss": 0.9161, + "step": 1968 + }, + { + "epoch": 0.3, + "grad_norm": 3.0591703288395715, + "learning_rate": 1.637943344385944e-05, + "loss": 0.9756, + "step": 1969 + }, + { + "epoch": 0.3, + "grad_norm": 2.780749719595347, + "learning_rate": 1.6375614941848553e-05, + "loss": 0.9215, + "step": 1970 + }, + { + "epoch": 0.3, + "grad_norm": 2.78679062321962, + "learning_rate": 1.637179487289032e-05, + "loss": 0.9447, + "step": 1971 + }, + { + "epoch": 0.3, + "grad_norm": 2.916558730513451, + "learning_rate": 1.6367973237923606e-05, + "loss": 1.017, + "step": 1972 + }, + { + "epoch": 0.3, + "grad_norm": 2.8714051172595014, + "learning_rate": 1.636415003788767e-05, + "loss": 0.9574, + "step": 1973 + }, + { + "epoch": 0.3, + "grad_norm": 2.424562176046629, + "learning_rate": 1.6360325273722148e-05, + "loss": 0.7982, + "step": 1974 + }, + { + "epoch": 0.3, + "grad_norm": 2.7338308340261883, + "learning_rate": 1.6356498946367052e-05, + "loss": 0.9081, + "step": 1975 + }, + { + "epoch": 0.3, + "grad_norm": 3.27037857350359, + "learning_rate": 1.635267105676279e-05, + "loss": 1.0226, + "step": 1976 + }, + { + "epoch": 0.3, + "grad_norm": 3.1300143332121175, + "learning_rate": 1.634884160585015e-05, + "loss": 0.9527, + "step": 1977 + }, + { + "epoch": 0.3, + "grad_norm": 3.106799123943485, + "learning_rate": 1.6345010594570303e-05, + "loss": 0.9228, + "step": 1978 + }, + { + "epoch": 0.3, + "grad_norm": 2.9397095069102734, + "learning_rate": 1.6341178023864803e-05, + "loss": 0.9816, + "step": 1979 + }, + { + "epoch": 0.3, + "grad_norm": 3.0107007075493217, + "learning_rate": 1.6337343894675594e-05, + "loss": 0.8376, + "step": 1980 + }, + { + "epoch": 0.3, + "grad_norm": 3.238057799122706, + "learning_rate": 1.6333508207944987e-05, + "loss": 0.9929, + "step": 1981 + }, + { + "epoch": 0.3, + "grad_norm": 2.767232716037717, + "learning_rate": 1.6329670964615698e-05, + "loss": 0.9193, + "step": 1982 + }, + { + "epoch": 0.3, + "grad_norm": 3.068081714153525, + "learning_rate": 1.6325832165630804e-05, + "loss": 0.9252, + "step": 1983 + }, + { + "epoch": 0.3, + "grad_norm": 10.403815270134738, + "learning_rate": 1.6321991811933778e-05, + "loss": 1.1198, + "step": 1984 + }, + { + "epoch": 0.3, + "grad_norm": 6.99459133747253, + "learning_rate": 1.6318149904468476e-05, + "loss": 1.2209, + "step": 1985 + }, + { + "epoch": 0.3, + "grad_norm": 3.3344886744220963, + "learning_rate": 1.6314306444179124e-05, + "loss": 0.9521, + "step": 1986 + }, + { + "epoch": 0.3, + "grad_norm": 8.507952799865198, + "learning_rate": 1.631046143201034e-05, + "loss": 1.1653, + "step": 1987 + }, + { + "epoch": 0.3, + "grad_norm": 3.0183257520527236, + "learning_rate": 1.6306614868907118e-05, + "loss": 0.9675, + "step": 1988 + }, + { + "epoch": 0.3, + "grad_norm": 3.2917705295673354, + "learning_rate": 1.6302766755814837e-05, + "loss": 0.9624, + "step": 1989 + }, + { + "epoch": 0.3, + "grad_norm": 2.6996186995769325, + "learning_rate": 1.6298917093679256e-05, + "loss": 0.9873, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 2.92846853659815, + "learning_rate": 1.6295065883446514e-05, + "loss": 1.0206, + "step": 1991 + }, + { + "epoch": 0.3, + "grad_norm": 2.6153606760314583, + "learning_rate": 1.6291213126063127e-05, + "loss": 0.8827, + "step": 1992 + }, + { + "epoch": 0.31, + "grad_norm": 3.0482692285445476, + "learning_rate": 1.6287358822476003e-05, + "loss": 0.9584, + "step": 1993 + }, + { + "epoch": 0.31, + "grad_norm": 2.945888562890866, + "learning_rate": 1.628350297363241e-05, + "loss": 1.0334, + "step": 1994 + }, + { + "epoch": 0.31, + "grad_norm": 2.9809895023289705, + "learning_rate": 1.6279645580480016e-05, + "loss": 0.8799, + "step": 1995 + }, + { + "epoch": 0.31, + "grad_norm": 3.040358876025292, + "learning_rate": 1.6275786643966857e-05, + "loss": 0.9606, + "step": 1996 + }, + { + "epoch": 0.31, + "grad_norm": 3.120304648204235, + "learning_rate": 1.627192616504135e-05, + "loss": 0.9562, + "step": 1997 + }, + { + "epoch": 0.31, + "grad_norm": 2.8891192132781764, + "learning_rate": 1.6268064144652298e-05, + "loss": 0.9539, + "step": 1998 + }, + { + "epoch": 0.31, + "grad_norm": 12.618369413745189, + "learning_rate": 1.6264200583748872e-05, + "loss": 1.1742, + "step": 1999 + }, + { + "epoch": 0.31, + "grad_norm": 2.562754164287327, + "learning_rate": 1.6260335483280628e-05, + "loss": 0.9873, + "step": 2000 + }, + { + "epoch": 0.31, + "grad_norm": 2.7923306530773155, + "learning_rate": 1.6256468844197503e-05, + "loss": 0.906, + "step": 2001 + }, + { + "epoch": 0.31, + "grad_norm": 2.7896490661006172, + "learning_rate": 1.6252600667449803e-05, + "loss": 1.0057, + "step": 2002 + }, + { + "epoch": 0.31, + "grad_norm": 3.0281024080777756, + "learning_rate": 1.624873095398822e-05, + "loss": 0.9022, + "step": 2003 + }, + { + "epoch": 0.31, + "grad_norm": 2.881684994302156, + "learning_rate": 1.6244859704763822e-05, + "loss": 0.9479, + "step": 2004 + }, + { + "epoch": 0.31, + "grad_norm": 2.9444445700411137, + "learning_rate": 1.6240986920728047e-05, + "loss": 0.8036, + "step": 2005 + }, + { + "epoch": 0.31, + "grad_norm": 2.7270622289384363, + "learning_rate": 1.6237112602832725e-05, + "loss": 0.8149, + "step": 2006 + }, + { + "epoch": 0.31, + "grad_norm": 2.932690790849735, + "learning_rate": 1.6233236752030055e-05, + "loss": 0.894, + "step": 2007 + }, + { + "epoch": 0.31, + "grad_norm": 2.9384491189874713, + "learning_rate": 1.6229359369272604e-05, + "loss": 0.9274, + "step": 2008 + }, + { + "epoch": 0.31, + "grad_norm": 3.2749757856588406, + "learning_rate": 1.622548045551333e-05, + "loss": 1.0205, + "step": 2009 + }, + { + "epoch": 0.31, + "grad_norm": 3.2986011656446235, + "learning_rate": 1.6221600011705562e-05, + "loss": 0.9656, + "step": 2010 + }, + { + "epoch": 0.31, + "grad_norm": 2.899694894395896, + "learning_rate": 1.6217718038803004e-05, + "loss": 0.9747, + "step": 2011 + }, + { + "epoch": 0.31, + "grad_norm": 2.805963080021518, + "learning_rate": 1.621383453775973e-05, + "loss": 0.9258, + "step": 2012 + }, + { + "epoch": 0.31, + "grad_norm": 3.079907285065247, + "learning_rate": 1.6209949509530206e-05, + "loss": 1.0381, + "step": 2013 + }, + { + "epoch": 0.31, + "grad_norm": 2.85919475322715, + "learning_rate": 1.6206062955069252e-05, + "loss": 0.9665, + "step": 2014 + }, + { + "epoch": 0.31, + "grad_norm": 3.009738760174339, + "learning_rate": 1.6202174875332082e-05, + "loss": 0.9749, + "step": 2015 + }, + { + "epoch": 0.31, + "grad_norm": 2.7899818371791323, + "learning_rate": 1.619828527127427e-05, + "loss": 0.8872, + "step": 2016 + }, + { + "epoch": 0.31, + "grad_norm": 2.9113651910730733, + "learning_rate": 1.619439414385178e-05, + "loss": 0.93, + "step": 2017 + }, + { + "epoch": 0.31, + "grad_norm": 3.1385972777593194, + "learning_rate": 1.6190501494020938e-05, + "loss": 0.878, + "step": 2018 + }, + { + "epoch": 0.31, + "grad_norm": 2.902655972971181, + "learning_rate": 1.6186607322738446e-05, + "loss": 0.9324, + "step": 2019 + }, + { + "epoch": 0.31, + "grad_norm": 36.255687160999884, + "learning_rate": 1.6182711630961385e-05, + "loss": 1.2675, + "step": 2020 + }, + { + "epoch": 0.31, + "grad_norm": 2.8813953340320717, + "learning_rate": 1.6178814419647207e-05, + "loss": 0.9405, + "step": 2021 + }, + { + "epoch": 0.31, + "grad_norm": 3.146214173693879, + "learning_rate": 1.6174915689753733e-05, + "loss": 0.9028, + "step": 2022 + }, + { + "epoch": 0.31, + "grad_norm": 3.295327922343884, + "learning_rate": 1.6171015442239167e-05, + "loss": 0.8791, + "step": 2023 + }, + { + "epoch": 0.31, + "grad_norm": 2.7436426736490667, + "learning_rate": 1.6167113678062074e-05, + "loss": 0.8547, + "step": 2024 + }, + { + "epoch": 0.31, + "grad_norm": 2.983002667352024, + "learning_rate": 1.6163210398181405e-05, + "loss": 0.9734, + "step": 2025 + }, + { + "epoch": 0.31, + "grad_norm": 3.21413114511768, + "learning_rate": 1.6159305603556474e-05, + "loss": 0.9882, + "step": 2026 + }, + { + "epoch": 0.31, + "grad_norm": 2.8282094732335454, + "learning_rate": 1.615539929514697e-05, + "loss": 0.921, + "step": 2027 + }, + { + "epoch": 0.31, + "grad_norm": 3.1604216542592587, + "learning_rate": 1.615149147391295e-05, + "loss": 0.9523, + "step": 2028 + }, + { + "epoch": 0.31, + "grad_norm": 2.856666507616215, + "learning_rate": 1.614758214081486e-05, + "loss": 0.936, + "step": 2029 + }, + { + "epoch": 0.31, + "grad_norm": 2.991532447251548, + "learning_rate": 1.6143671296813487e-05, + "loss": 0.9874, + "step": 2030 + }, + { + "epoch": 0.31, + "grad_norm": 3.840642279858671, + "learning_rate": 1.6139758942870015e-05, + "loss": 0.9078, + "step": 2031 + }, + { + "epoch": 0.31, + "grad_norm": 2.9496055825239855, + "learning_rate": 1.6135845079945994e-05, + "loss": 0.9009, + "step": 2032 + }, + { + "epoch": 0.31, + "grad_norm": 3.4997572003360498, + "learning_rate": 1.6131929709003338e-05, + "loss": 0.8366, + "step": 2033 + }, + { + "epoch": 0.31, + "grad_norm": 2.854286295482546, + "learning_rate": 1.6128012831004334e-05, + "loss": 0.9281, + "step": 2034 + }, + { + "epoch": 0.31, + "grad_norm": 3.0047445073134647, + "learning_rate": 1.6124094446911643e-05, + "loss": 0.9288, + "step": 2035 + }, + { + "epoch": 0.31, + "grad_norm": 3.0747711973099534, + "learning_rate": 1.6120174557688296e-05, + "loss": 0.9777, + "step": 2036 + }, + { + "epoch": 0.31, + "grad_norm": 2.836496039165514, + "learning_rate": 1.6116253164297688e-05, + "loss": 1.0003, + "step": 2037 + }, + { + "epoch": 0.31, + "grad_norm": 2.9495829885617866, + "learning_rate": 1.611233026770359e-05, + "loss": 0.958, + "step": 2038 + }, + { + "epoch": 0.31, + "grad_norm": 4.040177556037205, + "learning_rate": 1.6108405868870138e-05, + "loss": 1.0066, + "step": 2039 + }, + { + "epoch": 0.31, + "grad_norm": 2.8392312040901073, + "learning_rate": 1.610447996876184e-05, + "loss": 0.9305, + "step": 2040 + }, + { + "epoch": 0.31, + "grad_norm": 2.858555557096998, + "learning_rate": 1.6100552568343575e-05, + "loss": 0.9561, + "step": 2041 + }, + { + "epoch": 0.31, + "grad_norm": 2.925244748188534, + "learning_rate": 1.609662366858058e-05, + "loss": 0.9858, + "step": 2042 + }, + { + "epoch": 0.31, + "grad_norm": 18.775907505689492, + "learning_rate": 1.6092693270438477e-05, + "loss": 1.1733, + "step": 2043 + }, + { + "epoch": 0.31, + "grad_norm": 2.8831230007137116, + "learning_rate": 1.6088761374883244e-05, + "loss": 0.9558, + "step": 2044 + }, + { + "epoch": 0.31, + "grad_norm": 3.0027686259492707, + "learning_rate": 1.608482798288123e-05, + "loss": 0.9855, + "step": 2045 + }, + { + "epoch": 0.31, + "grad_norm": 3.3576253702197185, + "learning_rate": 1.6080893095399154e-05, + "loss": 0.996, + "step": 2046 + }, + { + "epoch": 0.31, + "grad_norm": 2.9658837524613344, + "learning_rate": 1.6076956713404096e-05, + "loss": 0.9021, + "step": 2047 + }, + { + "epoch": 0.31, + "grad_norm": 3.1626601985594966, + "learning_rate": 1.607301883786352e-05, + "loss": 1.0554, + "step": 2048 + }, + { + "epoch": 0.31, + "grad_norm": 3.0553939427426196, + "learning_rate": 1.6069079469745232e-05, + "loss": 0.7729, + "step": 2049 + }, + { + "epoch": 0.31, + "grad_norm": 2.7411280804515896, + "learning_rate": 1.6065138610017425e-05, + "loss": 0.975, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 2.774742963619658, + "learning_rate": 1.606119625964865e-05, + "loss": 0.7768, + "step": 2051 + }, + { + "epoch": 0.31, + "grad_norm": 38.98687845629915, + "learning_rate": 1.605725241960783e-05, + "loss": 1.1693, + "step": 2052 + }, + { + "epoch": 0.31, + "grad_norm": 2.7458443144950424, + "learning_rate": 1.6053307090864247e-05, + "loss": 0.9069, + "step": 2053 + }, + { + "epoch": 0.31, + "grad_norm": 2.919021481663924, + "learning_rate": 1.604936027438755e-05, + "loss": 0.9223, + "step": 2054 + }, + { + "epoch": 0.31, + "grad_norm": 2.948893360420509, + "learning_rate": 1.604541197114776e-05, + "loss": 0.94, + "step": 2055 + }, + { + "epoch": 0.31, + "grad_norm": 2.8116682169195295, + "learning_rate": 1.6041462182115257e-05, + "loss": 0.9397, + "step": 2056 + }, + { + "epoch": 0.31, + "grad_norm": 2.701273752635258, + "learning_rate": 1.603751090826079e-05, + "loss": 0.9303, + "step": 2057 + }, + { + "epoch": 0.32, + "grad_norm": 2.9204076878350715, + "learning_rate": 1.6033558150555468e-05, + "loss": 0.9636, + "step": 2058 + }, + { + "epoch": 0.32, + "grad_norm": 2.9249074674731808, + "learning_rate": 1.602960390997077e-05, + "loss": 0.8089, + "step": 2059 + }, + { + "epoch": 0.32, + "grad_norm": 2.8604826058914865, + "learning_rate": 1.602564818747854e-05, + "loss": 0.9502, + "step": 2060 + }, + { + "epoch": 0.32, + "grad_norm": 2.642150556596377, + "learning_rate": 1.602169098405098e-05, + "loss": 0.8838, + "step": 2061 + }, + { + "epoch": 0.32, + "grad_norm": 2.671658746125172, + "learning_rate": 1.601773230066066e-05, + "loss": 0.8714, + "step": 2062 + }, + { + "epoch": 0.32, + "grad_norm": 3.0325050799154147, + "learning_rate": 1.6013772138280516e-05, + "loss": 0.945, + "step": 2063 + }, + { + "epoch": 0.32, + "grad_norm": 3.0000570968848552, + "learning_rate": 1.600981049788384e-05, + "loss": 1.0131, + "step": 2064 + }, + { + "epoch": 0.32, + "grad_norm": 2.904729128627338, + "learning_rate": 1.6005847380444296e-05, + "loss": 0.9075, + "step": 2065 + }, + { + "epoch": 0.32, + "grad_norm": 2.716502377018324, + "learning_rate": 1.6001882786935906e-05, + "loss": 0.8516, + "step": 2066 + }, + { + "epoch": 0.32, + "grad_norm": 3.0878573330752905, + "learning_rate": 1.599791671833306e-05, + "loss": 0.8876, + "step": 2067 + }, + { + "epoch": 0.32, + "grad_norm": 3.0105827423255227, + "learning_rate": 1.5993949175610496e-05, + "loss": 0.9043, + "step": 2068 + }, + { + "epoch": 0.32, + "grad_norm": 2.8610460528157593, + "learning_rate": 1.5989980159743336e-05, + "loss": 0.9194, + "step": 2069 + }, + { + "epoch": 0.32, + "grad_norm": 3.2183249053149567, + "learning_rate": 1.5986009671707048e-05, + "loss": 1.097, + "step": 2070 + }, + { + "epoch": 0.32, + "grad_norm": 2.866509114415062, + "learning_rate": 1.5982037712477466e-05, + "loss": 0.9069, + "step": 2071 + }, + { + "epoch": 0.32, + "grad_norm": 2.9793432999723133, + "learning_rate": 1.5978064283030784e-05, + "loss": 1.0063, + "step": 2072 + }, + { + "epoch": 0.32, + "grad_norm": 3.366780686052711, + "learning_rate": 1.597408938434356e-05, + "loss": 0.9817, + "step": 2073 + }, + { + "epoch": 0.32, + "grad_norm": 2.7600076399826334, + "learning_rate": 1.5970113017392724e-05, + "loss": 0.9071, + "step": 2074 + }, + { + "epoch": 0.32, + "grad_norm": 3.1655593506091955, + "learning_rate": 1.5966135183155542e-05, + "loss": 0.8892, + "step": 2075 + }, + { + "epoch": 0.32, + "grad_norm": 3.04836429836989, + "learning_rate": 1.5962155882609657e-05, + "loss": 0.9741, + "step": 2076 + }, + { + "epoch": 0.32, + "grad_norm": 8.977526763468514, + "learning_rate": 1.5958175116733074e-05, + "loss": 1.2192, + "step": 2077 + }, + { + "epoch": 0.32, + "grad_norm": 2.726548999880068, + "learning_rate": 1.595419288650415e-05, + "loss": 0.9317, + "step": 2078 + }, + { + "epoch": 0.32, + "grad_norm": 3.011217259674851, + "learning_rate": 1.5950209192901603e-05, + "loss": 0.9827, + "step": 2079 + }, + { + "epoch": 0.32, + "grad_norm": 3.29525534444165, + "learning_rate": 1.594622403690452e-05, + "loss": 0.9244, + "step": 2080 + }, + { + "epoch": 0.32, + "grad_norm": 3.229445003686063, + "learning_rate": 1.5942237419492334e-05, + "loss": 0.9362, + "step": 2081 + }, + { + "epoch": 0.32, + "grad_norm": 2.9204486236688245, + "learning_rate": 1.5938249341644847e-05, + "loss": 0.9227, + "step": 2082 + }, + { + "epoch": 0.32, + "grad_norm": 2.927607260082721, + "learning_rate": 1.5934259804342218e-05, + "loss": 0.9349, + "step": 2083 + }, + { + "epoch": 0.32, + "grad_norm": 2.837295266420575, + "learning_rate": 1.593026880856496e-05, + "loss": 0.8942, + "step": 2084 + }, + { + "epoch": 0.32, + "grad_norm": 2.651081252949924, + "learning_rate": 1.5926276355293948e-05, + "loss": 0.8587, + "step": 2085 + }, + { + "epoch": 0.32, + "grad_norm": 8.857579788362404, + "learning_rate": 1.5922282445510422e-05, + "loss": 1.1825, + "step": 2086 + }, + { + "epoch": 0.32, + "grad_norm": 2.641208085873871, + "learning_rate": 1.5918287080195962e-05, + "loss": 0.9027, + "step": 2087 + }, + { + "epoch": 0.32, + "grad_norm": 2.80746351383302, + "learning_rate": 1.5914290260332524e-05, + "loss": 0.9674, + "step": 2088 + }, + { + "epoch": 0.32, + "grad_norm": 3.0202119253982875, + "learning_rate": 1.5910291986902415e-05, + "loss": 1.0477, + "step": 2089 + }, + { + "epoch": 0.32, + "grad_norm": 2.7121329518000694, + "learning_rate": 1.590629226088829e-05, + "loss": 0.9407, + "step": 2090 + }, + { + "epoch": 0.32, + "grad_norm": 2.8600057410215216, + "learning_rate": 1.5902291083273182e-05, + "loss": 0.812, + "step": 2091 + }, + { + "epoch": 0.32, + "grad_norm": 2.95858748031997, + "learning_rate": 1.589828845504046e-05, + "loss": 0.8556, + "step": 2092 + }, + { + "epoch": 0.32, + "grad_norm": 5.746810792832964, + "learning_rate": 1.589428437717386e-05, + "loss": 1.1038, + "step": 2093 + }, + { + "epoch": 0.32, + "grad_norm": 2.917450104375745, + "learning_rate": 1.5890278850657468e-05, + "loss": 0.963, + "step": 2094 + }, + { + "epoch": 0.32, + "grad_norm": 3.148069985975481, + "learning_rate": 1.5886271876475733e-05, + "loss": 1.0287, + "step": 2095 + }, + { + "epoch": 0.32, + "grad_norm": 2.919608954475873, + "learning_rate": 1.588226345561346e-05, + "loss": 1.0119, + "step": 2096 + }, + { + "epoch": 0.32, + "grad_norm": 3.0119482294019226, + "learning_rate": 1.5878253589055807e-05, + "loss": 0.9433, + "step": 2097 + }, + { + "epoch": 0.32, + "grad_norm": 2.745844362970861, + "learning_rate": 1.587424227778828e-05, + "loss": 0.9186, + "step": 2098 + }, + { + "epoch": 0.32, + "grad_norm": 2.7795582081388304, + "learning_rate": 1.587022952279675e-05, + "loss": 0.939, + "step": 2099 + }, + { + "epoch": 0.32, + "grad_norm": 2.9849086913750154, + "learning_rate": 1.5866215325067442e-05, + "loss": 0.9054, + "step": 2100 + }, + { + "epoch": 0.32, + "grad_norm": 2.830979510479411, + "learning_rate": 1.586219968558693e-05, + "loss": 0.9651, + "step": 2101 + }, + { + "epoch": 0.32, + "grad_norm": 2.921574916421984, + "learning_rate": 1.5858182605342146e-05, + "loss": 0.8421, + "step": 2102 + }, + { + "epoch": 0.32, + "grad_norm": 2.8984836166465087, + "learning_rate": 1.5854164085320375e-05, + "loss": 0.9944, + "step": 2103 + }, + { + "epoch": 0.32, + "grad_norm": 2.96415686502195, + "learning_rate": 1.5850144126509263e-05, + "loss": 0.8441, + "step": 2104 + }, + { + "epoch": 0.32, + "grad_norm": 8.399398202622699, + "learning_rate": 1.5846122729896796e-05, + "loss": 1.0563, + "step": 2105 + }, + { + "epoch": 0.32, + "grad_norm": 2.8074715800805694, + "learning_rate": 1.5842099896471322e-05, + "loss": 0.8831, + "step": 2106 + }, + { + "epoch": 0.32, + "grad_norm": 3.036041778722464, + "learning_rate": 1.583807562722154e-05, + "loss": 0.9816, + "step": 2107 + }, + { + "epoch": 0.32, + "grad_norm": 3.0284113302543822, + "learning_rate": 1.5834049923136508e-05, + "loss": 1.0098, + "step": 2108 + }, + { + "epoch": 0.32, + "grad_norm": 2.7496789939771475, + "learning_rate": 1.5830022785205623e-05, + "loss": 0.8986, + "step": 2109 + }, + { + "epoch": 0.32, + "grad_norm": 2.953041494056494, + "learning_rate": 1.5825994214418646e-05, + "loss": 0.8537, + "step": 2110 + }, + { + "epoch": 0.32, + "grad_norm": 2.659977164215324, + "learning_rate": 1.582196421176569e-05, + "loss": 0.9785, + "step": 2111 + }, + { + "epoch": 0.32, + "grad_norm": 3.204140063929111, + "learning_rate": 1.5817932778237217e-05, + "loss": 0.9483, + "step": 2112 + }, + { + "epoch": 0.32, + "grad_norm": 2.6887868024183654, + "learning_rate": 1.581389991482403e-05, + "loss": 0.9564, + "step": 2113 + }, + { + "epoch": 0.32, + "grad_norm": 2.7755813582969027, + "learning_rate": 1.5809865622517303e-05, + "loss": 0.9934, + "step": 2114 + }, + { + "epoch": 0.32, + "grad_norm": 2.845892550045217, + "learning_rate": 1.5805829902308555e-05, + "loss": 0.9452, + "step": 2115 + }, + { + "epoch": 0.32, + "grad_norm": 2.853050815785061, + "learning_rate": 1.580179275518964e-05, + "loss": 0.93, + "step": 2116 + }, + { + "epoch": 0.32, + "grad_norm": 3.2733220558775726, + "learning_rate": 1.5797754182152786e-05, + "loss": 1.0112, + "step": 2117 + }, + { + "epoch": 0.32, + "grad_norm": 2.979523192746589, + "learning_rate": 1.579371418419056e-05, + "loss": 0.8896, + "step": 2118 + }, + { + "epoch": 0.32, + "grad_norm": 3.4450783956006923, + "learning_rate": 1.578967276229588e-05, + "loss": 1.1442, + "step": 2119 + }, + { + "epoch": 0.32, + "grad_norm": 2.864541500830383, + "learning_rate": 1.5785629917462005e-05, + "loss": 0.8686, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 2.8665462714350847, + "learning_rate": 1.5781585650682565e-05, + "loss": 0.8522, + "step": 2121 + }, + { + "epoch": 0.32, + "grad_norm": 7.2402800359131625, + "learning_rate": 1.5777539962951526e-05, + "loss": 1.0849, + "step": 2122 + }, + { + "epoch": 0.32, + "grad_norm": 2.9401554666052516, + "learning_rate": 1.5773492855263196e-05, + "loss": 0.9797, + "step": 2123 + }, + { + "epoch": 0.33, + "grad_norm": 2.826087551554652, + "learning_rate": 1.5769444328612248e-05, + "loss": 0.9366, + "step": 2124 + }, + { + "epoch": 0.33, + "grad_norm": 2.9748307430738388, + "learning_rate": 1.5765394383993693e-05, + "loss": 0.9461, + "step": 2125 + }, + { + "epoch": 0.33, + "grad_norm": 2.917612348057046, + "learning_rate": 1.57613430224029e-05, + "loss": 0.9599, + "step": 2126 + }, + { + "epoch": 0.33, + "grad_norm": 2.9492782151559545, + "learning_rate": 1.5757290244835566e-05, + "loss": 0.9607, + "step": 2127 + }, + { + "epoch": 0.33, + "grad_norm": 3.104337592341858, + "learning_rate": 1.5753236052287766e-05, + "loss": 0.9517, + "step": 2128 + }, + { + "epoch": 0.33, + "grad_norm": 2.8612025981659652, + "learning_rate": 1.57491804457559e-05, + "loss": 1.0167, + "step": 2129 + }, + { + "epoch": 0.33, + "grad_norm": 2.8123553472731198, + "learning_rate": 1.5745123426236716e-05, + "loss": 0.9264, + "step": 2130 + }, + { + "epoch": 0.33, + "grad_norm": 2.830018363337432, + "learning_rate": 1.5741064994727327e-05, + "loss": 0.8245, + "step": 2131 + }, + { + "epoch": 0.33, + "grad_norm": 2.8110191476042683, + "learning_rate": 1.5737005152225176e-05, + "loss": 0.8709, + "step": 2132 + }, + { + "epoch": 0.33, + "grad_norm": 2.8502055155928923, + "learning_rate": 1.573294389972806e-05, + "loss": 1.0445, + "step": 2133 + }, + { + "epoch": 0.33, + "grad_norm": 3.0421641138681763, + "learning_rate": 1.5728881238234118e-05, + "loss": 1.0078, + "step": 2134 + }, + { + "epoch": 0.33, + "grad_norm": 2.80236736793243, + "learning_rate": 1.572481716874184e-05, + "loss": 0.9258, + "step": 2135 + }, + { + "epoch": 0.33, + "grad_norm": 3.1396454859019043, + "learning_rate": 1.572075169225006e-05, + "loss": 1.0275, + "step": 2136 + }, + { + "epoch": 0.33, + "grad_norm": 3.01442584800464, + "learning_rate": 1.571668480975796e-05, + "loss": 0.9114, + "step": 2137 + }, + { + "epoch": 0.33, + "grad_norm": 2.9439289256223495, + "learning_rate": 1.5712616522265062e-05, + "loss": 0.9387, + "step": 2138 + }, + { + "epoch": 0.33, + "grad_norm": 2.86630479136464, + "learning_rate": 1.5708546830771242e-05, + "loss": 0.8924, + "step": 2139 + }, + { + "epoch": 0.33, + "grad_norm": 2.9053380514019573, + "learning_rate": 1.5704475736276708e-05, + "loss": 0.9631, + "step": 2140 + }, + { + "epoch": 0.33, + "grad_norm": 3.368103684245844, + "learning_rate": 1.5700403239782035e-05, + "loss": 0.913, + "step": 2141 + }, + { + "epoch": 0.33, + "grad_norm": 2.857579536349821, + "learning_rate": 1.569632934228811e-05, + "loss": 1.0226, + "step": 2142 + }, + { + "epoch": 0.33, + "grad_norm": 3.0564847208325254, + "learning_rate": 1.56922540447962e-05, + "loss": 0.9765, + "step": 2143 + }, + { + "epoch": 0.33, + "grad_norm": 2.9287560858591966, + "learning_rate": 1.568817734830789e-05, + "loss": 0.8979, + "step": 2144 + }, + { + "epoch": 0.33, + "grad_norm": 2.8933213409616583, + "learning_rate": 1.5684099253825117e-05, + "loss": 0.9059, + "step": 2145 + }, + { + "epoch": 0.33, + "grad_norm": 2.639794473491297, + "learning_rate": 1.5680019762350162e-05, + "loss": 0.9327, + "step": 2146 + }, + { + "epoch": 0.33, + "grad_norm": 2.891538719719891, + "learning_rate": 1.567593887488565e-05, + "loss": 1.0255, + "step": 2147 + }, + { + "epoch": 0.33, + "grad_norm": 2.8097333302096015, + "learning_rate": 1.5671856592434557e-05, + "loss": 0.9987, + "step": 2148 + }, + { + "epoch": 0.33, + "grad_norm": 2.788275914343878, + "learning_rate": 1.5667772916000182e-05, + "loss": 0.8716, + "step": 2149 + }, + { + "epoch": 0.33, + "grad_norm": 2.968827801790018, + "learning_rate": 1.5663687846586183e-05, + "loss": 0.8922, + "step": 2150 + }, + { + "epoch": 0.33, + "grad_norm": 2.9789373637634182, + "learning_rate": 1.5659601385196555e-05, + "loss": 0.7631, + "step": 2151 + }, + { + "epoch": 0.33, + "grad_norm": 2.641680662522926, + "learning_rate": 1.565551353283564e-05, + "loss": 0.8783, + "step": 2152 + }, + { + "epoch": 0.33, + "grad_norm": 2.992765042257369, + "learning_rate": 1.565142429050811e-05, + "loss": 0.9534, + "step": 2153 + }, + { + "epoch": 0.33, + "grad_norm": 2.912789857674366, + "learning_rate": 1.564733365921899e-05, + "loss": 0.9623, + "step": 2154 + }, + { + "epoch": 0.33, + "grad_norm": 3.1939374684533623, + "learning_rate": 1.564324163997364e-05, + "loss": 0.9484, + "step": 2155 + }, + { + "epoch": 0.33, + "grad_norm": 2.910399280383399, + "learning_rate": 1.563914823377777e-05, + "loss": 1.05, + "step": 2156 + }, + { + "epoch": 0.33, + "grad_norm": 2.7829633029832346, + "learning_rate": 1.5635053441637416e-05, + "loss": 0.899, + "step": 2157 + }, + { + "epoch": 0.33, + "grad_norm": 2.8483703755801475, + "learning_rate": 1.563095726455897e-05, + "loss": 0.9353, + "step": 2158 + }, + { + "epoch": 0.33, + "grad_norm": 3.1605453107869805, + "learning_rate": 1.5626859703549153e-05, + "loss": 0.9756, + "step": 2159 + }, + { + "epoch": 0.33, + "grad_norm": 2.6579454798900675, + "learning_rate": 1.5622760759615033e-05, + "loss": 0.9316, + "step": 2160 + }, + { + "epoch": 0.33, + "grad_norm": 2.895988425803534, + "learning_rate": 1.561866043376401e-05, + "loss": 0.8316, + "step": 2161 + }, + { + "epoch": 0.33, + "grad_norm": 3.1091023855621067, + "learning_rate": 1.5614558727003838e-05, + "loss": 0.8095, + "step": 2162 + }, + { + "epoch": 0.33, + "grad_norm": 2.723661965840028, + "learning_rate": 1.56104556403426e-05, + "loss": 0.9086, + "step": 2163 + }, + { + "epoch": 0.33, + "grad_norm": 3.1270182582775274, + "learning_rate": 1.560635117478871e-05, + "loss": 0.9554, + "step": 2164 + }, + { + "epoch": 0.33, + "grad_norm": 2.9598489071966627, + "learning_rate": 1.560224533135094e-05, + "loss": 0.864, + "step": 2165 + }, + { + "epoch": 0.33, + "grad_norm": 2.912429840064479, + "learning_rate": 1.559813811103839e-05, + "loss": 0.9377, + "step": 2166 + }, + { + "epoch": 0.33, + "grad_norm": 2.8075984318631653, + "learning_rate": 1.55940295148605e-05, + "loss": 0.9434, + "step": 2167 + }, + { + "epoch": 0.33, + "grad_norm": 2.9689893677610657, + "learning_rate": 1.5589919543827038e-05, + "loss": 0.8091, + "step": 2168 + }, + { + "epoch": 0.33, + "grad_norm": 2.7527341533964913, + "learning_rate": 1.5585808198948135e-05, + "loss": 0.9535, + "step": 2169 + }, + { + "epoch": 0.33, + "grad_norm": 3.240281727452343, + "learning_rate": 1.5581695481234234e-05, + "loss": 0.9207, + "step": 2170 + }, + { + "epoch": 0.33, + "grad_norm": 2.9346697305866134, + "learning_rate": 1.5577581391696125e-05, + "loss": 0.9847, + "step": 2171 + }, + { + "epoch": 0.33, + "grad_norm": 3.2363181840874184, + "learning_rate": 1.5573465931344943e-05, + "loss": 0.9466, + "step": 2172 + }, + { + "epoch": 0.33, + "grad_norm": 3.1311202216405905, + "learning_rate": 1.5569349101192147e-05, + "loss": 0.7614, + "step": 2173 + }, + { + "epoch": 0.33, + "grad_norm": 2.9897802189566147, + "learning_rate": 1.556523090224954e-05, + "loss": 0.9406, + "step": 2174 + }, + { + "epoch": 0.33, + "grad_norm": 2.866293002770438, + "learning_rate": 1.5561111335529262e-05, + "loss": 0.856, + "step": 2175 + }, + { + "epoch": 0.33, + "grad_norm": 2.7666487683268266, + "learning_rate": 1.5556990402043785e-05, + "loss": 0.8607, + "step": 2176 + }, + { + "epoch": 0.33, + "grad_norm": 3.206477614388165, + "learning_rate": 1.5552868102805914e-05, + "loss": 0.9272, + "step": 2177 + }, + { + "epoch": 0.33, + "grad_norm": 2.703502589931452, + "learning_rate": 1.5548744438828806e-05, + "loss": 0.8436, + "step": 2178 + }, + { + "epoch": 0.33, + "grad_norm": 2.9045504062867398, + "learning_rate": 1.5544619411125932e-05, + "loss": 0.9542, + "step": 2179 + }, + { + "epoch": 0.33, + "grad_norm": 3.4383119642842557, + "learning_rate": 1.554049302071111e-05, + "loss": 0.9043, + "step": 2180 + }, + { + "epoch": 0.33, + "grad_norm": 2.749910787471238, + "learning_rate": 1.5536365268598495e-05, + "loss": 0.9428, + "step": 2181 + }, + { + "epoch": 0.33, + "grad_norm": 3.0887495819218884, + "learning_rate": 1.5532236155802568e-05, + "loss": 0.8238, + "step": 2182 + }, + { + "epoch": 0.33, + "grad_norm": 2.7762753591598526, + "learning_rate": 1.5528105683338153e-05, + "loss": 0.9144, + "step": 2183 + }, + { + "epoch": 0.33, + "grad_norm": 2.8725126638936267, + "learning_rate": 1.5523973852220403e-05, + "loss": 0.8462, + "step": 2184 + }, + { + "epoch": 0.33, + "grad_norm": 2.9885596304544615, + "learning_rate": 1.5519840663464803e-05, + "loss": 0.8969, + "step": 2185 + }, + { + "epoch": 0.33, + "grad_norm": 3.473266378506294, + "learning_rate": 1.5515706118087178e-05, + "loss": 1.0156, + "step": 2186 + }, + { + "epoch": 0.33, + "grad_norm": 7.417240186483622, + "learning_rate": 1.5511570217103686e-05, + "loss": 1.1446, + "step": 2187 + }, + { + "epoch": 0.33, + "grad_norm": 3.084298206467197, + "learning_rate": 1.550743296153081e-05, + "loss": 0.9202, + "step": 2188 + }, + { + "epoch": 0.34, + "grad_norm": 2.9819144441801195, + "learning_rate": 1.5503294352385376e-05, + "loss": 0.9559, + "step": 2189 + }, + { + "epoch": 0.34, + "grad_norm": 8.69663067192699, + "learning_rate": 1.5499154390684534e-05, + "loss": 1.1093, + "step": 2190 + }, + { + "epoch": 0.34, + "grad_norm": 2.9797669448476896, + "learning_rate": 1.5495013077445773e-05, + "loss": 0.9774, + "step": 2191 + }, + { + "epoch": 0.34, + "grad_norm": 3.1250248267064844, + "learning_rate": 1.5490870413686913e-05, + "loss": 0.8908, + "step": 2192 + }, + { + "epoch": 0.34, + "grad_norm": 4.978205398929475, + "learning_rate": 1.54867264004261e-05, + "loss": 1.0935, + "step": 2193 + }, + { + "epoch": 0.34, + "grad_norm": 3.3254960105618383, + "learning_rate": 1.5482581038681817e-05, + "loss": 0.9489, + "step": 2194 + }, + { + "epoch": 0.34, + "grad_norm": 3.423818665589391, + "learning_rate": 1.5478434329472883e-05, + "loss": 0.9161, + "step": 2195 + }, + { + "epoch": 0.34, + "grad_norm": 4.997633325042334, + "learning_rate": 1.5474286273818437e-05, + "loss": 1.0483, + "step": 2196 + }, + { + "epoch": 0.34, + "grad_norm": 2.901737431846852, + "learning_rate": 1.547013687273796e-05, + "loss": 0.8797, + "step": 2197 + }, + { + "epoch": 0.34, + "grad_norm": 2.9155767889936506, + "learning_rate": 1.5465986127251253e-05, + "loss": 1.0351, + "step": 2198 + }, + { + "epoch": 0.34, + "grad_norm": 2.893278694107767, + "learning_rate": 1.546183403837845e-05, + "loss": 0.9258, + "step": 2199 + }, + { + "epoch": 0.34, + "grad_norm": 2.995767480013683, + "learning_rate": 1.545768060714003e-05, + "loss": 0.8585, + "step": 2200 + }, + { + "epoch": 0.34, + "grad_norm": 2.9454418666606514, + "learning_rate": 1.5453525834556784e-05, + "loss": 0.9562, + "step": 2201 + }, + { + "epoch": 0.34, + "grad_norm": 2.743360233087499, + "learning_rate": 1.5449369721649835e-05, + "loss": 0.8992, + "step": 2202 + }, + { + "epoch": 0.34, + "grad_norm": 3.0252718804825993, + "learning_rate": 1.5445212269440644e-05, + "loss": 1.0418, + "step": 2203 + }, + { + "epoch": 0.34, + "grad_norm": 2.8837846598545855, + "learning_rate": 1.544105347895099e-05, + "loss": 0.9565, + "step": 2204 + }, + { + "epoch": 0.34, + "grad_norm": 3.2789960918949608, + "learning_rate": 1.5436893351202993e-05, + "loss": 0.9387, + "step": 2205 + }, + { + "epoch": 0.34, + "grad_norm": 6.385976907084884, + "learning_rate": 1.5432731887219094e-05, + "loss": 1.1257, + "step": 2206 + }, + { + "epoch": 0.34, + "grad_norm": 2.8727045249703735, + "learning_rate": 1.5428569088022067e-05, + "loss": 0.9819, + "step": 2207 + }, + { + "epoch": 0.34, + "grad_norm": 2.9735127238030588, + "learning_rate": 1.5424404954635e-05, + "loss": 0.9494, + "step": 2208 + }, + { + "epoch": 0.34, + "grad_norm": 3.0445408913692784, + "learning_rate": 1.5420239488081335e-05, + "loss": 0.8285, + "step": 2209 + }, + { + "epoch": 0.34, + "grad_norm": 7.763826104396605, + "learning_rate": 1.5416072689384818e-05, + "loss": 1.1142, + "step": 2210 + }, + { + "epoch": 0.34, + "grad_norm": 3.061026010845337, + "learning_rate": 1.5411904559569536e-05, + "loss": 0.8954, + "step": 2211 + }, + { + "epoch": 0.34, + "grad_norm": 3.084396939692579, + "learning_rate": 1.540773509965989e-05, + "loss": 0.9391, + "step": 2212 + }, + { + "epoch": 0.34, + "grad_norm": 2.8981176045057295, + "learning_rate": 1.5403564310680627e-05, + "loss": 0.841, + "step": 2213 + }, + { + "epoch": 0.34, + "grad_norm": 2.7909013188405085, + "learning_rate": 1.5399392193656802e-05, + "loss": 0.9828, + "step": 2214 + }, + { + "epoch": 0.34, + "grad_norm": 2.957934625908896, + "learning_rate": 1.5395218749613808e-05, + "loss": 0.9562, + "step": 2215 + }, + { + "epoch": 0.34, + "grad_norm": 2.752532249725703, + "learning_rate": 1.5391043979577364e-05, + "loss": 0.9208, + "step": 2216 + }, + { + "epoch": 0.34, + "grad_norm": 2.9615314376475936, + "learning_rate": 1.5386867884573505e-05, + "loss": 0.9258, + "step": 2217 + }, + { + "epoch": 0.34, + "grad_norm": 3.010242558942028, + "learning_rate": 1.53826904656286e-05, + "loss": 0.9155, + "step": 2218 + }, + { + "epoch": 0.34, + "grad_norm": 3.2902012119762647, + "learning_rate": 1.537851172376934e-05, + "loss": 0.9412, + "step": 2219 + }, + { + "epoch": 0.34, + "grad_norm": 2.9225946084002095, + "learning_rate": 1.5374331660022744e-05, + "loss": 0.9045, + "step": 2220 + }, + { + "epoch": 0.34, + "grad_norm": 2.939000118464009, + "learning_rate": 1.537015027541616e-05, + "loss": 0.9167, + "step": 2221 + }, + { + "epoch": 0.34, + "grad_norm": 2.8952624644975042, + "learning_rate": 1.5365967570977244e-05, + "loss": 0.9866, + "step": 2222 + }, + { + "epoch": 0.34, + "grad_norm": 2.9487056282258512, + "learning_rate": 1.5361783547733997e-05, + "loss": 0.9799, + "step": 2223 + }, + { + "epoch": 0.34, + "grad_norm": 3.0272505909958296, + "learning_rate": 1.5357598206714726e-05, + "loss": 0.8923, + "step": 2224 + }, + { + "epoch": 0.34, + "grad_norm": 2.9581269346386927, + "learning_rate": 1.535341154894808e-05, + "loss": 0.9712, + "step": 2225 + }, + { + "epoch": 0.34, + "grad_norm": 2.6448689391213436, + "learning_rate": 1.5349223575463015e-05, + "loss": 0.9218, + "step": 2226 + }, + { + "epoch": 0.34, + "grad_norm": 3.7704869063269126, + "learning_rate": 1.534503428728882e-05, + "loss": 0.9657, + "step": 2227 + }, + { + "epoch": 0.34, + "grad_norm": 2.663305732820632, + "learning_rate": 1.5340843685455105e-05, + "loss": 0.8638, + "step": 2228 + }, + { + "epoch": 0.34, + "grad_norm": 2.90419193223228, + "learning_rate": 1.5336651770991806e-05, + "loss": 0.9674, + "step": 2229 + }, + { + "epoch": 0.34, + "grad_norm": 2.888348711422464, + "learning_rate": 1.533245854492917e-05, + "loss": 0.8858, + "step": 2230 + }, + { + "epoch": 0.34, + "grad_norm": 28.443076215648286, + "learning_rate": 1.5328264008297774e-05, + "loss": 1.1028, + "step": 2231 + }, + { + "epoch": 0.34, + "grad_norm": 2.67306035825745, + "learning_rate": 1.5324068162128524e-05, + "loss": 0.9006, + "step": 2232 + }, + { + "epoch": 0.34, + "grad_norm": 2.883306393381524, + "learning_rate": 1.5319871007452643e-05, + "loss": 0.9638, + "step": 2233 + }, + { + "epoch": 0.34, + "grad_norm": 2.9956980307536196, + "learning_rate": 1.5315672545301668e-05, + "loss": 0.8938, + "step": 2234 + }, + { + "epoch": 0.34, + "grad_norm": 2.8465615901796655, + "learning_rate": 1.531147277670746e-05, + "loss": 0.9024, + "step": 2235 + }, + { + "epoch": 0.34, + "grad_norm": 3.0133716494370693, + "learning_rate": 1.5307271702702215e-05, + "loss": 0.8917, + "step": 2236 + }, + { + "epoch": 0.34, + "grad_norm": 2.7472224187281222, + "learning_rate": 1.530306932431843e-05, + "loss": 0.8313, + "step": 2237 + }, + { + "epoch": 0.34, + "grad_norm": 2.9205674411797533, + "learning_rate": 1.529886564258894e-05, + "loss": 0.9551, + "step": 2238 + }, + { + "epoch": 0.34, + "grad_norm": 2.8579615936508076, + "learning_rate": 1.529466065854688e-05, + "loss": 0.9034, + "step": 2239 + }, + { + "epoch": 0.34, + "grad_norm": 3.077793196274853, + "learning_rate": 1.5290454373225728e-05, + "loss": 0.9184, + "step": 2240 + }, + { + "epoch": 0.34, + "grad_norm": 2.8593478786344995, + "learning_rate": 1.528624678765927e-05, + "loss": 1.0188, + "step": 2241 + }, + { + "epoch": 0.34, + "grad_norm": 2.786932569982691, + "learning_rate": 1.528203790288161e-05, + "loss": 0.8707, + "step": 2242 + }, + { + "epoch": 0.34, + "grad_norm": 2.986711280532268, + "learning_rate": 1.5277827719927168e-05, + "loss": 1.0005, + "step": 2243 + }, + { + "epoch": 0.34, + "grad_norm": 2.8440787062164885, + "learning_rate": 1.5273616239830697e-05, + "loss": 0.9213, + "step": 2244 + }, + { + "epoch": 0.34, + "grad_norm": 7.778194347417419, + "learning_rate": 1.5269403463627263e-05, + "loss": 1.0728, + "step": 2245 + }, + { + "epoch": 0.34, + "grad_norm": 3.2806136694782992, + "learning_rate": 1.5265189392352238e-05, + "loss": 0.9671, + "step": 2246 + }, + { + "epoch": 0.34, + "grad_norm": 3.2636437257793274, + "learning_rate": 1.5260974027041328e-05, + "loss": 0.9756, + "step": 2247 + }, + { + "epoch": 0.34, + "grad_norm": 2.7949362496071135, + "learning_rate": 1.5256757368730557e-05, + "loss": 1.0169, + "step": 2248 + }, + { + "epoch": 0.34, + "grad_norm": 2.883127148241771, + "learning_rate": 1.5252539418456253e-05, + "loss": 0.9109, + "step": 2249 + }, + { + "epoch": 0.34, + "grad_norm": 3.079927358289439, + "learning_rate": 1.5248320177255074e-05, + "loss": 0.9541, + "step": 2250 + }, + { + "epoch": 0.34, + "grad_norm": 3.066797245053238, + "learning_rate": 1.5244099646163987e-05, + "loss": 1.009, + "step": 2251 + }, + { + "epoch": 0.34, + "grad_norm": 3.196708284476865, + "learning_rate": 1.5239877826220286e-05, + "loss": 0.9755, + "step": 2252 + }, + { + "epoch": 0.34, + "grad_norm": 2.972529111217515, + "learning_rate": 1.5235654718461572e-05, + "loss": 0.8432, + "step": 2253 + }, + { + "epoch": 0.35, + "grad_norm": 2.6898726536996396, + "learning_rate": 1.5231430323925768e-05, + "loss": 0.8175, + "step": 2254 + }, + { + "epoch": 0.35, + "grad_norm": 3.052173706002016, + "learning_rate": 1.5227204643651112e-05, + "loss": 0.979, + "step": 2255 + }, + { + "epoch": 0.35, + "grad_norm": 3.370484652099961, + "learning_rate": 1.5222977678676159e-05, + "loss": 0.955, + "step": 2256 + }, + { + "epoch": 0.35, + "grad_norm": 2.857037218718541, + "learning_rate": 1.5218749430039772e-05, + "loss": 0.9497, + "step": 2257 + }, + { + "epoch": 0.35, + "grad_norm": 2.686950220905987, + "learning_rate": 1.5214519898781141e-05, + "loss": 0.8466, + "step": 2258 + }, + { + "epoch": 0.35, + "grad_norm": 3.023274578834779, + "learning_rate": 1.5210289085939769e-05, + "loss": 1.0174, + "step": 2259 + }, + { + "epoch": 0.35, + "grad_norm": 3.017154767031705, + "learning_rate": 1.5206056992555465e-05, + "loss": 0.9454, + "step": 2260 + }, + { + "epoch": 0.35, + "grad_norm": 2.693819407507513, + "learning_rate": 1.5201823619668362e-05, + "loss": 0.9532, + "step": 2261 + }, + { + "epoch": 0.35, + "grad_norm": 2.7126130445896273, + "learning_rate": 1.5197588968318904e-05, + "loss": 0.957, + "step": 2262 + }, + { + "epoch": 0.35, + "grad_norm": 2.585559358851023, + "learning_rate": 1.519335303954785e-05, + "loss": 0.9274, + "step": 2263 + }, + { + "epoch": 0.35, + "grad_norm": 2.9563349968700394, + "learning_rate": 1.518911583439627e-05, + "loss": 0.9926, + "step": 2264 + }, + { + "epoch": 0.35, + "grad_norm": 3.3004301639856544, + "learning_rate": 1.5184877353905556e-05, + "loss": 0.946, + "step": 2265 + }, + { + "epoch": 0.35, + "grad_norm": 2.7952021511451006, + "learning_rate": 1.5180637599117401e-05, + "loss": 0.942, + "step": 2266 + }, + { + "epoch": 0.35, + "grad_norm": 3.0215029152891892, + "learning_rate": 1.5176396571073821e-05, + "loss": 0.9685, + "step": 2267 + }, + { + "epoch": 0.35, + "grad_norm": 2.7965263658600503, + "learning_rate": 1.517215427081714e-05, + "loss": 0.9508, + "step": 2268 + }, + { + "epoch": 0.35, + "grad_norm": 2.917787388458425, + "learning_rate": 1.516791069939e-05, + "loss": 0.9753, + "step": 2269 + }, + { + "epoch": 0.35, + "grad_norm": 2.7820103402362046, + "learning_rate": 1.5163665857835348e-05, + "loss": 0.9348, + "step": 2270 + }, + { + "epoch": 0.35, + "grad_norm": 3.0049793733461914, + "learning_rate": 1.515941974719645e-05, + "loss": 0.9459, + "step": 2271 + }, + { + "epoch": 0.35, + "grad_norm": 2.9106205504262777, + "learning_rate": 1.515517236851688e-05, + "loss": 0.8804, + "step": 2272 + }, + { + "epoch": 0.35, + "grad_norm": 2.9958370089998407, + "learning_rate": 1.5150923722840523e-05, + "loss": 0.9294, + "step": 2273 + }, + { + "epoch": 0.35, + "grad_norm": 2.861271074958744, + "learning_rate": 1.5146673811211576e-05, + "loss": 0.9511, + "step": 2274 + }, + { + "epoch": 0.35, + "grad_norm": 2.8958602073341666, + "learning_rate": 1.5142422634674551e-05, + "loss": 0.9407, + "step": 2275 + }, + { + "epoch": 0.35, + "grad_norm": 2.847941228108326, + "learning_rate": 1.5138170194274269e-05, + "loss": 1.0268, + "step": 2276 + }, + { + "epoch": 0.35, + "grad_norm": 2.902937511503127, + "learning_rate": 1.5133916491055858e-05, + "loss": 0.8619, + "step": 2277 + }, + { + "epoch": 0.35, + "grad_norm": 2.941672149117542, + "learning_rate": 1.512966152606476e-05, + "loss": 1.0764, + "step": 2278 + }, + { + "epoch": 0.35, + "grad_norm": 3.1262853598966927, + "learning_rate": 1.5125405300346722e-05, + "loss": 0.9714, + "step": 2279 + }, + { + "epoch": 0.35, + "grad_norm": 2.502183056519954, + "learning_rate": 1.5121147814947812e-05, + "loss": 0.8329, + "step": 2280 + }, + { + "epoch": 0.35, + "grad_norm": 2.779765611110997, + "learning_rate": 1.5116889070914397e-05, + "loss": 0.9328, + "step": 2281 + }, + { + "epoch": 0.35, + "grad_norm": 3.347802675409558, + "learning_rate": 1.5112629069293156e-05, + "loss": 0.9771, + "step": 2282 + }, + { + "epoch": 0.35, + "grad_norm": 2.7310814686797293, + "learning_rate": 1.5108367811131079e-05, + "loss": 0.8361, + "step": 2283 + }, + { + "epoch": 0.35, + "grad_norm": 2.9784430820973946, + "learning_rate": 1.5104105297475462e-05, + "loss": 0.9534, + "step": 2284 + }, + { + "epoch": 0.35, + "grad_norm": 2.90244730992472, + "learning_rate": 1.5099841529373918e-05, + "loss": 1.0294, + "step": 2285 + }, + { + "epoch": 0.35, + "grad_norm": 5.741448217823855, + "learning_rate": 1.5095576507874353e-05, + "loss": 1.0374, + "step": 2286 + }, + { + "epoch": 0.35, + "grad_norm": 2.84138712585894, + "learning_rate": 1.5091310234024991e-05, + "loss": 0.8247, + "step": 2287 + }, + { + "epoch": 0.35, + "grad_norm": 3.113982915053296, + "learning_rate": 1.5087042708874368e-05, + "loss": 0.8825, + "step": 2288 + }, + { + "epoch": 0.35, + "grad_norm": 2.9396202859487635, + "learning_rate": 1.508277393347132e-05, + "loss": 0.8591, + "step": 2289 + }, + { + "epoch": 0.35, + "grad_norm": 2.804939430938009, + "learning_rate": 1.5078503908864985e-05, + "loss": 0.9577, + "step": 2290 + }, + { + "epoch": 0.35, + "grad_norm": 3.106725332329504, + "learning_rate": 1.5074232636104824e-05, + "loss": 0.9256, + "step": 2291 + }, + { + "epoch": 0.35, + "grad_norm": 2.8731329023678387, + "learning_rate": 1.5069960116240597e-05, + "loss": 0.8268, + "step": 2292 + }, + { + "epoch": 0.35, + "grad_norm": 2.561527056009055, + "learning_rate": 1.506568635032236e-05, + "loss": 0.8696, + "step": 2293 + }, + { + "epoch": 0.35, + "grad_norm": 3.293742479742836, + "learning_rate": 1.5061411339400494e-05, + "loss": 0.9943, + "step": 2294 + }, + { + "epoch": 0.35, + "grad_norm": 3.2364700641093647, + "learning_rate": 1.5057135084525671e-05, + "loss": 1.0094, + "step": 2295 + }, + { + "epoch": 0.35, + "grad_norm": 2.8922317115255423, + "learning_rate": 1.5052857586748881e-05, + "loss": 0.9652, + "step": 2296 + }, + { + "epoch": 0.35, + "grad_norm": 2.7080024377532146, + "learning_rate": 1.5048578847121405e-05, + "loss": 0.8256, + "step": 2297 + }, + { + "epoch": 0.35, + "grad_norm": 2.6647130744588314, + "learning_rate": 1.5044298866694842e-05, + "loss": 1.0209, + "step": 2298 + }, + { + "epoch": 0.35, + "grad_norm": 2.7470063263170243, + "learning_rate": 1.504001764652109e-05, + "loss": 0.8917, + "step": 2299 + }, + { + "epoch": 0.35, + "grad_norm": 2.8360347324676005, + "learning_rate": 1.5035735187652353e-05, + "loss": 0.9454, + "step": 2300 + }, + { + "epoch": 0.35, + "grad_norm": 2.900545216220249, + "learning_rate": 1.503145149114114e-05, + "loss": 0.9046, + "step": 2301 + }, + { + "epoch": 0.35, + "grad_norm": 2.8178594587932593, + "learning_rate": 1.5027166558040262e-05, + "loss": 1.0504, + "step": 2302 + }, + { + "epoch": 0.35, + "grad_norm": 3.01761000475965, + "learning_rate": 1.502288038940284e-05, + "loss": 0.9777, + "step": 2303 + }, + { + "epoch": 0.35, + "grad_norm": 2.979517837174316, + "learning_rate": 1.5018592986282286e-05, + "loss": 0.848, + "step": 2304 + }, + { + "epoch": 0.35, + "grad_norm": 2.9140866574800333, + "learning_rate": 1.5014304349732327e-05, + "loss": 0.8894, + "step": 2305 + }, + { + "epoch": 0.35, + "grad_norm": 2.8764694466178775, + "learning_rate": 1.5010014480806994e-05, + "loss": 1.0152, + "step": 2306 + }, + { + "epoch": 0.35, + "grad_norm": 3.081285640863487, + "learning_rate": 1.5005723380560613e-05, + "loss": 0.7847, + "step": 2307 + }, + { + "epoch": 0.35, + "grad_norm": 2.636984699828142, + "learning_rate": 1.5001431050047814e-05, + "loss": 0.9056, + "step": 2308 + }, + { + "epoch": 0.35, + "grad_norm": 2.833963909345822, + "learning_rate": 1.499713749032353e-05, + "loss": 0.9704, + "step": 2309 + }, + { + "epoch": 0.35, + "grad_norm": 3.0114563448994174, + "learning_rate": 1.4992842702443005e-05, + "loss": 0.982, + "step": 2310 + }, + { + "epoch": 0.35, + "grad_norm": 2.7306229723513553, + "learning_rate": 1.4988546687461774e-05, + "loss": 0.8364, + "step": 2311 + }, + { + "epoch": 0.35, + "grad_norm": 3.0296755610921164, + "learning_rate": 1.4984249446435674e-05, + "loss": 0.9381, + "step": 2312 + }, + { + "epoch": 0.35, + "grad_norm": 2.991527609025026, + "learning_rate": 1.4979950980420847e-05, + "loss": 0.9938, + "step": 2313 + }, + { + "epoch": 0.35, + "grad_norm": 2.9612173589174624, + "learning_rate": 1.4975651290473741e-05, + "loss": 0.7781, + "step": 2314 + }, + { + "epoch": 0.35, + "grad_norm": 2.8810874165072233, + "learning_rate": 1.4971350377651093e-05, + "loss": 1.0572, + "step": 2315 + }, + { + "epoch": 0.35, + "grad_norm": 2.8538670159087545, + "learning_rate": 1.496704824300995e-05, + "loss": 1.0219, + "step": 2316 + }, + { + "epoch": 0.35, + "grad_norm": 3.0232911034956933, + "learning_rate": 1.4962744887607654e-05, + "loss": 0.9658, + "step": 2317 + }, + { + "epoch": 0.35, + "grad_norm": 2.7966552102771116, + "learning_rate": 1.4958440312501852e-05, + "loss": 0.9018, + "step": 2318 + }, + { + "epoch": 0.35, + "grad_norm": 3.008138253863501, + "learning_rate": 1.4954134518750483e-05, + "loss": 0.8631, + "step": 2319 + }, + { + "epoch": 0.36, + "grad_norm": 2.99847234940024, + "learning_rate": 1.4949827507411791e-05, + "loss": 1.0202, + "step": 2320 + }, + { + "epoch": 0.36, + "grad_norm": 3.105716574195463, + "learning_rate": 1.4945519279544325e-05, + "loss": 0.8817, + "step": 2321 + }, + { + "epoch": 0.36, + "grad_norm": 2.613668461496793, + "learning_rate": 1.4941209836206922e-05, + "loss": 0.9624, + "step": 2322 + }, + { + "epoch": 0.36, + "grad_norm": 2.8192085782602696, + "learning_rate": 1.4936899178458724e-05, + "loss": 0.976, + "step": 2323 + }, + { + "epoch": 0.36, + "grad_norm": 2.931181999122721, + "learning_rate": 1.4932587307359165e-05, + "loss": 0.9704, + "step": 2324 + }, + { + "epoch": 0.36, + "grad_norm": 2.767933016701002, + "learning_rate": 1.4928274223967986e-05, + "loss": 0.8961, + "step": 2325 + }, + { + "epoch": 0.36, + "grad_norm": 2.7663837703593295, + "learning_rate": 1.4923959929345225e-05, + "loss": 0.9543, + "step": 2326 + }, + { + "epoch": 0.36, + "grad_norm": 2.8819865289259616, + "learning_rate": 1.4919644424551205e-05, + "loss": 0.92, + "step": 2327 + }, + { + "epoch": 0.36, + "grad_norm": 2.935705325811252, + "learning_rate": 1.4915327710646568e-05, + "loss": 0.8172, + "step": 2328 + }, + { + "epoch": 0.36, + "grad_norm": 2.8807778391035526, + "learning_rate": 1.4911009788692235e-05, + "loss": 0.8079, + "step": 2329 + }, + { + "epoch": 0.36, + "grad_norm": 2.7340974992817775, + "learning_rate": 1.4906690659749426e-05, + "loss": 0.9884, + "step": 2330 + }, + { + "epoch": 0.36, + "grad_norm": 2.8010092611333235, + "learning_rate": 1.4902370324879668e-05, + "loss": 0.8415, + "step": 2331 + }, + { + "epoch": 0.36, + "grad_norm": 2.805620053742415, + "learning_rate": 1.4898048785144775e-05, + "loss": 0.9433, + "step": 2332 + }, + { + "epoch": 0.36, + "grad_norm": 2.806019089524397, + "learning_rate": 1.4893726041606864e-05, + "loss": 0.9695, + "step": 2333 + }, + { + "epoch": 0.36, + "grad_norm": 2.8069677773537647, + "learning_rate": 1.4889402095328343e-05, + "loss": 0.8722, + "step": 2334 + }, + { + "epoch": 0.36, + "grad_norm": 2.906550234653562, + "learning_rate": 1.488507694737191e-05, + "loss": 0.9039, + "step": 2335 + }, + { + "epoch": 0.36, + "grad_norm": 3.1084072393256705, + "learning_rate": 1.4880750598800574e-05, + "loss": 0.9259, + "step": 2336 + }, + { + "epoch": 0.36, + "grad_norm": 2.749123472026514, + "learning_rate": 1.4876423050677627e-05, + "loss": 0.9346, + "step": 2337 + }, + { + "epoch": 0.36, + "grad_norm": 2.6330518662858595, + "learning_rate": 1.4872094304066656e-05, + "loss": 0.987, + "step": 2338 + }, + { + "epoch": 0.36, + "grad_norm": 2.788749903404174, + "learning_rate": 1.486776436003155e-05, + "loss": 0.9606, + "step": 2339 + }, + { + "epoch": 0.36, + "grad_norm": 2.7976341491384056, + "learning_rate": 1.4863433219636488e-05, + "loss": 0.9753, + "step": 2340 + }, + { + "epoch": 0.36, + "grad_norm": 2.665942700415535, + "learning_rate": 1.4859100883945936e-05, + "loss": 0.9599, + "step": 2341 + }, + { + "epoch": 0.36, + "grad_norm": 4.930986893091736, + "learning_rate": 1.4854767354024668e-05, + "loss": 0.8953, + "step": 2342 + }, + { + "epoch": 0.36, + "grad_norm": 3.0580837922604696, + "learning_rate": 1.4850432630937741e-05, + "loss": 1.0139, + "step": 2343 + }, + { + "epoch": 0.36, + "grad_norm": 2.8370525920262972, + "learning_rate": 1.4846096715750509e-05, + "loss": 0.898, + "step": 2344 + }, + { + "epoch": 0.36, + "grad_norm": 3.0053150475311012, + "learning_rate": 1.4841759609528619e-05, + "loss": 0.9479, + "step": 2345 + }, + { + "epoch": 0.36, + "grad_norm": 2.7670959422581003, + "learning_rate": 1.4837421313338008e-05, + "loss": 0.915, + "step": 2346 + }, + { + "epoch": 0.36, + "grad_norm": 2.9217310890737367, + "learning_rate": 1.4833081828244908e-05, + "loss": 0.8599, + "step": 2347 + }, + { + "epoch": 0.36, + "grad_norm": 3.490086661578173, + "learning_rate": 1.4828741155315844e-05, + "loss": 0.9828, + "step": 2348 + }, + { + "epoch": 0.36, + "grad_norm": 2.849816628823393, + "learning_rate": 1.4824399295617631e-05, + "loss": 0.8926, + "step": 2349 + }, + { + "epoch": 0.36, + "grad_norm": 2.999178133361301, + "learning_rate": 1.4820056250217377e-05, + "loss": 0.9283, + "step": 2350 + }, + { + "epoch": 0.36, + "grad_norm": 3.2252969975357586, + "learning_rate": 1.4815712020182482e-05, + "loss": 0.8567, + "step": 2351 + }, + { + "epoch": 0.36, + "grad_norm": 2.6421881777254463, + "learning_rate": 1.4811366606580633e-05, + "loss": 0.9335, + "step": 2352 + }, + { + "epoch": 0.36, + "grad_norm": 2.859144819762544, + "learning_rate": 1.480702001047981e-05, + "loss": 0.9672, + "step": 2353 + }, + { + "epoch": 0.36, + "grad_norm": 3.149151970612346, + "learning_rate": 1.4802672232948287e-05, + "loss": 0.9672, + "step": 2354 + }, + { + "epoch": 0.36, + "grad_norm": 2.961148024285571, + "learning_rate": 1.4798323275054627e-05, + "loss": 0.8123, + "step": 2355 + }, + { + "epoch": 0.36, + "grad_norm": 3.0213153897706237, + "learning_rate": 1.4793973137867679e-05, + "loss": 0.9199, + "step": 2356 + }, + { + "epoch": 0.36, + "grad_norm": 2.6555091707488243, + "learning_rate": 1.4789621822456585e-05, + "loss": 0.8879, + "step": 2357 + }, + { + "epoch": 0.36, + "grad_norm": 3.1922995389939093, + "learning_rate": 1.4785269329890779e-05, + "loss": 0.8563, + "step": 2358 + }, + { + "epoch": 0.36, + "grad_norm": 2.877845027283451, + "learning_rate": 1.4780915661239979e-05, + "loss": 0.9435, + "step": 2359 + }, + { + "epoch": 0.36, + "grad_norm": 2.9264636830785347, + "learning_rate": 1.4776560817574192e-05, + "loss": 0.9215, + "step": 2360 + }, + { + "epoch": 0.36, + "grad_norm": 2.8938324963130237, + "learning_rate": 1.4772204799963723e-05, + "loss": 0.9374, + "step": 2361 + }, + { + "epoch": 0.36, + "grad_norm": 3.039456953253997, + "learning_rate": 1.4767847609479155e-05, + "loss": 0.9589, + "step": 2362 + }, + { + "epoch": 0.36, + "grad_norm": 2.699834713722586, + "learning_rate": 1.4763489247191367e-05, + "loss": 0.9505, + "step": 2363 + }, + { + "epoch": 0.36, + "grad_norm": 2.894943910673886, + "learning_rate": 1.4759129714171515e-05, + "loss": 1.0437, + "step": 2364 + }, + { + "epoch": 0.36, + "grad_norm": 2.7951532789142117, + "learning_rate": 1.4754769011491052e-05, + "loss": 0.8761, + "step": 2365 + }, + { + "epoch": 0.36, + "grad_norm": 2.8583742591953425, + "learning_rate": 1.4750407140221723e-05, + "loss": 0.8651, + "step": 2366 + }, + { + "epoch": 0.36, + "grad_norm": 2.8018366439100553, + "learning_rate": 1.4746044101435546e-05, + "loss": 0.9273, + "step": 2367 + }, + { + "epoch": 0.36, + "grad_norm": 3.4891138367062005, + "learning_rate": 1.4741679896204842e-05, + "loss": 0.935, + "step": 2368 + }, + { + "epoch": 0.36, + "grad_norm": 2.806867292338262, + "learning_rate": 1.47373145256022e-05, + "loss": 0.851, + "step": 2369 + }, + { + "epoch": 0.36, + "grad_norm": 2.9532302608146273, + "learning_rate": 1.4732947990700512e-05, + "loss": 0.9913, + "step": 2370 + }, + { + "epoch": 0.36, + "grad_norm": 2.777258410564699, + "learning_rate": 1.4728580292572947e-05, + "loss": 0.9188, + "step": 2371 + }, + { + "epoch": 0.36, + "grad_norm": 2.8586548046452687, + "learning_rate": 1.4724211432292965e-05, + "loss": 1.0016, + "step": 2372 + }, + { + "epoch": 0.36, + "grad_norm": 2.7528671137789376, + "learning_rate": 1.4719841410934307e-05, + "loss": 0.9907, + "step": 2373 + }, + { + "epoch": 0.36, + "grad_norm": 2.711606229508736, + "learning_rate": 1.4715470229571007e-05, + "loss": 0.938, + "step": 2374 + }, + { + "epoch": 0.36, + "grad_norm": 2.6358126001386037, + "learning_rate": 1.4711097889277373e-05, + "loss": 0.9436, + "step": 2375 + }, + { + "epoch": 0.36, + "grad_norm": 3.3690839599101254, + "learning_rate": 1.4706724391128004e-05, + "loss": 0.8859, + "step": 2376 + }, + { + "epoch": 0.36, + "grad_norm": 2.857511482655727, + "learning_rate": 1.4702349736197787e-05, + "loss": 0.995, + "step": 2377 + }, + { + "epoch": 0.36, + "grad_norm": 2.657782775593164, + "learning_rate": 1.4697973925561885e-05, + "loss": 0.9377, + "step": 2378 + }, + { + "epoch": 0.36, + "grad_norm": 3.0674842520445136, + "learning_rate": 1.4693596960295754e-05, + "loss": 0.8398, + "step": 2379 + }, + { + "epoch": 0.36, + "grad_norm": 2.6013761676243523, + "learning_rate": 1.4689218841475126e-05, + "loss": 0.8126, + "step": 2380 + }, + { + "epoch": 0.36, + "grad_norm": 2.864805727482045, + "learning_rate": 1.468483957017602e-05, + "loss": 0.8925, + "step": 2381 + }, + { + "epoch": 0.36, + "grad_norm": 2.800285854028283, + "learning_rate": 1.4680459147474739e-05, + "loss": 0.8958, + "step": 2382 + }, + { + "epoch": 0.36, + "grad_norm": 3.2943055652064226, + "learning_rate": 1.4676077574447867e-05, + "loss": 1.0121, + "step": 2383 + }, + { + "epoch": 0.36, + "grad_norm": 2.8279058966790025, + "learning_rate": 1.4671694852172276e-05, + "loss": 0.8836, + "step": 2384 + }, + { + "epoch": 0.37, + "grad_norm": 2.5618607083364355, + "learning_rate": 1.4667310981725113e-05, + "loss": 0.8828, + "step": 2385 + }, + { + "epoch": 0.37, + "grad_norm": 2.919796341251823, + "learning_rate": 1.4662925964183807e-05, + "loss": 0.8746, + "step": 2386 + }, + { + "epoch": 0.37, + "grad_norm": 2.5276980612176696, + "learning_rate": 1.4658539800626078e-05, + "loss": 0.8823, + "step": 2387 + }, + { + "epoch": 0.37, + "grad_norm": 2.6098839300692678, + "learning_rate": 1.4654152492129918e-05, + "loss": 0.948, + "step": 2388 + }, + { + "epoch": 0.37, + "grad_norm": 2.5115034465656607, + "learning_rate": 1.4649764039773606e-05, + "loss": 0.9205, + "step": 2389 + }, + { + "epoch": 0.37, + "grad_norm": 2.731287358860139, + "learning_rate": 1.4645374444635703e-05, + "loss": 0.9242, + "step": 2390 + }, + { + "epoch": 0.37, + "grad_norm": 3.1344035395663186, + "learning_rate": 1.4640983707795042e-05, + "loss": 0.8341, + "step": 2391 + }, + { + "epoch": 0.37, + "grad_norm": 2.8115383914664194, + "learning_rate": 1.463659183033075e-05, + "loss": 0.8833, + "step": 2392 + }, + { + "epoch": 0.37, + "grad_norm": 2.7697574743430575, + "learning_rate": 1.4632198813322223e-05, + "loss": 0.9018, + "step": 2393 + }, + { + "epoch": 0.37, + "grad_norm": 2.83330791104321, + "learning_rate": 1.4627804657849143e-05, + "loss": 0.9454, + "step": 2394 + }, + { + "epoch": 0.37, + "grad_norm": 2.775273132429049, + "learning_rate": 1.462340936499147e-05, + "loss": 0.8233, + "step": 2395 + }, + { + "epoch": 0.37, + "grad_norm": 2.5958681049130883, + "learning_rate": 1.4619012935829444e-05, + "loss": 0.8597, + "step": 2396 + }, + { + "epoch": 0.37, + "grad_norm": 2.8992675561202574, + "learning_rate": 1.4614615371443583e-05, + "loss": 0.9323, + "step": 2397 + }, + { + "epoch": 0.37, + "grad_norm": 3.2927009271512806, + "learning_rate": 1.4610216672914683e-05, + "loss": 0.8577, + "step": 2398 + }, + { + "epoch": 0.37, + "grad_norm": 2.9704436448904388, + "learning_rate": 1.4605816841323827e-05, + "loss": 0.997, + "step": 2399 + }, + { + "epoch": 0.37, + "grad_norm": 7.353962376662028, + "learning_rate": 1.4601415877752362e-05, + "loss": 1.1038, + "step": 2400 + }, + { + "epoch": 0.37, + "grad_norm": 2.7215183764031874, + "learning_rate": 1.459701378328193e-05, + "loss": 0.9097, + "step": 2401 + }, + { + "epoch": 0.37, + "grad_norm": 2.933424502380309, + "learning_rate": 1.4592610558994436e-05, + "loss": 0.7999, + "step": 2402 + }, + { + "epoch": 0.37, + "grad_norm": 2.926601447869054, + "learning_rate": 1.4588206205972074e-05, + "loss": 0.9358, + "step": 2403 + }, + { + "epoch": 0.37, + "grad_norm": 2.9496117509119233, + "learning_rate": 1.4583800725297303e-05, + "loss": 0.9057, + "step": 2404 + }, + { + "epoch": 0.37, + "grad_norm": 3.254611314774993, + "learning_rate": 1.4579394118052874e-05, + "loss": 0.7935, + "step": 2405 + }, + { + "epoch": 0.37, + "grad_norm": 2.851822266069201, + "learning_rate": 1.4574986385321803e-05, + "loss": 1.0653, + "step": 2406 + }, + { + "epoch": 0.37, + "grad_norm": 3.1348344649384035, + "learning_rate": 1.457057752818739e-05, + "loss": 0.8486, + "step": 2407 + }, + { + "epoch": 0.37, + "grad_norm": 2.579387784667669, + "learning_rate": 1.456616754773321e-05, + "loss": 0.8526, + "step": 2408 + }, + { + "epoch": 0.37, + "grad_norm": 3.0468680359639984, + "learning_rate": 1.4561756445043104e-05, + "loss": 0.9041, + "step": 2409 + }, + { + "epoch": 0.37, + "grad_norm": 3.0982245467165415, + "learning_rate": 1.4557344221201206e-05, + "loss": 0.9566, + "step": 2410 + }, + { + "epoch": 0.37, + "grad_norm": 2.9565652107469536, + "learning_rate": 1.4552930877291915e-05, + "loss": 1.0297, + "step": 2411 + }, + { + "epoch": 0.37, + "grad_norm": 2.8739467747522607, + "learning_rate": 1.4548516414399904e-05, + "loss": 0.8787, + "step": 2412 + }, + { + "epoch": 0.37, + "grad_norm": 2.6437720748691778, + "learning_rate": 1.4544100833610132e-05, + "loss": 0.9208, + "step": 2413 + }, + { + "epoch": 0.37, + "grad_norm": 2.9485877441613635, + "learning_rate": 1.4539684136007815e-05, + "loss": 0.987, + "step": 2414 + }, + { + "epoch": 0.37, + "grad_norm": 2.7206452418675786, + "learning_rate": 1.4535266322678455e-05, + "loss": 0.9639, + "step": 2415 + }, + { + "epoch": 0.37, + "grad_norm": 3.0018829599804486, + "learning_rate": 1.453084739470783e-05, + "loss": 0.8887, + "step": 2416 + }, + { + "epoch": 0.37, + "grad_norm": 2.9505090712062456, + "learning_rate": 1.452642735318199e-05, + "loss": 0.9423, + "step": 2417 + }, + { + "epoch": 0.37, + "grad_norm": 2.9778830127369367, + "learning_rate": 1.4522006199187254e-05, + "loss": 0.9154, + "step": 2418 + }, + { + "epoch": 0.37, + "grad_norm": 2.657643161124756, + "learning_rate": 1.4517583933810219e-05, + "loss": 0.9314, + "step": 2419 + }, + { + "epoch": 0.37, + "grad_norm": 2.8830635226181176, + "learning_rate": 1.4513160558137753e-05, + "loss": 0.9039, + "step": 2420 + }, + { + "epoch": 0.37, + "grad_norm": 3.050561549706014, + "learning_rate": 1.4508736073256997e-05, + "loss": 0.873, + "step": 2421 + }, + { + "epoch": 0.37, + "grad_norm": 3.3887870469835066, + "learning_rate": 1.450431048025537e-05, + "loss": 0.8954, + "step": 2422 + }, + { + "epoch": 0.37, + "grad_norm": 2.9279258595634317, + "learning_rate": 1.4499883780220552e-05, + "loss": 0.9408, + "step": 2423 + }, + { + "epoch": 0.37, + "grad_norm": 3.0884637941438986, + "learning_rate": 1.4495455974240507e-05, + "loss": 0.8688, + "step": 2424 + }, + { + "epoch": 0.37, + "grad_norm": 2.855867004287512, + "learning_rate": 1.4491027063403462e-05, + "loss": 0.9584, + "step": 2425 + }, + { + "epoch": 0.37, + "grad_norm": 2.8528079362086025, + "learning_rate": 1.4486597048797922e-05, + "loss": 0.9052, + "step": 2426 + }, + { + "epoch": 0.37, + "grad_norm": 2.8168958102750477, + "learning_rate": 1.4482165931512655e-05, + "loss": 0.9273, + "step": 2427 + }, + { + "epoch": 0.37, + "grad_norm": 2.879624150643087, + "learning_rate": 1.447773371263671e-05, + "loss": 0.9153, + "step": 2428 + }, + { + "epoch": 0.37, + "grad_norm": 3.6327227672284925, + "learning_rate": 1.4473300393259404e-05, + "loss": 0.9121, + "step": 2429 + }, + { + "epoch": 0.37, + "grad_norm": 2.7857463283637314, + "learning_rate": 1.4468865974470319e-05, + "loss": 0.9462, + "step": 2430 + }, + { + "epoch": 0.37, + "grad_norm": 2.8478943189794337, + "learning_rate": 1.4464430457359308e-05, + "loss": 0.9284, + "step": 2431 + }, + { + "epoch": 0.37, + "grad_norm": 3.0260805043516834, + "learning_rate": 1.4459993843016501e-05, + "loss": 0.9206, + "step": 2432 + }, + { + "epoch": 0.37, + "grad_norm": 2.77957086923648, + "learning_rate": 1.4455556132532298e-05, + "loss": 0.8637, + "step": 2433 + }, + { + "epoch": 0.37, + "grad_norm": 2.844634762005502, + "learning_rate": 1.4451117326997355e-05, + "loss": 0.9007, + "step": 2434 + }, + { + "epoch": 0.37, + "grad_norm": 2.7510660636079756, + "learning_rate": 1.444667742750261e-05, + "loss": 0.9752, + "step": 2435 + }, + { + "epoch": 0.37, + "grad_norm": 2.729279351837006, + "learning_rate": 1.4442236435139265e-05, + "loss": 0.9481, + "step": 2436 + }, + { + "epoch": 0.37, + "grad_norm": 2.791939618896989, + "learning_rate": 1.4437794350998791e-05, + "loss": 0.9578, + "step": 2437 + }, + { + "epoch": 0.37, + "grad_norm": 2.8060592460027998, + "learning_rate": 1.4433351176172925e-05, + "loss": 0.8578, + "step": 2438 + }, + { + "epoch": 0.37, + "grad_norm": 6.666012032688352, + "learning_rate": 1.4428906911753679e-05, + "loss": 1.0896, + "step": 2439 + }, + { + "epoch": 0.37, + "grad_norm": 2.8237416644883946, + "learning_rate": 1.4424461558833332e-05, + "loss": 0.8734, + "step": 2440 + }, + { + "epoch": 0.37, + "grad_norm": 2.6397829663448653, + "learning_rate": 1.442001511850442e-05, + "loss": 0.8617, + "step": 2441 + }, + { + "epoch": 0.37, + "grad_norm": 2.6334892262776126, + "learning_rate": 1.4415567591859753e-05, + "loss": 0.7519, + "step": 2442 + }, + { + "epoch": 0.37, + "grad_norm": 3.1957681156517204, + "learning_rate": 1.4411118979992416e-05, + "loss": 1.0172, + "step": 2443 + }, + { + "epoch": 0.37, + "grad_norm": 5.69562700633565, + "learning_rate": 1.4406669283995747e-05, + "loss": 1.2042, + "step": 2444 + }, + { + "epoch": 0.37, + "grad_norm": 3.060077822738249, + "learning_rate": 1.4402218504963355e-05, + "loss": 0.9895, + "step": 2445 + }, + { + "epoch": 0.37, + "grad_norm": 2.7709821109960866, + "learning_rate": 1.4397766643989123e-05, + "loss": 0.8656, + "step": 2446 + }, + { + "epoch": 0.37, + "grad_norm": 2.6858104251237664, + "learning_rate": 1.4393313702167185e-05, + "loss": 0.9585, + "step": 2447 + }, + { + "epoch": 0.37, + "grad_norm": 2.9475283693916645, + "learning_rate": 1.4388859680591957e-05, + "loss": 0.7974, + "step": 2448 + }, + { + "epoch": 0.37, + "grad_norm": 2.7918155152676922, + "learning_rate": 1.4384404580358112e-05, + "loss": 0.9301, + "step": 2449 + }, + { + "epoch": 0.38, + "grad_norm": 2.6422292443267072, + "learning_rate": 1.4379948402560581e-05, + "loss": 0.8223, + "step": 2450 + }, + { + "epoch": 0.38, + "grad_norm": 2.790804524918857, + "learning_rate": 1.4375491148294578e-05, + "loss": 0.9744, + "step": 2451 + }, + { + "epoch": 0.38, + "grad_norm": 2.862406171144873, + "learning_rate": 1.4371032818655564e-05, + "loss": 0.837, + "step": 2452 + }, + { + "epoch": 0.38, + "grad_norm": 2.772748271528916, + "learning_rate": 1.4366573414739273e-05, + "loss": 0.9354, + "step": 2453 + }, + { + "epoch": 0.38, + "grad_norm": 2.5730897546706624, + "learning_rate": 1.4362112937641702e-05, + "loss": 0.9264, + "step": 2454 + }, + { + "epoch": 0.38, + "grad_norm": 2.9648680762416584, + "learning_rate": 1.435765138845911e-05, + "loss": 0.9132, + "step": 2455 + }, + { + "epoch": 0.38, + "grad_norm": 2.96891879680553, + "learning_rate": 1.4353188768288022e-05, + "loss": 0.771, + "step": 2456 + }, + { + "epoch": 0.38, + "grad_norm": 3.027731011106476, + "learning_rate": 1.4348725078225228e-05, + "loss": 0.9532, + "step": 2457 + }, + { + "epoch": 0.38, + "grad_norm": 2.9217523567988866, + "learning_rate": 1.434426031936777e-05, + "loss": 0.9065, + "step": 2458 + }, + { + "epoch": 0.38, + "grad_norm": 2.902108117677878, + "learning_rate": 1.4339794492812966e-05, + "loss": 1.0494, + "step": 2459 + }, + { + "epoch": 0.38, + "grad_norm": 3.0654388452077606, + "learning_rate": 1.433532759965839e-05, + "loss": 0.8442, + "step": 2460 + }, + { + "epoch": 0.38, + "grad_norm": 2.934914525516883, + "learning_rate": 1.433085964100188e-05, + "loss": 0.8166, + "step": 2461 + }, + { + "epoch": 0.38, + "grad_norm": 2.7968856258452566, + "learning_rate": 1.4326390617941533e-05, + "loss": 0.9282, + "step": 2462 + }, + { + "epoch": 0.38, + "grad_norm": 2.7773284905132862, + "learning_rate": 1.4321920531575708e-05, + "loss": 0.9476, + "step": 2463 + }, + { + "epoch": 0.38, + "grad_norm": 2.8341525979924986, + "learning_rate": 1.4317449383003032e-05, + "loss": 0.8395, + "step": 2464 + }, + { + "epoch": 0.38, + "grad_norm": 2.7509817488619133, + "learning_rate": 1.4312977173322384e-05, + "loss": 1.0142, + "step": 2465 + }, + { + "epoch": 0.38, + "grad_norm": 3.0310197179879648, + "learning_rate": 1.430850390363291e-05, + "loss": 0.8766, + "step": 2466 + }, + { + "epoch": 0.38, + "grad_norm": 2.9366419645028676, + "learning_rate": 1.430402957503401e-05, + "loss": 1.0466, + "step": 2467 + }, + { + "epoch": 0.38, + "grad_norm": 2.6504441981995255, + "learning_rate": 1.4299554188625352e-05, + "loss": 0.8638, + "step": 2468 + }, + { + "epoch": 0.38, + "grad_norm": 2.9048530088304023, + "learning_rate": 1.4295077745506865e-05, + "loss": 0.9458, + "step": 2469 + }, + { + "epoch": 0.38, + "grad_norm": 2.8117932431738164, + "learning_rate": 1.4290600246778726e-05, + "loss": 0.9039, + "step": 2470 + }, + { + "epoch": 0.38, + "grad_norm": 2.6908191575161906, + "learning_rate": 1.4286121693541378e-05, + "loss": 0.9676, + "step": 2471 + }, + { + "epoch": 0.38, + "grad_norm": 3.0444535921462332, + "learning_rate": 1.4281642086895527e-05, + "loss": 0.9215, + "step": 2472 + }, + { + "epoch": 0.38, + "grad_norm": 3.0096304418790654, + "learning_rate": 1.4277161427942137e-05, + "loss": 0.9357, + "step": 2473 + }, + { + "epoch": 0.38, + "grad_norm": 2.9660334572945746, + "learning_rate": 1.427267971778242e-05, + "loss": 0.9491, + "step": 2474 + }, + { + "epoch": 0.38, + "grad_norm": 2.6068280607700136, + "learning_rate": 1.4268196957517866e-05, + "loss": 0.973, + "step": 2475 + }, + { + "epoch": 0.38, + "grad_norm": 3.076600308765351, + "learning_rate": 1.4263713148250203e-05, + "loss": 0.85, + "step": 2476 + }, + { + "epoch": 0.38, + "grad_norm": 3.127527029548337, + "learning_rate": 1.4259228291081431e-05, + "loss": 1.0009, + "step": 2477 + }, + { + "epoch": 0.38, + "grad_norm": 2.796536067560663, + "learning_rate": 1.4254742387113795e-05, + "loss": 0.8847, + "step": 2478 + }, + { + "epoch": 0.38, + "grad_norm": 2.7476099402360243, + "learning_rate": 1.4250255437449812e-05, + "loss": 0.9103, + "step": 2479 + }, + { + "epoch": 0.38, + "grad_norm": 3.4654239989935136, + "learning_rate": 1.4245767443192246e-05, + "loss": 0.986, + "step": 2480 + }, + { + "epoch": 0.38, + "grad_norm": 3.0750075287655423, + "learning_rate": 1.424127840544412e-05, + "loss": 0.924, + "step": 2481 + }, + { + "epoch": 0.38, + "grad_norm": 2.649710861424403, + "learning_rate": 1.423678832530871e-05, + "loss": 0.8283, + "step": 2482 + }, + { + "epoch": 0.38, + "grad_norm": 7.986231643973584, + "learning_rate": 1.4232297203889556e-05, + "loss": 1.1026, + "step": 2483 + }, + { + "epoch": 0.38, + "grad_norm": 2.6902403214536084, + "learning_rate": 1.4227805042290447e-05, + "loss": 0.858, + "step": 2484 + }, + { + "epoch": 0.38, + "grad_norm": 2.86598463124649, + "learning_rate": 1.4223311841615435e-05, + "loss": 0.9599, + "step": 2485 + }, + { + "epoch": 0.38, + "grad_norm": 2.6725105892389482, + "learning_rate": 1.421881760296882e-05, + "loss": 0.9663, + "step": 2486 + }, + { + "epoch": 0.38, + "grad_norm": 2.5386181291260144, + "learning_rate": 1.4214322327455157e-05, + "loss": 0.9455, + "step": 2487 + }, + { + "epoch": 0.38, + "grad_norm": 2.9625059832666265, + "learning_rate": 1.4209826016179263e-05, + "loss": 1.0126, + "step": 2488 + }, + { + "epoch": 0.38, + "grad_norm": 2.761055038983844, + "learning_rate": 1.42053286702462e-05, + "loss": 0.8517, + "step": 2489 + }, + { + "epoch": 0.38, + "grad_norm": 2.8698649570185073, + "learning_rate": 1.4200830290761295e-05, + "loss": 0.9186, + "step": 2490 + }, + { + "epoch": 0.38, + "grad_norm": 3.176874951282763, + "learning_rate": 1.419633087883012e-05, + "loss": 0.9243, + "step": 2491 + }, + { + "epoch": 0.38, + "grad_norm": 2.7541805305928913, + "learning_rate": 1.419183043555851e-05, + "loss": 0.8945, + "step": 2492 + }, + { + "epoch": 0.38, + "grad_norm": 2.7154264957294645, + "learning_rate": 1.4187328962052536e-05, + "loss": 0.9352, + "step": 2493 + }, + { + "epoch": 0.38, + "grad_norm": 2.769275581735851, + "learning_rate": 1.4182826459418543e-05, + "loss": 0.9409, + "step": 2494 + }, + { + "epoch": 0.38, + "grad_norm": 2.54796386871129, + "learning_rate": 1.417832292876312e-05, + "loss": 0.9307, + "step": 2495 + }, + { + "epoch": 0.38, + "grad_norm": 2.869832085250058, + "learning_rate": 1.4173818371193106e-05, + "loss": 0.9986, + "step": 2496 + }, + { + "epoch": 0.38, + "grad_norm": 2.737659375631931, + "learning_rate": 1.41693127878156e-05, + "loss": 0.8104, + "step": 2497 + }, + { + "epoch": 0.38, + "grad_norm": 3.0286718411841465, + "learning_rate": 1.4164806179737935e-05, + "loss": 0.8746, + "step": 2498 + }, + { + "epoch": 0.38, + "grad_norm": 2.940764782727735, + "learning_rate": 1.4160298548067725e-05, + "loss": 0.8917, + "step": 2499 + }, + { + "epoch": 0.38, + "grad_norm": 2.896939827544822, + "learning_rate": 1.4155789893912808e-05, + "loss": 0.8441, + "step": 2500 + }, + { + "epoch": 0.38, + "grad_norm": 2.5839952314690016, + "learning_rate": 1.4151280218381287e-05, + "loss": 0.9311, + "step": 2501 + }, + { + "epoch": 0.38, + "grad_norm": 2.845449847456862, + "learning_rate": 1.4146769522581519e-05, + "loss": 0.897, + "step": 2502 + }, + { + "epoch": 0.38, + "grad_norm": 2.6564675143300276, + "learning_rate": 1.4142257807622103e-05, + "loss": 0.8739, + "step": 2503 + }, + { + "epoch": 0.38, + "grad_norm": 2.80321608838917, + "learning_rate": 1.4137745074611888e-05, + "loss": 0.9843, + "step": 2504 + }, + { + "epoch": 0.38, + "grad_norm": 2.6055525554314296, + "learning_rate": 1.4133231324659984e-05, + "loss": 0.8857, + "step": 2505 + }, + { + "epoch": 0.38, + "grad_norm": 2.7794862939293057, + "learning_rate": 1.412871655887574e-05, + "loss": 0.8227, + "step": 2506 + }, + { + "epoch": 0.38, + "grad_norm": 2.860975874901178, + "learning_rate": 1.412420077836876e-05, + "loss": 0.9598, + "step": 2507 + }, + { + "epoch": 0.38, + "grad_norm": 2.95525509948893, + "learning_rate": 1.4119683984248898e-05, + "loss": 0.9288, + "step": 2508 + }, + { + "epoch": 0.38, + "grad_norm": 3.2260253091053848, + "learning_rate": 1.4115166177626252e-05, + "loss": 1.0039, + "step": 2509 + }, + { + "epoch": 0.38, + "grad_norm": 2.8290548924846135, + "learning_rate": 1.4110647359611175e-05, + "loss": 0.8938, + "step": 2510 + }, + { + "epoch": 0.38, + "grad_norm": 2.9476608734997245, + "learning_rate": 1.4106127531314261e-05, + "loss": 0.8894, + "step": 2511 + }, + { + "epoch": 0.38, + "grad_norm": 2.652241950773867, + "learning_rate": 1.4101606693846363e-05, + "loss": 0.9125, + "step": 2512 + }, + { + "epoch": 0.38, + "grad_norm": 2.7796707100209352, + "learning_rate": 1.4097084848318574e-05, + "loss": 0.9231, + "step": 2513 + }, + { + "epoch": 0.38, + "grad_norm": 5.863374745844007, + "learning_rate": 1.4092561995842238e-05, + "loss": 1.0797, + "step": 2514 + }, + { + "epoch": 0.38, + "grad_norm": 2.7617959711169444, + "learning_rate": 1.4088038137528938e-05, + "loss": 0.9663, + "step": 2515 + }, + { + "epoch": 0.39, + "grad_norm": 3.163921586737289, + "learning_rate": 1.408351327449052e-05, + "loss": 0.9333, + "step": 2516 + }, + { + "epoch": 0.39, + "grad_norm": 2.7792699930149816, + "learning_rate": 1.4078987407839066e-05, + "loss": 0.8374, + "step": 2517 + }, + { + "epoch": 0.39, + "grad_norm": 2.7188267540892985, + "learning_rate": 1.4074460538686908e-05, + "loss": 0.8946, + "step": 2518 + }, + { + "epoch": 0.39, + "grad_norm": 2.6558835918581876, + "learning_rate": 1.4069932668146619e-05, + "loss": 0.9045, + "step": 2519 + }, + { + "epoch": 0.39, + "grad_norm": 2.866876242378881, + "learning_rate": 1.406540379733103e-05, + "loss": 0.8737, + "step": 2520 + }, + { + "epoch": 0.39, + "grad_norm": 3.198276090274672, + "learning_rate": 1.4060873927353203e-05, + "loss": 0.9967, + "step": 2521 + }, + { + "epoch": 0.39, + "grad_norm": 2.5484619529654458, + "learning_rate": 1.4056343059326458e-05, + "loss": 0.8782, + "step": 2522 + }, + { + "epoch": 0.39, + "grad_norm": 2.743265614544366, + "learning_rate": 1.405181119436435e-05, + "loss": 0.8819, + "step": 2523 + }, + { + "epoch": 0.39, + "grad_norm": 3.084943695798522, + "learning_rate": 1.4047278333580689e-05, + "loss": 0.8367, + "step": 2524 + }, + { + "epoch": 0.39, + "grad_norm": 2.6568453931938603, + "learning_rate": 1.4042744478089528e-05, + "loss": 0.9544, + "step": 2525 + }, + { + "epoch": 0.39, + "grad_norm": 2.9522303052046026, + "learning_rate": 1.4038209629005156e-05, + "loss": 0.8682, + "step": 2526 + }, + { + "epoch": 0.39, + "grad_norm": 3.0082325997961306, + "learning_rate": 1.4033673787442108e-05, + "loss": 0.8472, + "step": 2527 + }, + { + "epoch": 0.39, + "grad_norm": 2.5897995504801026, + "learning_rate": 1.4029136954515175e-05, + "loss": 0.9053, + "step": 2528 + }, + { + "epoch": 0.39, + "grad_norm": 4.056262723244552, + "learning_rate": 1.402459913133938e-05, + "loss": 0.9708, + "step": 2529 + }, + { + "epoch": 0.39, + "grad_norm": 2.830129743021095, + "learning_rate": 1.4020060319029991e-05, + "loss": 1.0032, + "step": 2530 + }, + { + "epoch": 0.39, + "grad_norm": 2.6939067046894354, + "learning_rate": 1.4015520518702526e-05, + "loss": 0.8888, + "step": 2531 + }, + { + "epoch": 0.39, + "grad_norm": 2.655218785250219, + "learning_rate": 1.4010979731472731e-05, + "loss": 0.8829, + "step": 2532 + }, + { + "epoch": 0.39, + "grad_norm": 2.67010313060736, + "learning_rate": 1.4006437958456616e-05, + "loss": 0.8782, + "step": 2533 + }, + { + "epoch": 0.39, + "grad_norm": 2.861776718201834, + "learning_rate": 1.4001895200770412e-05, + "loss": 0.9893, + "step": 2534 + }, + { + "epoch": 0.39, + "grad_norm": 2.64690624389765, + "learning_rate": 1.3997351459530605e-05, + "loss": 0.8615, + "step": 2535 + }, + { + "epoch": 0.39, + "grad_norm": 2.82273974702354, + "learning_rate": 1.3992806735853924e-05, + "loss": 1.0044, + "step": 2536 + }, + { + "epoch": 0.39, + "grad_norm": 2.7952746643921755, + "learning_rate": 1.3988261030857327e-05, + "loss": 0.853, + "step": 2537 + }, + { + "epoch": 0.39, + "grad_norm": 3.0034032826719534, + "learning_rate": 1.3983714345658021e-05, + "loss": 1.0211, + "step": 2538 + }, + { + "epoch": 0.39, + "grad_norm": 2.683248570522212, + "learning_rate": 1.3979166681373459e-05, + "loss": 0.8442, + "step": 2539 + }, + { + "epoch": 0.39, + "grad_norm": 2.8977292429823303, + "learning_rate": 1.3974618039121326e-05, + "loss": 0.8503, + "step": 2540 + }, + { + "epoch": 0.39, + "grad_norm": 2.838726270766741, + "learning_rate": 1.3970068420019552e-05, + "loss": 0.9449, + "step": 2541 + }, + { + "epoch": 0.39, + "grad_norm": 2.6119929718476276, + "learning_rate": 1.3965517825186306e-05, + "loss": 0.9552, + "step": 2542 + }, + { + "epoch": 0.39, + "grad_norm": 2.691711091798615, + "learning_rate": 1.3960966255739992e-05, + "loss": 0.9775, + "step": 2543 + }, + { + "epoch": 0.39, + "grad_norm": 2.886609800357804, + "learning_rate": 1.3956413712799263e-05, + "loss": 0.9099, + "step": 2544 + }, + { + "epoch": 0.39, + "grad_norm": 2.5510696879764474, + "learning_rate": 1.3951860197483008e-05, + "loss": 0.8674, + "step": 2545 + }, + { + "epoch": 0.39, + "grad_norm": 3.2071989200026714, + "learning_rate": 1.3947305710910346e-05, + "loss": 0.8818, + "step": 2546 + }, + { + "epoch": 0.39, + "grad_norm": 2.7232029556025936, + "learning_rate": 1.394275025420065e-05, + "loss": 0.8767, + "step": 2547 + }, + { + "epoch": 0.39, + "grad_norm": 2.822174025808589, + "learning_rate": 1.3938193828473521e-05, + "loss": 1.0398, + "step": 2548 + }, + { + "epoch": 0.39, + "grad_norm": 2.91246822858765, + "learning_rate": 1.3933636434848797e-05, + "loss": 0.8926, + "step": 2549 + }, + { + "epoch": 0.39, + "grad_norm": 2.7026549086967, + "learning_rate": 1.3929078074446561e-05, + "loss": 1.0069, + "step": 2550 + }, + { + "epoch": 0.39, + "grad_norm": 3.3907508424871993, + "learning_rate": 1.392451874838713e-05, + "loss": 0.8861, + "step": 2551 + }, + { + "epoch": 0.39, + "grad_norm": 3.365881535947219, + "learning_rate": 1.3919958457791056e-05, + "loss": 0.9715, + "step": 2552 + }, + { + "epoch": 0.39, + "grad_norm": 2.9741631822721617, + "learning_rate": 1.3915397203779138e-05, + "loss": 0.8546, + "step": 2553 + }, + { + "epoch": 0.39, + "grad_norm": 2.4551308205268096, + "learning_rate": 1.3910834987472393e-05, + "loss": 0.8067, + "step": 2554 + }, + { + "epoch": 0.39, + "grad_norm": 2.7640869588849273, + "learning_rate": 1.3906271809992093e-05, + "loss": 0.8744, + "step": 2555 + }, + { + "epoch": 0.39, + "grad_norm": 2.7444788971223937, + "learning_rate": 1.3901707672459738e-05, + "loss": 0.9381, + "step": 2556 + }, + { + "epoch": 0.39, + "grad_norm": 2.8966581939330647, + "learning_rate": 1.3897142575997062e-05, + "loss": 0.8446, + "step": 2557 + }, + { + "epoch": 0.39, + "grad_norm": 2.7516082432017703, + "learning_rate": 1.3892576521726045e-05, + "loss": 0.8426, + "step": 2558 + }, + { + "epoch": 0.39, + "grad_norm": 2.7161833682050105, + "learning_rate": 1.388800951076889e-05, + "loss": 0.927, + "step": 2559 + }, + { + "epoch": 0.39, + "grad_norm": 2.8069829382886025, + "learning_rate": 1.3883441544248037e-05, + "loss": 0.903, + "step": 2560 + }, + { + "epoch": 0.39, + "grad_norm": 2.8060823802410213, + "learning_rate": 1.3878872623286169e-05, + "loss": 0.9407, + "step": 2561 + }, + { + "epoch": 0.39, + "grad_norm": 3.1672323489059027, + "learning_rate": 1.38743027490062e-05, + "loss": 0.8627, + "step": 2562 + }, + { + "epoch": 0.39, + "grad_norm": 2.9899067313211556, + "learning_rate": 1.386973192253127e-05, + "loss": 0.9436, + "step": 2563 + }, + { + "epoch": 0.39, + "grad_norm": 2.742952405237023, + "learning_rate": 1.3865160144984766e-05, + "loss": 0.7929, + "step": 2564 + }, + { + "epoch": 0.39, + "grad_norm": 2.709277125727227, + "learning_rate": 1.38605874174903e-05, + "loss": 0.8206, + "step": 2565 + }, + { + "epoch": 0.39, + "grad_norm": 2.8380760313223443, + "learning_rate": 1.3856013741171723e-05, + "loss": 0.9272, + "step": 2566 + }, + { + "epoch": 0.39, + "grad_norm": 2.557049466152109, + "learning_rate": 1.3851439117153114e-05, + "loss": 0.8834, + "step": 2567 + }, + { + "epoch": 0.39, + "grad_norm": 2.674380736873026, + "learning_rate": 1.3846863546558783e-05, + "loss": 0.8271, + "step": 2568 + }, + { + "epoch": 0.39, + "grad_norm": 2.729408030623172, + "learning_rate": 1.3842287030513287e-05, + "loss": 0.917, + "step": 2569 + }, + { + "epoch": 0.39, + "grad_norm": 2.4925015801647987, + "learning_rate": 1.3837709570141401e-05, + "loss": 0.8981, + "step": 2570 + }, + { + "epoch": 0.39, + "grad_norm": 2.881328573934881, + "learning_rate": 1.3833131166568132e-05, + "loss": 0.9229, + "step": 2571 + }, + { + "epoch": 0.39, + "grad_norm": 2.6522807333490324, + "learning_rate": 1.3828551820918726e-05, + "loss": 1.0144, + "step": 2572 + }, + { + "epoch": 0.39, + "grad_norm": 2.631217876018588, + "learning_rate": 1.382397153431866e-05, + "loss": 0.8376, + "step": 2573 + }, + { + "epoch": 0.39, + "grad_norm": 2.752984398348073, + "learning_rate": 1.3819390307893637e-05, + "loss": 0.8932, + "step": 2574 + }, + { + "epoch": 0.39, + "grad_norm": 2.6337997335427636, + "learning_rate": 1.3814808142769596e-05, + "loss": 0.8712, + "step": 2575 + }, + { + "epoch": 0.39, + "grad_norm": 2.767756879305521, + "learning_rate": 1.3810225040072702e-05, + "loss": 0.8711, + "step": 2576 + }, + { + "epoch": 0.39, + "grad_norm": 2.833671564740066, + "learning_rate": 1.380564100092936e-05, + "loss": 0.9122, + "step": 2577 + }, + { + "epoch": 0.39, + "grad_norm": 2.6823193648331993, + "learning_rate": 1.3801056026466187e-05, + "loss": 0.9515, + "step": 2578 + }, + { + "epoch": 0.39, + "grad_norm": 2.7052167356636985, + "learning_rate": 1.3796470117810047e-05, + "loss": 0.9358, + "step": 2579 + }, + { + "epoch": 0.39, + "grad_norm": 2.7684445261992323, + "learning_rate": 1.3791883276088032e-05, + "loss": 0.9435, + "step": 2580 + }, + { + "epoch": 0.4, + "grad_norm": 2.7770127368337647, + "learning_rate": 1.3787295502427456e-05, + "loss": 1.0437, + "step": 2581 + }, + { + "epoch": 0.4, + "grad_norm": 2.7540315264272577, + "learning_rate": 1.3782706797955862e-05, + "loss": 0.8721, + "step": 2582 + }, + { + "epoch": 0.4, + "grad_norm": 2.6708408424496337, + "learning_rate": 1.3778117163801026e-05, + "loss": 0.8377, + "step": 2583 + }, + { + "epoch": 0.4, + "grad_norm": 2.4750890755484996, + "learning_rate": 1.3773526601090953e-05, + "loss": 0.9632, + "step": 2584 + }, + { + "epoch": 0.4, + "grad_norm": 2.8539918952627445, + "learning_rate": 1.3768935110953876e-05, + "loss": 0.8852, + "step": 2585 + }, + { + "epoch": 0.4, + "grad_norm": 2.7756479228382283, + "learning_rate": 1.376434269451825e-05, + "loss": 0.9469, + "step": 2586 + }, + { + "epoch": 0.4, + "grad_norm": 2.8092528873183813, + "learning_rate": 1.3759749352912766e-05, + "loss": 0.8656, + "step": 2587 + }, + { + "epoch": 0.4, + "grad_norm": 2.7460581990366535, + "learning_rate": 1.3755155087266339e-05, + "loss": 0.9428, + "step": 2588 + }, + { + "epoch": 0.4, + "grad_norm": 2.6634448363350254, + "learning_rate": 1.3750559898708104e-05, + "loss": 0.7914, + "step": 2589 + }, + { + "epoch": 0.4, + "grad_norm": 3.3016500768582326, + "learning_rate": 1.3745963788367438e-05, + "loss": 0.9247, + "step": 2590 + }, + { + "epoch": 0.4, + "grad_norm": 2.8604673823201012, + "learning_rate": 1.3741366757373928e-05, + "loss": 0.7247, + "step": 2591 + }, + { + "epoch": 0.4, + "grad_norm": 2.634064735941571, + "learning_rate": 1.3736768806857405e-05, + "loss": 0.7969, + "step": 2592 + }, + { + "epoch": 0.4, + "grad_norm": 2.6425265646373126, + "learning_rate": 1.373216993794791e-05, + "loss": 0.7945, + "step": 2593 + }, + { + "epoch": 0.4, + "grad_norm": 2.8211610865763577, + "learning_rate": 1.3727570151775716e-05, + "loss": 0.9039, + "step": 2594 + }, + { + "epoch": 0.4, + "grad_norm": 2.569205222997264, + "learning_rate": 1.3722969449471319e-05, + "loss": 0.9151, + "step": 2595 + }, + { + "epoch": 0.4, + "grad_norm": 2.716633530129442, + "learning_rate": 1.3718367832165451e-05, + "loss": 0.8682, + "step": 2596 + }, + { + "epoch": 0.4, + "grad_norm": 2.9394906838196153, + "learning_rate": 1.3713765300989053e-05, + "loss": 0.8901, + "step": 2597 + }, + { + "epoch": 0.4, + "grad_norm": 2.8753101369751275, + "learning_rate": 1.37091618570733e-05, + "loss": 1.0159, + "step": 2598 + }, + { + "epoch": 0.4, + "grad_norm": 2.6976133669347346, + "learning_rate": 1.3704557501549594e-05, + "loss": 0.8259, + "step": 2599 + }, + { + "epoch": 0.4, + "grad_norm": 2.9676339712488287, + "learning_rate": 1.3699952235549547e-05, + "loss": 0.8872, + "step": 2600 + }, + { + "epoch": 0.4, + "grad_norm": 3.015750805388782, + "learning_rate": 1.369534606020501e-05, + "loss": 0.9446, + "step": 2601 + }, + { + "epoch": 0.4, + "grad_norm": 3.0731985649847657, + "learning_rate": 1.3690738976648053e-05, + "loss": 1.0013, + "step": 2602 + }, + { + "epoch": 0.4, + "grad_norm": 3.163854041137835, + "learning_rate": 1.3686130986010965e-05, + "loss": 0.9987, + "step": 2603 + }, + { + "epoch": 0.4, + "grad_norm": 2.706206243594574, + "learning_rate": 1.3681522089426265e-05, + "loss": 0.8524, + "step": 2604 + }, + { + "epoch": 0.4, + "grad_norm": 2.575483493975654, + "learning_rate": 1.3676912288026685e-05, + "loss": 0.7937, + "step": 2605 + }, + { + "epoch": 0.4, + "grad_norm": 2.9879921457113077, + "learning_rate": 1.3672301582945187e-05, + "loss": 0.8572, + "step": 2606 + }, + { + "epoch": 0.4, + "grad_norm": 2.853495583921802, + "learning_rate": 1.3667689975314955e-05, + "loss": 0.98, + "step": 2607 + }, + { + "epoch": 0.4, + "grad_norm": 2.90434906178557, + "learning_rate": 1.366307746626939e-05, + "loss": 0.8291, + "step": 2608 + }, + { + "epoch": 0.4, + "grad_norm": 2.837540695181841, + "learning_rate": 1.365846405694212e-05, + "loss": 0.8312, + "step": 2609 + }, + { + "epoch": 0.4, + "grad_norm": 6.641247798531113, + "learning_rate": 1.3653849748466991e-05, + "loss": 1.1737, + "step": 2610 + }, + { + "epoch": 0.4, + "grad_norm": 2.845182795899973, + "learning_rate": 1.364923454197807e-05, + "loss": 0.9513, + "step": 2611 + }, + { + "epoch": 0.4, + "grad_norm": 2.8796126601879175, + "learning_rate": 1.3644618438609643e-05, + "loss": 0.8865, + "step": 2612 + }, + { + "epoch": 0.4, + "grad_norm": 2.8526823560302756, + "learning_rate": 1.364000143949622e-05, + "loss": 0.9331, + "step": 2613 + }, + { + "epoch": 0.4, + "grad_norm": 4.3528468609485875, + "learning_rate": 1.3635383545772534e-05, + "loss": 0.8366, + "step": 2614 + }, + { + "epoch": 0.4, + "grad_norm": 3.41234900636786, + "learning_rate": 1.3630764758573529e-05, + "loss": 0.834, + "step": 2615 + }, + { + "epoch": 0.4, + "grad_norm": 2.988943225087445, + "learning_rate": 1.3626145079034374e-05, + "loss": 0.9219, + "step": 2616 + }, + { + "epoch": 0.4, + "grad_norm": 2.9776376496628636, + "learning_rate": 1.3621524508290457e-05, + "loss": 0.9286, + "step": 2617 + }, + { + "epoch": 0.4, + "grad_norm": 3.229730135176999, + "learning_rate": 1.361690304747739e-05, + "loss": 0.9246, + "step": 2618 + }, + { + "epoch": 0.4, + "grad_norm": 2.9490639624794284, + "learning_rate": 1.361228069773099e-05, + "loss": 0.9191, + "step": 2619 + }, + { + "epoch": 0.4, + "grad_norm": 2.9437329662922713, + "learning_rate": 1.3607657460187307e-05, + "loss": 0.9408, + "step": 2620 + }, + { + "epoch": 0.4, + "grad_norm": 2.5307046738228607, + "learning_rate": 1.36030333359826e-05, + "loss": 0.8944, + "step": 2621 + }, + { + "epoch": 0.4, + "grad_norm": 2.6236321071528637, + "learning_rate": 1.3598408326253348e-05, + "loss": 0.8936, + "step": 2622 + }, + { + "epoch": 0.4, + "grad_norm": 3.0871202845956263, + "learning_rate": 1.3593782432136251e-05, + "loss": 0.9609, + "step": 2623 + }, + { + "epoch": 0.4, + "grad_norm": 2.6858071831140933, + "learning_rate": 1.3589155654768224e-05, + "loss": 0.8933, + "step": 2624 + }, + { + "epoch": 0.4, + "grad_norm": 8.045716570735623, + "learning_rate": 1.35845279952864e-05, + "loss": 1.1436, + "step": 2625 + }, + { + "epoch": 0.4, + "grad_norm": 2.8137361622080816, + "learning_rate": 1.3579899454828126e-05, + "loss": 0.9405, + "step": 2626 + }, + { + "epoch": 0.4, + "grad_norm": 2.8962261675877774, + "learning_rate": 1.3575270034530967e-05, + "loss": 0.8779, + "step": 2627 + }, + { + "epoch": 0.4, + "grad_norm": 2.6786738209282155, + "learning_rate": 1.3570639735532707e-05, + "loss": 0.8522, + "step": 2628 + }, + { + "epoch": 0.4, + "grad_norm": 2.8984792337350838, + "learning_rate": 1.3566008558971342e-05, + "loss": 0.8925, + "step": 2629 + }, + { + "epoch": 0.4, + "grad_norm": 2.6947101762433547, + "learning_rate": 1.3561376505985085e-05, + "loss": 0.8026, + "step": 2630 + }, + { + "epoch": 0.4, + "grad_norm": 2.7174665627606913, + "learning_rate": 1.3556743577712363e-05, + "loss": 0.929, + "step": 2631 + }, + { + "epoch": 0.4, + "grad_norm": 2.8030923315646157, + "learning_rate": 1.3552109775291828e-05, + "loss": 0.9614, + "step": 2632 + }, + { + "epoch": 0.4, + "grad_norm": 3.067036667463295, + "learning_rate": 1.354747509986233e-05, + "loss": 0.8985, + "step": 2633 + }, + { + "epoch": 0.4, + "grad_norm": 2.737467558758066, + "learning_rate": 1.3542839552562945e-05, + "loss": 0.9098, + "step": 2634 + }, + { + "epoch": 0.4, + "grad_norm": 2.9905535153970537, + "learning_rate": 1.353820313453296e-05, + "loss": 0.8895, + "step": 2635 + }, + { + "epoch": 0.4, + "grad_norm": 2.8996950341165877, + "learning_rate": 1.353356584691188e-05, + "loss": 0.9421, + "step": 2636 + }, + { + "epoch": 0.4, + "grad_norm": 3.1401490422536424, + "learning_rate": 1.3528927690839414e-05, + "loss": 0.8608, + "step": 2637 + }, + { + "epoch": 0.4, + "grad_norm": 2.7015344706197664, + "learning_rate": 1.3524288667455497e-05, + "loss": 0.9702, + "step": 2638 + }, + { + "epoch": 0.4, + "grad_norm": 2.580217257775127, + "learning_rate": 1.3519648777900264e-05, + "loss": 0.8723, + "step": 2639 + }, + { + "epoch": 0.4, + "grad_norm": 3.0334489963523077, + "learning_rate": 1.3515008023314077e-05, + "loss": 0.9329, + "step": 2640 + }, + { + "epoch": 0.4, + "grad_norm": 2.94368863839663, + "learning_rate": 1.3510366404837499e-05, + "loss": 0.8787, + "step": 2641 + }, + { + "epoch": 0.4, + "grad_norm": 2.4523195891235248, + "learning_rate": 1.3505723923611309e-05, + "loss": 0.85, + "step": 2642 + }, + { + "epoch": 0.4, + "grad_norm": 2.845812910882723, + "learning_rate": 1.3501080580776504e-05, + "loss": 0.9852, + "step": 2643 + }, + { + "epoch": 0.4, + "grad_norm": 2.6528192407446602, + "learning_rate": 1.3496436377474282e-05, + "loss": 0.8928, + "step": 2644 + }, + { + "epoch": 0.4, + "grad_norm": 2.815621799890215, + "learning_rate": 1.3491791314846059e-05, + "loss": 0.857, + "step": 2645 + }, + { + "epoch": 0.41, + "grad_norm": 3.398570829960979, + "learning_rate": 1.348714539403346e-05, + "loss": 0.8943, + "step": 2646 + }, + { + "epoch": 0.41, + "grad_norm": 2.7354191271811326, + "learning_rate": 1.3482498616178329e-05, + "loss": 0.9616, + "step": 2647 + }, + { + "epoch": 0.41, + "grad_norm": 3.0440592594486175, + "learning_rate": 1.3477850982422704e-05, + "loss": 0.8634, + "step": 2648 + }, + { + "epoch": 0.41, + "grad_norm": 2.6829014105178377, + "learning_rate": 1.3473202493908847e-05, + "loss": 0.9402, + "step": 2649 + }, + { + "epoch": 0.41, + "grad_norm": 2.6462089340973725, + "learning_rate": 1.3468553151779229e-05, + "loss": 0.7961, + "step": 2650 + }, + { + "epoch": 0.41, + "grad_norm": 2.9826913776524546, + "learning_rate": 1.3463902957176526e-05, + "loss": 0.8488, + "step": 2651 + }, + { + "epoch": 0.41, + "grad_norm": 2.8634755106514653, + "learning_rate": 1.3459251911243623e-05, + "loss": 0.8495, + "step": 2652 + }, + { + "epoch": 0.41, + "grad_norm": 2.6810739867840625, + "learning_rate": 1.345460001512362e-05, + "loss": 1.0048, + "step": 2653 + }, + { + "epoch": 0.41, + "grad_norm": 2.8634739647852445, + "learning_rate": 1.344994726995982e-05, + "loss": 0.9138, + "step": 2654 + }, + { + "epoch": 0.41, + "grad_norm": 3.0049448169785546, + "learning_rate": 1.3445293676895742e-05, + "loss": 0.9287, + "step": 2655 + }, + { + "epoch": 0.41, + "grad_norm": 2.8368502452839635, + "learning_rate": 1.34406392370751e-05, + "loss": 0.8992, + "step": 2656 + }, + { + "epoch": 0.41, + "grad_norm": 2.7559802643598608, + "learning_rate": 1.3435983951641831e-05, + "loss": 0.844, + "step": 2657 + }, + { + "epoch": 0.41, + "grad_norm": 3.1639489232846976, + "learning_rate": 1.3431327821740074e-05, + "loss": 0.9783, + "step": 2658 + }, + { + "epoch": 0.41, + "grad_norm": 2.5013584115803975, + "learning_rate": 1.3426670848514172e-05, + "loss": 0.8197, + "step": 2659 + }, + { + "epoch": 0.41, + "grad_norm": 2.6903599600202748, + "learning_rate": 1.3422013033108683e-05, + "loss": 0.8648, + "step": 2660 + }, + { + "epoch": 0.41, + "grad_norm": 2.889426640643883, + "learning_rate": 1.341735437666836e-05, + "loss": 0.9477, + "step": 2661 + }, + { + "epoch": 0.41, + "grad_norm": 2.561490050920701, + "learning_rate": 1.341269488033818e-05, + "loss": 0.8577, + "step": 2662 + }, + { + "epoch": 0.41, + "grad_norm": 2.93139917411127, + "learning_rate": 1.3408034545263307e-05, + "loss": 0.8558, + "step": 2663 + }, + { + "epoch": 0.41, + "grad_norm": 2.8483228045095426, + "learning_rate": 1.3403373372589126e-05, + "loss": 0.8472, + "step": 2664 + }, + { + "epoch": 0.41, + "grad_norm": 2.8900801444678326, + "learning_rate": 1.339871136346122e-05, + "loss": 0.9375, + "step": 2665 + }, + { + "epoch": 0.41, + "grad_norm": 2.761176789590491, + "learning_rate": 1.3394048519025385e-05, + "loss": 0.88, + "step": 2666 + }, + { + "epoch": 0.41, + "grad_norm": 2.995974666022481, + "learning_rate": 1.3389384840427609e-05, + "loss": 0.9282, + "step": 2667 + }, + { + "epoch": 0.41, + "grad_norm": 2.795462437751262, + "learning_rate": 1.3384720328814101e-05, + "loss": 0.8563, + "step": 2668 + }, + { + "epoch": 0.41, + "grad_norm": 2.8992183719809117, + "learning_rate": 1.338005498533126e-05, + "loss": 0.9361, + "step": 2669 + }, + { + "epoch": 0.41, + "grad_norm": 2.6516545078942926, + "learning_rate": 1.3375388811125707e-05, + "loss": 0.9369, + "step": 2670 + }, + { + "epoch": 0.41, + "grad_norm": 2.369168985699152, + "learning_rate": 1.337072180734425e-05, + "loss": 0.7759, + "step": 2671 + }, + { + "epoch": 0.41, + "grad_norm": 3.088417388011945, + "learning_rate": 1.3366053975133904e-05, + "loss": 0.8851, + "step": 2672 + }, + { + "epoch": 0.41, + "grad_norm": 2.9663036317798768, + "learning_rate": 1.3361385315641898e-05, + "loss": 0.9257, + "step": 2673 + }, + { + "epoch": 0.41, + "grad_norm": 3.1256307601041797, + "learning_rate": 1.3356715830015652e-05, + "loss": 0.9925, + "step": 2674 + }, + { + "epoch": 0.41, + "grad_norm": 7.090405001302476, + "learning_rate": 1.3352045519402799e-05, + "loss": 1.1464, + "step": 2675 + }, + { + "epoch": 0.41, + "grad_norm": 2.604324707239911, + "learning_rate": 1.3347374384951171e-05, + "loss": 0.8574, + "step": 2676 + }, + { + "epoch": 0.41, + "grad_norm": 2.7363852452328645, + "learning_rate": 1.33427024278088e-05, + "loss": 0.8872, + "step": 2677 + }, + { + "epoch": 0.41, + "grad_norm": 3.7479083095110153, + "learning_rate": 1.333802964912392e-05, + "loss": 0.8885, + "step": 2678 + }, + { + "epoch": 0.41, + "grad_norm": 2.7348959309497, + "learning_rate": 1.333335605004497e-05, + "loss": 0.8658, + "step": 2679 + }, + { + "epoch": 0.41, + "grad_norm": 3.015353432771644, + "learning_rate": 1.332868163172059e-05, + "loss": 0.949, + "step": 2680 + }, + { + "epoch": 0.41, + "grad_norm": 2.513831609319815, + "learning_rate": 1.3324006395299624e-05, + "loss": 0.8491, + "step": 2681 + }, + { + "epoch": 0.41, + "grad_norm": 2.646123238994641, + "learning_rate": 1.3319330341931112e-05, + "loss": 0.8096, + "step": 2682 + }, + { + "epoch": 0.41, + "grad_norm": 2.870007829546668, + "learning_rate": 1.3314653472764293e-05, + "loss": 0.8376, + "step": 2683 + }, + { + "epoch": 0.41, + "grad_norm": 5.54424602096051, + "learning_rate": 1.3309975788948616e-05, + "loss": 1.0279, + "step": 2684 + }, + { + "epoch": 0.41, + "grad_norm": 2.650425831800214, + "learning_rate": 1.330529729163372e-05, + "loss": 0.8417, + "step": 2685 + }, + { + "epoch": 0.41, + "grad_norm": 2.793823761613536, + "learning_rate": 1.330061798196945e-05, + "loss": 0.858, + "step": 2686 + }, + { + "epoch": 0.41, + "grad_norm": 3.0661166656301595, + "learning_rate": 1.3295937861105848e-05, + "loss": 0.8369, + "step": 2687 + }, + { + "epoch": 0.41, + "grad_norm": 2.8365228650619096, + "learning_rate": 1.3291256930193164e-05, + "loss": 0.8982, + "step": 2688 + }, + { + "epoch": 0.41, + "grad_norm": 2.829947055923032, + "learning_rate": 1.3286575190381828e-05, + "loss": 0.9457, + "step": 2689 + }, + { + "epoch": 0.41, + "grad_norm": 2.8245219608219343, + "learning_rate": 1.3281892642822488e-05, + "loss": 0.8713, + "step": 2690 + }, + { + "epoch": 0.41, + "grad_norm": 2.7515076639581935, + "learning_rate": 1.3277209288665977e-05, + "loss": 0.8659, + "step": 2691 + }, + { + "epoch": 0.41, + "grad_norm": 3.039281991650958, + "learning_rate": 1.3272525129063339e-05, + "loss": 0.9434, + "step": 2692 + }, + { + "epoch": 0.41, + "grad_norm": 2.78357479127376, + "learning_rate": 1.3267840165165802e-05, + "loss": 0.9466, + "step": 2693 + }, + { + "epoch": 0.41, + "grad_norm": 2.708980316331865, + "learning_rate": 1.3263154398124807e-05, + "loss": 0.7815, + "step": 2694 + }, + { + "epoch": 0.41, + "grad_norm": 2.539856934636554, + "learning_rate": 1.325846782909198e-05, + "loss": 0.8458, + "step": 2695 + }, + { + "epoch": 0.41, + "grad_norm": 2.9120051675086893, + "learning_rate": 1.3253780459219143e-05, + "loss": 0.9523, + "step": 2696 + }, + { + "epoch": 0.41, + "grad_norm": 2.6221844576287014, + "learning_rate": 1.3249092289658327e-05, + "loss": 0.8075, + "step": 2697 + }, + { + "epoch": 0.41, + "grad_norm": 3.1776531212135213, + "learning_rate": 1.324440332156175e-05, + "loss": 0.8592, + "step": 2698 + }, + { + "epoch": 0.41, + "grad_norm": 2.8181917492879025, + "learning_rate": 1.323971355608183e-05, + "loss": 0.9806, + "step": 2699 + }, + { + "epoch": 0.41, + "grad_norm": 2.916697538337927, + "learning_rate": 1.323502299437118e-05, + "loss": 0.9191, + "step": 2700 + }, + { + "epoch": 0.41, + "grad_norm": 2.682544239590553, + "learning_rate": 1.3230331637582605e-05, + "loss": 0.9452, + "step": 2701 + }, + { + "epoch": 0.41, + "grad_norm": 2.803323300095509, + "learning_rate": 1.3225639486869113e-05, + "loss": 0.8157, + "step": 2702 + }, + { + "epoch": 0.41, + "grad_norm": 2.7628897421404925, + "learning_rate": 1.3220946543383904e-05, + "loss": 0.8419, + "step": 2703 + }, + { + "epoch": 0.41, + "grad_norm": 2.9899136183471016, + "learning_rate": 1.3216252808280366e-05, + "loss": 0.9206, + "step": 2704 + }, + { + "epoch": 0.41, + "grad_norm": 2.9304013279937533, + "learning_rate": 1.3211558282712092e-05, + "loss": 0.8583, + "step": 2705 + }, + { + "epoch": 0.41, + "grad_norm": 3.0558376279085238, + "learning_rate": 1.3206862967832863e-05, + "loss": 0.9753, + "step": 2706 + }, + { + "epoch": 0.41, + "grad_norm": 7.049415755795687, + "learning_rate": 1.3202166864796659e-05, + "loss": 1.0676, + "step": 2707 + }, + { + "epoch": 0.41, + "grad_norm": 2.903397325916613, + "learning_rate": 1.3197469974757644e-05, + "loss": 0.8897, + "step": 2708 + }, + { + "epoch": 0.41, + "grad_norm": 2.927310383815381, + "learning_rate": 1.3192772298870187e-05, + "loss": 0.8699, + "step": 2709 + }, + { + "epoch": 0.41, + "grad_norm": 2.7231277330666335, + "learning_rate": 1.3188073838288844e-05, + "loss": 0.7612, + "step": 2710 + }, + { + "epoch": 0.41, + "grad_norm": 2.9162454164657357, + "learning_rate": 1.3183374594168368e-05, + "loss": 1.0089, + "step": 2711 + }, + { + "epoch": 0.42, + "grad_norm": 2.759293862016098, + "learning_rate": 1.3178674567663692e-05, + "loss": 0.9146, + "step": 2712 + }, + { + "epoch": 0.42, + "grad_norm": 2.873108983408784, + "learning_rate": 1.3173973759929956e-05, + "loss": 0.8961, + "step": 2713 + }, + { + "epoch": 0.42, + "grad_norm": 2.900800727457516, + "learning_rate": 1.3169272172122493e-05, + "loss": 0.8252, + "step": 2714 + }, + { + "epoch": 0.42, + "grad_norm": 2.671252646096681, + "learning_rate": 1.3164569805396812e-05, + "loss": 0.9027, + "step": 2715 + }, + { + "epoch": 0.42, + "grad_norm": 2.4209487105746272, + "learning_rate": 1.3159866660908629e-05, + "loss": 0.818, + "step": 2716 + }, + { + "epoch": 0.42, + "grad_norm": 2.596162460313096, + "learning_rate": 1.315516273981384e-05, + "loss": 0.8533, + "step": 2717 + }, + { + "epoch": 0.42, + "grad_norm": 2.8102198032524477, + "learning_rate": 1.3150458043268541e-05, + "loss": 0.9579, + "step": 2718 + }, + { + "epoch": 0.42, + "grad_norm": 2.7144446997601652, + "learning_rate": 1.3145752572429012e-05, + "loss": 0.9368, + "step": 2719 + }, + { + "epoch": 0.42, + "grad_norm": 2.9364685528235817, + "learning_rate": 1.3141046328451724e-05, + "loss": 0.8868, + "step": 2720 + }, + { + "epoch": 0.42, + "grad_norm": 4.064401225713515, + "learning_rate": 1.3136339312493346e-05, + "loss": 0.9467, + "step": 2721 + }, + { + "epoch": 0.42, + "grad_norm": 2.5323842705217143, + "learning_rate": 1.3131631525710729e-05, + "loss": 0.822, + "step": 2722 + }, + { + "epoch": 0.42, + "grad_norm": 2.936538224523595, + "learning_rate": 1.3126922969260908e-05, + "loss": 0.8762, + "step": 2723 + }, + { + "epoch": 0.42, + "grad_norm": 2.640141372170476, + "learning_rate": 1.312221364430112e-05, + "loss": 0.9057, + "step": 2724 + }, + { + "epoch": 0.42, + "grad_norm": 2.9851422679608923, + "learning_rate": 1.3117503551988786e-05, + "loss": 0.9964, + "step": 2725 + }, + { + "epoch": 0.42, + "grad_norm": 2.729257381358218, + "learning_rate": 1.311279269348151e-05, + "loss": 0.905, + "step": 2726 + }, + { + "epoch": 0.42, + "grad_norm": 2.8125014739544567, + "learning_rate": 1.3108081069937095e-05, + "loss": 0.8715, + "step": 2727 + }, + { + "epoch": 0.42, + "grad_norm": 3.0387568343577667, + "learning_rate": 1.3103368682513519e-05, + "loss": 0.8337, + "step": 2728 + }, + { + "epoch": 0.42, + "grad_norm": 3.0422667008518354, + "learning_rate": 1.309865553236896e-05, + "loss": 0.8908, + "step": 2729 + }, + { + "epoch": 0.42, + "grad_norm": 2.685792667296222, + "learning_rate": 1.3093941620661777e-05, + "loss": 0.8665, + "step": 2730 + }, + { + "epoch": 0.42, + "grad_norm": 2.864671960381527, + "learning_rate": 1.3089226948550513e-05, + "loss": 0.8474, + "step": 2731 + }, + { + "epoch": 0.42, + "grad_norm": 2.6702622460485546, + "learning_rate": 1.3084511517193908e-05, + "loss": 0.8985, + "step": 2732 + }, + { + "epoch": 0.42, + "grad_norm": 2.943472642915905, + "learning_rate": 1.3079795327750882e-05, + "loss": 0.8237, + "step": 2733 + }, + { + "epoch": 0.42, + "grad_norm": 2.80814879490897, + "learning_rate": 1.307507838138054e-05, + "loss": 0.9702, + "step": 2734 + }, + { + "epoch": 0.42, + "grad_norm": 2.763516539187601, + "learning_rate": 1.3070360679242171e-05, + "loss": 0.9229, + "step": 2735 + }, + { + "epoch": 0.42, + "grad_norm": 2.513463001939668, + "learning_rate": 1.3065642222495263e-05, + "loss": 0.8847, + "step": 2736 + }, + { + "epoch": 0.42, + "grad_norm": 2.6352679845951053, + "learning_rate": 1.3060923012299474e-05, + "loss": 0.808, + "step": 2737 + }, + { + "epoch": 0.42, + "grad_norm": 2.6534091049974515, + "learning_rate": 1.3056203049814657e-05, + "loss": 0.8816, + "step": 2738 + }, + { + "epoch": 0.42, + "grad_norm": 2.6125283309661, + "learning_rate": 1.3051482336200844e-05, + "loss": 0.8314, + "step": 2739 + }, + { + "epoch": 0.42, + "grad_norm": 2.5198187375176504, + "learning_rate": 1.3046760872618256e-05, + "loss": 0.8629, + "step": 2740 + }, + { + "epoch": 0.42, + "grad_norm": 3.314150472453498, + "learning_rate": 1.3042038660227294e-05, + "loss": 0.9143, + "step": 2741 + }, + { + "epoch": 0.42, + "grad_norm": 2.682097793806096, + "learning_rate": 1.3037315700188545e-05, + "loss": 0.8695, + "step": 2742 + }, + { + "epoch": 0.42, + "grad_norm": 2.691789862579373, + "learning_rate": 1.3032591993662782e-05, + "loss": 0.9287, + "step": 2743 + }, + { + "epoch": 0.42, + "grad_norm": 2.7938866440413697, + "learning_rate": 1.3027867541810961e-05, + "loss": 0.9067, + "step": 2744 + }, + { + "epoch": 0.42, + "grad_norm": 2.806362708709313, + "learning_rate": 1.3023142345794217e-05, + "loss": 0.925, + "step": 2745 + }, + { + "epoch": 0.42, + "grad_norm": 2.8228075732201483, + "learning_rate": 1.3018416406773872e-05, + "loss": 0.9025, + "step": 2746 + }, + { + "epoch": 0.42, + "grad_norm": 2.6855012662957063, + "learning_rate": 1.3013689725911429e-05, + "loss": 0.8737, + "step": 2747 + }, + { + "epoch": 0.42, + "grad_norm": 2.816680124122433, + "learning_rate": 1.3008962304368574e-05, + "loss": 0.9318, + "step": 2748 + }, + { + "epoch": 0.42, + "grad_norm": 2.7972551434857094, + "learning_rate": 1.3004234143307173e-05, + "loss": 1.0027, + "step": 2749 + }, + { + "epoch": 0.42, + "grad_norm": 2.8968584915515048, + "learning_rate": 1.2999505243889276e-05, + "loss": 0.8936, + "step": 2750 + }, + { + "epoch": 0.42, + "grad_norm": 2.8214777133442754, + "learning_rate": 1.2994775607277117e-05, + "loss": 0.8905, + "step": 2751 + }, + { + "epoch": 0.42, + "grad_norm": 2.7101579231098265, + "learning_rate": 1.2990045234633103e-05, + "loss": 0.9263, + "step": 2752 + }, + { + "epoch": 0.42, + "grad_norm": 2.677902149989637, + "learning_rate": 1.2985314127119827e-05, + "loss": 0.8503, + "step": 2753 + }, + { + "epoch": 0.42, + "grad_norm": 2.7699643153619378, + "learning_rate": 1.2980582285900067e-05, + "loss": 0.9604, + "step": 2754 + }, + { + "epoch": 0.42, + "grad_norm": 2.9787744770673408, + "learning_rate": 1.2975849712136777e-05, + "loss": 0.9063, + "step": 2755 + }, + { + "epoch": 0.42, + "grad_norm": 2.6750675394118044, + "learning_rate": 1.2971116406993087e-05, + "loss": 1.0037, + "step": 2756 + }, + { + "epoch": 0.42, + "grad_norm": 2.76107592986856, + "learning_rate": 1.2966382371632312e-05, + "loss": 0.8496, + "step": 2757 + }, + { + "epoch": 0.42, + "grad_norm": 2.810847329041354, + "learning_rate": 1.2961647607217947e-05, + "loss": 0.9515, + "step": 2758 + }, + { + "epoch": 0.42, + "grad_norm": 2.5966067731708664, + "learning_rate": 1.295691211491366e-05, + "loss": 0.8798, + "step": 2759 + }, + { + "epoch": 0.42, + "grad_norm": 2.62491735299009, + "learning_rate": 1.2952175895883305e-05, + "loss": 0.8362, + "step": 2760 + }, + { + "epoch": 0.42, + "grad_norm": 2.739999232523196, + "learning_rate": 1.2947438951290914e-05, + "loss": 0.8861, + "step": 2761 + }, + { + "epoch": 0.42, + "grad_norm": 2.794841678738877, + "learning_rate": 1.2942701282300694e-05, + "loss": 0.8742, + "step": 2762 + }, + { + "epoch": 0.42, + "grad_norm": 2.5384183680272097, + "learning_rate": 1.2937962890077025e-05, + "loss": 0.8167, + "step": 2763 + }, + { + "epoch": 0.42, + "grad_norm": 2.618528137650998, + "learning_rate": 1.2933223775784476e-05, + "loss": 0.9138, + "step": 2764 + }, + { + "epoch": 0.42, + "grad_norm": 2.9500922611185936, + "learning_rate": 1.292848394058779e-05, + "loss": 0.9018, + "step": 2765 + }, + { + "epoch": 0.42, + "grad_norm": 2.842181916751234, + "learning_rate": 1.2923743385651886e-05, + "loss": 0.9543, + "step": 2766 + }, + { + "epoch": 0.42, + "grad_norm": 3.0128918616195084, + "learning_rate": 1.2919002112141856e-05, + "loss": 1.0471, + "step": 2767 + }, + { + "epoch": 0.42, + "grad_norm": 2.561823446799556, + "learning_rate": 1.2914260121222973e-05, + "loss": 0.9366, + "step": 2768 + }, + { + "epoch": 0.42, + "grad_norm": 2.70083738483455, + "learning_rate": 1.2909517414060686e-05, + "loss": 0.9575, + "step": 2769 + }, + { + "epoch": 0.42, + "grad_norm": 2.6430987035979, + "learning_rate": 1.2904773991820619e-05, + "loss": 0.9195, + "step": 2770 + }, + { + "epoch": 0.42, + "grad_norm": 2.628729956718192, + "learning_rate": 1.290002985566857e-05, + "loss": 0.8752, + "step": 2771 + }, + { + "epoch": 0.42, + "grad_norm": 5.1639188996877206, + "learning_rate": 1.2895285006770521e-05, + "loss": 1.1261, + "step": 2772 + }, + { + "epoch": 0.42, + "grad_norm": 2.704694091407873, + "learning_rate": 1.2890539446292617e-05, + "loss": 0.8353, + "step": 2773 + }, + { + "epoch": 0.42, + "grad_norm": 2.7133213384391324, + "learning_rate": 1.2885793175401184e-05, + "loss": 0.8487, + "step": 2774 + }, + { + "epoch": 0.42, + "grad_norm": 2.766965649726584, + "learning_rate": 1.2881046195262722e-05, + "loss": 0.8796, + "step": 2775 + }, + { + "epoch": 0.42, + "grad_norm": 2.666040307272094, + "learning_rate": 1.2876298507043909e-05, + "loss": 0.9043, + "step": 2776 + }, + { + "epoch": 0.43, + "grad_norm": 2.6977785742953646, + "learning_rate": 1.2871550111911594e-05, + "loss": 0.9711, + "step": 2777 + }, + { + "epoch": 0.43, + "grad_norm": 2.7691161056205362, + "learning_rate": 1.2866801011032797e-05, + "loss": 0.9293, + "step": 2778 + }, + { + "epoch": 0.43, + "grad_norm": 2.7409823091121863, + "learning_rate": 1.2862051205574711e-05, + "loss": 0.9393, + "step": 2779 + }, + { + "epoch": 0.43, + "grad_norm": 2.729972015716603, + "learning_rate": 1.2857300696704709e-05, + "loss": 0.7922, + "step": 2780 + }, + { + "epoch": 0.43, + "grad_norm": 2.6327159311383377, + "learning_rate": 1.2852549485590333e-05, + "loss": 0.8727, + "step": 2781 + }, + { + "epoch": 0.43, + "grad_norm": 2.780302599242152, + "learning_rate": 1.2847797573399295e-05, + "loss": 0.995, + "step": 2782 + }, + { + "epoch": 0.43, + "grad_norm": 2.8037074207099093, + "learning_rate": 1.2843044961299485e-05, + "loss": 0.9171, + "step": 2783 + }, + { + "epoch": 0.43, + "grad_norm": 2.6445660114893594, + "learning_rate": 1.2838291650458958e-05, + "loss": 0.925, + "step": 2784 + }, + { + "epoch": 0.43, + "grad_norm": 2.837391869572842, + "learning_rate": 1.2833537642045946e-05, + "loss": 0.9195, + "step": 2785 + }, + { + "epoch": 0.43, + "grad_norm": 2.824701378485826, + "learning_rate": 1.282878293722885e-05, + "loss": 0.8747, + "step": 2786 + }, + { + "epoch": 0.43, + "grad_norm": 2.9369630126925377, + "learning_rate": 1.2824027537176245e-05, + "loss": 0.8264, + "step": 2787 + }, + { + "epoch": 0.43, + "grad_norm": 2.979787998965054, + "learning_rate": 1.2819271443056875e-05, + "loss": 0.8666, + "step": 2788 + }, + { + "epoch": 0.43, + "grad_norm": 2.7097297050328284, + "learning_rate": 1.2814514656039654e-05, + "loss": 0.8764, + "step": 2789 + }, + { + "epoch": 0.43, + "grad_norm": 2.7239968607272536, + "learning_rate": 1.2809757177293665e-05, + "loss": 0.8569, + "step": 2790 + }, + { + "epoch": 0.43, + "grad_norm": 5.739216278795826, + "learning_rate": 1.2804999007988164e-05, + "loss": 1.076, + "step": 2791 + }, + { + "epoch": 0.43, + "grad_norm": 2.78097932188797, + "learning_rate": 1.280024014929258e-05, + "loss": 0.9201, + "step": 2792 + }, + { + "epoch": 0.43, + "grad_norm": 2.7816801537901346, + "learning_rate": 1.2795480602376498e-05, + "loss": 0.8948, + "step": 2793 + }, + { + "epoch": 0.43, + "grad_norm": 2.8022224490668406, + "learning_rate": 1.279072036840969e-05, + "loss": 0.9382, + "step": 2794 + }, + { + "epoch": 0.43, + "grad_norm": 3.0675784329509876, + "learning_rate": 1.2785959448562085e-05, + "loss": 0.969, + "step": 2795 + }, + { + "epoch": 0.43, + "grad_norm": 2.6091188213405028, + "learning_rate": 1.2781197844003779e-05, + "loss": 0.7114, + "step": 2796 + }, + { + "epoch": 0.43, + "grad_norm": 2.9346912134531844, + "learning_rate": 1.2776435555905044e-05, + "loss": 0.9375, + "step": 2797 + }, + { + "epoch": 0.43, + "grad_norm": 2.720573376646091, + "learning_rate": 1.2771672585436319e-05, + "loss": 0.9549, + "step": 2798 + }, + { + "epoch": 0.43, + "grad_norm": 2.816531795887586, + "learning_rate": 1.2766908933768208e-05, + "loss": 0.8821, + "step": 2799 + }, + { + "epoch": 0.43, + "grad_norm": 2.8434065002551825, + "learning_rate": 1.276214460207148e-05, + "loss": 0.9636, + "step": 2800 + }, + { + "epoch": 0.43, + "grad_norm": 2.513410606594475, + "learning_rate": 1.2757379591517078e-05, + "loss": 0.8945, + "step": 2801 + }, + { + "epoch": 0.43, + "grad_norm": 2.572942288837356, + "learning_rate": 1.2752613903276105e-05, + "loss": 0.8388, + "step": 2802 + }, + { + "epoch": 0.43, + "grad_norm": 2.3877301322816327, + "learning_rate": 1.2747847538519835e-05, + "loss": 0.8968, + "step": 2803 + }, + { + "epoch": 0.43, + "grad_norm": 2.8973856675083773, + "learning_rate": 1.2743080498419706e-05, + "loss": 0.8581, + "step": 2804 + }, + { + "epoch": 0.43, + "grad_norm": 2.6479081480567146, + "learning_rate": 1.2738312784147321e-05, + "loss": 0.9422, + "step": 2805 + }, + { + "epoch": 0.43, + "grad_norm": 2.658536179740867, + "learning_rate": 1.2733544396874458e-05, + "loss": 0.9756, + "step": 2806 + }, + { + "epoch": 0.43, + "grad_norm": 2.7879058284524008, + "learning_rate": 1.2728775337773045e-05, + "loss": 0.7959, + "step": 2807 + }, + { + "epoch": 0.43, + "grad_norm": 2.56131826463935, + "learning_rate": 1.2724005608015182e-05, + "loss": 0.9383, + "step": 2808 + }, + { + "epoch": 0.43, + "grad_norm": 2.7028874499026885, + "learning_rate": 1.271923520877314e-05, + "loss": 0.9949, + "step": 2809 + }, + { + "epoch": 0.43, + "grad_norm": 2.6867758439522373, + "learning_rate": 1.2714464141219349e-05, + "loss": 0.9218, + "step": 2810 + }, + { + "epoch": 0.43, + "grad_norm": 2.52131520434168, + "learning_rate": 1.2709692406526402e-05, + "loss": 0.8991, + "step": 2811 + }, + { + "epoch": 0.43, + "grad_norm": 2.6569271969175845, + "learning_rate": 1.2704920005867056e-05, + "loss": 0.9066, + "step": 2812 + }, + { + "epoch": 0.43, + "grad_norm": 2.8660174545559434, + "learning_rate": 1.2700146940414235e-05, + "loss": 0.992, + "step": 2813 + }, + { + "epoch": 0.43, + "grad_norm": 2.8443981988384084, + "learning_rate": 1.2695373211341027e-05, + "loss": 0.9334, + "step": 2814 + }, + { + "epoch": 0.43, + "grad_norm": 2.8875230537426404, + "learning_rate": 1.2690598819820673e-05, + "loss": 0.9484, + "step": 2815 + }, + { + "epoch": 0.43, + "grad_norm": 2.4775901913816125, + "learning_rate": 1.2685823767026595e-05, + "loss": 0.891, + "step": 2816 + }, + { + "epoch": 0.43, + "grad_norm": 2.7206299402662686, + "learning_rate": 1.2681048054132362e-05, + "loss": 0.9514, + "step": 2817 + }, + { + "epoch": 0.43, + "grad_norm": 3.167692193895392, + "learning_rate": 1.2676271682311707e-05, + "loss": 0.8848, + "step": 2818 + }, + { + "epoch": 0.43, + "grad_norm": 5.190369872857878, + "learning_rate": 1.2671494652738532e-05, + "loss": 0.9674, + "step": 2819 + }, + { + "epoch": 0.43, + "grad_norm": 2.7075328177894096, + "learning_rate": 1.2666716966586897e-05, + "loss": 0.9712, + "step": 2820 + }, + { + "epoch": 0.43, + "grad_norm": 2.624690499566204, + "learning_rate": 1.2661938625031023e-05, + "loss": 0.8567, + "step": 2821 + }, + { + "epoch": 0.43, + "grad_norm": 2.8968353468717307, + "learning_rate": 1.2657159629245289e-05, + "loss": 0.8855, + "step": 2822 + }, + { + "epoch": 0.43, + "grad_norm": 2.654379274046832, + "learning_rate": 1.2652379980404243e-05, + "loss": 0.864, + "step": 2823 + }, + { + "epoch": 0.43, + "grad_norm": 2.6926644329777125, + "learning_rate": 1.2647599679682587e-05, + "loss": 0.8662, + "step": 2824 + }, + { + "epoch": 0.43, + "grad_norm": 3.082786408851719, + "learning_rate": 1.2642818728255187e-05, + "loss": 0.8975, + "step": 2825 + }, + { + "epoch": 0.43, + "grad_norm": 2.738555091909095, + "learning_rate": 1.2638037127297057e-05, + "loss": 0.846, + "step": 2826 + }, + { + "epoch": 0.43, + "grad_norm": 2.674779193194505, + "learning_rate": 1.2633254877983391e-05, + "loss": 0.8139, + "step": 2827 + }, + { + "epoch": 0.43, + "grad_norm": 2.6945356029820298, + "learning_rate": 1.2628471981489531e-05, + "loss": 1.0136, + "step": 2828 + }, + { + "epoch": 0.43, + "grad_norm": 2.752427401318561, + "learning_rate": 1.2623688438990977e-05, + "loss": 0.8937, + "step": 2829 + }, + { + "epoch": 0.43, + "grad_norm": 2.520523775277756, + "learning_rate": 1.2618904251663383e-05, + "loss": 0.8781, + "step": 2830 + }, + { + "epoch": 0.43, + "grad_norm": 2.7637733636446513, + "learning_rate": 1.2614119420682578e-05, + "loss": 0.8954, + "step": 2831 + }, + { + "epoch": 0.43, + "grad_norm": 2.618376381488524, + "learning_rate": 1.2609333947224536e-05, + "loss": 0.8622, + "step": 2832 + }, + { + "epoch": 0.43, + "grad_norm": 2.6964185788176973, + "learning_rate": 1.2604547832465388e-05, + "loss": 0.9027, + "step": 2833 + }, + { + "epoch": 0.43, + "grad_norm": 2.4961769332020256, + "learning_rate": 1.2599761077581432e-05, + "loss": 0.896, + "step": 2834 + }, + { + "epoch": 0.43, + "grad_norm": 3.1354249286666676, + "learning_rate": 1.2594973683749117e-05, + "loss": 0.917, + "step": 2835 + }, + { + "epoch": 0.43, + "grad_norm": 2.499449814627182, + "learning_rate": 1.2590185652145048e-05, + "loss": 0.876, + "step": 2836 + }, + { + "epoch": 0.43, + "grad_norm": 2.6758994644461054, + "learning_rate": 1.258539698394599e-05, + "loss": 0.9023, + "step": 2837 + }, + { + "epoch": 0.43, + "grad_norm": 2.721132926239213, + "learning_rate": 1.2580607680328862e-05, + "loss": 0.8959, + "step": 2838 + }, + { + "epoch": 0.43, + "grad_norm": 3.012454865319753, + "learning_rate": 1.2575817742470744e-05, + "loss": 0.856, + "step": 2839 + }, + { + "epoch": 0.43, + "grad_norm": 2.704714719021651, + "learning_rate": 1.2571027171548869e-05, + "loss": 0.8977, + "step": 2840 + }, + { + "epoch": 0.43, + "grad_norm": 2.84919928232391, + "learning_rate": 1.2566235968740617e-05, + "loss": 0.9952, + "step": 2841 + }, + { + "epoch": 0.44, + "grad_norm": 2.6574199172892525, + "learning_rate": 1.256144413522354e-05, + "loss": 0.9264, + "step": 2842 + }, + { + "epoch": 0.44, + "grad_norm": 2.614026383064716, + "learning_rate": 1.2556651672175334e-05, + "loss": 0.814, + "step": 2843 + }, + { + "epoch": 0.44, + "grad_norm": 2.701109692270557, + "learning_rate": 1.2551858580773849e-05, + "loss": 0.9096, + "step": 2844 + }, + { + "epoch": 0.44, + "grad_norm": 3.238141437418869, + "learning_rate": 1.2547064862197094e-05, + "loss": 0.9762, + "step": 2845 + }, + { + "epoch": 0.44, + "grad_norm": 2.6739228366592105, + "learning_rate": 1.254227051762323e-05, + "loss": 0.8861, + "step": 2846 + }, + { + "epoch": 0.44, + "grad_norm": 2.939568840529801, + "learning_rate": 1.2537475548230576e-05, + "loss": 0.897, + "step": 2847 + }, + { + "epoch": 0.44, + "grad_norm": 6.158877445042974, + "learning_rate": 1.2532679955197598e-05, + "loss": 1.1339, + "step": 2848 + }, + { + "epoch": 0.44, + "grad_norm": 2.7311145656490585, + "learning_rate": 1.2527883739702915e-05, + "loss": 0.7805, + "step": 2849 + }, + { + "epoch": 0.44, + "grad_norm": 2.887246109250409, + "learning_rate": 1.252308690292531e-05, + "loss": 0.9028, + "step": 2850 + }, + { + "epoch": 0.44, + "grad_norm": 3.1492961703914863, + "learning_rate": 1.2518289446043708e-05, + "loss": 0.8904, + "step": 2851 + }, + { + "epoch": 0.44, + "grad_norm": 2.86800072338607, + "learning_rate": 1.2513491370237185e-05, + "loss": 0.9, + "step": 2852 + }, + { + "epoch": 0.44, + "grad_norm": 3.0258328334393756, + "learning_rate": 1.2508692676684976e-05, + "loss": 0.9429, + "step": 2853 + }, + { + "epoch": 0.44, + "grad_norm": 2.7665456325819693, + "learning_rate": 1.250389336656647e-05, + "loss": 0.9439, + "step": 2854 + }, + { + "epoch": 0.44, + "grad_norm": 2.5646168892460377, + "learning_rate": 1.2499093441061197e-05, + "loss": 0.9178, + "step": 2855 + }, + { + "epoch": 0.44, + "grad_norm": 2.6287532728333125, + "learning_rate": 1.2494292901348843e-05, + "loss": 0.8059, + "step": 2856 + }, + { + "epoch": 0.44, + "grad_norm": 2.958320890926117, + "learning_rate": 1.2489491748609252e-05, + "loss": 0.8785, + "step": 2857 + }, + { + "epoch": 0.44, + "grad_norm": 3.2348670122620264, + "learning_rate": 1.2484689984022411e-05, + "loss": 0.8628, + "step": 2858 + }, + { + "epoch": 0.44, + "grad_norm": 2.980007281557108, + "learning_rate": 1.2479887608768456e-05, + "loss": 0.9237, + "step": 2859 + }, + { + "epoch": 0.44, + "grad_norm": 3.1141166290087186, + "learning_rate": 1.2475084624027676e-05, + "loss": 0.9854, + "step": 2860 + }, + { + "epoch": 0.44, + "grad_norm": 2.6165244904164173, + "learning_rate": 1.2470281030980514e-05, + "loss": 0.8273, + "step": 2861 + }, + { + "epoch": 0.44, + "grad_norm": 2.6790421711796113, + "learning_rate": 1.2465476830807554e-05, + "loss": 0.8995, + "step": 2862 + }, + { + "epoch": 0.44, + "grad_norm": 2.6940258268094537, + "learning_rate": 1.246067202468954e-05, + "loss": 0.7782, + "step": 2863 + }, + { + "epoch": 0.44, + "grad_norm": 2.7195876883706096, + "learning_rate": 1.245586661380735e-05, + "loss": 0.9042, + "step": 2864 + }, + { + "epoch": 0.44, + "grad_norm": 2.6665709592021134, + "learning_rate": 1.2451060599342027e-05, + "loss": 0.8503, + "step": 2865 + }, + { + "epoch": 0.44, + "grad_norm": 2.780535821992812, + "learning_rate": 1.244625398247475e-05, + "loss": 0.8841, + "step": 2866 + }, + { + "epoch": 0.44, + "grad_norm": 2.3861406054156697, + "learning_rate": 1.2441446764386852e-05, + "loss": 0.8027, + "step": 2867 + }, + { + "epoch": 0.44, + "grad_norm": 3.1004373602009925, + "learning_rate": 1.2436638946259812e-05, + "loss": 0.8828, + "step": 2868 + }, + { + "epoch": 0.44, + "grad_norm": 2.6726937897135388, + "learning_rate": 1.2431830529275258e-05, + "loss": 0.8096, + "step": 2869 + }, + { + "epoch": 0.44, + "grad_norm": 2.699417394780327, + "learning_rate": 1.242702151461496e-05, + "loss": 0.8516, + "step": 2870 + }, + { + "epoch": 0.44, + "grad_norm": 2.8101062223473368, + "learning_rate": 1.2422211903460845e-05, + "loss": 0.9078, + "step": 2871 + }, + { + "epoch": 0.44, + "grad_norm": 6.152195380073992, + "learning_rate": 1.2417401696994976e-05, + "loss": 1.0162, + "step": 2872 + }, + { + "epoch": 0.44, + "grad_norm": 2.8336300027831007, + "learning_rate": 1.241259089639957e-05, + "loss": 0.799, + "step": 2873 + }, + { + "epoch": 0.44, + "grad_norm": 3.050238017683165, + "learning_rate": 1.2407779502856987e-05, + "loss": 0.8961, + "step": 2874 + }, + { + "epoch": 0.44, + "grad_norm": 2.808485721043457, + "learning_rate": 1.2402967517549727e-05, + "loss": 0.8932, + "step": 2875 + }, + { + "epoch": 0.44, + "grad_norm": 2.7191297854715617, + "learning_rate": 1.2398154941660444e-05, + "loss": 0.9282, + "step": 2876 + }, + { + "epoch": 0.44, + "grad_norm": 2.724204028606329, + "learning_rate": 1.2393341776371938e-05, + "loss": 0.9891, + "step": 2877 + }, + { + "epoch": 0.44, + "grad_norm": 2.7601559575612145, + "learning_rate": 1.2388528022867149e-05, + "loss": 0.8699, + "step": 2878 + }, + { + "epoch": 0.44, + "grad_norm": 3.000915940449102, + "learning_rate": 1.238371368232916e-05, + "loss": 0.8998, + "step": 2879 + }, + { + "epoch": 0.44, + "grad_norm": 4.492961193488793, + "learning_rate": 1.23788987559412e-05, + "loss": 0.8853, + "step": 2880 + }, + { + "epoch": 0.44, + "grad_norm": 2.8430538524255873, + "learning_rate": 1.2374083244886643e-05, + "loss": 0.9793, + "step": 2881 + }, + { + "epoch": 0.44, + "grad_norm": 2.6505794293919682, + "learning_rate": 1.2369267150349009e-05, + "loss": 0.9107, + "step": 2882 + }, + { + "epoch": 0.44, + "grad_norm": 2.7726946812456537, + "learning_rate": 1.2364450473511958e-05, + "loss": 0.8269, + "step": 2883 + }, + { + "epoch": 0.44, + "grad_norm": 2.5356256975086793, + "learning_rate": 1.2359633215559297e-05, + "loss": 0.856, + "step": 2884 + }, + { + "epoch": 0.44, + "grad_norm": 2.705532891404338, + "learning_rate": 1.235481537767497e-05, + "loss": 0.8162, + "step": 2885 + }, + { + "epoch": 0.44, + "grad_norm": 2.5881663782552518, + "learning_rate": 1.2349996961043063e-05, + "loss": 0.8658, + "step": 2886 + }, + { + "epoch": 0.44, + "grad_norm": 2.7240151945541533, + "learning_rate": 1.234517796684781e-05, + "loss": 0.8341, + "step": 2887 + }, + { + "epoch": 0.44, + "grad_norm": 2.8736602833296305, + "learning_rate": 1.234035839627359e-05, + "loss": 0.8701, + "step": 2888 + }, + { + "epoch": 0.44, + "grad_norm": 2.496571636405041, + "learning_rate": 1.233553825050491e-05, + "loss": 0.8708, + "step": 2889 + }, + { + "epoch": 0.44, + "grad_norm": 2.7426844394459926, + "learning_rate": 1.2330717530726435e-05, + "loss": 0.8606, + "step": 2890 + }, + { + "epoch": 0.44, + "grad_norm": 20.411245162121194, + "learning_rate": 1.2325896238122958e-05, + "loss": 0.9352, + "step": 2891 + }, + { + "epoch": 0.44, + "grad_norm": 3.088228446918177, + "learning_rate": 1.2321074373879416e-05, + "loss": 0.8612, + "step": 2892 + }, + { + "epoch": 0.44, + "grad_norm": 8.92235944442913, + "learning_rate": 1.2316251939180888e-05, + "loss": 0.8304, + "step": 2893 + }, + { + "epoch": 0.44, + "grad_norm": 2.9263552390861607, + "learning_rate": 1.2311428935212598e-05, + "loss": 0.7888, + "step": 2894 + }, + { + "epoch": 0.44, + "grad_norm": 2.9573729539265527, + "learning_rate": 1.23066053631599e-05, + "loss": 0.881, + "step": 2895 + }, + { + "epoch": 0.44, + "grad_norm": 2.592117602341213, + "learning_rate": 1.2301781224208297e-05, + "loss": 0.8186, + "step": 2896 + }, + { + "epoch": 0.44, + "grad_norm": 2.804394996969601, + "learning_rate": 1.2296956519543424e-05, + "loss": 0.9044, + "step": 2897 + }, + { + "epoch": 0.44, + "grad_norm": 2.6173110797940975, + "learning_rate": 1.2292131250351059e-05, + "loss": 0.8435, + "step": 2898 + }, + { + "epoch": 0.44, + "grad_norm": 2.744848357135238, + "learning_rate": 1.228730541781712e-05, + "loss": 0.8331, + "step": 2899 + }, + { + "epoch": 0.44, + "grad_norm": 2.8600823953486634, + "learning_rate": 1.2282479023127656e-05, + "loss": 0.9479, + "step": 2900 + }, + { + "epoch": 0.44, + "grad_norm": 3.128079937009162, + "learning_rate": 1.2277652067468864e-05, + "loss": 0.881, + "step": 2901 + }, + { + "epoch": 0.44, + "grad_norm": 2.6693888567881565, + "learning_rate": 1.2272824552027072e-05, + "loss": 0.8039, + "step": 2902 + }, + { + "epoch": 0.44, + "grad_norm": 2.9620619172529166, + "learning_rate": 1.226799647798875e-05, + "loss": 0.8716, + "step": 2903 + }, + { + "epoch": 0.44, + "grad_norm": 2.7313416619549824, + "learning_rate": 1.2263167846540502e-05, + "loss": 0.9502, + "step": 2904 + }, + { + "epoch": 0.44, + "grad_norm": 5.497128092116304, + "learning_rate": 1.2258338658869069e-05, + "loss": 1.0453, + "step": 2905 + }, + { + "epoch": 0.44, + "grad_norm": 2.5403130711331765, + "learning_rate": 1.2253508916161331e-05, + "loss": 0.792, + "step": 2906 + }, + { + "epoch": 0.44, + "grad_norm": 3.122401354526183, + "learning_rate": 1.2248678619604308e-05, + "loss": 0.8809, + "step": 2907 + }, + { + "epoch": 0.45, + "grad_norm": 6.86419136824606, + "learning_rate": 1.2243847770385142e-05, + "loss": 1.0589, + "step": 2908 + }, + { + "epoch": 0.45, + "grad_norm": 2.8875754346845306, + "learning_rate": 1.2239016369691126e-05, + "loss": 0.835, + "step": 2909 + }, + { + "epoch": 0.45, + "grad_norm": 3.028081918992309, + "learning_rate": 1.2234184418709685e-05, + "loss": 0.8556, + "step": 2910 + }, + { + "epoch": 0.45, + "grad_norm": 2.6065156965874188, + "learning_rate": 1.222935191862837e-05, + "loss": 0.9404, + "step": 2911 + }, + { + "epoch": 0.45, + "grad_norm": 3.1134873620451864, + "learning_rate": 1.2224518870634879e-05, + "loss": 0.8696, + "step": 2912 + }, + { + "epoch": 0.45, + "grad_norm": 2.5175705354858375, + "learning_rate": 1.2219685275917039e-05, + "loss": 0.8818, + "step": 2913 + }, + { + "epoch": 0.45, + "grad_norm": 2.810887240705469, + "learning_rate": 1.2214851135662813e-05, + "loss": 0.8863, + "step": 2914 + }, + { + "epoch": 0.45, + "grad_norm": 2.7008514488137108, + "learning_rate": 1.2210016451060291e-05, + "loss": 0.8713, + "step": 2915 + }, + { + "epoch": 0.45, + "grad_norm": 2.9587433703312325, + "learning_rate": 1.2205181223297712e-05, + "loss": 0.9416, + "step": 2916 + }, + { + "epoch": 0.45, + "grad_norm": 5.448777441858399, + "learning_rate": 1.2200345453563433e-05, + "loss": 1.0207, + "step": 2917 + }, + { + "epoch": 0.45, + "grad_norm": 2.714595830544037, + "learning_rate": 1.2195509143045953e-05, + "loss": 0.8795, + "step": 2918 + }, + { + "epoch": 0.45, + "grad_norm": 2.9810252209169565, + "learning_rate": 1.2190672292933902e-05, + "loss": 0.9086, + "step": 2919 + }, + { + "epoch": 0.45, + "grad_norm": 2.750782059673263, + "learning_rate": 1.218583490441604e-05, + "loss": 0.9838, + "step": 2920 + }, + { + "epoch": 0.45, + "grad_norm": 2.588070443866364, + "learning_rate": 1.2180996978681262e-05, + "loss": 0.8068, + "step": 2921 + }, + { + "epoch": 0.45, + "grad_norm": 3.5146551808157622, + "learning_rate": 1.2176158516918597e-05, + "loss": 0.7791, + "step": 2922 + }, + { + "epoch": 0.45, + "grad_norm": 2.9319161515472665, + "learning_rate": 1.21713195203172e-05, + "loss": 0.8309, + "step": 2923 + }, + { + "epoch": 0.45, + "grad_norm": 2.6399633926053623, + "learning_rate": 1.2166479990066362e-05, + "loss": 0.7922, + "step": 2924 + }, + { + "epoch": 0.45, + "grad_norm": 2.5698088581181113, + "learning_rate": 1.2161639927355503e-05, + "loss": 0.838, + "step": 2925 + }, + { + "epoch": 0.45, + "grad_norm": 2.56267127959082, + "learning_rate": 1.2156799333374176e-05, + "loss": 0.9022, + "step": 2926 + }, + { + "epoch": 0.45, + "grad_norm": 2.6103619596786913, + "learning_rate": 1.2151958209312063e-05, + "loss": 0.9504, + "step": 2927 + }, + { + "epoch": 0.45, + "grad_norm": 2.754608561725593, + "learning_rate": 1.2147116556358975e-05, + "loss": 0.8826, + "step": 2928 + }, + { + "epoch": 0.45, + "grad_norm": 3.05394639932344, + "learning_rate": 1.2142274375704855e-05, + "loss": 0.9252, + "step": 2929 + }, + { + "epoch": 0.45, + "grad_norm": 2.758527341831582, + "learning_rate": 1.2137431668539778e-05, + "loss": 0.8559, + "step": 2930 + }, + { + "epoch": 0.45, + "grad_norm": 2.650067159518404, + "learning_rate": 1.2132588436053942e-05, + "loss": 0.8278, + "step": 2931 + }, + { + "epoch": 0.45, + "grad_norm": 3.2212624060518076, + "learning_rate": 1.2127744679437681e-05, + "loss": 0.795, + "step": 2932 + }, + { + "epoch": 0.45, + "grad_norm": 2.568855356171706, + "learning_rate": 1.212290039988145e-05, + "loss": 0.8856, + "step": 2933 + }, + { + "epoch": 0.45, + "grad_norm": 4.895450975084306, + "learning_rate": 1.211805559857584e-05, + "loss": 1.0471, + "step": 2934 + }, + { + "epoch": 0.45, + "grad_norm": 2.602246628619692, + "learning_rate": 1.211321027671157e-05, + "loss": 0.8483, + "step": 2935 + }, + { + "epoch": 0.45, + "grad_norm": 2.6483466494779972, + "learning_rate": 1.2108364435479478e-05, + "loss": 0.9137, + "step": 2936 + }, + { + "epoch": 0.45, + "grad_norm": 2.708496704441042, + "learning_rate": 1.210351807607054e-05, + "loss": 0.9075, + "step": 2937 + }, + { + "epoch": 0.45, + "grad_norm": 3.2501209914236693, + "learning_rate": 1.2098671199675851e-05, + "loss": 0.9656, + "step": 2938 + }, + { + "epoch": 0.45, + "grad_norm": 2.7307672357696626, + "learning_rate": 1.2093823807486645e-05, + "loss": 0.8287, + "step": 2939 + }, + { + "epoch": 0.45, + "grad_norm": 2.715745559891006, + "learning_rate": 1.2088975900694269e-05, + "loss": 0.7912, + "step": 2940 + }, + { + "epoch": 0.45, + "grad_norm": 2.7187018995015113, + "learning_rate": 1.2084127480490206e-05, + "loss": 0.8247, + "step": 2941 + }, + { + "epoch": 0.45, + "grad_norm": 2.5691527149286326, + "learning_rate": 1.2079278548066058e-05, + "loss": 0.8684, + "step": 2942 + }, + { + "epoch": 0.45, + "grad_norm": 2.7598411907507274, + "learning_rate": 1.2074429104613558e-05, + "loss": 0.8464, + "step": 2943 + }, + { + "epoch": 0.45, + "grad_norm": 5.124777456564711, + "learning_rate": 1.2069579151324563e-05, + "loss": 1.0324, + "step": 2944 + }, + { + "epoch": 0.45, + "grad_norm": 2.553503614208449, + "learning_rate": 1.2064728689391059e-05, + "loss": 0.8478, + "step": 2945 + }, + { + "epoch": 0.45, + "grad_norm": 5.142861541542434, + "learning_rate": 1.2059877720005149e-05, + "loss": 1.0479, + "step": 2946 + }, + { + "epoch": 0.45, + "grad_norm": 2.7464581018539866, + "learning_rate": 1.205502624435907e-05, + "loss": 0.8466, + "step": 2947 + }, + { + "epoch": 0.45, + "grad_norm": 2.437733125582104, + "learning_rate": 1.2050174263645169e-05, + "loss": 0.8697, + "step": 2948 + }, + { + "epoch": 0.45, + "grad_norm": 4.590483344547154, + "learning_rate": 1.2045321779055936e-05, + "loss": 1.0495, + "step": 2949 + }, + { + "epoch": 0.45, + "grad_norm": 4.379367692712861, + "learning_rate": 1.2040468791783973e-05, + "loss": 1.0306, + "step": 2950 + }, + { + "epoch": 0.45, + "grad_norm": 3.3149611334682025, + "learning_rate": 1.203561530302201e-05, + "loss": 1.001, + "step": 2951 + }, + { + "epoch": 0.45, + "grad_norm": 7.087327985322694, + "learning_rate": 1.2030761313962898e-05, + "loss": 1.0608, + "step": 2952 + }, + { + "epoch": 0.45, + "grad_norm": 4.516713422731289, + "learning_rate": 1.2025906825799604e-05, + "loss": 0.9792, + "step": 2953 + }, + { + "epoch": 0.45, + "grad_norm": 2.870450403349401, + "learning_rate": 1.2021051839725235e-05, + "loss": 0.8342, + "step": 2954 + }, + { + "epoch": 0.45, + "grad_norm": 2.623934475340532, + "learning_rate": 1.2016196356933005e-05, + "loss": 0.8126, + "step": 2955 + }, + { + "epoch": 0.45, + "grad_norm": 2.757354876528018, + "learning_rate": 1.2011340378616256e-05, + "loss": 0.9232, + "step": 2956 + }, + { + "epoch": 0.45, + "grad_norm": 2.6140798539319494, + "learning_rate": 1.2006483905968456e-05, + "loss": 0.8552, + "step": 2957 + }, + { + "epoch": 0.45, + "grad_norm": 2.6166843878359862, + "learning_rate": 1.2001626940183185e-05, + "loss": 0.926, + "step": 2958 + }, + { + "epoch": 0.45, + "grad_norm": 2.6471979880923047, + "learning_rate": 1.1996769482454144e-05, + "loss": 0.8624, + "step": 2959 + }, + { + "epoch": 0.45, + "grad_norm": 2.6230161949992663, + "learning_rate": 1.1991911533975172e-05, + "loss": 0.7624, + "step": 2960 + }, + { + "epoch": 0.45, + "grad_norm": 2.7014344511734807, + "learning_rate": 1.1987053095940204e-05, + "loss": 0.86, + "step": 2961 + }, + { + "epoch": 0.45, + "grad_norm": 2.649651028227363, + "learning_rate": 1.198219416954332e-05, + "loss": 0.9156, + "step": 2962 + }, + { + "epoch": 0.45, + "grad_norm": 2.5928321617079844, + "learning_rate": 1.19773347559787e-05, + "loss": 0.8112, + "step": 2963 + }, + { + "epoch": 0.45, + "grad_norm": 2.7488508110619754, + "learning_rate": 1.1972474856440654e-05, + "loss": 0.9098, + "step": 2964 + }, + { + "epoch": 0.45, + "grad_norm": 2.7198204374543224, + "learning_rate": 1.1967614472123607e-05, + "loss": 0.8645, + "step": 2965 + }, + { + "epoch": 0.45, + "grad_norm": 3.05265967983416, + "learning_rate": 1.1962753604222108e-05, + "loss": 0.9255, + "step": 2966 + }, + { + "epoch": 0.45, + "grad_norm": 2.6759537172100645, + "learning_rate": 1.1957892253930819e-05, + "loss": 0.8436, + "step": 2967 + }, + { + "epoch": 0.45, + "grad_norm": 3.2144844492572755, + "learning_rate": 1.1953030422444526e-05, + "loss": 0.914, + "step": 2968 + }, + { + "epoch": 0.45, + "grad_norm": 3.0346285503049057, + "learning_rate": 1.1948168110958132e-05, + "loss": 0.9222, + "step": 2969 + }, + { + "epoch": 0.45, + "grad_norm": 2.919974504371189, + "learning_rate": 1.194330532066665e-05, + "loss": 0.9076, + "step": 2970 + }, + { + "epoch": 0.45, + "grad_norm": 2.916037722150218, + "learning_rate": 1.1938442052765225e-05, + "loss": 0.9252, + "step": 2971 + }, + { + "epoch": 0.45, + "grad_norm": 3.0399673977308037, + "learning_rate": 1.1933578308449108e-05, + "loss": 0.7823, + "step": 2972 + }, + { + "epoch": 0.46, + "grad_norm": 2.554756201579733, + "learning_rate": 1.1928714088913673e-05, + "loss": 0.828, + "step": 2973 + }, + { + "epoch": 0.46, + "grad_norm": 2.7425865424645615, + "learning_rate": 1.1923849395354407e-05, + "loss": 0.8717, + "step": 2974 + }, + { + "epoch": 0.46, + "grad_norm": 2.375571545693288, + "learning_rate": 1.1918984228966917e-05, + "loss": 0.8431, + "step": 2975 + }, + { + "epoch": 0.46, + "grad_norm": 2.9360801702086463, + "learning_rate": 1.1914118590946924e-05, + "loss": 0.8792, + "step": 2976 + }, + { + "epoch": 0.46, + "grad_norm": 2.9626264065606702, + "learning_rate": 1.1909252482490263e-05, + "loss": 0.8869, + "step": 2977 + }, + { + "epoch": 0.46, + "grad_norm": 2.890042514095932, + "learning_rate": 1.190438590479289e-05, + "loss": 0.9294, + "step": 2978 + }, + { + "epoch": 0.46, + "grad_norm": 2.981870606613395, + "learning_rate": 1.1899518859050869e-05, + "loss": 0.9017, + "step": 2979 + }, + { + "epoch": 0.46, + "grad_norm": 2.715493009051848, + "learning_rate": 1.1894651346460391e-05, + "loss": 0.8221, + "step": 2980 + }, + { + "epoch": 0.46, + "grad_norm": 10.855410286020899, + "learning_rate": 1.188978336821775e-05, + "loss": 1.0945, + "step": 2981 + }, + { + "epoch": 0.46, + "grad_norm": 2.7232678577599536, + "learning_rate": 1.1884914925519356e-05, + "loss": 0.8186, + "step": 2982 + }, + { + "epoch": 0.46, + "grad_norm": 2.482457923899972, + "learning_rate": 1.1880046019561735e-05, + "loss": 0.8432, + "step": 2983 + }, + { + "epoch": 0.46, + "grad_norm": 2.639829647704218, + "learning_rate": 1.1875176651541533e-05, + "loss": 0.8063, + "step": 2984 + }, + { + "epoch": 0.46, + "grad_norm": 3.107733287783824, + "learning_rate": 1.1870306822655502e-05, + "loss": 0.9191, + "step": 2985 + }, + { + "epoch": 0.46, + "grad_norm": 2.7323130504400144, + "learning_rate": 1.1865436534100508e-05, + "loss": 0.9172, + "step": 2986 + }, + { + "epoch": 0.46, + "grad_norm": 2.7044458896629044, + "learning_rate": 1.186056578707353e-05, + "loss": 0.9815, + "step": 2987 + }, + { + "epoch": 0.46, + "grad_norm": 2.8592139872621245, + "learning_rate": 1.1855694582771666e-05, + "loss": 0.9378, + "step": 2988 + }, + { + "epoch": 0.46, + "grad_norm": 2.565066517688624, + "learning_rate": 1.1850822922392119e-05, + "loss": 0.8113, + "step": 2989 + }, + { + "epoch": 0.46, + "grad_norm": 2.6779479847097685, + "learning_rate": 1.1845950807132203e-05, + "loss": 0.8412, + "step": 2990 + }, + { + "epoch": 0.46, + "grad_norm": 2.5545351126336584, + "learning_rate": 1.1841078238189352e-05, + "loss": 0.9046, + "step": 2991 + }, + { + "epoch": 0.46, + "grad_norm": 2.6450426222819354, + "learning_rate": 1.1836205216761105e-05, + "loss": 0.8617, + "step": 2992 + }, + { + "epoch": 0.46, + "grad_norm": 2.6275419597565746, + "learning_rate": 1.1831331744045114e-05, + "loss": 0.8616, + "step": 2993 + }, + { + "epoch": 0.46, + "grad_norm": 2.7510452230302787, + "learning_rate": 1.182645782123914e-05, + "loss": 0.8931, + "step": 2994 + }, + { + "epoch": 0.46, + "grad_norm": 2.6643225382375206, + "learning_rate": 1.182158344954106e-05, + "loss": 0.8337, + "step": 2995 + }, + { + "epoch": 0.46, + "grad_norm": 2.618555830855707, + "learning_rate": 1.1816708630148857e-05, + "loss": 0.9202, + "step": 2996 + }, + { + "epoch": 0.46, + "grad_norm": 2.8455305353008713, + "learning_rate": 1.1811833364260625e-05, + "loss": 0.9023, + "step": 2997 + }, + { + "epoch": 0.46, + "grad_norm": 2.7928103126274375, + "learning_rate": 1.1806957653074564e-05, + "loss": 0.8542, + "step": 2998 + }, + { + "epoch": 0.46, + "grad_norm": 2.7731698718633604, + "learning_rate": 1.1802081497788993e-05, + "loss": 0.7557, + "step": 2999 + }, + { + "epoch": 0.46, + "grad_norm": 2.814943969739714, + "learning_rate": 1.1797204899602328e-05, + "loss": 0.8808, + "step": 3000 + }, + { + "epoch": 0.46, + "grad_norm": 2.8355012116827134, + "learning_rate": 1.1792327859713104e-05, + "loss": 0.8385, + "step": 3001 + }, + { + "epoch": 0.46, + "grad_norm": 3.1537111761470373, + "learning_rate": 1.1787450379319963e-05, + "loss": 0.8303, + "step": 3002 + }, + { + "epoch": 0.46, + "grad_norm": 2.744178078536589, + "learning_rate": 1.1782572459621646e-05, + "loss": 0.9255, + "step": 3003 + }, + { + "epoch": 0.46, + "grad_norm": 2.5395370642311623, + "learning_rate": 1.1777694101817014e-05, + "loss": 0.8489, + "step": 3004 + }, + { + "epoch": 0.46, + "grad_norm": 2.9295376938033324, + "learning_rate": 1.1772815307105027e-05, + "loss": 0.9934, + "step": 3005 + }, + { + "epoch": 0.46, + "grad_norm": 2.8432809249138042, + "learning_rate": 1.176793607668476e-05, + "loss": 0.9256, + "step": 3006 + }, + { + "epoch": 0.46, + "grad_norm": 2.6080502082997423, + "learning_rate": 1.176305641175539e-05, + "loss": 0.8401, + "step": 3007 + }, + { + "epoch": 0.46, + "grad_norm": 2.6194753528697103, + "learning_rate": 1.17581763135162e-05, + "loss": 0.8568, + "step": 3008 + }, + { + "epoch": 0.46, + "grad_norm": 2.518047586237937, + "learning_rate": 1.1753295783166581e-05, + "loss": 0.8328, + "step": 3009 + }, + { + "epoch": 0.46, + "grad_norm": 2.4931889524384694, + "learning_rate": 1.1748414821906034e-05, + "loss": 0.8611, + "step": 3010 + }, + { + "epoch": 0.46, + "grad_norm": 2.540801850644384, + "learning_rate": 1.1743533430934155e-05, + "loss": 0.9411, + "step": 3011 + }, + { + "epoch": 0.46, + "grad_norm": 2.7099336287100435, + "learning_rate": 1.173865161145066e-05, + "loss": 0.7831, + "step": 3012 + }, + { + "epoch": 0.46, + "grad_norm": 3.11702898724108, + "learning_rate": 1.1733769364655363e-05, + "loss": 0.9089, + "step": 3013 + }, + { + "epoch": 0.46, + "grad_norm": 5.378925417302546, + "learning_rate": 1.1728886691748183e-05, + "loss": 1.0161, + "step": 3014 + }, + { + "epoch": 0.46, + "grad_norm": 2.7509304800419336, + "learning_rate": 1.1724003593929138e-05, + "loss": 0.9121, + "step": 3015 + }, + { + "epoch": 0.46, + "grad_norm": 2.68915934169062, + "learning_rate": 1.1719120072398361e-05, + "loss": 0.8014, + "step": 3016 + }, + { + "epoch": 0.46, + "grad_norm": 2.7399760762830856, + "learning_rate": 1.1714236128356092e-05, + "loss": 0.8724, + "step": 3017 + }, + { + "epoch": 0.46, + "grad_norm": 3.030162280916283, + "learning_rate": 1.1709351763002652e-05, + "loss": 0.8158, + "step": 3018 + }, + { + "epoch": 0.46, + "grad_norm": 2.585101360406029, + "learning_rate": 1.1704466977538496e-05, + "loss": 0.9078, + "step": 3019 + }, + { + "epoch": 0.46, + "grad_norm": 2.7771498640982206, + "learning_rate": 1.1699581773164155e-05, + "loss": 0.8736, + "step": 3020 + }, + { + "epoch": 0.46, + "grad_norm": 2.9162027035422398, + "learning_rate": 1.1694696151080282e-05, + "loss": 0.9148, + "step": 3021 + }, + { + "epoch": 0.46, + "grad_norm": 2.674802039440239, + "learning_rate": 1.1689810112487626e-05, + "loss": 0.8939, + "step": 3022 + }, + { + "epoch": 0.46, + "grad_norm": 2.9827837301978692, + "learning_rate": 1.1684923658587036e-05, + "loss": 0.7724, + "step": 3023 + }, + { + "epoch": 0.46, + "grad_norm": 2.446858036060685, + "learning_rate": 1.1680036790579465e-05, + "loss": 0.8349, + "step": 3024 + }, + { + "epoch": 0.46, + "grad_norm": 2.7490258161168355, + "learning_rate": 1.1675149509665972e-05, + "loss": 0.8318, + "step": 3025 + }, + { + "epoch": 0.46, + "grad_norm": 2.758105078173259, + "learning_rate": 1.167026181704771e-05, + "loss": 0.8289, + "step": 3026 + }, + { + "epoch": 0.46, + "grad_norm": 7.353607851225301, + "learning_rate": 1.1665373713925936e-05, + "loss": 1.0972, + "step": 3027 + }, + { + "epoch": 0.46, + "grad_norm": 3.3166270913512403, + "learning_rate": 1.1660485201502011e-05, + "loss": 0.8928, + "step": 3028 + }, + { + "epoch": 0.46, + "grad_norm": 2.5484423724642578, + "learning_rate": 1.1655596280977395e-05, + "loss": 0.8378, + "step": 3029 + }, + { + "epoch": 0.46, + "grad_norm": 2.7828491941027105, + "learning_rate": 1.1650706953553644e-05, + "loss": 0.8506, + "step": 3030 + }, + { + "epoch": 0.46, + "grad_norm": 3.222252561223075, + "learning_rate": 1.1645817220432421e-05, + "loss": 0.7596, + "step": 3031 + }, + { + "epoch": 0.46, + "grad_norm": 2.8353519578274593, + "learning_rate": 1.1640927082815485e-05, + "loss": 0.8111, + "step": 3032 + }, + { + "epoch": 0.46, + "grad_norm": 2.712517208964851, + "learning_rate": 1.1636036541904692e-05, + "loss": 0.9596, + "step": 3033 + }, + { + "epoch": 0.46, + "grad_norm": 3.02873034434939, + "learning_rate": 1.1631145598901999e-05, + "loss": 0.934, + "step": 3034 + }, + { + "epoch": 0.46, + "grad_norm": 2.7491870247460337, + "learning_rate": 1.1626254255009465e-05, + "loss": 0.8391, + "step": 3035 + }, + { + "epoch": 0.46, + "grad_norm": 2.522093212738503, + "learning_rate": 1.162136251142925e-05, + "loss": 0.8147, + "step": 3036 + }, + { + "epoch": 0.46, + "grad_norm": 3.03519444835517, + "learning_rate": 1.1616470369363602e-05, + "loss": 0.892, + "step": 3037 + }, + { + "epoch": 0.47, + "grad_norm": 3.769866454847948, + "learning_rate": 1.161157783001487e-05, + "loss": 0.9277, + "step": 3038 + }, + { + "epoch": 0.47, + "grad_norm": 2.8005906197669908, + "learning_rate": 1.1606684894585507e-05, + "loss": 0.9335, + "step": 3039 + }, + { + "epoch": 0.47, + "grad_norm": 2.6771495503120724, + "learning_rate": 1.1601791564278057e-05, + "loss": 0.8585, + "step": 3040 + }, + { + "epoch": 0.47, + "grad_norm": 2.7458170630301493, + "learning_rate": 1.1596897840295165e-05, + "loss": 0.9842, + "step": 3041 + }, + { + "epoch": 0.47, + "grad_norm": 2.42988492519934, + "learning_rate": 1.159200372383957e-05, + "loss": 0.8292, + "step": 3042 + }, + { + "epoch": 0.47, + "grad_norm": 2.5103081399212375, + "learning_rate": 1.1587109216114111e-05, + "loss": 0.9228, + "step": 3043 + }, + { + "epoch": 0.47, + "grad_norm": 2.589221115239344, + "learning_rate": 1.1582214318321718e-05, + "loss": 0.8647, + "step": 3044 + }, + { + "epoch": 0.47, + "grad_norm": 4.811962497245336, + "learning_rate": 1.1577319031665419e-05, + "loss": 1.0228, + "step": 3045 + }, + { + "epoch": 0.47, + "grad_norm": 2.6507114110052887, + "learning_rate": 1.157242335734834e-05, + "loss": 0.845, + "step": 3046 + }, + { + "epoch": 0.47, + "grad_norm": 2.8573693138518306, + "learning_rate": 1.1567527296573702e-05, + "loss": 0.87, + "step": 3047 + }, + { + "epoch": 0.47, + "grad_norm": 2.457127461940877, + "learning_rate": 1.1562630850544816e-05, + "loss": 0.86, + "step": 3048 + }, + { + "epoch": 0.47, + "grad_norm": 2.556657489478441, + "learning_rate": 1.1557734020465093e-05, + "loss": 0.7787, + "step": 3049 + }, + { + "epoch": 0.47, + "grad_norm": 4.76817538447232, + "learning_rate": 1.1552836807538034e-05, + "loss": 1.0745, + "step": 3050 + }, + { + "epoch": 0.47, + "grad_norm": 2.828019544724989, + "learning_rate": 1.154793921296724e-05, + "loss": 0.7933, + "step": 3051 + }, + { + "epoch": 0.47, + "grad_norm": 3.9926852113478217, + "learning_rate": 1.1543041237956403e-05, + "loss": 0.9776, + "step": 3052 + }, + { + "epoch": 0.47, + "grad_norm": 2.7029689770823206, + "learning_rate": 1.1538142883709305e-05, + "loss": 0.9321, + "step": 3053 + }, + { + "epoch": 0.47, + "grad_norm": 2.819858750783005, + "learning_rate": 1.1533244151429825e-05, + "loss": 0.8614, + "step": 3054 + }, + { + "epoch": 0.47, + "grad_norm": 2.5754083770125895, + "learning_rate": 1.1528345042321933e-05, + "loss": 0.8534, + "step": 3055 + }, + { + "epoch": 0.47, + "grad_norm": 2.808109444366165, + "learning_rate": 1.1523445557589692e-05, + "loss": 0.8464, + "step": 3056 + }, + { + "epoch": 0.47, + "grad_norm": 2.642707924896552, + "learning_rate": 1.1518545698437262e-05, + "loss": 0.8235, + "step": 3057 + }, + { + "epoch": 0.47, + "grad_norm": 2.793040715733123, + "learning_rate": 1.1513645466068887e-05, + "loss": 0.7877, + "step": 3058 + }, + { + "epoch": 0.47, + "grad_norm": 3.2252924252687065, + "learning_rate": 1.1508744861688912e-05, + "loss": 0.7785, + "step": 3059 + }, + { + "epoch": 0.47, + "grad_norm": 2.9322617811791116, + "learning_rate": 1.150384388650176e-05, + "loss": 0.8891, + "step": 3060 + }, + { + "epoch": 0.47, + "grad_norm": 3.3079267028920536, + "learning_rate": 1.149894254171196e-05, + "loss": 0.8218, + "step": 3061 + }, + { + "epoch": 0.47, + "grad_norm": 2.826653081264617, + "learning_rate": 1.1494040828524122e-05, + "loss": 0.8896, + "step": 3062 + }, + { + "epoch": 0.47, + "grad_norm": 2.499685351473289, + "learning_rate": 1.1489138748142949e-05, + "loss": 0.8141, + "step": 3063 + }, + { + "epoch": 0.47, + "grad_norm": 3.0839843943225453, + "learning_rate": 1.1484236301773239e-05, + "loss": 0.797, + "step": 3064 + }, + { + "epoch": 0.47, + "grad_norm": 2.9392104478446295, + "learning_rate": 1.1479333490619873e-05, + "loss": 0.8057, + "step": 3065 + }, + { + "epoch": 0.47, + "grad_norm": 2.544480005618585, + "learning_rate": 1.147443031588782e-05, + "loss": 0.8691, + "step": 3066 + }, + { + "epoch": 0.47, + "grad_norm": 2.5467653566420805, + "learning_rate": 1.146952677878215e-05, + "loss": 0.7424, + "step": 3067 + }, + { + "epoch": 0.47, + "grad_norm": 2.62107278620751, + "learning_rate": 1.1464622880508012e-05, + "loss": 0.967, + "step": 3068 + }, + { + "epoch": 0.47, + "grad_norm": 6.543148916423793, + "learning_rate": 1.1459718622270648e-05, + "loss": 1.0334, + "step": 3069 + }, + { + "epoch": 0.47, + "grad_norm": 2.4324678600471663, + "learning_rate": 1.1454814005275388e-05, + "loss": 0.8146, + "step": 3070 + }, + { + "epoch": 0.47, + "grad_norm": 2.8998473892388326, + "learning_rate": 1.1449909030727641e-05, + "loss": 0.8289, + "step": 3071 + }, + { + "epoch": 0.47, + "grad_norm": 2.7750117258591485, + "learning_rate": 1.1445003699832922e-05, + "loss": 1.0167, + "step": 3072 + }, + { + "epoch": 0.47, + "grad_norm": 2.8018037931812034, + "learning_rate": 1.144009801379682e-05, + "loss": 0.9312, + "step": 3073 + }, + { + "epoch": 0.47, + "grad_norm": 2.789138018988291, + "learning_rate": 1.1435191973825015e-05, + "loss": 0.9369, + "step": 3074 + }, + { + "epoch": 0.47, + "grad_norm": 2.692334241154601, + "learning_rate": 1.1430285581123278e-05, + "loss": 0.911, + "step": 3075 + }, + { + "epoch": 0.47, + "grad_norm": 2.5145506813955163, + "learning_rate": 1.1425378836897457e-05, + "loss": 0.8242, + "step": 3076 + }, + { + "epoch": 0.47, + "grad_norm": 2.4979403035686945, + "learning_rate": 1.1420471742353491e-05, + "loss": 0.9209, + "step": 3077 + }, + { + "epoch": 0.47, + "grad_norm": 2.502264436899139, + "learning_rate": 1.141556429869741e-05, + "loss": 0.806, + "step": 3078 + }, + { + "epoch": 0.47, + "grad_norm": 2.734742905570955, + "learning_rate": 1.1410656507135328e-05, + "loss": 0.8528, + "step": 3079 + }, + { + "epoch": 0.47, + "grad_norm": 2.722677435806108, + "learning_rate": 1.1405748368873438e-05, + "loss": 0.8267, + "step": 3080 + }, + { + "epoch": 0.47, + "grad_norm": 2.6633849719505664, + "learning_rate": 1.1400839885118026e-05, + "loss": 0.8519, + "step": 3081 + }, + { + "epoch": 0.47, + "grad_norm": 2.831404582758861, + "learning_rate": 1.1395931057075455e-05, + "loss": 0.839, + "step": 3082 + }, + { + "epoch": 0.47, + "grad_norm": 2.7737992784725893, + "learning_rate": 1.1391021885952182e-05, + "loss": 0.8881, + "step": 3083 + }, + { + "epoch": 0.47, + "grad_norm": 5.287806551319888, + "learning_rate": 1.1386112372954745e-05, + "loss": 0.9836, + "step": 3084 + }, + { + "epoch": 0.47, + "grad_norm": 4.687326202298055, + "learning_rate": 1.138120251928976e-05, + "loss": 1.0422, + "step": 3085 + }, + { + "epoch": 0.47, + "grad_norm": 2.5190937625243697, + "learning_rate": 1.137629232616393e-05, + "loss": 0.8285, + "step": 3086 + }, + { + "epoch": 0.47, + "grad_norm": 2.6728531205610087, + "learning_rate": 1.1371381794784051e-05, + "loss": 0.8709, + "step": 3087 + }, + { + "epoch": 0.47, + "grad_norm": 3.1875883834182965, + "learning_rate": 1.1366470926356986e-05, + "loss": 0.8351, + "step": 3088 + }, + { + "epoch": 0.47, + "grad_norm": 2.6343183413218236, + "learning_rate": 1.1361559722089691e-05, + "loss": 0.9105, + "step": 3089 + }, + { + "epoch": 0.47, + "grad_norm": 5.273978479754304, + "learning_rate": 1.1356648183189203e-05, + "loss": 1.0424, + "step": 3090 + }, + { + "epoch": 0.47, + "grad_norm": 2.8158695606170863, + "learning_rate": 1.1351736310862642e-05, + "loss": 0.8838, + "step": 3091 + }, + { + "epoch": 0.47, + "grad_norm": 2.808634999693899, + "learning_rate": 1.1346824106317204e-05, + "loss": 0.7946, + "step": 3092 + }, + { + "epoch": 0.47, + "grad_norm": 2.9091587714578644, + "learning_rate": 1.1341911570760176e-05, + "loss": 0.8636, + "step": 3093 + }, + { + "epoch": 0.47, + "grad_norm": 2.481053395723128, + "learning_rate": 1.1336998705398918e-05, + "loss": 0.8706, + "step": 3094 + }, + { + "epoch": 0.47, + "grad_norm": 2.7924999881066985, + "learning_rate": 1.1332085511440877e-05, + "loss": 0.9168, + "step": 3095 + }, + { + "epoch": 0.47, + "grad_norm": 2.7065320524178937, + "learning_rate": 1.1327171990093574e-05, + "loss": 0.8494, + "step": 3096 + }, + { + "epoch": 0.47, + "grad_norm": 2.798614283817104, + "learning_rate": 1.1322258142564619e-05, + "loss": 0.8357, + "step": 3097 + }, + { + "epoch": 0.47, + "grad_norm": 3.9673905386874635, + "learning_rate": 1.13173439700617e-05, + "loss": 0.9729, + "step": 3098 + }, + { + "epoch": 0.47, + "grad_norm": 2.669600900005444, + "learning_rate": 1.1312429473792576e-05, + "loss": 0.8486, + "step": 3099 + }, + { + "epoch": 0.47, + "grad_norm": 2.820936598288706, + "learning_rate": 1.1307514654965097e-05, + "loss": 0.8991, + "step": 3100 + }, + { + "epoch": 0.47, + "grad_norm": 2.6125223944829936, + "learning_rate": 1.1302599514787186e-05, + "loss": 0.8566, + "step": 3101 + }, + { + "epoch": 0.47, + "grad_norm": 2.739922897046682, + "learning_rate": 1.129768405446685e-05, + "loss": 0.9179, + "step": 3102 + }, + { + "epoch": 0.47, + "grad_norm": 2.7433379542441787, + "learning_rate": 1.1292768275212162e-05, + "loss": 0.8841, + "step": 3103 + }, + { + "epoch": 0.48, + "grad_norm": 2.670627406032204, + "learning_rate": 1.1287852178231295e-05, + "loss": 0.8187, + "step": 3104 + }, + { + "epoch": 0.48, + "grad_norm": 2.9365550668107816, + "learning_rate": 1.1282935764732477e-05, + "loss": 0.9242, + "step": 3105 + }, + { + "epoch": 0.48, + "grad_norm": 2.755246641824546, + "learning_rate": 1.1278019035924032e-05, + "loss": 0.9124, + "step": 3106 + }, + { + "epoch": 0.48, + "grad_norm": 2.7649728926786143, + "learning_rate": 1.1273101993014351e-05, + "loss": 0.782, + "step": 3107 + }, + { + "epoch": 0.48, + "grad_norm": 3.1261831995171527, + "learning_rate": 1.1268184637211905e-05, + "loss": 0.8278, + "step": 3108 + }, + { + "epoch": 0.48, + "grad_norm": 2.6353217053491713, + "learning_rate": 1.1263266969725244e-05, + "loss": 0.893, + "step": 3109 + }, + { + "epoch": 0.48, + "grad_norm": 2.8645760552772144, + "learning_rate": 1.1258348991762994e-05, + "loss": 0.7923, + "step": 3110 + }, + { + "epoch": 0.48, + "grad_norm": 2.891997003857544, + "learning_rate": 1.1253430704533847e-05, + "loss": 0.7817, + "step": 3111 + }, + { + "epoch": 0.48, + "grad_norm": 2.624875915617395, + "learning_rate": 1.124851210924659e-05, + "loss": 0.8476, + "step": 3112 + }, + { + "epoch": 0.48, + "grad_norm": 2.7688792539306153, + "learning_rate": 1.1243593207110073e-05, + "loss": 0.8592, + "step": 3113 + }, + { + "epoch": 0.48, + "grad_norm": 2.7742296596297518, + "learning_rate": 1.1238673999333223e-05, + "loss": 0.8803, + "step": 3114 + }, + { + "epoch": 0.48, + "grad_norm": 2.793489256791983, + "learning_rate": 1.1233754487125043e-05, + "loss": 0.9501, + "step": 3115 + }, + { + "epoch": 0.48, + "grad_norm": 2.813680144743365, + "learning_rate": 1.1228834671694613e-05, + "loss": 0.839, + "step": 3116 + }, + { + "epoch": 0.48, + "grad_norm": 4.916078405373159, + "learning_rate": 1.1223914554251085e-05, + "loss": 1.0373, + "step": 3117 + }, + { + "epoch": 0.48, + "grad_norm": 2.610000965016514, + "learning_rate": 1.1218994136003685e-05, + "loss": 0.8629, + "step": 3118 + }, + { + "epoch": 0.48, + "grad_norm": 2.7625346058741904, + "learning_rate": 1.1214073418161712e-05, + "loss": 0.9373, + "step": 3119 + }, + { + "epoch": 0.48, + "grad_norm": 2.6943411479104284, + "learning_rate": 1.1209152401934546e-05, + "loss": 0.8694, + "step": 3120 + }, + { + "epoch": 0.48, + "grad_norm": 2.861777936032375, + "learning_rate": 1.1204231088531631e-05, + "loss": 0.7413, + "step": 3121 + }, + { + "epoch": 0.48, + "grad_norm": 2.5235310218243376, + "learning_rate": 1.1199309479162489e-05, + "loss": 0.8644, + "step": 3122 + }, + { + "epoch": 0.48, + "grad_norm": 2.497572177741026, + "learning_rate": 1.119438757503671e-05, + "loss": 0.8023, + "step": 3123 + }, + { + "epoch": 0.48, + "grad_norm": 2.6581250101609863, + "learning_rate": 1.1189465377363964e-05, + "loss": 0.8173, + "step": 3124 + }, + { + "epoch": 0.48, + "grad_norm": 2.7763787999992227, + "learning_rate": 1.118454288735399e-05, + "loss": 0.9271, + "step": 3125 + }, + { + "epoch": 0.48, + "grad_norm": 2.5574303042021675, + "learning_rate": 1.1179620106216597e-05, + "loss": 0.9474, + "step": 3126 + }, + { + "epoch": 0.48, + "grad_norm": 2.5624182985693587, + "learning_rate": 1.117469703516166e-05, + "loss": 0.8598, + "step": 3127 + }, + { + "epoch": 0.48, + "grad_norm": 2.3757353182751983, + "learning_rate": 1.1169773675399144e-05, + "loss": 0.8019, + "step": 3128 + }, + { + "epoch": 0.48, + "grad_norm": 2.690347384996987, + "learning_rate": 1.1164850028139063e-05, + "loss": 0.8593, + "step": 3129 + }, + { + "epoch": 0.48, + "grad_norm": 2.4945781694298197, + "learning_rate": 1.1159926094591514e-05, + "loss": 0.8506, + "step": 3130 + }, + { + "epoch": 0.48, + "grad_norm": 2.655067345590805, + "learning_rate": 1.1155001875966663e-05, + "loss": 0.894, + "step": 3131 + }, + { + "epoch": 0.48, + "grad_norm": 2.5126207449004463, + "learning_rate": 1.1150077373474745e-05, + "loss": 0.7716, + "step": 3132 + }, + { + "epoch": 0.48, + "grad_norm": 2.3273236917279556, + "learning_rate": 1.1145152588326063e-05, + "loss": 0.75, + "step": 3133 + }, + { + "epoch": 0.48, + "grad_norm": 2.7025550289889817, + "learning_rate": 1.1140227521730988e-05, + "loss": 0.7937, + "step": 3134 + }, + { + "epoch": 0.48, + "grad_norm": 2.742824070165681, + "learning_rate": 1.1135302174899971e-05, + "loss": 0.8877, + "step": 3135 + }, + { + "epoch": 0.48, + "grad_norm": 3.237256074327542, + "learning_rate": 1.113037654904352e-05, + "loss": 0.8987, + "step": 3136 + }, + { + "epoch": 0.48, + "grad_norm": 2.9438627721310087, + "learning_rate": 1.1125450645372218e-05, + "loss": 0.8522, + "step": 3137 + }, + { + "epoch": 0.48, + "grad_norm": 2.886165850380071, + "learning_rate": 1.1120524465096706e-05, + "loss": 0.8938, + "step": 3138 + }, + { + "epoch": 0.48, + "grad_norm": 2.656321602028388, + "learning_rate": 1.1115598009427712e-05, + "loss": 0.8773, + "step": 3139 + }, + { + "epoch": 0.48, + "grad_norm": 2.6816869315729543, + "learning_rate": 1.1110671279576014e-05, + "loss": 0.8806, + "step": 3140 + }, + { + "epoch": 0.48, + "grad_norm": 2.7861332807618555, + "learning_rate": 1.1105744276752464e-05, + "loss": 0.8534, + "step": 3141 + }, + { + "epoch": 0.48, + "grad_norm": 2.7415889420632036, + "learning_rate": 1.1100817002167983e-05, + "loss": 0.8204, + "step": 3142 + }, + { + "epoch": 0.48, + "grad_norm": 2.853777558312234, + "learning_rate": 1.1095889457033557e-05, + "loss": 0.8472, + "step": 3143 + }, + { + "epoch": 0.48, + "grad_norm": 2.679333044964318, + "learning_rate": 1.1090961642560238e-05, + "loss": 0.8814, + "step": 3144 + }, + { + "epoch": 0.48, + "grad_norm": 2.712063692602172, + "learning_rate": 1.1086033559959143e-05, + "loss": 0.8651, + "step": 3145 + }, + { + "epoch": 0.48, + "grad_norm": 2.901893289890703, + "learning_rate": 1.1081105210441458e-05, + "loss": 0.9463, + "step": 3146 + }, + { + "epoch": 0.48, + "grad_norm": 2.622942948873725, + "learning_rate": 1.1076176595218438e-05, + "loss": 0.8437, + "step": 3147 + }, + { + "epoch": 0.48, + "grad_norm": 2.784225144697118, + "learning_rate": 1.1071247715501387e-05, + "loss": 0.9422, + "step": 3148 + }, + { + "epoch": 0.48, + "grad_norm": 2.784905531348584, + "learning_rate": 1.1066318572501695e-05, + "loss": 0.857, + "step": 3149 + }, + { + "epoch": 0.48, + "grad_norm": 2.614670266049605, + "learning_rate": 1.1061389167430804e-05, + "loss": 0.899, + "step": 3150 + }, + { + "epoch": 0.48, + "grad_norm": 2.786304158975111, + "learning_rate": 1.1056459501500223e-05, + "loss": 0.9484, + "step": 3151 + }, + { + "epoch": 0.48, + "grad_norm": 2.7894077314963335, + "learning_rate": 1.1051529575921525e-05, + "loss": 0.9577, + "step": 3152 + }, + { + "epoch": 0.48, + "grad_norm": 2.621899779821202, + "learning_rate": 1.1046599391906347e-05, + "loss": 0.7899, + "step": 3153 + }, + { + "epoch": 0.48, + "grad_norm": 2.7178037657407033, + "learning_rate": 1.1041668950666395e-05, + "loss": 0.7923, + "step": 3154 + }, + { + "epoch": 0.48, + "grad_norm": 2.387761205410121, + "learning_rate": 1.1036738253413431e-05, + "loss": 0.7847, + "step": 3155 + }, + { + "epoch": 0.48, + "grad_norm": 2.6422778659156134, + "learning_rate": 1.1031807301359273e-05, + "loss": 0.9249, + "step": 3156 + }, + { + "epoch": 0.48, + "grad_norm": 2.9058694607373634, + "learning_rate": 1.1026876095715825e-05, + "loss": 0.8803, + "step": 3157 + }, + { + "epoch": 0.48, + "grad_norm": 5.628653013302643, + "learning_rate": 1.1021944637695032e-05, + "loss": 0.9861, + "step": 3158 + }, + { + "epoch": 0.48, + "grad_norm": 2.639775493358449, + "learning_rate": 1.1017012928508905e-05, + "loss": 0.8768, + "step": 3159 + }, + { + "epoch": 0.48, + "grad_norm": 2.6560270557217853, + "learning_rate": 1.1012080969369527e-05, + "loss": 0.8731, + "step": 3160 + }, + { + "epoch": 0.48, + "grad_norm": 2.9183755912857343, + "learning_rate": 1.1007148761489031e-05, + "loss": 0.8827, + "step": 3161 + }, + { + "epoch": 0.48, + "grad_norm": 2.6162305768841576, + "learning_rate": 1.1002216306079616e-05, + "loss": 0.8166, + "step": 3162 + }, + { + "epoch": 0.48, + "grad_norm": 2.580769173010447, + "learning_rate": 1.099728360435354e-05, + "loss": 0.9338, + "step": 3163 + }, + { + "epoch": 0.48, + "grad_norm": 2.548212107677192, + "learning_rate": 1.0992350657523123e-05, + "loss": 0.7659, + "step": 3164 + }, + { + "epoch": 0.48, + "grad_norm": 3.048637681072861, + "learning_rate": 1.0987417466800749e-05, + "loss": 0.9659, + "step": 3165 + }, + { + "epoch": 0.48, + "grad_norm": 2.6843483103040544, + "learning_rate": 1.0982484033398855e-05, + "loss": 0.8631, + "step": 3166 + }, + { + "epoch": 0.48, + "grad_norm": 2.6588237759332394, + "learning_rate": 1.0977550358529935e-05, + "loss": 0.8909, + "step": 3167 + }, + { + "epoch": 0.48, + "grad_norm": 2.8436832837871253, + "learning_rate": 1.0972616443406558e-05, + "loss": 0.834, + "step": 3168 + }, + { + "epoch": 0.49, + "grad_norm": 2.8045928596793903, + "learning_rate": 1.0967682289241337e-05, + "loss": 0.8808, + "step": 3169 + }, + { + "epoch": 0.49, + "grad_norm": 2.7498769351854677, + "learning_rate": 1.0962747897246949e-05, + "loss": 0.8705, + "step": 3170 + }, + { + "epoch": 0.49, + "grad_norm": 2.709629558405918, + "learning_rate": 1.0957813268636127e-05, + "loss": 0.8877, + "step": 3171 + }, + { + "epoch": 0.49, + "grad_norm": 2.6672565570216595, + "learning_rate": 1.0952878404621667e-05, + "loss": 0.9211, + "step": 3172 + }, + { + "epoch": 0.49, + "grad_norm": 2.855523920099627, + "learning_rate": 1.0947943306416422e-05, + "loss": 0.834, + "step": 3173 + }, + { + "epoch": 0.49, + "grad_norm": 2.8752440568494277, + "learning_rate": 1.0943007975233296e-05, + "loss": 0.8943, + "step": 3174 + }, + { + "epoch": 0.49, + "grad_norm": 2.7796180474907732, + "learning_rate": 1.0938072412285257e-05, + "loss": 0.9756, + "step": 3175 + }, + { + "epoch": 0.49, + "grad_norm": 2.637847864052865, + "learning_rate": 1.093313661878533e-05, + "loss": 0.843, + "step": 3176 + }, + { + "epoch": 0.49, + "grad_norm": 2.516870968925292, + "learning_rate": 1.0928200595946594e-05, + "loss": 0.8225, + "step": 3177 + }, + { + "epoch": 0.49, + "grad_norm": 2.8040107938246868, + "learning_rate": 1.092326434498218e-05, + "loss": 0.8971, + "step": 3178 + }, + { + "epoch": 0.49, + "grad_norm": 6.312917984262526, + "learning_rate": 1.0918327867105284e-05, + "loss": 1.0743, + "step": 3179 + }, + { + "epoch": 0.49, + "grad_norm": 2.6661378147021946, + "learning_rate": 1.0913391163529158e-05, + "loss": 0.8194, + "step": 3180 + }, + { + "epoch": 0.49, + "grad_norm": 2.923249564548845, + "learning_rate": 1.0908454235467099e-05, + "loss": 0.9235, + "step": 3181 + }, + { + "epoch": 0.49, + "grad_norm": 4.124631933822971, + "learning_rate": 1.0903517084132469e-05, + "loss": 0.7906, + "step": 3182 + }, + { + "epoch": 0.49, + "grad_norm": 2.6608958546046586, + "learning_rate": 1.0898579710738675e-05, + "loss": 0.9203, + "step": 3183 + }, + { + "epoch": 0.49, + "grad_norm": 2.737613006287347, + "learning_rate": 1.0893642116499194e-05, + "loss": 0.95, + "step": 3184 + }, + { + "epoch": 0.49, + "grad_norm": 2.804888270338999, + "learning_rate": 1.0888704302627542e-05, + "loss": 0.8623, + "step": 3185 + }, + { + "epoch": 0.49, + "grad_norm": 2.710517749370958, + "learning_rate": 1.0883766270337297e-05, + "loss": 0.7938, + "step": 3186 + }, + { + "epoch": 0.49, + "grad_norm": 2.846565399790556, + "learning_rate": 1.0878828020842091e-05, + "loss": 0.8408, + "step": 3187 + }, + { + "epoch": 0.49, + "grad_norm": 3.2049410040769426, + "learning_rate": 1.0873889555355606e-05, + "loss": 0.845, + "step": 3188 + }, + { + "epoch": 0.49, + "grad_norm": 2.9466179850540923, + "learning_rate": 1.0868950875091573e-05, + "loss": 0.8936, + "step": 3189 + }, + { + "epoch": 0.49, + "grad_norm": 2.6737815841660186, + "learning_rate": 1.0864011981263786e-05, + "loss": 0.9218, + "step": 3190 + }, + { + "epoch": 0.49, + "grad_norm": 2.620826306048308, + "learning_rate": 1.085907287508609e-05, + "loss": 0.9462, + "step": 3191 + }, + { + "epoch": 0.49, + "grad_norm": 4.778997897619199, + "learning_rate": 1.0854133557772373e-05, + "loss": 0.9397, + "step": 3192 + }, + { + "epoch": 0.49, + "grad_norm": 2.5247511860554677, + "learning_rate": 1.0849194030536583e-05, + "loss": 0.9028, + "step": 3193 + }, + { + "epoch": 0.49, + "grad_norm": 2.4814349353822736, + "learning_rate": 1.0844254294592716e-05, + "loss": 0.7551, + "step": 3194 + }, + { + "epoch": 0.49, + "grad_norm": 2.5717710565669205, + "learning_rate": 1.0839314351154821e-05, + "loss": 0.8836, + "step": 3195 + }, + { + "epoch": 0.49, + "grad_norm": 3.061981208422528, + "learning_rate": 1.0834374201436996e-05, + "loss": 0.8326, + "step": 3196 + }, + { + "epoch": 0.49, + "grad_norm": 4.710682198564873, + "learning_rate": 1.0829433846653397e-05, + "loss": 0.9966, + "step": 3197 + }, + { + "epoch": 0.49, + "grad_norm": 2.631961606425523, + "learning_rate": 1.082449328801822e-05, + "loss": 0.9213, + "step": 3198 + }, + { + "epoch": 0.49, + "grad_norm": 2.718721534981861, + "learning_rate": 1.0819552526745716e-05, + "loss": 0.7187, + "step": 3199 + }, + { + "epoch": 0.49, + "grad_norm": 2.6937766875932736, + "learning_rate": 1.0814611564050186e-05, + "loss": 0.8894, + "step": 3200 + }, + { + "epoch": 0.49, + "grad_norm": 2.8889819332593887, + "learning_rate": 1.080967040114598e-05, + "loss": 0.8874, + "step": 3201 + }, + { + "epoch": 0.49, + "grad_norm": 2.7937136356082792, + "learning_rate": 1.08047290392475e-05, + "loss": 0.8424, + "step": 3202 + }, + { + "epoch": 0.49, + "grad_norm": 2.644207184621602, + "learning_rate": 1.0799787479569188e-05, + "loss": 0.8399, + "step": 3203 + }, + { + "epoch": 0.49, + "grad_norm": 3.3400959435695334, + "learning_rate": 1.0794845723325544e-05, + "loss": 0.8781, + "step": 3204 + }, + { + "epoch": 0.49, + "grad_norm": 2.9983667609443976, + "learning_rate": 1.0789903771731118e-05, + "loss": 0.8109, + "step": 3205 + }, + { + "epoch": 0.49, + "grad_norm": 2.617913805844195, + "learning_rate": 1.0784961626000497e-05, + "loss": 0.831, + "step": 3206 + }, + { + "epoch": 0.49, + "grad_norm": 2.676909320940646, + "learning_rate": 1.0780019287348321e-05, + "loss": 0.9348, + "step": 3207 + }, + { + "epoch": 0.49, + "grad_norm": 2.4113877770088146, + "learning_rate": 1.0775076756989281e-05, + "loss": 0.7715, + "step": 3208 + }, + { + "epoch": 0.49, + "grad_norm": 2.873965751785654, + "learning_rate": 1.0770134036138114e-05, + "loss": 0.863, + "step": 3209 + }, + { + "epoch": 0.49, + "grad_norm": 4.234018267681649, + "learning_rate": 1.07651911260096e-05, + "loss": 1.0551, + "step": 3210 + }, + { + "epoch": 0.49, + "grad_norm": 2.601399561591246, + "learning_rate": 1.0760248027818566e-05, + "loss": 0.8906, + "step": 3211 + }, + { + "epoch": 0.49, + "grad_norm": 2.5780137439885213, + "learning_rate": 1.0755304742779891e-05, + "loss": 0.8582, + "step": 3212 + }, + { + "epoch": 0.49, + "grad_norm": 2.7272475315349687, + "learning_rate": 1.0750361272108492e-05, + "loss": 0.9181, + "step": 3213 + }, + { + "epoch": 0.49, + "grad_norm": 3.0256202385156685, + "learning_rate": 1.0745417617019336e-05, + "loss": 0.9568, + "step": 3214 + }, + { + "epoch": 0.49, + "grad_norm": 2.515863450716289, + "learning_rate": 1.0740473778727436e-05, + "loss": 0.7738, + "step": 3215 + }, + { + "epoch": 0.49, + "grad_norm": 2.787365827288811, + "learning_rate": 1.0735529758447851e-05, + "loss": 0.9147, + "step": 3216 + }, + { + "epoch": 0.49, + "grad_norm": 2.687163583453802, + "learning_rate": 1.0730585557395682e-05, + "loss": 0.9083, + "step": 3217 + }, + { + "epoch": 0.49, + "grad_norm": 2.498617602623131, + "learning_rate": 1.0725641176786066e-05, + "loss": 0.8464, + "step": 3218 + }, + { + "epoch": 0.49, + "grad_norm": 2.669957505128777, + "learning_rate": 1.0720696617834203e-05, + "loss": 0.8533, + "step": 3219 + }, + { + "epoch": 0.49, + "grad_norm": 2.7615180189226725, + "learning_rate": 1.0715751881755322e-05, + "loss": 0.8023, + "step": 3220 + }, + { + "epoch": 0.49, + "grad_norm": 2.84954465106346, + "learning_rate": 1.0710806969764708e-05, + "loss": 0.7787, + "step": 3221 + }, + { + "epoch": 0.49, + "grad_norm": 2.761029530875673, + "learning_rate": 1.0705861883077676e-05, + "loss": 0.8775, + "step": 3222 + }, + { + "epoch": 0.49, + "grad_norm": 2.7200088736275156, + "learning_rate": 1.0700916622909584e-05, + "loss": 0.8918, + "step": 3223 + }, + { + "epoch": 0.49, + "grad_norm": 4.580459203264464, + "learning_rate": 1.069597119047585e-05, + "loss": 0.979, + "step": 3224 + }, + { + "epoch": 0.49, + "grad_norm": 2.784799576334272, + "learning_rate": 1.0691025586991913e-05, + "loss": 0.8962, + "step": 3225 + }, + { + "epoch": 0.49, + "grad_norm": 2.716320206142077, + "learning_rate": 1.0686079813673266e-05, + "loss": 0.8454, + "step": 3226 + }, + { + "epoch": 0.49, + "grad_norm": 2.78252057186699, + "learning_rate": 1.0681133871735447e-05, + "loss": 0.7963, + "step": 3227 + }, + { + "epoch": 0.49, + "grad_norm": 2.797761111042213, + "learning_rate": 1.0676187762394024e-05, + "loss": 0.8693, + "step": 3228 + }, + { + "epoch": 0.49, + "grad_norm": 2.6328034403208016, + "learning_rate": 1.0671241486864612e-05, + "loss": 0.831, + "step": 3229 + }, + { + "epoch": 0.49, + "grad_norm": 2.509790499903019, + "learning_rate": 1.0666295046362866e-05, + "loss": 0.8735, + "step": 3230 + }, + { + "epoch": 0.49, + "grad_norm": 2.559415932905526, + "learning_rate": 1.0661348442104488e-05, + "loss": 0.8074, + "step": 3231 + }, + { + "epoch": 0.49, + "grad_norm": 2.7346927684237206, + "learning_rate": 1.0656401675305213e-05, + "loss": 0.8779, + "step": 3232 + }, + { + "epoch": 0.49, + "grad_norm": 2.790203708689786, + "learning_rate": 1.0651454747180814e-05, + "loss": 0.8612, + "step": 3233 + }, + { + "epoch": 0.5, + "grad_norm": 2.570354502414417, + "learning_rate": 1.0646507658947107e-05, + "loss": 0.7987, + "step": 3234 + }, + { + "epoch": 0.5, + "grad_norm": 2.864626432776611, + "learning_rate": 1.0641560411819949e-05, + "loss": 0.8281, + "step": 3235 + }, + { + "epoch": 0.5, + "grad_norm": 2.703277856066407, + "learning_rate": 1.0636613007015237e-05, + "loss": 0.8933, + "step": 3236 + }, + { + "epoch": 0.5, + "grad_norm": 2.5346887053544522, + "learning_rate": 1.0631665445748903e-05, + "loss": 0.8432, + "step": 3237 + }, + { + "epoch": 0.5, + "grad_norm": 2.6720590127577113, + "learning_rate": 1.0626717729236916e-05, + "loss": 0.8069, + "step": 3238 + }, + { + "epoch": 0.5, + "grad_norm": 2.696786972098412, + "learning_rate": 1.062176985869529e-05, + "loss": 0.8801, + "step": 3239 + }, + { + "epoch": 0.5, + "grad_norm": 2.697229729382102, + "learning_rate": 1.061682183534007e-05, + "loss": 0.9402, + "step": 3240 + }, + { + "epoch": 0.5, + "grad_norm": 2.948636608691428, + "learning_rate": 1.0611873660387342e-05, + "loss": 0.8494, + "step": 3241 + }, + { + "epoch": 0.5, + "grad_norm": 2.8446098507092406, + "learning_rate": 1.0606925335053227e-05, + "loss": 0.8413, + "step": 3242 + }, + { + "epoch": 0.5, + "grad_norm": 2.728881060896302, + "learning_rate": 1.060197686055389e-05, + "loss": 0.9709, + "step": 3243 + }, + { + "epoch": 0.5, + "grad_norm": 2.7895855154963085, + "learning_rate": 1.0597028238105524e-05, + "loss": 0.7959, + "step": 3244 + }, + { + "epoch": 0.5, + "grad_norm": 2.6499197839840463, + "learning_rate": 1.0592079468924359e-05, + "loss": 0.7914, + "step": 3245 + }, + { + "epoch": 0.5, + "grad_norm": 4.911507768646488, + "learning_rate": 1.0587130554226665e-05, + "loss": 0.9737, + "step": 3246 + }, + { + "epoch": 0.5, + "grad_norm": 2.900046966280705, + "learning_rate": 1.0582181495228751e-05, + "loss": 0.9709, + "step": 3247 + }, + { + "epoch": 0.5, + "grad_norm": 2.590115515221143, + "learning_rate": 1.0577232293146951e-05, + "loss": 0.9294, + "step": 3248 + }, + { + "epoch": 0.5, + "grad_norm": 2.5526063817207385, + "learning_rate": 1.0572282949197646e-05, + "loss": 0.8423, + "step": 3249 + }, + { + "epoch": 0.5, + "grad_norm": 2.53629469601658, + "learning_rate": 1.0567333464597238e-05, + "loss": 0.9556, + "step": 3250 + }, + { + "epoch": 0.5, + "grad_norm": 5.980707695112363, + "learning_rate": 1.0562383840562179e-05, + "loss": 0.9552, + "step": 3251 + }, + { + "epoch": 0.5, + "grad_norm": 2.495283589020335, + "learning_rate": 1.0557434078308941e-05, + "loss": 0.7907, + "step": 3252 + }, + { + "epoch": 0.5, + "grad_norm": 2.518120801724238, + "learning_rate": 1.0552484179054041e-05, + "loss": 0.8885, + "step": 3253 + }, + { + "epoch": 0.5, + "grad_norm": 2.7614692883101126, + "learning_rate": 1.0547534144014027e-05, + "loss": 0.8517, + "step": 3254 + }, + { + "epoch": 0.5, + "grad_norm": 2.8019226736605187, + "learning_rate": 1.0542583974405476e-05, + "loss": 0.9192, + "step": 3255 + }, + { + "epoch": 0.5, + "grad_norm": 2.79675855043032, + "learning_rate": 1.0537633671445002e-05, + "loss": 0.8881, + "step": 3256 + }, + { + "epoch": 0.5, + "grad_norm": 2.265933060436887, + "learning_rate": 1.0532683236349248e-05, + "loss": 0.7393, + "step": 3257 + }, + { + "epoch": 0.5, + "grad_norm": 2.526929479088268, + "learning_rate": 1.0527732670334897e-05, + "loss": 0.8796, + "step": 3258 + }, + { + "epoch": 0.5, + "grad_norm": 2.520864599696683, + "learning_rate": 1.0522781974618652e-05, + "loss": 0.8469, + "step": 3259 + }, + { + "epoch": 0.5, + "grad_norm": 2.7165796819432604, + "learning_rate": 1.0517831150417264e-05, + "loss": 0.8224, + "step": 3260 + }, + { + "epoch": 0.5, + "grad_norm": 2.6270000648683722, + "learning_rate": 1.0512880198947501e-05, + "loss": 0.938, + "step": 3261 + }, + { + "epoch": 0.5, + "grad_norm": 2.79444465784981, + "learning_rate": 1.050792912142617e-05, + "loss": 0.7442, + "step": 3262 + }, + { + "epoch": 0.5, + "grad_norm": 2.5324615943017106, + "learning_rate": 1.0502977919070106e-05, + "loss": 0.8856, + "step": 3263 + }, + { + "epoch": 0.5, + "grad_norm": 2.533648583070449, + "learning_rate": 1.0498026593096174e-05, + "loss": 0.8138, + "step": 3264 + }, + { + "epoch": 0.5, + "grad_norm": 2.483789225555165, + "learning_rate": 1.0493075144721274e-05, + "loss": 0.7558, + "step": 3265 + }, + { + "epoch": 0.5, + "grad_norm": 2.8208179904872566, + "learning_rate": 1.0488123575162332e-05, + "loss": 0.8044, + "step": 3266 + }, + { + "epoch": 0.5, + "grad_norm": 2.618894893257103, + "learning_rate": 1.0483171885636307e-05, + "loss": 0.8982, + "step": 3267 + }, + { + "epoch": 0.5, + "grad_norm": 2.8409111924604673, + "learning_rate": 1.0478220077360184e-05, + "loss": 0.7764, + "step": 3268 + }, + { + "epoch": 0.5, + "grad_norm": 3.6090076484711986, + "learning_rate": 1.0473268151550977e-05, + "loss": 0.9362, + "step": 3269 + }, + { + "epoch": 0.5, + "grad_norm": 2.9134663249316786, + "learning_rate": 1.0468316109425732e-05, + "loss": 0.8487, + "step": 3270 + }, + { + "epoch": 0.5, + "grad_norm": 2.5389919301057446, + "learning_rate": 1.046336395220152e-05, + "loss": 0.8479, + "step": 3271 + }, + { + "epoch": 0.5, + "grad_norm": 2.4828818279246376, + "learning_rate": 1.0458411681095444e-05, + "loss": 0.8475, + "step": 3272 + }, + { + "epoch": 0.5, + "grad_norm": 2.8052594263529405, + "learning_rate": 1.0453459297324638e-05, + "loss": 0.7965, + "step": 3273 + }, + { + "epoch": 0.5, + "grad_norm": 6.018704108526814, + "learning_rate": 1.0448506802106248e-05, + "loss": 1.0924, + "step": 3274 + }, + { + "epoch": 0.5, + "grad_norm": 2.7527598038819567, + "learning_rate": 1.0443554196657468e-05, + "loss": 0.908, + "step": 3275 + }, + { + "epoch": 0.5, + "grad_norm": 3.1159386946264753, + "learning_rate": 1.0438601482195507e-05, + "loss": 0.9319, + "step": 3276 + }, + { + "epoch": 0.5, + "grad_norm": 2.705038646126299, + "learning_rate": 1.0433648659937604e-05, + "loss": 0.813, + "step": 3277 + }, + { + "epoch": 0.5, + "grad_norm": 2.5206348247401427, + "learning_rate": 1.042869573110102e-05, + "loss": 0.7875, + "step": 3278 + }, + { + "epoch": 0.5, + "grad_norm": 2.7917041949509933, + "learning_rate": 1.0423742696903047e-05, + "loss": 0.9462, + "step": 3279 + }, + { + "epoch": 0.5, + "grad_norm": 2.6746082283942822, + "learning_rate": 1.0418789558561009e-05, + "loss": 0.9436, + "step": 3280 + }, + { + "epoch": 0.5, + "grad_norm": 2.5656588357619428, + "learning_rate": 1.0413836317292237e-05, + "loss": 0.9148, + "step": 3281 + }, + { + "epoch": 0.5, + "grad_norm": 2.6118866244099777, + "learning_rate": 1.0408882974314107e-05, + "loss": 0.9189, + "step": 3282 + }, + { + "epoch": 0.5, + "grad_norm": 2.6684671051310556, + "learning_rate": 1.040392953084401e-05, + "loss": 0.8033, + "step": 3283 + }, + { + "epoch": 0.5, + "grad_norm": 2.501298470832651, + "learning_rate": 1.0398975988099364e-05, + "loss": 0.8298, + "step": 3284 + }, + { + "epoch": 0.5, + "grad_norm": 2.5541503907350083, + "learning_rate": 1.0394022347297607e-05, + "loss": 0.8786, + "step": 3285 + }, + { + "epoch": 0.5, + "grad_norm": 2.5594560871040892, + "learning_rate": 1.038906860965621e-05, + "loss": 0.8968, + "step": 3286 + }, + { + "epoch": 0.5, + "grad_norm": 2.5151461823220593, + "learning_rate": 1.038411477639266e-05, + "loss": 0.9096, + "step": 3287 + }, + { + "epoch": 0.5, + "grad_norm": 2.636485823466242, + "learning_rate": 1.037916084872447e-05, + "loss": 0.8021, + "step": 3288 + }, + { + "epoch": 0.5, + "grad_norm": 2.72865389979779, + "learning_rate": 1.0374206827869177e-05, + "loss": 0.907, + "step": 3289 + }, + { + "epoch": 0.5, + "grad_norm": 2.818026224530828, + "learning_rate": 1.0369252715044343e-05, + "loss": 0.8863, + "step": 3290 + }, + { + "epoch": 0.5, + "grad_norm": 2.7254428761840406, + "learning_rate": 1.0364298511467548e-05, + "loss": 0.88, + "step": 3291 + }, + { + "epoch": 0.5, + "grad_norm": 2.728241616850513, + "learning_rate": 1.0359344218356393e-05, + "loss": 0.8195, + "step": 3292 + }, + { + "epoch": 0.5, + "grad_norm": 2.8484344836013813, + "learning_rate": 1.0354389836928507e-05, + "loss": 0.9687, + "step": 3293 + }, + { + "epoch": 0.5, + "grad_norm": 3.096231522927287, + "learning_rate": 1.0349435368401541e-05, + "loss": 0.8959, + "step": 3294 + }, + { + "epoch": 0.5, + "grad_norm": 2.6238474886319425, + "learning_rate": 1.0344480813993163e-05, + "loss": 0.8663, + "step": 3295 + }, + { + "epoch": 0.5, + "grad_norm": 2.785385869115943, + "learning_rate": 1.033952617492106e-05, + "loss": 0.9265, + "step": 3296 + }, + { + "epoch": 0.5, + "grad_norm": 2.4841009237509755, + "learning_rate": 1.0334571452402943e-05, + "loss": 0.8481, + "step": 3297 + }, + { + "epoch": 0.5, + "grad_norm": 2.6596124256370652, + "learning_rate": 1.032961664765655e-05, + "loss": 0.8183, + "step": 3298 + }, + { + "epoch": 0.5, + "grad_norm": 2.4437143863687227, + "learning_rate": 1.0324661761899629e-05, + "loss": 0.8173, + "step": 3299 + }, + { + "epoch": 0.51, + "grad_norm": 3.29109457912766, + "learning_rate": 1.0319706796349954e-05, + "loss": 0.9644, + "step": 3300 + }, + { + "epoch": 0.51, + "grad_norm": 2.797304793482052, + "learning_rate": 1.0314751752225311e-05, + "loss": 0.8918, + "step": 3301 + }, + { + "epoch": 0.51, + "grad_norm": 2.7597018438887027, + "learning_rate": 1.0309796630743518e-05, + "loss": 0.819, + "step": 3302 + }, + { + "epoch": 0.51, + "grad_norm": 2.3860829479408974, + "learning_rate": 1.0304841433122399e-05, + "loss": 0.9725, + "step": 3303 + }, + { + "epoch": 0.51, + "grad_norm": 2.6324701321499036, + "learning_rate": 1.0299886160579806e-05, + "loss": 0.8367, + "step": 3304 + }, + { + "epoch": 0.51, + "grad_norm": 2.501387059647868, + "learning_rate": 1.0294930814333605e-05, + "loss": 0.7974, + "step": 3305 + }, + { + "epoch": 0.51, + "grad_norm": 2.525680580078367, + "learning_rate": 1.0289975395601686e-05, + "loss": 0.7649, + "step": 3306 + }, + { + "epoch": 0.51, + "grad_norm": 2.686467602375458, + "learning_rate": 1.0285019905601943e-05, + "loss": 0.8886, + "step": 3307 + }, + { + "epoch": 0.51, + "grad_norm": 3.1918554685564837, + "learning_rate": 1.02800643455523e-05, + "loss": 0.7807, + "step": 3308 + }, + { + "epoch": 0.51, + "grad_norm": 2.636561913205046, + "learning_rate": 1.0275108716670698e-05, + "loss": 0.7962, + "step": 3309 + }, + { + "epoch": 0.51, + "grad_norm": 2.719254863507505, + "learning_rate": 1.0270153020175092e-05, + "loss": 1.0061, + "step": 3310 + }, + { + "epoch": 0.51, + "grad_norm": 2.801691612655321, + "learning_rate": 1.0265197257283444e-05, + "loss": 0.8736, + "step": 3311 + }, + { + "epoch": 0.51, + "grad_norm": 2.590671703849368, + "learning_rate": 1.0260241429213754e-05, + "loss": 0.9129, + "step": 3312 + }, + { + "epoch": 0.51, + "grad_norm": 2.8841881720350333, + "learning_rate": 1.0255285537184016e-05, + "loss": 0.8225, + "step": 3313 + }, + { + "epoch": 0.51, + "grad_norm": 2.4984456373558555, + "learning_rate": 1.0250329582412253e-05, + "loss": 0.8641, + "step": 3314 + }, + { + "epoch": 0.51, + "grad_norm": 2.596338224358061, + "learning_rate": 1.0245373566116496e-05, + "loss": 0.8932, + "step": 3315 + }, + { + "epoch": 0.51, + "grad_norm": 2.7568003722125938, + "learning_rate": 1.0240417489514802e-05, + "loss": 0.8756, + "step": 3316 + }, + { + "epoch": 0.51, + "grad_norm": 2.866798445477397, + "learning_rate": 1.0235461353825234e-05, + "loss": 0.8654, + "step": 3317 + }, + { + "epoch": 0.51, + "grad_norm": 2.5433943971852275, + "learning_rate": 1.0230505160265867e-05, + "loss": 0.8851, + "step": 3318 + }, + { + "epoch": 0.51, + "grad_norm": 2.9431924246520564, + "learning_rate": 1.0225548910054794e-05, + "loss": 0.8239, + "step": 3319 + }, + { + "epoch": 0.51, + "grad_norm": 2.745923605151662, + "learning_rate": 1.0220592604410127e-05, + "loss": 0.8627, + "step": 3320 + }, + { + "epoch": 0.51, + "grad_norm": 6.277014146147613, + "learning_rate": 1.0215636244549985e-05, + "loss": 0.9909, + "step": 3321 + }, + { + "epoch": 0.51, + "grad_norm": 2.9053879625025125, + "learning_rate": 1.02106798316925e-05, + "loss": 0.8434, + "step": 3322 + }, + { + "epoch": 0.51, + "grad_norm": 2.584237482121651, + "learning_rate": 1.0205723367055821e-05, + "loss": 0.842, + "step": 3323 + }, + { + "epoch": 0.51, + "grad_norm": 2.8707112582385177, + "learning_rate": 1.0200766851858112e-05, + "loss": 0.935, + "step": 3324 + }, + { + "epoch": 0.51, + "grad_norm": 2.447159641817951, + "learning_rate": 1.0195810287317539e-05, + "loss": 0.8213, + "step": 3325 + }, + { + "epoch": 0.51, + "grad_norm": 3.651398982692408, + "learning_rate": 1.0190853674652289e-05, + "loss": 0.9345, + "step": 3326 + }, + { + "epoch": 0.51, + "grad_norm": 3.411584643818777, + "learning_rate": 1.0185897015080555e-05, + "loss": 0.8791, + "step": 3327 + }, + { + "epoch": 0.51, + "grad_norm": 2.728514157055375, + "learning_rate": 1.0180940309820553e-05, + "loss": 0.8527, + "step": 3328 + }, + { + "epoch": 0.51, + "grad_norm": 2.3547298609445724, + "learning_rate": 1.0175983560090496e-05, + "loss": 0.8094, + "step": 3329 + }, + { + "epoch": 0.51, + "grad_norm": 2.439730352013072, + "learning_rate": 1.0171026767108617e-05, + "loss": 0.7593, + "step": 3330 + }, + { + "epoch": 0.51, + "grad_norm": 3.0534143692632862, + "learning_rate": 1.0166069932093152e-05, + "loss": 0.8036, + "step": 3331 + }, + { + "epoch": 0.51, + "grad_norm": 2.683965074654162, + "learning_rate": 1.016111305626236e-05, + "loss": 0.8081, + "step": 3332 + }, + { + "epoch": 0.51, + "grad_norm": 2.57505920815486, + "learning_rate": 1.0156156140834492e-05, + "loss": 0.753, + "step": 3333 + }, + { + "epoch": 0.51, + "grad_norm": 4.283608619024065, + "learning_rate": 1.0151199187027828e-05, + "loss": 0.9922, + "step": 3334 + }, + { + "epoch": 0.51, + "grad_norm": 2.7688976110505235, + "learning_rate": 1.0146242196060646e-05, + "loss": 0.8464, + "step": 3335 + }, + { + "epoch": 0.51, + "grad_norm": 2.68577175813711, + "learning_rate": 1.0141285169151229e-05, + "loss": 0.7952, + "step": 3336 + }, + { + "epoch": 0.51, + "grad_norm": 2.562544645505176, + "learning_rate": 1.0136328107517881e-05, + "loss": 0.9107, + "step": 3337 + }, + { + "epoch": 0.51, + "grad_norm": 2.6052512511645904, + "learning_rate": 1.0131371012378907e-05, + "loss": 0.7702, + "step": 3338 + }, + { + "epoch": 0.51, + "grad_norm": 3.0419214898098548, + "learning_rate": 1.0126413884952626e-05, + "loss": 0.7645, + "step": 3339 + }, + { + "epoch": 0.51, + "grad_norm": 2.671771314701333, + "learning_rate": 1.0121456726457357e-05, + "loss": 0.8947, + "step": 3340 + }, + { + "epoch": 0.51, + "grad_norm": 2.650762861555099, + "learning_rate": 1.0116499538111428e-05, + "loss": 0.8634, + "step": 3341 + }, + { + "epoch": 0.51, + "grad_norm": 2.634411025008088, + "learning_rate": 1.0111542321133182e-05, + "loss": 0.8083, + "step": 3342 + }, + { + "epoch": 0.51, + "grad_norm": 4.41310547661801, + "learning_rate": 1.010658507674096e-05, + "loss": 0.9767, + "step": 3343 + }, + { + "epoch": 0.51, + "grad_norm": 2.568605596401079, + "learning_rate": 1.0101627806153117e-05, + "loss": 0.8827, + "step": 3344 + }, + { + "epoch": 0.51, + "grad_norm": 2.7319944186757406, + "learning_rate": 1.0096670510588009e-05, + "loss": 0.8224, + "step": 3345 + }, + { + "epoch": 0.51, + "grad_norm": 2.283551340145881, + "learning_rate": 1.0091713191264001e-05, + "loss": 0.7559, + "step": 3346 + }, + { + "epoch": 0.51, + "grad_norm": 4.012626492995786, + "learning_rate": 1.0086755849399464e-05, + "loss": 0.986, + "step": 3347 + }, + { + "epoch": 0.51, + "grad_norm": 2.686251388531931, + "learning_rate": 1.008179848621277e-05, + "loss": 0.8781, + "step": 3348 + }, + { + "epoch": 0.51, + "grad_norm": 2.4624808391304565, + "learning_rate": 1.0076841102922301e-05, + "loss": 0.8124, + "step": 3349 + }, + { + "epoch": 0.51, + "grad_norm": 2.6461052882685365, + "learning_rate": 1.0071883700746448e-05, + "loss": 0.8288, + "step": 3350 + }, + { + "epoch": 0.51, + "grad_norm": 2.712048135899838, + "learning_rate": 1.0066926280903598e-05, + "loss": 0.8369, + "step": 3351 + }, + { + "epoch": 0.51, + "grad_norm": 2.5896984185944394, + "learning_rate": 1.0061968844612143e-05, + "loss": 0.8532, + "step": 3352 + }, + { + "epoch": 0.51, + "grad_norm": 2.8039264113702904, + "learning_rate": 1.0057011393090481e-05, + "loss": 0.8418, + "step": 3353 + }, + { + "epoch": 0.51, + "grad_norm": 2.930467198579045, + "learning_rate": 1.0052053927557022e-05, + "loss": 0.8305, + "step": 3354 + }, + { + "epoch": 0.51, + "grad_norm": 3.02410853561508, + "learning_rate": 1.0047096449230164e-05, + "loss": 0.8893, + "step": 3355 + }, + { + "epoch": 0.51, + "grad_norm": 2.6144880865805216, + "learning_rate": 1.0042138959328322e-05, + "loss": 0.7572, + "step": 3356 + }, + { + "epoch": 0.51, + "grad_norm": 2.7977719436946544, + "learning_rate": 1.0037181459069905e-05, + "loss": 0.8333, + "step": 3357 + }, + { + "epoch": 0.51, + "grad_norm": 2.5403140670311344, + "learning_rate": 1.003222394967333e-05, + "loss": 0.7677, + "step": 3358 + }, + { + "epoch": 0.51, + "grad_norm": 2.6608801644930726, + "learning_rate": 1.0027266432357007e-05, + "loss": 0.924, + "step": 3359 + }, + { + "epoch": 0.51, + "grad_norm": 2.704651179422154, + "learning_rate": 1.0022308908339365e-05, + "loss": 0.7022, + "step": 3360 + }, + { + "epoch": 0.51, + "grad_norm": 2.679460239629454, + "learning_rate": 1.0017351378838817e-05, + "loss": 0.8915, + "step": 3361 + }, + { + "epoch": 0.51, + "grad_norm": 2.498793390857166, + "learning_rate": 1.0012393845073787e-05, + "loss": 0.7512, + "step": 3362 + }, + { + "epoch": 0.51, + "grad_norm": 2.6430015156102837, + "learning_rate": 1.0007436308262696e-05, + "loss": 0.8217, + "step": 3363 + }, + { + "epoch": 0.51, + "grad_norm": 2.735772542706487, + "learning_rate": 1.000247876962397e-05, + "loss": 0.8334, + "step": 3364 + }, + { + "epoch": 0.52, + "grad_norm": 2.515510191667527, + "learning_rate": 9.997521230376032e-06, + "loss": 0.8312, + "step": 3365 + }, + { + "epoch": 0.52, + "grad_norm": 2.8658668233468925, + "learning_rate": 9.992563691737304e-06, + "loss": 0.9067, + "step": 3366 + }, + { + "epoch": 0.52, + "grad_norm": 2.837582352635219, + "learning_rate": 9.987606154926214e-06, + "loss": 0.7849, + "step": 3367 + }, + { + "epoch": 0.52, + "grad_norm": 2.7723264928964144, + "learning_rate": 9.982648621161188e-06, + "loss": 0.8556, + "step": 3368 + }, + { + "epoch": 0.52, + "grad_norm": 2.7701920524217827, + "learning_rate": 9.977691091660637e-06, + "loss": 0.7989, + "step": 3369 + }, + { + "epoch": 0.52, + "grad_norm": 2.7434900183215976, + "learning_rate": 9.972733567642994e-06, + "loss": 0.8515, + "step": 3370 + }, + { + "epoch": 0.52, + "grad_norm": 2.8262306070225014, + "learning_rate": 9.967776050326675e-06, + "loss": 0.768, + "step": 3371 + }, + { + "epoch": 0.52, + "grad_norm": 2.630803167653262, + "learning_rate": 9.962818540930095e-06, + "loss": 0.8, + "step": 3372 + }, + { + "epoch": 0.52, + "grad_norm": 2.7316271034593185, + "learning_rate": 9.95786104067168e-06, + "loss": 0.8565, + "step": 3373 + }, + { + "epoch": 0.52, + "grad_norm": 2.5927251961593463, + "learning_rate": 9.952903550769837e-06, + "loss": 0.843, + "step": 3374 + }, + { + "epoch": 0.52, + "grad_norm": 2.682418900705461, + "learning_rate": 9.947946072442982e-06, + "loss": 0.9163, + "step": 3375 + }, + { + "epoch": 0.52, + "grad_norm": 2.4891777647767173, + "learning_rate": 9.942988606909522e-06, + "loss": 0.7926, + "step": 3376 + }, + { + "epoch": 0.52, + "grad_norm": 2.7278232479343947, + "learning_rate": 9.938031155387859e-06, + "loss": 0.8517, + "step": 3377 + }, + { + "epoch": 0.52, + "grad_norm": 2.6922445372108488, + "learning_rate": 9.933073719096406e-06, + "loss": 0.916, + "step": 3378 + }, + { + "epoch": 0.52, + "grad_norm": 2.562382246736542, + "learning_rate": 9.928116299253553e-06, + "loss": 0.8839, + "step": 3379 + }, + { + "epoch": 0.52, + "grad_norm": 2.4537029707439313, + "learning_rate": 9.923158897077699e-06, + "loss": 0.8054, + "step": 3380 + }, + { + "epoch": 0.52, + "grad_norm": 3.3236126145506613, + "learning_rate": 9.918201513787233e-06, + "loss": 0.7271, + "step": 3381 + }, + { + "epoch": 0.52, + "grad_norm": 3.055213047874377, + "learning_rate": 9.913244150600541e-06, + "loss": 0.9046, + "step": 3382 + }, + { + "epoch": 0.52, + "grad_norm": 2.553739553514491, + "learning_rate": 9.908286808735999e-06, + "loss": 0.8065, + "step": 3383 + }, + { + "epoch": 0.52, + "grad_norm": 2.4245662109666766, + "learning_rate": 9.903329489411993e-06, + "loss": 0.8129, + "step": 3384 + }, + { + "epoch": 0.52, + "grad_norm": 2.660906783044096, + "learning_rate": 9.898372193846887e-06, + "loss": 0.8541, + "step": 3385 + }, + { + "epoch": 0.52, + "grad_norm": 2.578729689514895, + "learning_rate": 9.893414923259042e-06, + "loss": 0.8421, + "step": 3386 + }, + { + "epoch": 0.52, + "grad_norm": 2.5078486683767416, + "learning_rate": 9.888457678866823e-06, + "loss": 0.8087, + "step": 3387 + }, + { + "epoch": 0.52, + "grad_norm": 2.521637554958834, + "learning_rate": 9.883500461888573e-06, + "loss": 0.8557, + "step": 3388 + }, + { + "epoch": 0.52, + "grad_norm": 2.6795766617286967, + "learning_rate": 9.878543273542648e-06, + "loss": 0.7181, + "step": 3389 + }, + { + "epoch": 0.52, + "grad_norm": 2.498689771309547, + "learning_rate": 9.873586115047377e-06, + "loss": 0.8819, + "step": 3390 + }, + { + "epoch": 0.52, + "grad_norm": 2.577184398412776, + "learning_rate": 9.868628987621095e-06, + "loss": 0.8566, + "step": 3391 + }, + { + "epoch": 0.52, + "grad_norm": 2.793311417654129, + "learning_rate": 9.863671892482122e-06, + "loss": 0.8169, + "step": 3392 + }, + { + "epoch": 0.52, + "grad_norm": 2.620997389101896, + "learning_rate": 9.858714830848776e-06, + "loss": 0.9171, + "step": 3393 + }, + { + "epoch": 0.52, + "grad_norm": 2.778207542389812, + "learning_rate": 9.853757803939358e-06, + "loss": 0.8429, + "step": 3394 + }, + { + "epoch": 0.52, + "grad_norm": 2.9562677095301786, + "learning_rate": 9.848800812972175e-06, + "loss": 0.8696, + "step": 3395 + }, + { + "epoch": 0.52, + "grad_norm": 2.599642093006926, + "learning_rate": 9.84384385916551e-06, + "loss": 0.7961, + "step": 3396 + }, + { + "epoch": 0.52, + "grad_norm": 3.0071841120945737, + "learning_rate": 9.838886943737645e-06, + "loss": 0.8398, + "step": 3397 + }, + { + "epoch": 0.52, + "grad_norm": 2.696431412561743, + "learning_rate": 9.833930067906851e-06, + "loss": 0.8396, + "step": 3398 + }, + { + "epoch": 0.52, + "grad_norm": 2.430701029582184, + "learning_rate": 9.828973232891385e-06, + "loss": 0.7366, + "step": 3399 + }, + { + "epoch": 0.52, + "grad_norm": 2.8167533231043294, + "learning_rate": 9.824016439909505e-06, + "loss": 0.8484, + "step": 3400 + }, + { + "epoch": 0.52, + "grad_norm": 2.687160062145975, + "learning_rate": 9.81905969017945e-06, + "loss": 0.8009, + "step": 3401 + }, + { + "epoch": 0.52, + "grad_norm": 2.4484114257757734, + "learning_rate": 9.814102984919445e-06, + "loss": 0.7761, + "step": 3402 + }, + { + "epoch": 0.52, + "grad_norm": 2.6496492282970485, + "learning_rate": 9.809146325347716e-06, + "loss": 0.7687, + "step": 3403 + }, + { + "epoch": 0.52, + "grad_norm": 4.864677266471541, + "learning_rate": 9.804189712682466e-06, + "loss": 1.023, + "step": 3404 + }, + { + "epoch": 0.52, + "grad_norm": 2.7989202592935984, + "learning_rate": 9.79923314814189e-06, + "loss": 0.8775, + "step": 3405 + }, + { + "epoch": 0.52, + "grad_norm": 2.586169913513614, + "learning_rate": 9.79427663294418e-06, + "loss": 0.7965, + "step": 3406 + }, + { + "epoch": 0.52, + "grad_norm": 2.8333347061539187, + "learning_rate": 9.7893201683075e-06, + "loss": 0.9712, + "step": 3407 + }, + { + "epoch": 0.52, + "grad_norm": 2.719996695889003, + "learning_rate": 9.784363755450018e-06, + "loss": 0.9177, + "step": 3408 + }, + { + "epoch": 0.52, + "grad_norm": 2.4773553077282444, + "learning_rate": 9.779407395589876e-06, + "loss": 0.7608, + "step": 3409 + }, + { + "epoch": 0.52, + "grad_norm": 2.5475236772585563, + "learning_rate": 9.774451089945206e-06, + "loss": 0.7846, + "step": 3410 + }, + { + "epoch": 0.52, + "grad_norm": 3.0188556226064445, + "learning_rate": 9.769494839734136e-06, + "loss": 0.8426, + "step": 3411 + }, + { + "epoch": 0.52, + "grad_norm": 4.254714206692356, + "learning_rate": 9.764538646174771e-06, + "loss": 0.9804, + "step": 3412 + }, + { + "epoch": 0.52, + "grad_norm": 2.827330030766981, + "learning_rate": 9.759582510485198e-06, + "loss": 0.857, + "step": 3413 + }, + { + "epoch": 0.52, + "grad_norm": 2.4734544365088404, + "learning_rate": 9.754626433883506e-06, + "loss": 0.8474, + "step": 3414 + }, + { + "epoch": 0.52, + "grad_norm": 2.704908642099613, + "learning_rate": 9.749670417587753e-06, + "loss": 0.9602, + "step": 3415 + }, + { + "epoch": 0.52, + "grad_norm": 2.6888669635440077, + "learning_rate": 9.744714462815987e-06, + "loss": 0.8605, + "step": 3416 + }, + { + "epoch": 0.52, + "grad_norm": 2.709281617887562, + "learning_rate": 9.739758570786251e-06, + "loss": 0.8596, + "step": 3417 + }, + { + "epoch": 0.52, + "grad_norm": 2.7199414147145395, + "learning_rate": 9.734802742716556e-06, + "loss": 0.7955, + "step": 3418 + }, + { + "epoch": 0.52, + "grad_norm": 2.509186081480435, + "learning_rate": 9.729846979824913e-06, + "loss": 0.8612, + "step": 3419 + }, + { + "epoch": 0.52, + "grad_norm": 2.9166514156562298, + "learning_rate": 9.724891283329305e-06, + "loss": 0.9359, + "step": 3420 + }, + { + "epoch": 0.52, + "grad_norm": 2.496906473025068, + "learning_rate": 9.7199356544477e-06, + "loss": 0.7673, + "step": 3421 + }, + { + "epoch": 0.52, + "grad_norm": 2.5023038128266286, + "learning_rate": 9.714980094398059e-06, + "loss": 0.8332, + "step": 3422 + }, + { + "epoch": 0.52, + "grad_norm": 2.75657816335714, + "learning_rate": 9.710024604398317e-06, + "loss": 0.8968, + "step": 3423 + }, + { + "epoch": 0.52, + "grad_norm": 2.5561294321392594, + "learning_rate": 9.705069185666396e-06, + "loss": 0.7787, + "step": 3424 + }, + { + "epoch": 0.52, + "grad_norm": 3.1037707547457294, + "learning_rate": 9.700113839420197e-06, + "loss": 0.8882, + "step": 3425 + }, + { + "epoch": 0.52, + "grad_norm": 2.4952606751734274, + "learning_rate": 9.695158566877606e-06, + "loss": 0.9268, + "step": 3426 + }, + { + "epoch": 0.52, + "grad_norm": 6.026747964805295, + "learning_rate": 9.690203369256486e-06, + "loss": 0.9835, + "step": 3427 + }, + { + "epoch": 0.52, + "grad_norm": 2.5480816986475294, + "learning_rate": 9.685248247774692e-06, + "loss": 0.7946, + "step": 3428 + }, + { + "epoch": 0.52, + "grad_norm": 2.604296116527149, + "learning_rate": 9.68029320365005e-06, + "loss": 0.8441, + "step": 3429 + }, + { + "epoch": 0.53, + "grad_norm": 2.510085849702647, + "learning_rate": 9.675338238100375e-06, + "loss": 0.7499, + "step": 3430 + }, + { + "epoch": 0.53, + "grad_norm": 2.9373262344980984, + "learning_rate": 9.670383352343454e-06, + "loss": 0.9004, + "step": 3431 + }, + { + "epoch": 0.53, + "grad_norm": 2.6469345532827955, + "learning_rate": 9.665428547597057e-06, + "loss": 0.8273, + "step": 3432 + }, + { + "epoch": 0.53, + "grad_norm": 2.761930481958474, + "learning_rate": 9.660473825078944e-06, + "loss": 0.819, + "step": 3433 + }, + { + "epoch": 0.53, + "grad_norm": 2.4498718953719414, + "learning_rate": 9.655519186006842e-06, + "loss": 0.7963, + "step": 3434 + }, + { + "epoch": 0.53, + "grad_norm": 2.786495910453177, + "learning_rate": 9.65056463159846e-06, + "loss": 0.913, + "step": 3435 + }, + { + "epoch": 0.53, + "grad_norm": 2.53808454970735, + "learning_rate": 9.645610163071495e-06, + "loss": 0.82, + "step": 3436 + }, + { + "epoch": 0.53, + "grad_norm": 2.5015219428230586, + "learning_rate": 9.640655781643612e-06, + "loss": 0.7981, + "step": 3437 + }, + { + "epoch": 0.53, + "grad_norm": 2.4680609160616767, + "learning_rate": 9.635701488532455e-06, + "loss": 0.7857, + "step": 3438 + }, + { + "epoch": 0.53, + "grad_norm": 3.990846500890265, + "learning_rate": 9.63074728495566e-06, + "loss": 0.9372, + "step": 3439 + }, + { + "epoch": 0.53, + "grad_norm": 2.9446715414792775, + "learning_rate": 9.625793172130825e-06, + "loss": 0.7747, + "step": 3440 + }, + { + "epoch": 0.53, + "grad_norm": 2.483984216486534, + "learning_rate": 9.620839151275534e-06, + "loss": 0.8843, + "step": 3441 + }, + { + "epoch": 0.53, + "grad_norm": 2.7169965735793693, + "learning_rate": 9.615885223607345e-06, + "loss": 0.8942, + "step": 3442 + }, + { + "epoch": 0.53, + "grad_norm": 2.544915450697256, + "learning_rate": 9.610931390343792e-06, + "loss": 0.8571, + "step": 3443 + }, + { + "epoch": 0.53, + "grad_norm": 2.6189570146790913, + "learning_rate": 9.605977652702394e-06, + "loss": 0.8119, + "step": 3444 + }, + { + "epoch": 0.53, + "grad_norm": 2.771286909908849, + "learning_rate": 9.60102401190064e-06, + "loss": 0.8558, + "step": 3445 + }, + { + "epoch": 0.53, + "grad_norm": 2.627183298351705, + "learning_rate": 9.596070469155992e-06, + "loss": 0.8693, + "step": 3446 + }, + { + "epoch": 0.53, + "grad_norm": 2.602025072939709, + "learning_rate": 9.591117025685897e-06, + "loss": 0.8599, + "step": 3447 + }, + { + "epoch": 0.53, + "grad_norm": 2.7213601358474806, + "learning_rate": 9.586163682707768e-06, + "loss": 0.754, + "step": 3448 + }, + { + "epoch": 0.53, + "grad_norm": 2.7410310874423023, + "learning_rate": 9.581210441438994e-06, + "loss": 0.8447, + "step": 3449 + }, + { + "epoch": 0.53, + "grad_norm": 2.44157798300347, + "learning_rate": 9.576257303096955e-06, + "loss": 0.8248, + "step": 3450 + }, + { + "epoch": 0.53, + "grad_norm": 2.6606986031824222, + "learning_rate": 9.571304268898983e-06, + "loss": 0.99, + "step": 3451 + }, + { + "epoch": 0.53, + "grad_norm": 2.703515781558748, + "learning_rate": 9.566351340062401e-06, + "loss": 0.7711, + "step": 3452 + }, + { + "epoch": 0.53, + "grad_norm": 2.727709716865192, + "learning_rate": 9.561398517804498e-06, + "loss": 0.7937, + "step": 3453 + }, + { + "epoch": 0.53, + "grad_norm": 2.634863441280792, + "learning_rate": 9.556445803342532e-06, + "loss": 0.836, + "step": 3454 + }, + { + "epoch": 0.53, + "grad_norm": 6.148624164260505, + "learning_rate": 9.551493197893755e-06, + "loss": 0.9873, + "step": 3455 + }, + { + "epoch": 0.53, + "grad_norm": 2.8217903157217106, + "learning_rate": 9.546540702675369e-06, + "loss": 0.841, + "step": 3456 + }, + { + "epoch": 0.53, + "grad_norm": 2.667756818733098, + "learning_rate": 9.541588318904558e-06, + "loss": 0.8399, + "step": 3457 + }, + { + "epoch": 0.53, + "grad_norm": 6.8915142027475165, + "learning_rate": 9.536636047798484e-06, + "loss": 0.8674, + "step": 3458 + }, + { + "epoch": 0.53, + "grad_norm": 2.5080794197051826, + "learning_rate": 9.531683890574275e-06, + "loss": 0.7672, + "step": 3459 + }, + { + "epoch": 0.53, + "grad_norm": 2.8576179806465003, + "learning_rate": 9.526731848449025e-06, + "loss": 0.8458, + "step": 3460 + }, + { + "epoch": 0.53, + "grad_norm": 2.7890894029086764, + "learning_rate": 9.52177992263982e-06, + "loss": 0.8804, + "step": 3461 + }, + { + "epoch": 0.53, + "grad_norm": 2.6054412774153697, + "learning_rate": 9.516828114363695e-06, + "loss": 0.7928, + "step": 3462 + }, + { + "epoch": 0.53, + "grad_norm": 2.510783295864765, + "learning_rate": 9.51187642483767e-06, + "loss": 0.9286, + "step": 3463 + }, + { + "epoch": 0.53, + "grad_norm": 2.600019031689135, + "learning_rate": 9.50692485527873e-06, + "loss": 0.7392, + "step": 3464 + }, + { + "epoch": 0.53, + "grad_norm": 2.6241823466708336, + "learning_rate": 9.501973406903827e-06, + "loss": 0.8606, + "step": 3465 + }, + { + "epoch": 0.53, + "grad_norm": 9.190637646959537, + "learning_rate": 9.497022080929898e-06, + "loss": 0.9424, + "step": 3466 + }, + { + "epoch": 0.53, + "grad_norm": 2.564623714774583, + "learning_rate": 9.492070878573835e-06, + "loss": 0.8429, + "step": 3467 + }, + { + "epoch": 0.53, + "grad_norm": 2.668379840486614, + "learning_rate": 9.487119801052502e-06, + "loss": 0.8477, + "step": 3468 + }, + { + "epoch": 0.53, + "grad_norm": 2.569726689880901, + "learning_rate": 9.48216884958274e-06, + "loss": 0.7003, + "step": 3469 + }, + { + "epoch": 0.53, + "grad_norm": 2.5774518981436936, + "learning_rate": 9.477218025381351e-06, + "loss": 0.7733, + "step": 3470 + }, + { + "epoch": 0.53, + "grad_norm": 2.8184807477848963, + "learning_rate": 9.472267329665107e-06, + "loss": 0.7704, + "step": 3471 + }, + { + "epoch": 0.53, + "grad_norm": 2.489332514227551, + "learning_rate": 9.467316763650755e-06, + "loss": 0.8022, + "step": 3472 + }, + { + "epoch": 0.53, + "grad_norm": 2.741348982843612, + "learning_rate": 9.462366328555e-06, + "loss": 0.8633, + "step": 3473 + }, + { + "epoch": 0.53, + "grad_norm": 2.8905647730408908, + "learning_rate": 9.457416025594528e-06, + "loss": 0.9357, + "step": 3474 + }, + { + "epoch": 0.53, + "grad_norm": 2.951366616887644, + "learning_rate": 9.452465855985978e-06, + "loss": 0.8145, + "step": 3475 + }, + { + "epoch": 0.53, + "grad_norm": 3.4285873693290116, + "learning_rate": 9.447515820945959e-06, + "loss": 0.7857, + "step": 3476 + }, + { + "epoch": 0.53, + "grad_norm": 2.815876716660041, + "learning_rate": 9.442565921691062e-06, + "loss": 0.7452, + "step": 3477 + }, + { + "epoch": 0.53, + "grad_norm": 2.5491022711639046, + "learning_rate": 9.437616159437828e-06, + "loss": 0.8215, + "step": 3478 + }, + { + "epoch": 0.53, + "grad_norm": 2.5599892020044384, + "learning_rate": 9.432666535402764e-06, + "loss": 0.7299, + "step": 3479 + }, + { + "epoch": 0.53, + "grad_norm": 2.7040859098807526, + "learning_rate": 9.427717050802359e-06, + "loss": 0.811, + "step": 3480 + }, + { + "epoch": 0.53, + "grad_norm": 2.9344148037173694, + "learning_rate": 9.42276770685305e-06, + "loss": 0.9246, + "step": 3481 + }, + { + "epoch": 0.53, + "grad_norm": 5.906786834299944, + "learning_rate": 9.41781850477125e-06, + "loss": 0.9404, + "step": 3482 + }, + { + "epoch": 0.53, + "grad_norm": 2.566431527013668, + "learning_rate": 9.412869445773338e-06, + "loss": 0.8971, + "step": 3483 + }, + { + "epoch": 0.53, + "grad_norm": 2.8506725057017905, + "learning_rate": 9.407920531075641e-06, + "loss": 0.8048, + "step": 3484 + }, + { + "epoch": 0.53, + "grad_norm": 4.215499929654718, + "learning_rate": 9.40297176189448e-06, + "loss": 0.9818, + "step": 3485 + }, + { + "epoch": 0.53, + "grad_norm": 2.8119702972456717, + "learning_rate": 9.398023139446113e-06, + "loss": 0.8954, + "step": 3486 + }, + { + "epoch": 0.53, + "grad_norm": 2.8743448207774582, + "learning_rate": 9.393074664946773e-06, + "loss": 0.9275, + "step": 3487 + }, + { + "epoch": 0.53, + "grad_norm": 2.6264244278472773, + "learning_rate": 9.388126339612661e-06, + "loss": 0.8055, + "step": 3488 + }, + { + "epoch": 0.53, + "grad_norm": 2.786429666020273, + "learning_rate": 9.383178164659935e-06, + "loss": 0.7912, + "step": 3489 + }, + { + "epoch": 0.53, + "grad_norm": 2.96515087237739, + "learning_rate": 9.378230141304711e-06, + "loss": 0.9173, + "step": 3490 + }, + { + "epoch": 0.53, + "grad_norm": 2.685269953318423, + "learning_rate": 9.373282270763087e-06, + "loss": 0.8394, + "step": 3491 + }, + { + "epoch": 0.53, + "grad_norm": 2.8070326631129587, + "learning_rate": 9.368334554251099e-06, + "loss": 0.9085, + "step": 3492 + }, + { + "epoch": 0.53, + "grad_norm": 2.79038954283755, + "learning_rate": 9.363386992984765e-06, + "loss": 0.8801, + "step": 3493 + }, + { + "epoch": 0.53, + "grad_norm": 2.9430808628170833, + "learning_rate": 9.358439588180053e-06, + "loss": 0.8405, + "step": 3494 + }, + { + "epoch": 0.53, + "grad_norm": 2.6887706037474057, + "learning_rate": 9.353492341052894e-06, + "loss": 0.7708, + "step": 3495 + }, + { + "epoch": 0.54, + "grad_norm": 2.506287273436747, + "learning_rate": 9.34854525281919e-06, + "loss": 0.8898, + "step": 3496 + }, + { + "epoch": 0.54, + "grad_norm": 2.798243828822919, + "learning_rate": 9.34359832469479e-06, + "loss": 0.8658, + "step": 3497 + }, + { + "epoch": 0.54, + "grad_norm": 2.8401541321532227, + "learning_rate": 9.338651557895513e-06, + "loss": 0.8637, + "step": 3498 + }, + { + "epoch": 0.54, + "grad_norm": 2.4414548026642198, + "learning_rate": 9.333704953637135e-06, + "loss": 0.8658, + "step": 3499 + }, + { + "epoch": 0.54, + "grad_norm": 2.6143105878878985, + "learning_rate": 9.328758513135393e-06, + "loss": 0.7415, + "step": 3500 + }, + { + "epoch": 0.54, + "grad_norm": 2.509734906422458, + "learning_rate": 9.323812237605977e-06, + "loss": 0.8148, + "step": 3501 + }, + { + "epoch": 0.54, + "grad_norm": 2.55941722209386, + "learning_rate": 9.318866128264556e-06, + "loss": 0.7698, + "step": 3502 + }, + { + "epoch": 0.54, + "grad_norm": 2.6536660992690133, + "learning_rate": 9.313920186326734e-06, + "loss": 0.7173, + "step": 3503 + }, + { + "epoch": 0.54, + "grad_norm": 2.5321252514402866, + "learning_rate": 9.30897441300809e-06, + "loss": 0.8794, + "step": 3504 + }, + { + "epoch": 0.54, + "grad_norm": 2.9748037974758677, + "learning_rate": 9.304028809524154e-06, + "loss": 0.9203, + "step": 3505 + }, + { + "epoch": 0.54, + "grad_norm": 2.852371008150815, + "learning_rate": 9.299083377090415e-06, + "loss": 0.811, + "step": 3506 + }, + { + "epoch": 0.54, + "grad_norm": 2.8329209503983774, + "learning_rate": 9.294138116922328e-06, + "loss": 0.8725, + "step": 3507 + }, + { + "epoch": 0.54, + "grad_norm": 2.556917873688633, + "learning_rate": 9.289193030235293e-06, + "loss": 0.8832, + "step": 3508 + }, + { + "epoch": 0.54, + "grad_norm": 2.7490077769808483, + "learning_rate": 9.284248118244676e-06, + "loss": 0.8685, + "step": 3509 + }, + { + "epoch": 0.54, + "grad_norm": 2.821699231197627, + "learning_rate": 9.2793033821658e-06, + "loss": 0.8822, + "step": 3510 + }, + { + "epoch": 0.54, + "grad_norm": 2.62568208135973, + "learning_rate": 9.274358823213938e-06, + "loss": 0.8229, + "step": 3511 + }, + { + "epoch": 0.54, + "grad_norm": 3.3221816368186157, + "learning_rate": 9.269414442604324e-06, + "loss": 0.9029, + "step": 3512 + }, + { + "epoch": 0.54, + "grad_norm": 2.5729726401385427, + "learning_rate": 9.264470241552152e-06, + "loss": 0.7594, + "step": 3513 + }, + { + "epoch": 0.54, + "grad_norm": 2.613962094609536, + "learning_rate": 9.259526221272564e-06, + "loss": 0.8595, + "step": 3514 + }, + { + "epoch": 0.54, + "grad_norm": 2.7485831111727084, + "learning_rate": 9.254582382980667e-06, + "loss": 0.7865, + "step": 3515 + }, + { + "epoch": 0.54, + "grad_norm": 2.622037998037098, + "learning_rate": 9.249638727891513e-06, + "loss": 0.9312, + "step": 3516 + }, + { + "epoch": 0.54, + "grad_norm": 2.4585507034727847, + "learning_rate": 9.24469525722011e-06, + "loss": 0.7725, + "step": 3517 + }, + { + "epoch": 0.54, + "grad_norm": 2.491230089556136, + "learning_rate": 9.239751972181435e-06, + "loss": 0.8587, + "step": 3518 + }, + { + "epoch": 0.54, + "grad_norm": 2.6129819112402437, + "learning_rate": 9.234808873990405e-06, + "loss": 0.8502, + "step": 3519 + }, + { + "epoch": 0.54, + "grad_norm": 2.5048705499962733, + "learning_rate": 9.229865963861888e-06, + "loss": 0.7968, + "step": 3520 + }, + { + "epoch": 0.54, + "grad_norm": 2.6486800302992606, + "learning_rate": 9.224923243010722e-06, + "loss": 0.8275, + "step": 3521 + }, + { + "epoch": 0.54, + "grad_norm": 2.7180029217975923, + "learning_rate": 9.219980712651684e-06, + "loss": 0.7334, + "step": 3522 + }, + { + "epoch": 0.54, + "grad_norm": 2.600823388647472, + "learning_rate": 9.215038373999507e-06, + "loss": 0.8259, + "step": 3523 + }, + { + "epoch": 0.54, + "grad_norm": 2.842235272446629, + "learning_rate": 9.210096228268885e-06, + "loss": 0.8671, + "step": 3524 + }, + { + "epoch": 0.54, + "grad_norm": 4.692287898296294, + "learning_rate": 9.205154276674456e-06, + "loss": 0.9264, + "step": 3525 + }, + { + "epoch": 0.54, + "grad_norm": 2.682372073940632, + "learning_rate": 9.200212520430814e-06, + "loss": 0.8054, + "step": 3526 + }, + { + "epoch": 0.54, + "grad_norm": 4.928798676688956, + "learning_rate": 9.195270960752505e-06, + "loss": 0.998, + "step": 3527 + }, + { + "epoch": 0.54, + "grad_norm": 2.5156680957421016, + "learning_rate": 9.19032959885402e-06, + "loss": 0.7873, + "step": 3528 + }, + { + "epoch": 0.54, + "grad_norm": 2.643942704387546, + "learning_rate": 9.185388435949815e-06, + "loss": 0.7548, + "step": 3529 + }, + { + "epoch": 0.54, + "grad_norm": 2.65169439065476, + "learning_rate": 9.180447473254289e-06, + "loss": 0.8838, + "step": 3530 + }, + { + "epoch": 0.54, + "grad_norm": 2.6752622485002098, + "learning_rate": 9.175506711981782e-06, + "loss": 0.8896, + "step": 3531 + }, + { + "epoch": 0.54, + "grad_norm": 2.7589924845247444, + "learning_rate": 9.170566153346606e-06, + "loss": 0.8725, + "step": 3532 + }, + { + "epoch": 0.54, + "grad_norm": 2.506078803726627, + "learning_rate": 9.165625798563007e-06, + "loss": 0.7382, + "step": 3533 + }, + { + "epoch": 0.54, + "grad_norm": 2.53520520564263, + "learning_rate": 9.160685648845182e-06, + "loss": 0.8571, + "step": 3534 + }, + { + "epoch": 0.54, + "grad_norm": 2.607123422142902, + "learning_rate": 9.155745705407288e-06, + "loss": 0.9119, + "step": 3535 + }, + { + "epoch": 0.54, + "grad_norm": 2.5822010527861434, + "learning_rate": 9.15080596946342e-06, + "loss": 0.7983, + "step": 3536 + }, + { + "epoch": 0.54, + "grad_norm": 2.5685810562247813, + "learning_rate": 9.145866442227632e-06, + "loss": 0.8518, + "step": 3537 + }, + { + "epoch": 0.54, + "grad_norm": 2.535089245635708, + "learning_rate": 9.140927124913915e-06, + "loss": 0.6993, + "step": 3538 + }, + { + "epoch": 0.54, + "grad_norm": 2.7257069900543813, + "learning_rate": 9.135988018736214e-06, + "loss": 0.8749, + "step": 3539 + }, + { + "epoch": 0.54, + "grad_norm": 2.711857668052274, + "learning_rate": 9.13104912490843e-06, + "loss": 0.833, + "step": 3540 + }, + { + "epoch": 0.54, + "grad_norm": 2.5288441724978767, + "learning_rate": 9.1261104446444e-06, + "loss": 0.806, + "step": 3541 + }, + { + "epoch": 0.54, + "grad_norm": 2.9774721456587407, + "learning_rate": 9.121171979157912e-06, + "loss": 0.9085, + "step": 3542 + }, + { + "epoch": 0.54, + "grad_norm": 2.5387614093810194, + "learning_rate": 9.116233729662705e-06, + "loss": 0.784, + "step": 3543 + }, + { + "epoch": 0.54, + "grad_norm": 2.9467484789677543, + "learning_rate": 9.111295697372463e-06, + "loss": 0.9633, + "step": 3544 + }, + { + "epoch": 0.54, + "grad_norm": 2.9757437755654497, + "learning_rate": 9.106357883500808e-06, + "loss": 0.8032, + "step": 3545 + }, + { + "epoch": 0.54, + "grad_norm": 2.5998054749396022, + "learning_rate": 9.101420289261327e-06, + "loss": 0.8322, + "step": 3546 + }, + { + "epoch": 0.54, + "grad_norm": 2.7217799386307995, + "learning_rate": 9.096482915867535e-06, + "loss": 0.9016, + "step": 3547 + }, + { + "epoch": 0.54, + "grad_norm": 2.7475901842215342, + "learning_rate": 9.091545764532905e-06, + "loss": 0.8422, + "step": 3548 + }, + { + "epoch": 0.54, + "grad_norm": 2.904223424761576, + "learning_rate": 9.086608836470847e-06, + "loss": 0.8544, + "step": 3549 + }, + { + "epoch": 0.54, + "grad_norm": 2.8385256278455, + "learning_rate": 9.081672132894716e-06, + "loss": 0.8998, + "step": 3550 + }, + { + "epoch": 0.54, + "grad_norm": 6.538160224033064, + "learning_rate": 9.076735655017822e-06, + "loss": 1.0688, + "step": 3551 + }, + { + "epoch": 0.54, + "grad_norm": 2.6527168081088623, + "learning_rate": 9.071799404053412e-06, + "loss": 0.8223, + "step": 3552 + }, + { + "epoch": 0.54, + "grad_norm": 2.9676891853864023, + "learning_rate": 9.066863381214672e-06, + "loss": 0.8399, + "step": 3553 + }, + { + "epoch": 0.54, + "grad_norm": 2.685845974353761, + "learning_rate": 9.061927587714747e-06, + "loss": 0.9156, + "step": 3554 + }, + { + "epoch": 0.54, + "grad_norm": 2.724001222133191, + "learning_rate": 9.056992024766706e-06, + "loss": 0.8723, + "step": 3555 + }, + { + "epoch": 0.54, + "grad_norm": 2.4897993153457327, + "learning_rate": 9.05205669358358e-06, + "loss": 0.7244, + "step": 3556 + }, + { + "epoch": 0.54, + "grad_norm": 2.6163874330228825, + "learning_rate": 9.047121595378335e-06, + "loss": 0.9575, + "step": 3557 + }, + { + "epoch": 0.54, + "grad_norm": 2.471137288002054, + "learning_rate": 9.042186731363876e-06, + "loss": 0.8606, + "step": 3558 + }, + { + "epoch": 0.54, + "grad_norm": 2.8295381734389005, + "learning_rate": 9.037252102753056e-06, + "loss": 0.866, + "step": 3559 + }, + { + "epoch": 0.54, + "grad_norm": 2.5518594589083032, + "learning_rate": 9.032317710758668e-06, + "loss": 0.7769, + "step": 3560 + }, + { + "epoch": 0.55, + "grad_norm": 2.6965298501742114, + "learning_rate": 9.027383556593443e-06, + "loss": 0.818, + "step": 3561 + }, + { + "epoch": 0.55, + "grad_norm": 2.536115670826026, + "learning_rate": 9.022449641470066e-06, + "loss": 0.7124, + "step": 3562 + }, + { + "epoch": 0.55, + "grad_norm": 2.588450203478518, + "learning_rate": 9.017515966601152e-06, + "loss": 0.972, + "step": 3563 + }, + { + "epoch": 0.55, + "grad_norm": 3.7909023955013934, + "learning_rate": 9.012582533199254e-06, + "loss": 0.9889, + "step": 3564 + }, + { + "epoch": 0.55, + "grad_norm": 2.6482008686874416, + "learning_rate": 9.00764934247688e-06, + "loss": 0.8659, + "step": 3565 + }, + { + "epoch": 0.55, + "grad_norm": 2.8874019494064798, + "learning_rate": 9.002716395646462e-06, + "loss": 0.8602, + "step": 3566 + }, + { + "epoch": 0.55, + "grad_norm": 2.6378105278298722, + "learning_rate": 8.997783693920387e-06, + "loss": 0.8591, + "step": 3567 + }, + { + "epoch": 0.55, + "grad_norm": 2.6620268667766327, + "learning_rate": 8.992851238510972e-06, + "loss": 0.8449, + "step": 3568 + }, + { + "epoch": 0.55, + "grad_norm": 2.633541484655814, + "learning_rate": 8.987919030630474e-06, + "loss": 0.8495, + "step": 3569 + }, + { + "epoch": 0.55, + "grad_norm": 2.397040476582493, + "learning_rate": 8.982987071491097e-06, + "loss": 0.8061, + "step": 3570 + }, + { + "epoch": 0.55, + "grad_norm": 2.5219829545734362, + "learning_rate": 8.978055362304974e-06, + "loss": 0.8393, + "step": 3571 + }, + { + "epoch": 0.55, + "grad_norm": 2.586610187878668, + "learning_rate": 8.973123904284175e-06, + "loss": 0.7636, + "step": 3572 + }, + { + "epoch": 0.55, + "grad_norm": 2.7685865321112684, + "learning_rate": 8.968192698640728e-06, + "loss": 0.8546, + "step": 3573 + }, + { + "epoch": 0.55, + "grad_norm": 2.8152562626440734, + "learning_rate": 8.963261746586576e-06, + "loss": 0.8472, + "step": 3574 + }, + { + "epoch": 0.55, + "grad_norm": 2.470179452309494, + "learning_rate": 8.958331049333608e-06, + "loss": 0.8264, + "step": 3575 + }, + { + "epoch": 0.55, + "grad_norm": 2.5236760418062016, + "learning_rate": 8.953400608093655e-06, + "loss": 0.8112, + "step": 3576 + }, + { + "epoch": 0.55, + "grad_norm": 3.1871526471365152, + "learning_rate": 8.948470424078477e-06, + "loss": 0.8764, + "step": 3577 + }, + { + "epoch": 0.55, + "grad_norm": 4.390788855245002, + "learning_rate": 8.94354049849978e-06, + "loss": 1.0057, + "step": 3578 + }, + { + "epoch": 0.55, + "grad_norm": 2.7210093186803923, + "learning_rate": 8.9386108325692e-06, + "loss": 0.9305, + "step": 3579 + }, + { + "epoch": 0.55, + "grad_norm": 2.9855834421533496, + "learning_rate": 8.933681427498308e-06, + "loss": 0.9511, + "step": 3580 + }, + { + "epoch": 0.55, + "grad_norm": 2.7666441197229914, + "learning_rate": 8.928752284498616e-06, + "loss": 0.8362, + "step": 3581 + }, + { + "epoch": 0.55, + "grad_norm": 2.751965748661896, + "learning_rate": 8.923823404781569e-06, + "loss": 0.8808, + "step": 3582 + }, + { + "epoch": 0.55, + "grad_norm": 2.576050038453813, + "learning_rate": 8.91889478955854e-06, + "loss": 0.7936, + "step": 3583 + }, + { + "epoch": 0.55, + "grad_norm": 2.8137578824285945, + "learning_rate": 8.913966440040858e-06, + "loss": 0.8241, + "step": 3584 + }, + { + "epoch": 0.55, + "grad_norm": 2.6778875907500406, + "learning_rate": 8.909038357439767e-06, + "loss": 0.7843, + "step": 3585 + }, + { + "epoch": 0.55, + "grad_norm": 2.668733297014157, + "learning_rate": 8.904110542966446e-06, + "loss": 0.7914, + "step": 3586 + }, + { + "epoch": 0.55, + "grad_norm": 2.6923499450062893, + "learning_rate": 8.89918299783202e-06, + "loss": 0.8458, + "step": 3587 + }, + { + "epoch": 0.55, + "grad_norm": 2.632803767921514, + "learning_rate": 8.894255723247536e-06, + "loss": 0.8983, + "step": 3588 + }, + { + "epoch": 0.55, + "grad_norm": 2.5401591385553277, + "learning_rate": 8.88932872042399e-06, + "loss": 0.7862, + "step": 3589 + }, + { + "epoch": 0.55, + "grad_norm": 2.7332741048875278, + "learning_rate": 8.884401990572293e-06, + "loss": 0.9888, + "step": 3590 + }, + { + "epoch": 0.55, + "grad_norm": 2.7439162578949094, + "learning_rate": 8.879475534903292e-06, + "loss": 0.8091, + "step": 3591 + }, + { + "epoch": 0.55, + "grad_norm": 3.2872806339100715, + "learning_rate": 8.874549354627786e-06, + "loss": 0.9352, + "step": 3592 + }, + { + "epoch": 0.55, + "grad_norm": 2.7685688420362307, + "learning_rate": 8.869623450956484e-06, + "loss": 0.8611, + "step": 3593 + }, + { + "epoch": 0.55, + "grad_norm": 2.736941628903649, + "learning_rate": 8.86469782510003e-06, + "loss": 0.7929, + "step": 3594 + }, + { + "epoch": 0.55, + "grad_norm": 2.58569671282389, + "learning_rate": 8.859772478269013e-06, + "loss": 0.8211, + "step": 3595 + }, + { + "epoch": 0.55, + "grad_norm": 3.025888336572811, + "learning_rate": 8.854847411673944e-06, + "loss": 0.8808, + "step": 3596 + }, + { + "epoch": 0.55, + "grad_norm": 2.774503519864151, + "learning_rate": 8.849922626525258e-06, + "loss": 0.8471, + "step": 3597 + }, + { + "epoch": 0.55, + "grad_norm": 2.652018012156596, + "learning_rate": 8.844998124033339e-06, + "loss": 0.8309, + "step": 3598 + }, + { + "epoch": 0.55, + "grad_norm": 2.7737368141511816, + "learning_rate": 8.840073905408488e-06, + "loss": 0.797, + "step": 3599 + }, + { + "epoch": 0.55, + "grad_norm": 2.631941056407336, + "learning_rate": 8.83514997186094e-06, + "loss": 0.7881, + "step": 3600 + }, + { + "epoch": 0.55, + "grad_norm": 3.2951018055617, + "learning_rate": 8.83022632460086e-06, + "loss": 0.808, + "step": 3601 + }, + { + "epoch": 0.55, + "grad_norm": 2.754968213283891, + "learning_rate": 8.825302964838337e-06, + "loss": 0.8158, + "step": 3602 + }, + { + "epoch": 0.55, + "grad_norm": 2.7676551447009947, + "learning_rate": 8.820379893783406e-06, + "loss": 0.7856, + "step": 3603 + }, + { + "epoch": 0.55, + "grad_norm": 2.3558550948620947, + "learning_rate": 8.815457112646012e-06, + "loss": 0.8038, + "step": 3604 + }, + { + "epoch": 0.55, + "grad_norm": 2.9201439057338137, + "learning_rate": 8.810534622636035e-06, + "loss": 0.8423, + "step": 3605 + }, + { + "epoch": 0.55, + "grad_norm": 4.485876965646072, + "learning_rate": 8.805612424963293e-06, + "loss": 0.9396, + "step": 3606 + }, + { + "epoch": 0.55, + "grad_norm": 3.05468597846747, + "learning_rate": 8.800690520837516e-06, + "loss": 0.8926, + "step": 3607 + }, + { + "epoch": 0.55, + "grad_norm": 2.6860278468379533, + "learning_rate": 8.79576891146837e-06, + "loss": 0.8396, + "step": 3608 + }, + { + "epoch": 0.55, + "grad_norm": 2.4171544778269194, + "learning_rate": 8.790847598065457e-06, + "loss": 0.8286, + "step": 3609 + }, + { + "epoch": 0.55, + "grad_norm": 3.84210486687714, + "learning_rate": 8.785926581838288e-06, + "loss": 0.9437, + "step": 3610 + }, + { + "epoch": 0.55, + "grad_norm": 3.015621670490844, + "learning_rate": 8.781005863996318e-06, + "loss": 0.9002, + "step": 3611 + }, + { + "epoch": 0.55, + "grad_norm": 2.6779526247346066, + "learning_rate": 8.77608544574892e-06, + "loss": 0.8747, + "step": 3612 + }, + { + "epoch": 0.55, + "grad_norm": 3.650536560154308, + "learning_rate": 8.771165328305387e-06, + "loss": 0.8957, + "step": 3613 + }, + { + "epoch": 0.55, + "grad_norm": 2.6917203323536785, + "learning_rate": 8.766245512874959e-06, + "loss": 0.886, + "step": 3614 + }, + { + "epoch": 0.55, + "grad_norm": 2.6663991698263834, + "learning_rate": 8.76132600066678e-06, + "loss": 0.7626, + "step": 3615 + }, + { + "epoch": 0.55, + "grad_norm": 2.620435279245936, + "learning_rate": 8.75640679288993e-06, + "loss": 0.7998, + "step": 3616 + }, + { + "epoch": 0.55, + "grad_norm": 2.506585621530046, + "learning_rate": 8.751487890753414e-06, + "loss": 0.8461, + "step": 3617 + }, + { + "epoch": 0.55, + "grad_norm": 2.427248465159573, + "learning_rate": 8.746569295466158e-06, + "loss": 0.854, + "step": 3618 + }, + { + "epoch": 0.55, + "grad_norm": 2.886733992698023, + "learning_rate": 8.741651008237012e-06, + "loss": 0.9957, + "step": 3619 + }, + { + "epoch": 0.55, + "grad_norm": 2.5762624317434146, + "learning_rate": 8.73673303027476e-06, + "loss": 0.893, + "step": 3620 + }, + { + "epoch": 0.55, + "grad_norm": 3.0155415002945727, + "learning_rate": 8.731815362788097e-06, + "loss": 0.8848, + "step": 3621 + }, + { + "epoch": 0.55, + "grad_norm": 3.077168323117745, + "learning_rate": 8.72689800698565e-06, + "loss": 0.8263, + "step": 3622 + }, + { + "epoch": 0.55, + "grad_norm": 2.7555288356242706, + "learning_rate": 8.721980964075971e-06, + "loss": 0.7863, + "step": 3623 + }, + { + "epoch": 0.55, + "grad_norm": 2.5669626707976776, + "learning_rate": 8.717064235267523e-06, + "loss": 0.8099, + "step": 3624 + }, + { + "epoch": 0.55, + "grad_norm": 2.9058792524652364, + "learning_rate": 8.712147821768708e-06, + "loss": 0.8589, + "step": 3625 + }, + { + "epoch": 0.56, + "grad_norm": 2.5965076505939697, + "learning_rate": 8.70723172478784e-06, + "loss": 0.8579, + "step": 3626 + }, + { + "epoch": 0.56, + "grad_norm": 2.8274905558414374, + "learning_rate": 8.702315945533156e-06, + "loss": 0.8621, + "step": 3627 + }, + { + "epoch": 0.56, + "grad_norm": 2.630215743420575, + "learning_rate": 8.697400485212816e-06, + "loss": 0.9039, + "step": 3628 + }, + { + "epoch": 0.56, + "grad_norm": 2.6932115439415667, + "learning_rate": 8.692485345034903e-06, + "loss": 0.8897, + "step": 3629 + }, + { + "epoch": 0.56, + "grad_norm": 2.9440595153618623, + "learning_rate": 8.687570526207425e-06, + "loss": 0.8714, + "step": 3630 + }, + { + "epoch": 0.56, + "grad_norm": 2.603288212077659, + "learning_rate": 8.682656029938304e-06, + "loss": 0.8508, + "step": 3631 + }, + { + "epoch": 0.56, + "grad_norm": 2.677093356005969, + "learning_rate": 8.67774185743538e-06, + "loss": 0.8141, + "step": 3632 + }, + { + "epoch": 0.56, + "grad_norm": 2.3662069721673933, + "learning_rate": 8.672828009906427e-06, + "loss": 0.8724, + "step": 3633 + }, + { + "epoch": 0.56, + "grad_norm": 2.53071609090638, + "learning_rate": 8.667914488559128e-06, + "loss": 0.8556, + "step": 3634 + }, + { + "epoch": 0.56, + "grad_norm": 2.7326135410952004, + "learning_rate": 8.663001294601082e-06, + "loss": 0.789, + "step": 3635 + }, + { + "epoch": 0.56, + "grad_norm": 5.10132310788602, + "learning_rate": 8.658088429239826e-06, + "loss": 1.0455, + "step": 3636 + }, + { + "epoch": 0.56, + "grad_norm": 2.87349936884276, + "learning_rate": 8.653175893682798e-06, + "loss": 0.8204, + "step": 3637 + }, + { + "epoch": 0.56, + "grad_norm": 2.763286521179671, + "learning_rate": 8.648263689137361e-06, + "loss": 0.8436, + "step": 3638 + }, + { + "epoch": 0.56, + "grad_norm": 2.6273730030188003, + "learning_rate": 8.643351816810798e-06, + "loss": 0.7951, + "step": 3639 + }, + { + "epoch": 0.56, + "grad_norm": 2.917254214247303, + "learning_rate": 8.638440277910308e-06, + "loss": 0.8058, + "step": 3640 + }, + { + "epoch": 0.56, + "grad_norm": 2.3002616303651755, + "learning_rate": 8.633529073643015e-06, + "loss": 0.7621, + "step": 3641 + }, + { + "epoch": 0.56, + "grad_norm": 2.413059539401685, + "learning_rate": 8.628618205215952e-06, + "loss": 0.8709, + "step": 3642 + }, + { + "epoch": 0.56, + "grad_norm": 2.5414190420762237, + "learning_rate": 8.62370767383607e-06, + "loss": 0.8097, + "step": 3643 + }, + { + "epoch": 0.56, + "grad_norm": 2.481571621621111, + "learning_rate": 8.618797480710244e-06, + "loss": 0.8251, + "step": 3644 + }, + { + "epoch": 0.56, + "grad_norm": 2.5172550773810687, + "learning_rate": 8.613887627045259e-06, + "loss": 0.9102, + "step": 3645 + }, + { + "epoch": 0.56, + "grad_norm": 2.670664180733619, + "learning_rate": 8.608978114047818e-06, + "loss": 0.8178, + "step": 3646 + }, + { + "epoch": 0.56, + "grad_norm": 2.606139471469613, + "learning_rate": 8.604068942924546e-06, + "loss": 0.8516, + "step": 3647 + }, + { + "epoch": 0.56, + "grad_norm": 2.82664466148121, + "learning_rate": 8.599160114881979e-06, + "loss": 0.9226, + "step": 3648 + }, + { + "epoch": 0.56, + "grad_norm": 2.523005606895564, + "learning_rate": 8.594251631126566e-06, + "loss": 0.8452, + "step": 3649 + }, + { + "epoch": 0.56, + "grad_norm": 6.5205110535151665, + "learning_rate": 8.589343492864677e-06, + "loss": 0.9898, + "step": 3650 + }, + { + "epoch": 0.56, + "grad_norm": 2.9766710161932304, + "learning_rate": 8.58443570130259e-06, + "loss": 0.7819, + "step": 3651 + }, + { + "epoch": 0.56, + "grad_norm": 2.6300474432151995, + "learning_rate": 8.579528257646512e-06, + "loss": 0.8238, + "step": 3652 + }, + { + "epoch": 0.56, + "grad_norm": 2.6781706303509085, + "learning_rate": 8.574621163102548e-06, + "loss": 0.8303, + "step": 3653 + }, + { + "epoch": 0.56, + "grad_norm": 2.8478976294309364, + "learning_rate": 8.569714418876726e-06, + "loss": 0.8185, + "step": 3654 + }, + { + "epoch": 0.56, + "grad_norm": 2.675348904783975, + "learning_rate": 8.564808026174987e-06, + "loss": 0.8694, + "step": 3655 + }, + { + "epoch": 0.56, + "grad_norm": 2.645107465382493, + "learning_rate": 8.559901986203183e-06, + "loss": 0.8213, + "step": 3656 + }, + { + "epoch": 0.56, + "grad_norm": 2.7247460217826753, + "learning_rate": 8.554996300167078e-06, + "loss": 0.7998, + "step": 3657 + }, + { + "epoch": 0.56, + "grad_norm": 2.5975898264618977, + "learning_rate": 8.55009096927236e-06, + "loss": 0.8683, + "step": 3658 + }, + { + "epoch": 0.56, + "grad_norm": 2.768347542686713, + "learning_rate": 8.545185994724619e-06, + "loss": 0.8646, + "step": 3659 + }, + { + "epoch": 0.56, + "grad_norm": 2.524499498032838, + "learning_rate": 8.540281377729355e-06, + "loss": 0.9274, + "step": 3660 + }, + { + "epoch": 0.56, + "grad_norm": 2.5152003835677985, + "learning_rate": 8.535377119491992e-06, + "loss": 0.7848, + "step": 3661 + }, + { + "epoch": 0.56, + "grad_norm": 11.956498767697864, + "learning_rate": 8.53047322121785e-06, + "loss": 0.9453, + "step": 3662 + }, + { + "epoch": 0.56, + "grad_norm": 2.6256844278947464, + "learning_rate": 8.525569684112181e-06, + "loss": 0.6936, + "step": 3663 + }, + { + "epoch": 0.56, + "grad_norm": 2.80453341383359, + "learning_rate": 8.520666509380132e-06, + "loss": 0.9192, + "step": 3664 + }, + { + "epoch": 0.56, + "grad_norm": 2.6025782439130096, + "learning_rate": 8.515763698226764e-06, + "loss": 0.9011, + "step": 3665 + }, + { + "epoch": 0.56, + "grad_norm": 2.6216848423824293, + "learning_rate": 8.510861251857053e-06, + "loss": 0.7981, + "step": 3666 + }, + { + "epoch": 0.56, + "grad_norm": 2.526482661731994, + "learning_rate": 8.505959171475883e-06, + "loss": 0.7283, + "step": 3667 + }, + { + "epoch": 0.56, + "grad_norm": 2.840409081076734, + "learning_rate": 8.501057458288041e-06, + "loss": 0.8971, + "step": 3668 + }, + { + "epoch": 0.56, + "grad_norm": 2.6405701512694044, + "learning_rate": 8.496156113498242e-06, + "loss": 0.8458, + "step": 3669 + }, + { + "epoch": 0.56, + "grad_norm": 2.647772783988904, + "learning_rate": 8.491255138311093e-06, + "loss": 0.8854, + "step": 3670 + }, + { + "epoch": 0.56, + "grad_norm": 2.5549512183080996, + "learning_rate": 8.486354533931114e-06, + "loss": 0.7682, + "step": 3671 + }, + { + "epoch": 0.56, + "grad_norm": 2.539104496539669, + "learning_rate": 8.481454301562741e-06, + "loss": 0.7779, + "step": 3672 + }, + { + "epoch": 0.56, + "grad_norm": 3.0747522525924325, + "learning_rate": 8.476554442410308e-06, + "loss": 0.8221, + "step": 3673 + }, + { + "epoch": 0.56, + "grad_norm": 2.833827224853693, + "learning_rate": 8.47165495767807e-06, + "loss": 0.8259, + "step": 3674 + }, + { + "epoch": 0.56, + "grad_norm": 4.3111799161124615, + "learning_rate": 8.46675584857018e-06, + "loss": 0.9928, + "step": 3675 + }, + { + "epoch": 0.56, + "grad_norm": 2.6904811253228873, + "learning_rate": 8.461857116290698e-06, + "loss": 0.8276, + "step": 3676 + }, + { + "epoch": 0.56, + "grad_norm": 2.789560243636047, + "learning_rate": 8.456958762043602e-06, + "loss": 0.7791, + "step": 3677 + }, + { + "epoch": 0.56, + "grad_norm": 2.7711953502100717, + "learning_rate": 8.452060787032762e-06, + "loss": 0.9128, + "step": 3678 + }, + { + "epoch": 0.56, + "grad_norm": 2.7597117529170894, + "learning_rate": 8.447163192461967e-06, + "loss": 0.7911, + "step": 3679 + }, + { + "epoch": 0.56, + "grad_norm": 2.758978197325863, + "learning_rate": 8.44226597953491e-06, + "loss": 0.8669, + "step": 3680 + }, + { + "epoch": 0.56, + "grad_norm": 2.561596737247688, + "learning_rate": 8.437369149455188e-06, + "loss": 0.8433, + "step": 3681 + }, + { + "epoch": 0.56, + "grad_norm": 4.7707246140287785, + "learning_rate": 8.432472703426301e-06, + "loss": 1.0088, + "step": 3682 + }, + { + "epoch": 0.56, + "grad_norm": 2.5503950734906424, + "learning_rate": 8.427576642651664e-06, + "loss": 0.8231, + "step": 3683 + }, + { + "epoch": 0.56, + "grad_norm": 2.6727623912158047, + "learning_rate": 8.422680968334583e-06, + "loss": 0.8313, + "step": 3684 + }, + { + "epoch": 0.56, + "grad_norm": 2.737454750508601, + "learning_rate": 8.417785681678286e-06, + "loss": 0.8674, + "step": 3685 + }, + { + "epoch": 0.56, + "grad_norm": 2.6719477754679586, + "learning_rate": 8.412890783885894e-06, + "loss": 0.9723, + "step": 3686 + }, + { + "epoch": 0.56, + "grad_norm": 2.642320424254053, + "learning_rate": 8.40799627616043e-06, + "loss": 0.8246, + "step": 3687 + }, + { + "epoch": 0.56, + "grad_norm": 2.709922300257362, + "learning_rate": 8.40310215970484e-06, + "loss": 0.8561, + "step": 3688 + }, + { + "epoch": 0.56, + "grad_norm": 2.666770069476004, + "learning_rate": 8.398208435721948e-06, + "loss": 0.7958, + "step": 3689 + }, + { + "epoch": 0.56, + "grad_norm": 2.7526148496402754, + "learning_rate": 8.393315105414496e-06, + "loss": 0.8502, + "step": 3690 + }, + { + "epoch": 0.56, + "grad_norm": 2.6659138491305767, + "learning_rate": 8.388422169985133e-06, + "loss": 0.7236, + "step": 3691 + }, + { + "epoch": 0.57, + "grad_norm": 2.5439661145398333, + "learning_rate": 8.383529630636403e-06, + "loss": 0.7353, + "step": 3692 + }, + { + "epoch": 0.57, + "grad_norm": 2.724577989752663, + "learning_rate": 8.378637488570753e-06, + "loss": 0.8559, + "step": 3693 + }, + { + "epoch": 0.57, + "grad_norm": 2.5940339598271605, + "learning_rate": 8.373745744990536e-06, + "loss": 0.9183, + "step": 3694 + }, + { + "epoch": 0.57, + "grad_norm": 2.4445109767260633, + "learning_rate": 8.368854401098001e-06, + "loss": 0.8698, + "step": 3695 + }, + { + "epoch": 0.57, + "grad_norm": 2.5851633527893316, + "learning_rate": 8.363963458095312e-06, + "loss": 0.9235, + "step": 3696 + }, + { + "epoch": 0.57, + "grad_norm": 2.58465589391059, + "learning_rate": 8.35907291718452e-06, + "loss": 0.8486, + "step": 3697 + }, + { + "epoch": 0.57, + "grad_norm": 2.4882994084186425, + "learning_rate": 8.35418277956758e-06, + "loss": 0.7139, + "step": 3698 + }, + { + "epoch": 0.57, + "grad_norm": 2.8000076559062457, + "learning_rate": 8.34929304644636e-06, + "loss": 0.8835, + "step": 3699 + }, + { + "epoch": 0.57, + "grad_norm": 2.770086107774786, + "learning_rate": 8.34440371902261e-06, + "loss": 0.8634, + "step": 3700 + }, + { + "epoch": 0.57, + "grad_norm": 2.4079058389435675, + "learning_rate": 8.33951479849799e-06, + "loss": 0.7542, + "step": 3701 + }, + { + "epoch": 0.57, + "grad_norm": 2.848969415151501, + "learning_rate": 8.334626286074068e-06, + "loss": 0.8225, + "step": 3702 + }, + { + "epoch": 0.57, + "grad_norm": 2.7592930972688183, + "learning_rate": 8.329738182952294e-06, + "loss": 0.8484, + "step": 3703 + }, + { + "epoch": 0.57, + "grad_norm": 2.6074480161638633, + "learning_rate": 8.32485049033403e-06, + "loss": 0.7946, + "step": 3704 + }, + { + "epoch": 0.57, + "grad_norm": 2.7716119180727934, + "learning_rate": 8.319963209420538e-06, + "loss": 0.944, + "step": 3705 + }, + { + "epoch": 0.57, + "grad_norm": 2.8029282681731065, + "learning_rate": 8.315076341412966e-06, + "loss": 0.7907, + "step": 3706 + }, + { + "epoch": 0.57, + "grad_norm": 2.6892629916864452, + "learning_rate": 8.310189887512376e-06, + "loss": 0.9146, + "step": 3707 + }, + { + "epoch": 0.57, + "grad_norm": 2.6734084490871024, + "learning_rate": 8.305303848919721e-06, + "loss": 0.7844, + "step": 3708 + }, + { + "epoch": 0.57, + "grad_norm": 2.539063798010986, + "learning_rate": 8.300418226835845e-06, + "loss": 0.8068, + "step": 3709 + }, + { + "epoch": 0.57, + "grad_norm": 3.9846429225063944, + "learning_rate": 8.295533022461509e-06, + "loss": 0.9278, + "step": 3710 + }, + { + "epoch": 0.57, + "grad_norm": 2.6602592369524305, + "learning_rate": 8.29064823699735e-06, + "loss": 0.8223, + "step": 3711 + }, + { + "epoch": 0.57, + "grad_norm": 2.416254247890502, + "learning_rate": 8.285763871643913e-06, + "loss": 0.9225, + "step": 3712 + }, + { + "epoch": 0.57, + "grad_norm": 2.9016898319075652, + "learning_rate": 8.28087992760164e-06, + "loss": 0.8301, + "step": 3713 + }, + { + "epoch": 0.57, + "grad_norm": 2.7287937427591302, + "learning_rate": 8.275996406070862e-06, + "loss": 0.8018, + "step": 3714 + }, + { + "epoch": 0.57, + "grad_norm": 2.5050043115033396, + "learning_rate": 8.27111330825182e-06, + "loss": 0.8171, + "step": 3715 + }, + { + "epoch": 0.57, + "grad_norm": 2.6532678696623444, + "learning_rate": 8.266230635344639e-06, + "loss": 0.8022, + "step": 3716 + }, + { + "epoch": 0.57, + "grad_norm": 2.561582659958214, + "learning_rate": 8.261348388549339e-06, + "loss": 0.8913, + "step": 3717 + }, + { + "epoch": 0.57, + "grad_norm": 3.179169065907958, + "learning_rate": 8.256466569065848e-06, + "loss": 0.8957, + "step": 3718 + }, + { + "epoch": 0.57, + "grad_norm": 2.7464665614377495, + "learning_rate": 8.251585178093973e-06, + "loss": 0.869, + "step": 3719 + }, + { + "epoch": 0.57, + "grad_norm": 2.5740469266074557, + "learning_rate": 8.24670421683342e-06, + "loss": 0.7636, + "step": 3720 + }, + { + "epoch": 0.57, + "grad_norm": 2.625686248280815, + "learning_rate": 8.241823686483803e-06, + "loss": 0.9046, + "step": 3721 + }, + { + "epoch": 0.57, + "grad_norm": 2.662924210522219, + "learning_rate": 8.236943588244614e-06, + "loss": 0.7854, + "step": 3722 + }, + { + "epoch": 0.57, + "grad_norm": 3.002836458193394, + "learning_rate": 8.232063923315241e-06, + "loss": 0.7868, + "step": 3723 + }, + { + "epoch": 0.57, + "grad_norm": 3.045339883343718, + "learning_rate": 8.227184692894975e-06, + "loss": 0.7397, + "step": 3724 + }, + { + "epoch": 0.57, + "grad_norm": 2.628061506312594, + "learning_rate": 8.222305898182988e-06, + "loss": 0.9184, + "step": 3725 + }, + { + "epoch": 0.57, + "grad_norm": 2.741478080492712, + "learning_rate": 8.217427540378356e-06, + "loss": 0.7048, + "step": 3726 + }, + { + "epoch": 0.57, + "grad_norm": 2.432295343245831, + "learning_rate": 8.212549620680042e-06, + "loss": 0.8698, + "step": 3727 + }, + { + "epoch": 0.57, + "grad_norm": 2.6309359477330854, + "learning_rate": 8.207672140286897e-06, + "loss": 0.801, + "step": 3728 + }, + { + "epoch": 0.57, + "grad_norm": 2.5958707826515677, + "learning_rate": 8.202795100397675e-06, + "loss": 0.8624, + "step": 3729 + }, + { + "epoch": 0.57, + "grad_norm": 2.463907049171007, + "learning_rate": 8.197918502211012e-06, + "loss": 0.7411, + "step": 3730 + }, + { + "epoch": 0.57, + "grad_norm": 2.7184611770503855, + "learning_rate": 8.193042346925436e-06, + "loss": 0.8144, + "step": 3731 + }, + { + "epoch": 0.57, + "grad_norm": 2.597451160371377, + "learning_rate": 8.188166635739378e-06, + "loss": 0.8072, + "step": 3732 + }, + { + "epoch": 0.57, + "grad_norm": 2.4920600266531894, + "learning_rate": 8.183291369851147e-06, + "loss": 0.7994, + "step": 3733 + }, + { + "epoch": 0.57, + "grad_norm": 2.836702439590227, + "learning_rate": 8.178416550458942e-06, + "loss": 0.8462, + "step": 3734 + }, + { + "epoch": 0.57, + "grad_norm": 2.4212101587552, + "learning_rate": 8.173542178760863e-06, + "loss": 0.853, + "step": 3735 + }, + { + "epoch": 0.57, + "grad_norm": 2.477224609844448, + "learning_rate": 8.168668255954887e-06, + "loss": 0.8548, + "step": 3736 + }, + { + "epoch": 0.57, + "grad_norm": 2.78458703917749, + "learning_rate": 8.163794783238896e-06, + "loss": 0.8783, + "step": 3737 + }, + { + "epoch": 0.57, + "grad_norm": 4.066389736990901, + "learning_rate": 8.158921761810652e-06, + "loss": 0.9896, + "step": 3738 + }, + { + "epoch": 0.57, + "grad_norm": 2.6268166088866214, + "learning_rate": 8.154049192867799e-06, + "loss": 0.8734, + "step": 3739 + }, + { + "epoch": 0.57, + "grad_norm": 2.6637608901708685, + "learning_rate": 8.149177077607885e-06, + "loss": 0.9008, + "step": 3740 + }, + { + "epoch": 0.57, + "grad_norm": 2.8074823775308575, + "learning_rate": 8.144305417228338e-06, + "loss": 0.9087, + "step": 3741 + }, + { + "epoch": 0.57, + "grad_norm": 2.5991639822857464, + "learning_rate": 8.13943421292647e-06, + "loss": 0.8284, + "step": 3742 + }, + { + "epoch": 0.57, + "grad_norm": 2.3865845077404044, + "learning_rate": 8.134563465899494e-06, + "loss": 0.5926, + "step": 3743 + }, + { + "epoch": 0.57, + "grad_norm": 3.0143104439623296, + "learning_rate": 8.129693177344501e-06, + "loss": 0.7634, + "step": 3744 + }, + { + "epoch": 0.57, + "grad_norm": 3.311530155343317, + "learning_rate": 8.124823348458469e-06, + "loss": 0.7781, + "step": 3745 + }, + { + "epoch": 0.57, + "grad_norm": 2.8112785294356226, + "learning_rate": 8.119953980438268e-06, + "loss": 0.8852, + "step": 3746 + }, + { + "epoch": 0.57, + "grad_norm": 4.7930743638010975, + "learning_rate": 8.115085074480646e-06, + "loss": 0.9101, + "step": 3747 + }, + { + "epoch": 0.57, + "grad_norm": 2.4819200007879494, + "learning_rate": 8.110216631782252e-06, + "loss": 0.7507, + "step": 3748 + }, + { + "epoch": 0.57, + "grad_norm": 2.5773649983789944, + "learning_rate": 8.10534865353961e-06, + "loss": 0.9403, + "step": 3749 + }, + { + "epoch": 0.57, + "grad_norm": 2.9531625641283825, + "learning_rate": 8.10048114094913e-06, + "loss": 0.8998, + "step": 3750 + }, + { + "epoch": 0.57, + "grad_norm": 2.3980166625417954, + "learning_rate": 8.095614095207114e-06, + "loss": 0.8242, + "step": 3751 + }, + { + "epoch": 0.57, + "grad_norm": 2.6056645544160584, + "learning_rate": 8.09074751750974e-06, + "loss": 0.798, + "step": 3752 + }, + { + "epoch": 0.57, + "grad_norm": 2.463826485778903, + "learning_rate": 8.085881409053077e-06, + "loss": 0.7636, + "step": 3753 + }, + { + "epoch": 0.57, + "grad_norm": 2.9734517602914123, + "learning_rate": 8.081015771033084e-06, + "loss": 0.6716, + "step": 3754 + }, + { + "epoch": 0.57, + "grad_norm": 2.6689993987748344, + "learning_rate": 8.076150604645596e-06, + "loss": 0.7986, + "step": 3755 + }, + { + "epoch": 0.57, + "grad_norm": 3.8901860718142007, + "learning_rate": 8.07128591108633e-06, + "loss": 0.9522, + "step": 3756 + }, + { + "epoch": 0.58, + "grad_norm": 2.691387601662654, + "learning_rate": 8.066421691550895e-06, + "loss": 0.8825, + "step": 3757 + }, + { + "epoch": 0.58, + "grad_norm": 2.521901884415322, + "learning_rate": 8.061557947234776e-06, + "loss": 0.8862, + "step": 3758 + }, + { + "epoch": 0.58, + "grad_norm": 2.492459607271112, + "learning_rate": 8.056694679333352e-06, + "loss": 0.7939, + "step": 3759 + }, + { + "epoch": 0.58, + "grad_norm": 2.661612562114557, + "learning_rate": 8.051831889041874e-06, + "loss": 0.9144, + "step": 3760 + }, + { + "epoch": 0.58, + "grad_norm": 2.706362129998038, + "learning_rate": 8.046969577555476e-06, + "loss": 0.8429, + "step": 3761 + }, + { + "epoch": 0.58, + "grad_norm": 2.472977941116828, + "learning_rate": 8.042107746069186e-06, + "loss": 0.8602, + "step": 3762 + }, + { + "epoch": 0.58, + "grad_norm": 2.754451211879156, + "learning_rate": 8.037246395777899e-06, + "loss": 0.9238, + "step": 3763 + }, + { + "epoch": 0.58, + "grad_norm": 2.481373688892135, + "learning_rate": 8.032385527876395e-06, + "loss": 0.8886, + "step": 3764 + }, + { + "epoch": 0.58, + "grad_norm": 2.5611895981930513, + "learning_rate": 8.02752514355935e-06, + "loss": 0.8657, + "step": 3765 + }, + { + "epoch": 0.58, + "grad_norm": 2.8296881650687014, + "learning_rate": 8.022665244021305e-06, + "loss": 0.8609, + "step": 3766 + }, + { + "epoch": 0.58, + "grad_norm": 2.661495351257953, + "learning_rate": 8.017805830456682e-06, + "loss": 0.9261, + "step": 3767 + }, + { + "epoch": 0.58, + "grad_norm": 2.661809405239875, + "learning_rate": 8.012946904059798e-06, + "loss": 0.7899, + "step": 3768 + }, + { + "epoch": 0.58, + "grad_norm": 2.558394657590361, + "learning_rate": 8.00808846602483e-06, + "loss": 0.8469, + "step": 3769 + }, + { + "epoch": 0.58, + "grad_norm": 2.8327158347851484, + "learning_rate": 8.003230517545857e-06, + "loss": 0.7845, + "step": 3770 + }, + { + "epoch": 0.58, + "grad_norm": 2.4525659365371824, + "learning_rate": 7.998373059816822e-06, + "loss": 0.7724, + "step": 3771 + }, + { + "epoch": 0.58, + "grad_norm": 2.603495304223118, + "learning_rate": 7.993516094031548e-06, + "loss": 0.9337, + "step": 3772 + }, + { + "epoch": 0.58, + "grad_norm": 2.6672418683358936, + "learning_rate": 7.988659621383746e-06, + "loss": 0.842, + "step": 3773 + }, + { + "epoch": 0.58, + "grad_norm": 2.877866056470712, + "learning_rate": 7.983803643067e-06, + "loss": 0.8995, + "step": 3774 + }, + { + "epoch": 0.58, + "grad_norm": 2.6378115325206313, + "learning_rate": 7.978948160274767e-06, + "loss": 0.8989, + "step": 3775 + }, + { + "epoch": 0.58, + "grad_norm": 2.9815579099601783, + "learning_rate": 7.974093174200397e-06, + "loss": 0.8495, + "step": 3776 + }, + { + "epoch": 0.58, + "grad_norm": 3.132213870518248, + "learning_rate": 7.96923868603711e-06, + "loss": 0.8843, + "step": 3777 + }, + { + "epoch": 0.58, + "grad_norm": 2.412788545789461, + "learning_rate": 7.964384696977992e-06, + "loss": 0.7961, + "step": 3778 + }, + { + "epoch": 0.58, + "grad_norm": 2.370076785003066, + "learning_rate": 7.959531208216028e-06, + "loss": 0.7651, + "step": 3779 + }, + { + "epoch": 0.58, + "grad_norm": 2.5746154436580557, + "learning_rate": 7.954678220944064e-06, + "loss": 0.878, + "step": 3780 + }, + { + "epoch": 0.58, + "grad_norm": 2.6759180343235527, + "learning_rate": 7.949825736354833e-06, + "loss": 0.8155, + "step": 3781 + }, + { + "epoch": 0.58, + "grad_norm": 2.6891425243501357, + "learning_rate": 7.944973755640937e-06, + "loss": 0.9401, + "step": 3782 + }, + { + "epoch": 0.58, + "grad_norm": 2.516111779906683, + "learning_rate": 7.940122279994853e-06, + "loss": 0.9423, + "step": 3783 + }, + { + "epoch": 0.58, + "grad_norm": 2.7621339373286307, + "learning_rate": 7.935271310608946e-06, + "loss": 0.9382, + "step": 3784 + }, + { + "epoch": 0.58, + "grad_norm": 2.619415577360085, + "learning_rate": 7.930420848675442e-06, + "loss": 0.8197, + "step": 3785 + }, + { + "epoch": 0.58, + "grad_norm": 4.245032498999451, + "learning_rate": 7.925570895386443e-06, + "loss": 0.9284, + "step": 3786 + }, + { + "epoch": 0.58, + "grad_norm": 2.700494938164556, + "learning_rate": 7.920721451933946e-06, + "loss": 0.952, + "step": 3787 + }, + { + "epoch": 0.58, + "grad_norm": 2.534211144265974, + "learning_rate": 7.915872519509798e-06, + "loss": 0.9005, + "step": 3788 + }, + { + "epoch": 0.58, + "grad_norm": 2.960644175712867, + "learning_rate": 7.911024099305733e-06, + "loss": 0.8329, + "step": 3789 + }, + { + "epoch": 0.58, + "grad_norm": 2.742678404898385, + "learning_rate": 7.90617619251336e-06, + "loss": 0.8715, + "step": 3790 + }, + { + "epoch": 0.58, + "grad_norm": 2.627606307215524, + "learning_rate": 7.901328800324149e-06, + "loss": 0.7791, + "step": 3791 + }, + { + "epoch": 0.58, + "grad_norm": 2.4158599573343573, + "learning_rate": 7.896481923929464e-06, + "loss": 0.7315, + "step": 3792 + }, + { + "epoch": 0.58, + "grad_norm": 2.4810470882863247, + "learning_rate": 7.891635564520527e-06, + "loss": 0.7382, + "step": 3793 + }, + { + "epoch": 0.58, + "grad_norm": 2.510165859368989, + "learning_rate": 7.886789723288436e-06, + "loss": 0.7769, + "step": 3794 + }, + { + "epoch": 0.58, + "grad_norm": 2.5818045931789033, + "learning_rate": 7.881944401424164e-06, + "loss": 0.7249, + "step": 3795 + }, + { + "epoch": 0.58, + "grad_norm": 2.7965746120569066, + "learning_rate": 7.877099600118556e-06, + "loss": 0.8596, + "step": 3796 + }, + { + "epoch": 0.58, + "grad_norm": 2.868160776338824, + "learning_rate": 7.872255320562324e-06, + "loss": 0.8343, + "step": 3797 + }, + { + "epoch": 0.58, + "grad_norm": 2.8908538497696297, + "learning_rate": 7.867411563946061e-06, + "loss": 0.9137, + "step": 3798 + }, + { + "epoch": 0.58, + "grad_norm": 2.675812256480429, + "learning_rate": 7.862568331460224e-06, + "loss": 0.8901, + "step": 3799 + }, + { + "epoch": 0.58, + "grad_norm": 2.7616648182460937, + "learning_rate": 7.857725624295147e-06, + "loss": 0.8093, + "step": 3800 + }, + { + "epoch": 0.58, + "grad_norm": 2.7560563302006402, + "learning_rate": 7.85288344364103e-06, + "loss": 0.7587, + "step": 3801 + }, + { + "epoch": 0.58, + "grad_norm": 2.902987278257522, + "learning_rate": 7.848041790687938e-06, + "loss": 0.809, + "step": 3802 + }, + { + "epoch": 0.58, + "grad_norm": 3.9658013555750475, + "learning_rate": 7.843200666625827e-06, + "loss": 0.9063, + "step": 3803 + }, + { + "epoch": 0.58, + "grad_norm": 2.7647583871656245, + "learning_rate": 7.838360072644502e-06, + "loss": 0.7302, + "step": 3804 + }, + { + "epoch": 0.58, + "grad_norm": 2.6531480332408846, + "learning_rate": 7.833520009933642e-06, + "loss": 0.8479, + "step": 3805 + }, + { + "epoch": 0.58, + "grad_norm": 2.4018411002880904, + "learning_rate": 7.828680479682804e-06, + "loss": 0.7875, + "step": 3806 + }, + { + "epoch": 0.58, + "grad_norm": 2.6037494271984802, + "learning_rate": 7.823841483081408e-06, + "loss": 0.8453, + "step": 3807 + }, + { + "epoch": 0.58, + "grad_norm": 2.5441498531562448, + "learning_rate": 7.81900302131874e-06, + "loss": 0.8459, + "step": 3808 + }, + { + "epoch": 0.58, + "grad_norm": 2.4488141970955777, + "learning_rate": 7.814165095583963e-06, + "loss": 0.7218, + "step": 3809 + }, + { + "epoch": 0.58, + "grad_norm": 2.697496931767143, + "learning_rate": 7.809327707066098e-06, + "loss": 0.875, + "step": 3810 + }, + { + "epoch": 0.58, + "grad_norm": 2.7787096557710593, + "learning_rate": 7.80449085695405e-06, + "loss": 0.9643, + "step": 3811 + }, + { + "epoch": 0.58, + "grad_norm": 2.997625552294042, + "learning_rate": 7.799654546436572e-06, + "loss": 0.9041, + "step": 3812 + }, + { + "epoch": 0.58, + "grad_norm": 2.5589065401606454, + "learning_rate": 7.79481877670229e-06, + "loss": 0.7814, + "step": 3813 + }, + { + "epoch": 0.58, + "grad_norm": 2.7670004774038652, + "learning_rate": 7.78998354893971e-06, + "loss": 0.7938, + "step": 3814 + }, + { + "epoch": 0.58, + "grad_norm": 2.5168899500141206, + "learning_rate": 7.785148864337192e-06, + "loss": 0.8931, + "step": 3815 + }, + { + "epoch": 0.58, + "grad_norm": 2.812391141474085, + "learning_rate": 7.780314724082963e-06, + "loss": 0.9063, + "step": 3816 + }, + { + "epoch": 0.58, + "grad_norm": 2.719624742077618, + "learning_rate": 7.775481129365126e-06, + "loss": 0.7571, + "step": 3817 + }, + { + "epoch": 0.58, + "grad_norm": 2.641408766865852, + "learning_rate": 7.770648081371636e-06, + "loss": 0.8086, + "step": 3818 + }, + { + "epoch": 0.58, + "grad_norm": 2.5413481785443994, + "learning_rate": 7.76581558129032e-06, + "loss": 0.9068, + "step": 3819 + }, + { + "epoch": 0.58, + "grad_norm": 2.567326515183025, + "learning_rate": 7.760983630308877e-06, + "loss": 0.8058, + "step": 3820 + }, + { + "epoch": 0.58, + "grad_norm": 2.895640457109586, + "learning_rate": 7.756152229614858e-06, + "loss": 0.7857, + "step": 3821 + }, + { + "epoch": 0.59, + "grad_norm": 2.705430270171675, + "learning_rate": 7.751321380395696e-06, + "loss": 0.7997, + "step": 3822 + }, + { + "epoch": 0.59, + "grad_norm": 2.3761036254213566, + "learning_rate": 7.74649108383867e-06, + "loss": 0.7441, + "step": 3823 + }, + { + "epoch": 0.59, + "grad_norm": 2.7108377611394814, + "learning_rate": 7.741661341130931e-06, + "loss": 0.8042, + "step": 3824 + }, + { + "epoch": 0.59, + "grad_norm": 2.76401109437764, + "learning_rate": 7.736832153459501e-06, + "loss": 0.7993, + "step": 3825 + }, + { + "epoch": 0.59, + "grad_norm": 2.6007382840432243, + "learning_rate": 7.732003522011255e-06, + "loss": 0.9539, + "step": 3826 + }, + { + "epoch": 0.59, + "grad_norm": 2.8750454249231185, + "learning_rate": 7.727175447972928e-06, + "loss": 0.7493, + "step": 3827 + }, + { + "epoch": 0.59, + "grad_norm": 2.7782634958747026, + "learning_rate": 7.72234793253114e-06, + "loss": 0.8482, + "step": 3828 + }, + { + "epoch": 0.59, + "grad_norm": 2.4242192072212174, + "learning_rate": 7.717520976872347e-06, + "loss": 0.7813, + "step": 3829 + }, + { + "epoch": 0.59, + "grad_norm": 2.8572436094490086, + "learning_rate": 7.712694582182885e-06, + "loss": 0.9205, + "step": 3830 + }, + { + "epoch": 0.59, + "grad_norm": 2.8229708482287035, + "learning_rate": 7.707868749648945e-06, + "loss": 0.8363, + "step": 3831 + }, + { + "epoch": 0.59, + "grad_norm": 2.5074469640257666, + "learning_rate": 7.703043480456576e-06, + "loss": 0.807, + "step": 3832 + }, + { + "epoch": 0.59, + "grad_norm": 2.6569233219377844, + "learning_rate": 7.698218775791704e-06, + "loss": 0.7093, + "step": 3833 + }, + { + "epoch": 0.59, + "grad_norm": 2.63041037066242, + "learning_rate": 7.693394636840102e-06, + "loss": 0.9002, + "step": 3834 + }, + { + "epoch": 0.59, + "grad_norm": 4.1206808271085595, + "learning_rate": 7.688571064787404e-06, + "loss": 0.9588, + "step": 3835 + }, + { + "epoch": 0.59, + "grad_norm": 2.6908652608398955, + "learning_rate": 7.683748060819116e-06, + "loss": 0.853, + "step": 3836 + }, + { + "epoch": 0.59, + "grad_norm": 2.64015780770857, + "learning_rate": 7.67892562612059e-06, + "loss": 0.7764, + "step": 3837 + }, + { + "epoch": 0.59, + "grad_norm": 3.161301991487249, + "learning_rate": 7.674103761877045e-06, + "loss": 0.8251, + "step": 3838 + }, + { + "epoch": 0.59, + "grad_norm": 2.627704700289266, + "learning_rate": 7.669282469273566e-06, + "loss": 0.8344, + "step": 3839 + }, + { + "epoch": 0.59, + "grad_norm": 2.7454896666097164, + "learning_rate": 7.664461749495092e-06, + "loss": 0.8359, + "step": 3840 + }, + { + "epoch": 0.59, + "grad_norm": 2.714463524406809, + "learning_rate": 7.659641603726413e-06, + "loss": 0.748, + "step": 3841 + }, + { + "epoch": 0.59, + "grad_norm": 2.788609811735884, + "learning_rate": 7.654822033152192e-06, + "loss": 0.8835, + "step": 3842 + }, + { + "epoch": 0.59, + "grad_norm": 2.72491491081617, + "learning_rate": 7.650003038956939e-06, + "loss": 0.9077, + "step": 3843 + }, + { + "epoch": 0.59, + "grad_norm": 2.6085472328025125, + "learning_rate": 7.645184622325034e-06, + "loss": 0.872, + "step": 3844 + }, + { + "epoch": 0.59, + "grad_norm": 2.813389893202622, + "learning_rate": 7.640366784440707e-06, + "loss": 0.8672, + "step": 3845 + }, + { + "epoch": 0.59, + "grad_norm": 2.326141636602756, + "learning_rate": 7.635549526488042e-06, + "loss": 0.7961, + "step": 3846 + }, + { + "epoch": 0.59, + "grad_norm": 2.6334060331516205, + "learning_rate": 7.630732849650993e-06, + "loss": 0.8555, + "step": 3847 + }, + { + "epoch": 0.59, + "grad_norm": 2.5144279864015444, + "learning_rate": 7.625916755113361e-06, + "loss": 0.8252, + "step": 3848 + }, + { + "epoch": 0.59, + "grad_norm": 2.674159071621899, + "learning_rate": 7.621101244058803e-06, + "loss": 0.8819, + "step": 3849 + }, + { + "epoch": 0.59, + "grad_norm": 2.7674109736473715, + "learning_rate": 7.616286317670845e-06, + "loss": 0.7824, + "step": 3850 + }, + { + "epoch": 0.59, + "grad_norm": 2.7562266107062277, + "learning_rate": 7.611471977132855e-06, + "loss": 0.888, + "step": 3851 + }, + { + "epoch": 0.59, + "grad_norm": 2.5461987223959257, + "learning_rate": 7.606658223628063e-06, + "loss": 0.8751, + "step": 3852 + }, + { + "epoch": 0.59, + "grad_norm": 2.4483598091432293, + "learning_rate": 7.601845058339557e-06, + "loss": 0.8321, + "step": 3853 + }, + { + "epoch": 0.59, + "grad_norm": 2.4859420768204363, + "learning_rate": 7.5970324824502736e-06, + "loss": 0.8101, + "step": 3854 + }, + { + "epoch": 0.59, + "grad_norm": 3.020109170851338, + "learning_rate": 7.5922204971430165e-06, + "loss": 0.8826, + "step": 3855 + }, + { + "epoch": 0.59, + "grad_norm": 2.692579456461489, + "learning_rate": 7.587409103600433e-06, + "loss": 0.7842, + "step": 3856 + }, + { + "epoch": 0.59, + "grad_norm": 2.8599508800622364, + "learning_rate": 7.582598303005026e-06, + "loss": 0.8273, + "step": 3857 + }, + { + "epoch": 0.59, + "grad_norm": 2.3605072679986843, + "learning_rate": 7.577788096539158e-06, + "loss": 0.8066, + "step": 3858 + }, + { + "epoch": 0.59, + "grad_norm": 2.609277078943832, + "learning_rate": 7.572978485385044e-06, + "loss": 0.8033, + "step": 3859 + }, + { + "epoch": 0.59, + "grad_norm": 4.195124199704877, + "learning_rate": 7.5681694707247445e-06, + "loss": 0.9971, + "step": 3860 + }, + { + "epoch": 0.59, + "grad_norm": 2.482502952468572, + "learning_rate": 7.563361053740191e-06, + "loss": 0.7986, + "step": 3861 + }, + { + "epoch": 0.59, + "grad_norm": 2.574590700574185, + "learning_rate": 7.558553235613151e-06, + "loss": 0.825, + "step": 3862 + }, + { + "epoch": 0.59, + "grad_norm": 2.5775304906123413, + "learning_rate": 7.553746017525253e-06, + "loss": 0.8978, + "step": 3863 + }, + { + "epoch": 0.59, + "grad_norm": 2.9492898868537, + "learning_rate": 7.548939400657977e-06, + "loss": 0.8328, + "step": 3864 + }, + { + "epoch": 0.59, + "grad_norm": 2.71613382329485, + "learning_rate": 7.544133386192649e-06, + "loss": 0.7306, + "step": 3865 + }, + { + "epoch": 0.59, + "grad_norm": 2.7268270443678637, + "learning_rate": 7.539327975310463e-06, + "loss": 0.8832, + "step": 3866 + }, + { + "epoch": 0.59, + "grad_norm": 2.5607710626551627, + "learning_rate": 7.534523169192447e-06, + "loss": 0.7932, + "step": 3867 + }, + { + "epoch": 0.59, + "grad_norm": 2.699562397322179, + "learning_rate": 7.529718969019488e-06, + "loss": 0.7144, + "step": 3868 + }, + { + "epoch": 0.59, + "grad_norm": 2.570287154377048, + "learning_rate": 7.524915375972327e-06, + "loss": 0.9042, + "step": 3869 + }, + { + "epoch": 0.59, + "grad_norm": 2.7691816343881577, + "learning_rate": 7.520112391231549e-06, + "loss": 0.9343, + "step": 3870 + }, + { + "epoch": 0.59, + "grad_norm": 2.6067248356622845, + "learning_rate": 7.515310015977591e-06, + "loss": 0.7428, + "step": 3871 + }, + { + "epoch": 0.59, + "grad_norm": 2.6809416644963053, + "learning_rate": 7.510508251390749e-06, + "loss": 0.7999, + "step": 3872 + }, + { + "epoch": 0.59, + "grad_norm": 2.6413952753527137, + "learning_rate": 7.505707098651157e-06, + "loss": 0.7502, + "step": 3873 + }, + { + "epoch": 0.59, + "grad_norm": 2.3759380993681276, + "learning_rate": 7.500906558938806e-06, + "loss": 0.7932, + "step": 3874 + }, + { + "epoch": 0.59, + "grad_norm": 2.605686384220005, + "learning_rate": 7.496106633433535e-06, + "loss": 0.9078, + "step": 3875 + }, + { + "epoch": 0.59, + "grad_norm": 2.4652523513354687, + "learning_rate": 7.4913073233150236e-06, + "loss": 0.8745, + "step": 3876 + }, + { + "epoch": 0.59, + "grad_norm": 2.7138063748947827, + "learning_rate": 7.486508629762818e-06, + "loss": 0.848, + "step": 3877 + }, + { + "epoch": 0.59, + "grad_norm": 2.508036618474075, + "learning_rate": 7.481710553956296e-06, + "loss": 0.7674, + "step": 3878 + }, + { + "epoch": 0.59, + "grad_norm": 2.770740350268557, + "learning_rate": 7.476913097074692e-06, + "loss": 0.8912, + "step": 3879 + }, + { + "epoch": 0.59, + "grad_norm": 2.6922695290496668, + "learning_rate": 7.472116260297087e-06, + "loss": 0.8769, + "step": 3880 + }, + { + "epoch": 0.59, + "grad_norm": 2.57309161479025, + "learning_rate": 7.467320044802409e-06, + "loss": 0.796, + "step": 3881 + }, + { + "epoch": 0.59, + "grad_norm": 2.512958056132519, + "learning_rate": 7.462524451769426e-06, + "loss": 0.9029, + "step": 3882 + }, + { + "epoch": 0.59, + "grad_norm": 2.6277879501170536, + "learning_rate": 7.457729482376772e-06, + "loss": 0.7873, + "step": 3883 + }, + { + "epoch": 0.59, + "grad_norm": 2.765976568443896, + "learning_rate": 7.4529351378029094e-06, + "loss": 0.9712, + "step": 3884 + }, + { + "epoch": 0.59, + "grad_norm": 2.4778448264754562, + "learning_rate": 7.4481414192261556e-06, + "loss": 0.7683, + "step": 3885 + }, + { + "epoch": 0.59, + "grad_norm": 2.607179107319063, + "learning_rate": 7.443348327824673e-06, + "loss": 0.8287, + "step": 3886 + }, + { + "epoch": 0.59, + "grad_norm": 2.707889718434232, + "learning_rate": 7.438555864776462e-06, + "loss": 0.808, + "step": 3887 + }, + { + "epoch": 0.6, + "grad_norm": 3.9849966740053584, + "learning_rate": 7.433764031259385e-06, + "loss": 0.9122, + "step": 3888 + }, + { + "epoch": 0.6, + "grad_norm": 3.112460889317166, + "learning_rate": 7.428972828451138e-06, + "loss": 0.7092, + "step": 3889 + }, + { + "epoch": 0.6, + "grad_norm": 2.7988184084042227, + "learning_rate": 7.424182257529258e-06, + "loss": 0.8475, + "step": 3890 + }, + { + "epoch": 0.6, + "grad_norm": 2.7428963787567104, + "learning_rate": 7.419392319671142e-06, + "loss": 0.7922, + "step": 3891 + }, + { + "epoch": 0.6, + "grad_norm": 4.100613249299371, + "learning_rate": 7.414603016054016e-06, + "loss": 0.968, + "step": 3892 + }, + { + "epoch": 0.6, + "grad_norm": 2.6502811402975506, + "learning_rate": 7.4098143478549555e-06, + "loss": 0.9379, + "step": 3893 + }, + { + "epoch": 0.6, + "grad_norm": 2.6669734031481216, + "learning_rate": 7.405026316250887e-06, + "loss": 0.7728, + "step": 3894 + }, + { + "epoch": 0.6, + "grad_norm": 2.606265333083008, + "learning_rate": 7.40023892241857e-06, + "loss": 0.7656, + "step": 3895 + }, + { + "epoch": 0.6, + "grad_norm": 2.4323996483300623, + "learning_rate": 7.395452167534615e-06, + "loss": 0.8158, + "step": 3896 + }, + { + "epoch": 0.6, + "grad_norm": 2.649985215785405, + "learning_rate": 7.39066605277547e-06, + "loss": 0.8632, + "step": 3897 + }, + { + "epoch": 0.6, + "grad_norm": 2.6220263882415966, + "learning_rate": 7.385880579317424e-06, + "loss": 0.7514, + "step": 3898 + }, + { + "epoch": 0.6, + "grad_norm": 2.9149752455005213, + "learning_rate": 7.381095748336618e-06, + "loss": 0.7993, + "step": 3899 + }, + { + "epoch": 0.6, + "grad_norm": 2.6720254849645264, + "learning_rate": 7.3763115610090284e-06, + "loss": 0.9737, + "step": 3900 + }, + { + "epoch": 0.6, + "grad_norm": 2.431364392785707, + "learning_rate": 7.371528018510472e-06, + "loss": 0.84, + "step": 3901 + }, + { + "epoch": 0.6, + "grad_norm": 2.6650297510276664, + "learning_rate": 7.3667451220166104e-06, + "loss": 0.791, + "step": 3902 + }, + { + "epoch": 0.6, + "grad_norm": 2.452822733776375, + "learning_rate": 7.361962872702947e-06, + "loss": 0.7925, + "step": 3903 + }, + { + "epoch": 0.6, + "grad_norm": 2.377872769290526, + "learning_rate": 7.357181271744818e-06, + "loss": 0.767, + "step": 3904 + }, + { + "epoch": 0.6, + "grad_norm": 2.7052139163110382, + "learning_rate": 7.352400320317417e-06, + "loss": 0.8582, + "step": 3905 + }, + { + "epoch": 0.6, + "grad_norm": 2.551578609697552, + "learning_rate": 7.347620019595758e-06, + "loss": 0.8198, + "step": 3906 + }, + { + "epoch": 0.6, + "grad_norm": 2.491058196271869, + "learning_rate": 7.342840370754714e-06, + "loss": 0.7494, + "step": 3907 + }, + { + "epoch": 0.6, + "grad_norm": 2.3744720682912193, + "learning_rate": 7.338061374968984e-06, + "loss": 0.7923, + "step": 3908 + }, + { + "epoch": 0.6, + "grad_norm": 2.498751007169318, + "learning_rate": 7.333283033413106e-06, + "loss": 0.863, + "step": 3909 + }, + { + "epoch": 0.6, + "grad_norm": 2.6822308660893808, + "learning_rate": 7.328505347261471e-06, + "loss": 0.8063, + "step": 3910 + }, + { + "epoch": 0.6, + "grad_norm": 2.701553992183195, + "learning_rate": 7.323728317688296e-06, + "loss": 0.7296, + "step": 3911 + }, + { + "epoch": 0.6, + "grad_norm": 2.6067250965264033, + "learning_rate": 7.318951945867643e-06, + "loss": 0.8595, + "step": 3912 + }, + { + "epoch": 0.6, + "grad_norm": 2.646105279440585, + "learning_rate": 7.314176232973409e-06, + "loss": 0.8204, + "step": 3913 + }, + { + "epoch": 0.6, + "grad_norm": 2.472573274215426, + "learning_rate": 7.30940118017933e-06, + "loss": 0.7515, + "step": 3914 + }, + { + "epoch": 0.6, + "grad_norm": 2.889346213895278, + "learning_rate": 7.304626788658976e-06, + "loss": 0.8521, + "step": 3915 + }, + { + "epoch": 0.6, + "grad_norm": 2.8143134429258736, + "learning_rate": 7.299853059585768e-06, + "loss": 0.8683, + "step": 3916 + }, + { + "epoch": 0.6, + "grad_norm": 3.359919074066636, + "learning_rate": 7.295079994132947e-06, + "loss": 0.9095, + "step": 3917 + }, + { + "epoch": 0.6, + "grad_norm": 2.3986782808370823, + "learning_rate": 7.290307593473603e-06, + "loss": 0.7398, + "step": 3918 + }, + { + "epoch": 0.6, + "grad_norm": 2.7038658777691236, + "learning_rate": 7.2855358587806554e-06, + "loss": 0.8169, + "step": 3919 + }, + { + "epoch": 0.6, + "grad_norm": 2.5069386708867407, + "learning_rate": 7.28076479122686e-06, + "loss": 0.8501, + "step": 3920 + }, + { + "epoch": 0.6, + "grad_norm": 2.517053657035505, + "learning_rate": 7.27599439198482e-06, + "loss": 0.7897, + "step": 3921 + }, + { + "epoch": 0.6, + "grad_norm": 2.8360269490317953, + "learning_rate": 7.2712246622269615e-06, + "loss": 0.8144, + "step": 3922 + }, + { + "epoch": 0.6, + "grad_norm": 2.9258561023362706, + "learning_rate": 7.266455603125546e-06, + "loss": 0.8818, + "step": 3923 + }, + { + "epoch": 0.6, + "grad_norm": 2.6528039617127424, + "learning_rate": 7.261687215852681e-06, + "loss": 0.7645, + "step": 3924 + }, + { + "epoch": 0.6, + "grad_norm": 2.935343617366732, + "learning_rate": 7.2569195015803e-06, + "loss": 0.8296, + "step": 3925 + }, + { + "epoch": 0.6, + "grad_norm": 2.4898009977984827, + "learning_rate": 7.252152461480166e-06, + "loss": 0.7976, + "step": 3926 + }, + { + "epoch": 0.6, + "grad_norm": 4.576448374761486, + "learning_rate": 7.247386096723899e-06, + "loss": 0.9198, + "step": 3927 + }, + { + "epoch": 0.6, + "grad_norm": 2.387055416933314, + "learning_rate": 7.2426204084829225e-06, + "loss": 0.7474, + "step": 3928 + }, + { + "epoch": 0.6, + "grad_norm": 2.6562445306220006, + "learning_rate": 7.237855397928522e-06, + "loss": 0.939, + "step": 3929 + }, + { + "epoch": 0.6, + "grad_norm": 2.4856046326947054, + "learning_rate": 7.233091066231796e-06, + "loss": 0.7783, + "step": 3930 + }, + { + "epoch": 0.6, + "grad_norm": 2.5341068263640323, + "learning_rate": 7.228327414563683e-06, + "loss": 0.8302, + "step": 3931 + }, + { + "epoch": 0.6, + "grad_norm": 2.6776321393981934, + "learning_rate": 7.223564444094959e-06, + "loss": 0.7617, + "step": 3932 + }, + { + "epoch": 0.6, + "grad_norm": 2.6597687482328407, + "learning_rate": 7.218802155996227e-06, + "loss": 0.8918, + "step": 3933 + }, + { + "epoch": 0.6, + "grad_norm": 3.0271167434689525, + "learning_rate": 7.214040551437919e-06, + "loss": 0.9506, + "step": 3934 + }, + { + "epoch": 0.6, + "grad_norm": 2.5823896739606136, + "learning_rate": 7.209279631590313e-06, + "loss": 0.7003, + "step": 3935 + }, + { + "epoch": 0.6, + "grad_norm": 2.748517455966857, + "learning_rate": 7.204519397623502e-06, + "loss": 0.8306, + "step": 3936 + }, + { + "epoch": 0.6, + "grad_norm": 2.612978903513092, + "learning_rate": 7.199759850707424e-06, + "loss": 0.7482, + "step": 3937 + }, + { + "epoch": 0.6, + "grad_norm": 2.6809929088899294, + "learning_rate": 7.195000992011838e-06, + "loss": 0.7985, + "step": 3938 + }, + { + "epoch": 0.6, + "grad_norm": 2.5825987471777534, + "learning_rate": 7.190242822706335e-06, + "loss": 0.7688, + "step": 3939 + }, + { + "epoch": 0.6, + "grad_norm": 2.5087445915228046, + "learning_rate": 7.185485343960348e-06, + "loss": 0.7756, + "step": 3940 + }, + { + "epoch": 0.6, + "grad_norm": 2.6229532585798303, + "learning_rate": 7.180728556943128e-06, + "loss": 0.8737, + "step": 3941 + }, + { + "epoch": 0.6, + "grad_norm": 2.8277366555711994, + "learning_rate": 7.175972462823757e-06, + "loss": 0.739, + "step": 3942 + }, + { + "epoch": 0.6, + "grad_norm": 3.6299431073851434, + "learning_rate": 7.171217062771153e-06, + "loss": 0.899, + "step": 3943 + }, + { + "epoch": 0.6, + "grad_norm": 2.7129768358506614, + "learning_rate": 7.16646235795406e-06, + "loss": 0.8835, + "step": 3944 + }, + { + "epoch": 0.6, + "grad_norm": 2.5883181574655696, + "learning_rate": 7.161708349541044e-06, + "loss": 0.8523, + "step": 3945 + }, + { + "epoch": 0.6, + "grad_norm": 2.5426023684011723, + "learning_rate": 7.156955038700519e-06, + "loss": 0.7801, + "step": 3946 + }, + { + "epoch": 0.6, + "grad_norm": 3.0473689365131404, + "learning_rate": 7.1522024266007065e-06, + "loss": 0.793, + "step": 3947 + }, + { + "epoch": 0.6, + "grad_norm": 2.7815288430966674, + "learning_rate": 7.14745051440967e-06, + "loss": 0.9378, + "step": 3948 + }, + { + "epoch": 0.6, + "grad_norm": 2.978236151092738, + "learning_rate": 7.1426993032952926e-06, + "loss": 0.9328, + "step": 3949 + }, + { + "epoch": 0.6, + "grad_norm": 2.7000413945650714, + "learning_rate": 7.137948794425289e-06, + "loss": 0.8611, + "step": 3950 + }, + { + "epoch": 0.6, + "grad_norm": 2.851811931723268, + "learning_rate": 7.133198988967205e-06, + "loss": 0.835, + "step": 3951 + }, + { + "epoch": 0.6, + "grad_norm": 2.785889191122522, + "learning_rate": 7.128449888088409e-06, + "loss": 0.7782, + "step": 3952 + }, + { + "epoch": 0.61, + "grad_norm": 2.7193290149705818, + "learning_rate": 7.12370149295609e-06, + "loss": 0.8046, + "step": 3953 + }, + { + "epoch": 0.61, + "grad_norm": 2.643197344893586, + "learning_rate": 7.118953804737278e-06, + "loss": 0.888, + "step": 3954 + }, + { + "epoch": 0.61, + "grad_norm": 2.64274182536652, + "learning_rate": 7.1142068245988216e-06, + "loss": 0.8475, + "step": 3955 + }, + { + "epoch": 0.61, + "grad_norm": 2.509853365664716, + "learning_rate": 7.109460553707386e-06, + "loss": 0.8149, + "step": 3956 + }, + { + "epoch": 0.61, + "grad_norm": 2.403526176543108, + "learning_rate": 7.104714993229482e-06, + "loss": 0.6655, + "step": 3957 + }, + { + "epoch": 0.61, + "grad_norm": 2.7434615498012906, + "learning_rate": 7.0999701443314295e-06, + "loss": 0.7385, + "step": 3958 + }, + { + "epoch": 0.61, + "grad_norm": 2.697601809689983, + "learning_rate": 7.095226008179384e-06, + "loss": 0.7553, + "step": 3959 + }, + { + "epoch": 0.61, + "grad_norm": 5.203386630427518, + "learning_rate": 7.090482585939318e-06, + "loss": 0.9734, + "step": 3960 + }, + { + "epoch": 0.61, + "grad_norm": 2.8165695481632227, + "learning_rate": 7.085739878777027e-06, + "loss": 0.8349, + "step": 3961 + }, + { + "epoch": 0.61, + "grad_norm": 4.609275504845507, + "learning_rate": 7.080997887858145e-06, + "loss": 0.9396, + "step": 3962 + }, + { + "epoch": 0.61, + "grad_norm": 2.8280813499025252, + "learning_rate": 7.076256614348116e-06, + "loss": 0.8813, + "step": 3963 + }, + { + "epoch": 0.61, + "grad_norm": 2.3852433919170135, + "learning_rate": 7.07151605941221e-06, + "loss": 0.741, + "step": 3964 + }, + { + "epoch": 0.61, + "grad_norm": 2.288825295359347, + "learning_rate": 7.066776224215526e-06, + "loss": 0.6835, + "step": 3965 + }, + { + "epoch": 0.61, + "grad_norm": 2.628680125563155, + "learning_rate": 7.06203710992298e-06, + "loss": 0.8354, + "step": 3966 + }, + { + "epoch": 0.61, + "grad_norm": 2.643593615500088, + "learning_rate": 7.05729871769931e-06, + "loss": 0.8535, + "step": 3967 + }, + { + "epoch": 0.61, + "grad_norm": 2.676690991946433, + "learning_rate": 7.052561048709089e-06, + "loss": 0.8382, + "step": 3968 + }, + { + "epoch": 0.61, + "grad_norm": 2.4738463303615092, + "learning_rate": 7.0478241041166964e-06, + "loss": 0.8537, + "step": 3969 + }, + { + "epoch": 0.61, + "grad_norm": 2.7570416706638268, + "learning_rate": 7.043087885086343e-06, + "loss": 0.8763, + "step": 3970 + }, + { + "epoch": 0.61, + "grad_norm": 2.927495685980187, + "learning_rate": 7.038352392782057e-06, + "loss": 0.7795, + "step": 3971 + }, + { + "epoch": 0.61, + "grad_norm": 2.6282299744327693, + "learning_rate": 7.033617628367688e-06, + "loss": 0.8111, + "step": 3972 + }, + { + "epoch": 0.61, + "grad_norm": 2.4653497111160387, + "learning_rate": 7.028883593006914e-06, + "loss": 0.8788, + "step": 3973 + }, + { + "epoch": 0.61, + "grad_norm": 2.521042858506162, + "learning_rate": 7.024150287863225e-06, + "loss": 0.8709, + "step": 3974 + }, + { + "epoch": 0.61, + "grad_norm": 2.931058675577534, + "learning_rate": 7.019417714099933e-06, + "loss": 0.9379, + "step": 3975 + }, + { + "epoch": 0.61, + "grad_norm": 2.8664809412726044, + "learning_rate": 7.014685872880175e-06, + "loss": 0.7604, + "step": 3976 + }, + { + "epoch": 0.61, + "grad_norm": 2.683922110588336, + "learning_rate": 7.009954765366902e-06, + "loss": 0.8586, + "step": 3977 + }, + { + "epoch": 0.61, + "grad_norm": 2.467799967600729, + "learning_rate": 7.005224392722886e-06, + "loss": 0.7699, + "step": 3978 + }, + { + "epoch": 0.61, + "grad_norm": 2.5448159972295774, + "learning_rate": 7.000494756110726e-06, + "loss": 0.8426, + "step": 3979 + }, + { + "epoch": 0.61, + "grad_norm": 2.6894278394301554, + "learning_rate": 6.995765856692829e-06, + "loss": 0.8512, + "step": 3980 + }, + { + "epoch": 0.61, + "grad_norm": 2.53178759299369, + "learning_rate": 6.991037695631429e-06, + "loss": 0.8591, + "step": 3981 + }, + { + "epoch": 0.61, + "grad_norm": 2.718043311800521, + "learning_rate": 6.986310274088574e-06, + "loss": 0.8791, + "step": 3982 + }, + { + "epoch": 0.61, + "grad_norm": 2.573133728510007, + "learning_rate": 6.981583593226129e-06, + "loss": 0.773, + "step": 3983 + }, + { + "epoch": 0.61, + "grad_norm": 2.3568678578730933, + "learning_rate": 6.976857654205784e-06, + "loss": 0.7653, + "step": 3984 + }, + { + "epoch": 0.61, + "grad_norm": 2.69196387885202, + "learning_rate": 6.972132458189041e-06, + "loss": 0.844, + "step": 3985 + }, + { + "epoch": 0.61, + "grad_norm": 2.619016705936952, + "learning_rate": 6.96740800633722e-06, + "loss": 0.8745, + "step": 3986 + }, + { + "epoch": 0.61, + "grad_norm": 2.5620443105698874, + "learning_rate": 6.962684299811458e-06, + "loss": 0.8514, + "step": 3987 + }, + { + "epoch": 0.61, + "grad_norm": 2.693222913880473, + "learning_rate": 6.957961339772712e-06, + "loss": 0.898, + "step": 3988 + }, + { + "epoch": 0.61, + "grad_norm": 2.8891906623178865, + "learning_rate": 6.953239127381747e-06, + "loss": 0.8755, + "step": 3989 + }, + { + "epoch": 0.61, + "grad_norm": 2.519827989426567, + "learning_rate": 6.948517663799159e-06, + "loss": 0.8378, + "step": 3990 + }, + { + "epoch": 0.61, + "grad_norm": 2.6792360398650534, + "learning_rate": 6.943796950185344e-06, + "loss": 0.727, + "step": 3991 + }, + { + "epoch": 0.61, + "grad_norm": 3.323976172597204, + "learning_rate": 6.939076987700528e-06, + "loss": 0.826, + "step": 3992 + }, + { + "epoch": 0.61, + "grad_norm": 2.8847629787119793, + "learning_rate": 6.934357777504741e-06, + "loss": 0.8186, + "step": 3993 + }, + { + "epoch": 0.61, + "grad_norm": 2.5969932789141064, + "learning_rate": 6.929639320757829e-06, + "loss": 0.7931, + "step": 3994 + }, + { + "epoch": 0.61, + "grad_norm": 3.013839101459728, + "learning_rate": 6.924921618619465e-06, + "loss": 0.8448, + "step": 3995 + }, + { + "epoch": 0.61, + "grad_norm": 2.8119454609352896, + "learning_rate": 6.920204672249122e-06, + "loss": 0.8568, + "step": 3996 + }, + { + "epoch": 0.61, + "grad_norm": 2.555707248166815, + "learning_rate": 6.915488482806094e-06, + "loss": 0.7189, + "step": 3997 + }, + { + "epoch": 0.61, + "grad_norm": 2.626316012005687, + "learning_rate": 6.9107730514494905e-06, + "loss": 0.891, + "step": 3998 + }, + { + "epoch": 0.61, + "grad_norm": 2.6351268057180492, + "learning_rate": 6.906058379338229e-06, + "loss": 0.8713, + "step": 3999 + }, + { + "epoch": 0.61, + "grad_norm": 2.8765134930506817, + "learning_rate": 6.901344467631041e-06, + "loss": 0.8725, + "step": 4000 + }, + { + "epoch": 0.61, + "grad_norm": 2.423697084064312, + "learning_rate": 6.896631317486484e-06, + "loss": 0.7581, + "step": 4001 + }, + { + "epoch": 0.61, + "grad_norm": 2.773687716759275, + "learning_rate": 6.891918930062908e-06, + "loss": 0.7716, + "step": 4002 + }, + { + "epoch": 0.61, + "grad_norm": 3.0364915828207826, + "learning_rate": 6.887207306518493e-06, + "loss": 0.753, + "step": 4003 + }, + { + "epoch": 0.61, + "grad_norm": 2.421734376856012, + "learning_rate": 6.8824964480112185e-06, + "loss": 0.8384, + "step": 4004 + }, + { + "epoch": 0.61, + "grad_norm": 2.8184012050044376, + "learning_rate": 6.877786355698881e-06, + "loss": 0.8738, + "step": 4005 + }, + { + "epoch": 0.61, + "grad_norm": 2.7558547785075453, + "learning_rate": 6.873077030739095e-06, + "loss": 0.7818, + "step": 4006 + }, + { + "epoch": 0.61, + "grad_norm": 2.495768634000435, + "learning_rate": 6.868368474289278e-06, + "loss": 0.8092, + "step": 4007 + }, + { + "epoch": 0.61, + "grad_norm": 3.026842608734279, + "learning_rate": 6.8636606875066556e-06, + "loss": 0.8685, + "step": 4008 + }, + { + "epoch": 0.61, + "grad_norm": 2.495028706880457, + "learning_rate": 6.8589536715482786e-06, + "loss": 0.7991, + "step": 4009 + }, + { + "epoch": 0.61, + "grad_norm": 3.109487577384096, + "learning_rate": 6.8542474275709906e-06, + "loss": 0.8021, + "step": 4010 + }, + { + "epoch": 0.61, + "grad_norm": 2.5455720562557658, + "learning_rate": 6.8495419567314625e-06, + "loss": 0.8722, + "step": 4011 + }, + { + "epoch": 0.61, + "grad_norm": 2.513268664427139, + "learning_rate": 6.844837260186164e-06, + "loss": 0.8785, + "step": 4012 + }, + { + "epoch": 0.61, + "grad_norm": 3.050419928001916, + "learning_rate": 6.840133339091375e-06, + "loss": 0.7724, + "step": 4013 + }, + { + "epoch": 0.61, + "grad_norm": 2.6074989364288426, + "learning_rate": 6.835430194603191e-06, + "loss": 0.8731, + "step": 4014 + }, + { + "epoch": 0.61, + "grad_norm": 2.43468203557067, + "learning_rate": 6.8307278278775125e-06, + "loss": 0.9025, + "step": 4015 + }, + { + "epoch": 0.61, + "grad_norm": 2.824045907126482, + "learning_rate": 6.826026240070043e-06, + "loss": 0.7948, + "step": 4016 + }, + { + "epoch": 0.61, + "grad_norm": 2.730762641642708, + "learning_rate": 6.821325432336311e-06, + "loss": 0.812, + "step": 4017 + }, + { + "epoch": 0.62, + "grad_norm": 2.531912929558859, + "learning_rate": 6.816625405831638e-06, + "loss": 0.7354, + "step": 4018 + }, + { + "epoch": 0.62, + "grad_norm": 2.572901191167967, + "learning_rate": 6.811926161711157e-06, + "loss": 0.8102, + "step": 4019 + }, + { + "epoch": 0.62, + "grad_norm": 2.726521586567872, + "learning_rate": 6.807227701129816e-06, + "loss": 0.8618, + "step": 4020 + }, + { + "epoch": 0.62, + "grad_norm": 2.634553428080092, + "learning_rate": 6.802530025242356e-06, + "loss": 0.8396, + "step": 4021 + }, + { + "epoch": 0.62, + "grad_norm": 2.7548342410377984, + "learning_rate": 6.797833135203345e-06, + "loss": 0.8199, + "step": 4022 + }, + { + "epoch": 0.62, + "grad_norm": 2.6846160687305054, + "learning_rate": 6.79313703216714e-06, + "loss": 0.8959, + "step": 4023 + }, + { + "epoch": 0.62, + "grad_norm": 2.8043730901195687, + "learning_rate": 6.7884417172879104e-06, + "loss": 0.7931, + "step": 4024 + }, + { + "epoch": 0.62, + "grad_norm": 2.545970678283305, + "learning_rate": 6.783747191719637e-06, + "loss": 0.7749, + "step": 4025 + }, + { + "epoch": 0.62, + "grad_norm": 2.5451689723358326, + "learning_rate": 6.779053456616102e-06, + "loss": 0.7443, + "step": 4026 + }, + { + "epoch": 0.62, + "grad_norm": 4.529972058083067, + "learning_rate": 6.774360513130888e-06, + "loss": 0.975, + "step": 4027 + }, + { + "epoch": 0.62, + "grad_norm": 2.611937243351216, + "learning_rate": 6.769668362417397e-06, + "loss": 0.9137, + "step": 4028 + }, + { + "epoch": 0.62, + "grad_norm": 2.644050252900844, + "learning_rate": 6.7649770056288245e-06, + "loss": 0.8149, + "step": 4029 + }, + { + "epoch": 0.62, + "grad_norm": 2.504733855409512, + "learning_rate": 6.760286443918172e-06, + "loss": 0.8504, + "step": 4030 + }, + { + "epoch": 0.62, + "grad_norm": 3.8987650988567344, + "learning_rate": 6.755596678438253e-06, + "loss": 0.9102, + "step": 4031 + }, + { + "epoch": 0.62, + "grad_norm": 2.6816926813216684, + "learning_rate": 6.750907710341674e-06, + "loss": 0.777, + "step": 4032 + }, + { + "epoch": 0.62, + "grad_norm": 2.568770879329872, + "learning_rate": 6.746219540780859e-06, + "loss": 0.8053, + "step": 4033 + }, + { + "epoch": 0.62, + "grad_norm": 2.6798556604625077, + "learning_rate": 6.7415321709080254e-06, + "loss": 0.9544, + "step": 4034 + }, + { + "epoch": 0.62, + "grad_norm": 2.558759654519992, + "learning_rate": 6.736845601875195e-06, + "loss": 0.7916, + "step": 4035 + }, + { + "epoch": 0.62, + "grad_norm": 2.85191964706347, + "learning_rate": 6.7321598348342e-06, + "loss": 0.7775, + "step": 4036 + }, + { + "epoch": 0.62, + "grad_norm": 2.699746268096858, + "learning_rate": 6.727474870936667e-06, + "loss": 0.8829, + "step": 4037 + }, + { + "epoch": 0.62, + "grad_norm": 2.6224212365035386, + "learning_rate": 6.722790711334024e-06, + "loss": 0.7747, + "step": 4038 + }, + { + "epoch": 0.62, + "grad_norm": 2.891059389978791, + "learning_rate": 6.7181073571775165e-06, + "loss": 0.7886, + "step": 4039 + }, + { + "epoch": 0.62, + "grad_norm": 2.623580531520144, + "learning_rate": 6.713424809618176e-06, + "loss": 0.8556, + "step": 4040 + }, + { + "epoch": 0.62, + "grad_norm": 2.6496413746498084, + "learning_rate": 6.708743069806839e-06, + "loss": 0.8466, + "step": 4041 + }, + { + "epoch": 0.62, + "grad_norm": 2.9920543365646664, + "learning_rate": 6.704062138894154e-06, + "loss": 0.7497, + "step": 4042 + }, + { + "epoch": 0.62, + "grad_norm": 2.4311107806698398, + "learning_rate": 6.699382018030551e-06, + "loss": 0.8919, + "step": 4043 + }, + { + "epoch": 0.62, + "grad_norm": 2.668935722922524, + "learning_rate": 6.694702708366283e-06, + "loss": 0.8383, + "step": 4044 + }, + { + "epoch": 0.62, + "grad_norm": 2.6708486158887257, + "learning_rate": 6.690024211051389e-06, + "loss": 0.8418, + "step": 4045 + }, + { + "epoch": 0.62, + "grad_norm": 2.611173432896564, + "learning_rate": 6.685346527235707e-06, + "loss": 0.778, + "step": 4046 + }, + { + "epoch": 0.62, + "grad_norm": 2.724855065721752, + "learning_rate": 6.68066965806889e-06, + "loss": 0.8278, + "step": 4047 + }, + { + "epoch": 0.62, + "grad_norm": 2.749989817656343, + "learning_rate": 6.67599360470038e-06, + "loss": 0.7917, + "step": 4048 + }, + { + "epoch": 0.62, + "grad_norm": 2.516759813149189, + "learning_rate": 6.6713183682794104e-06, + "loss": 0.7597, + "step": 4049 + }, + { + "epoch": 0.62, + "grad_norm": 2.700800674394037, + "learning_rate": 6.666643949955033e-06, + "loss": 0.9328, + "step": 4050 + }, + { + "epoch": 0.62, + "grad_norm": 2.3983122261201006, + "learning_rate": 6.6619703508760855e-06, + "loss": 0.7261, + "step": 4051 + }, + { + "epoch": 0.62, + "grad_norm": 2.748672614839632, + "learning_rate": 6.657297572191202e-06, + "loss": 0.8521, + "step": 4052 + }, + { + "epoch": 0.62, + "grad_norm": 2.6261395166600905, + "learning_rate": 6.652625615048831e-06, + "loss": 0.8024, + "step": 4053 + }, + { + "epoch": 0.62, + "grad_norm": 2.4759735826351665, + "learning_rate": 6.6479544805972e-06, + "loss": 0.776, + "step": 4054 + }, + { + "epoch": 0.62, + "grad_norm": 2.506862194790913, + "learning_rate": 6.64328416998435e-06, + "loss": 0.7892, + "step": 4055 + }, + { + "epoch": 0.62, + "grad_norm": 2.525522899555866, + "learning_rate": 6.6386146843581075e-06, + "loss": 0.76, + "step": 4056 + }, + { + "epoch": 0.62, + "grad_norm": 2.90535564554283, + "learning_rate": 6.633946024866098e-06, + "loss": 0.8547, + "step": 4057 + }, + { + "epoch": 0.62, + "grad_norm": 2.8295321791625, + "learning_rate": 6.629278192655755e-06, + "loss": 0.8736, + "step": 4058 + }, + { + "epoch": 0.62, + "grad_norm": 2.8648454263022014, + "learning_rate": 6.624611188874297e-06, + "loss": 0.8359, + "step": 4059 + }, + { + "epoch": 0.62, + "grad_norm": 2.4725522208039368, + "learning_rate": 6.619945014668741e-06, + "loss": 0.829, + "step": 4060 + }, + { + "epoch": 0.62, + "grad_norm": 2.924624596128136, + "learning_rate": 6.615279671185904e-06, + "loss": 0.783, + "step": 4061 + }, + { + "epoch": 0.62, + "grad_norm": 2.308448959924426, + "learning_rate": 6.6106151595723955e-06, + "loss": 0.6837, + "step": 4062 + }, + { + "epoch": 0.62, + "grad_norm": 2.7063686985335793, + "learning_rate": 6.605951480974618e-06, + "loss": 0.7703, + "step": 4063 + }, + { + "epoch": 0.62, + "grad_norm": 2.6589391223667045, + "learning_rate": 6.601288636538782e-06, + "loss": 0.8667, + "step": 4064 + }, + { + "epoch": 0.62, + "grad_norm": 2.544162074879167, + "learning_rate": 6.596626627410876e-06, + "loss": 0.7775, + "step": 4065 + }, + { + "epoch": 0.62, + "grad_norm": 2.7523919460415818, + "learning_rate": 6.591965454736695e-06, + "loss": 0.8147, + "step": 4066 + }, + { + "epoch": 0.62, + "grad_norm": 4.008385669960899, + "learning_rate": 6.587305119661824e-06, + "loss": 0.9105, + "step": 4067 + }, + { + "epoch": 0.62, + "grad_norm": 2.769938732126488, + "learning_rate": 6.582645623331638e-06, + "loss": 0.8769, + "step": 4068 + }, + { + "epoch": 0.62, + "grad_norm": 2.67656946337048, + "learning_rate": 6.577986966891319e-06, + "loss": 0.8576, + "step": 4069 + }, + { + "epoch": 0.62, + "grad_norm": 2.917024733414259, + "learning_rate": 6.57332915148583e-06, + "loss": 0.6818, + "step": 4070 + }, + { + "epoch": 0.62, + "grad_norm": 2.5314665259462976, + "learning_rate": 6.568672178259927e-06, + "loss": 0.7612, + "step": 4071 + }, + { + "epoch": 0.62, + "grad_norm": 2.767389303924322, + "learning_rate": 6.564016048358171e-06, + "loss": 0.7915, + "step": 4072 + }, + { + "epoch": 0.62, + "grad_norm": 2.5813872988695876, + "learning_rate": 6.559360762924905e-06, + "loss": 0.8387, + "step": 4073 + }, + { + "epoch": 0.62, + "grad_norm": 2.626198012687205, + "learning_rate": 6.5547063231042616e-06, + "loss": 0.8293, + "step": 4074 + }, + { + "epoch": 0.62, + "grad_norm": 2.616346063221364, + "learning_rate": 6.550052730040182e-06, + "loss": 0.904, + "step": 4075 + }, + { + "epoch": 0.62, + "grad_norm": 2.554419912580466, + "learning_rate": 6.545399984876382e-06, + "loss": 0.8675, + "step": 4076 + }, + { + "epoch": 0.62, + "grad_norm": 2.5640788502246457, + "learning_rate": 6.540748088756379e-06, + "loss": 0.7613, + "step": 4077 + }, + { + "epoch": 0.62, + "grad_norm": 2.704161443937231, + "learning_rate": 6.536097042823478e-06, + "loss": 0.8895, + "step": 4078 + }, + { + "epoch": 0.62, + "grad_norm": 2.6358909379667366, + "learning_rate": 6.53144684822077e-06, + "loss": 0.7766, + "step": 4079 + }, + { + "epoch": 0.62, + "grad_norm": 2.941858378301113, + "learning_rate": 6.5267975060911534e-06, + "loss": 0.8079, + "step": 4080 + }, + { + "epoch": 0.62, + "grad_norm": 2.515359174202313, + "learning_rate": 6.5221490175772996e-06, + "loss": 0.7194, + "step": 4081 + }, + { + "epoch": 0.62, + "grad_norm": 2.586250039707266, + "learning_rate": 6.517501383821675e-06, + "loss": 0.7522, + "step": 4082 + }, + { + "epoch": 0.62, + "grad_norm": 2.6503763756279386, + "learning_rate": 6.512854605966542e-06, + "loss": 0.8073, + "step": 4083 + }, + { + "epoch": 0.63, + "grad_norm": 2.6301952442230596, + "learning_rate": 6.508208685153943e-06, + "loss": 0.8574, + "step": 4084 + }, + { + "epoch": 0.63, + "grad_norm": 2.6460918372899784, + "learning_rate": 6.5035636225257206e-06, + "loss": 0.8026, + "step": 4085 + }, + { + "epoch": 0.63, + "grad_norm": 2.6467874751339475, + "learning_rate": 6.4989194192235e-06, + "loss": 0.7865, + "step": 4086 + }, + { + "epoch": 0.63, + "grad_norm": 2.53926613183767, + "learning_rate": 6.4942760763886906e-06, + "loss": 0.8668, + "step": 4087 + }, + { + "epoch": 0.63, + "grad_norm": 2.4622835484736094, + "learning_rate": 6.489633595162503e-06, + "loss": 0.8309, + "step": 4088 + }, + { + "epoch": 0.63, + "grad_norm": 2.7848796523663166, + "learning_rate": 6.484991976685927e-06, + "loss": 0.8824, + "step": 4089 + }, + { + "epoch": 0.63, + "grad_norm": 3.014091740841856, + "learning_rate": 6.4803512220997366e-06, + "loss": 0.9, + "step": 4090 + }, + { + "epoch": 0.63, + "grad_norm": 2.4966799499927443, + "learning_rate": 6.475711332544506e-06, + "loss": 0.8105, + "step": 4091 + }, + { + "epoch": 0.63, + "grad_norm": 2.56373881046293, + "learning_rate": 6.471072309160589e-06, + "loss": 0.8666, + "step": 4092 + }, + { + "epoch": 0.63, + "grad_norm": 3.102359688208209, + "learning_rate": 6.466434153088124e-06, + "loss": 0.8941, + "step": 4093 + }, + { + "epoch": 0.63, + "grad_norm": 2.506941106145849, + "learning_rate": 6.461796865467043e-06, + "loss": 0.8177, + "step": 4094 + }, + { + "epoch": 0.63, + "grad_norm": 2.446841206056606, + "learning_rate": 6.457160447437055e-06, + "loss": 0.8387, + "step": 4095 + }, + { + "epoch": 0.63, + "grad_norm": 3.039755545522044, + "learning_rate": 6.452524900137671e-06, + "loss": 0.9508, + "step": 4096 + }, + { + "epoch": 0.63, + "grad_norm": 2.960153109907913, + "learning_rate": 6.447890224708175e-06, + "loss": 0.8863, + "step": 4097 + }, + { + "epoch": 0.63, + "grad_norm": 3.8572295562041097, + "learning_rate": 6.443256422287636e-06, + "loss": 0.8931, + "step": 4098 + }, + { + "epoch": 0.63, + "grad_norm": 2.5975817640776717, + "learning_rate": 6.4386234940149175e-06, + "loss": 0.7323, + "step": 4099 + }, + { + "epoch": 0.63, + "grad_norm": 2.585156872911483, + "learning_rate": 6.433991441028662e-06, + "loss": 0.8293, + "step": 4100 + }, + { + "epoch": 0.63, + "grad_norm": 2.649456417121931, + "learning_rate": 6.429360264467295e-06, + "loss": 0.8892, + "step": 4101 + }, + { + "epoch": 0.63, + "grad_norm": 2.8639255281877456, + "learning_rate": 6.424729965469035e-06, + "loss": 0.9024, + "step": 4102 + }, + { + "epoch": 0.63, + "grad_norm": 2.780003960122886, + "learning_rate": 6.420100545171878e-06, + "loss": 0.8657, + "step": 4103 + }, + { + "epoch": 0.63, + "grad_norm": 2.509960135382255, + "learning_rate": 6.4154720047136036e-06, + "loss": 0.7157, + "step": 4104 + }, + { + "epoch": 0.63, + "grad_norm": 2.5979157879958343, + "learning_rate": 6.4108443452317795e-06, + "loss": 0.8317, + "step": 4105 + }, + { + "epoch": 0.63, + "grad_norm": 2.4868039343347768, + "learning_rate": 6.40621756786375e-06, + "loss": 0.8063, + "step": 4106 + }, + { + "epoch": 0.63, + "grad_norm": 3.2123728570860846, + "learning_rate": 6.401591673746654e-06, + "loss": 0.7474, + "step": 4107 + }, + { + "epoch": 0.63, + "grad_norm": 2.8195349629081465, + "learning_rate": 6.396966664017406e-06, + "loss": 0.8592, + "step": 4108 + }, + { + "epoch": 0.63, + "grad_norm": 2.74887243369275, + "learning_rate": 6.392342539812697e-06, + "loss": 0.8349, + "step": 4109 + }, + { + "epoch": 0.63, + "grad_norm": 2.414885397862542, + "learning_rate": 6.387719302269013e-06, + "loss": 0.8437, + "step": 4110 + }, + { + "epoch": 0.63, + "grad_norm": 2.8044942835656865, + "learning_rate": 6.383096952522616e-06, + "loss": 0.8438, + "step": 4111 + }, + { + "epoch": 0.63, + "grad_norm": 2.4437840839634037, + "learning_rate": 6.378475491709543e-06, + "loss": 0.8465, + "step": 4112 + }, + { + "epoch": 0.63, + "grad_norm": 2.5651394312215334, + "learning_rate": 6.373854920965629e-06, + "loss": 0.8432, + "step": 4113 + }, + { + "epoch": 0.63, + "grad_norm": 2.4733310108162367, + "learning_rate": 6.369235241426475e-06, + "loss": 0.7228, + "step": 4114 + }, + { + "epoch": 0.63, + "grad_norm": 3.0439512584691006, + "learning_rate": 6.3646164542274705e-06, + "loss": 0.7518, + "step": 4115 + }, + { + "epoch": 0.63, + "grad_norm": 2.9206726657813906, + "learning_rate": 6.359998560503784e-06, + "loss": 0.7668, + "step": 4116 + }, + { + "epoch": 0.63, + "grad_norm": 2.985452574642286, + "learning_rate": 6.355381561390359e-06, + "loss": 0.7956, + "step": 4117 + }, + { + "epoch": 0.63, + "grad_norm": 2.550473017214655, + "learning_rate": 6.350765458021935e-06, + "loss": 0.7408, + "step": 4118 + }, + { + "epoch": 0.63, + "grad_norm": 2.5381264489602375, + "learning_rate": 6.346150251533013e-06, + "loss": 0.8889, + "step": 4119 + }, + { + "epoch": 0.63, + "grad_norm": 2.575632216288596, + "learning_rate": 6.341535943057882e-06, + "loss": 0.8932, + "step": 4120 + }, + { + "epoch": 0.63, + "grad_norm": 2.4305923233576925, + "learning_rate": 6.336922533730611e-06, + "loss": 0.7149, + "step": 4121 + }, + { + "epoch": 0.63, + "grad_norm": 2.542358361284761, + "learning_rate": 6.332310024685049e-06, + "loss": 0.7948, + "step": 4122 + }, + { + "epoch": 0.63, + "grad_norm": 2.8751522589498832, + "learning_rate": 6.327698417054814e-06, + "loss": 0.8883, + "step": 4123 + }, + { + "epoch": 0.63, + "grad_norm": 2.4987312069013172, + "learning_rate": 6.3230877119733184e-06, + "loss": 0.8417, + "step": 4124 + }, + { + "epoch": 0.63, + "grad_norm": 2.626332516673871, + "learning_rate": 6.318477910573739e-06, + "loss": 0.8392, + "step": 4125 + }, + { + "epoch": 0.63, + "grad_norm": 2.884330720549634, + "learning_rate": 6.313869013989037e-06, + "loss": 0.8784, + "step": 4126 + }, + { + "epoch": 0.63, + "grad_norm": 2.433399590467831, + "learning_rate": 6.309261023351951e-06, + "loss": 0.7685, + "step": 4127 + }, + { + "epoch": 0.63, + "grad_norm": 4.234724727523963, + "learning_rate": 6.304653939794991e-06, + "loss": 0.9656, + "step": 4128 + }, + { + "epoch": 0.63, + "grad_norm": 3.7665147719725556, + "learning_rate": 6.300047764450456e-06, + "loss": 0.8758, + "step": 4129 + }, + { + "epoch": 0.63, + "grad_norm": 2.6819728229640902, + "learning_rate": 6.295442498450413e-06, + "loss": 0.8419, + "step": 4130 + }, + { + "epoch": 0.63, + "grad_norm": 2.405193789764089, + "learning_rate": 6.290838142926702e-06, + "loss": 0.7792, + "step": 4131 + }, + { + "epoch": 0.63, + "grad_norm": 2.5564651239366065, + "learning_rate": 6.286234699010951e-06, + "loss": 0.7418, + "step": 4132 + }, + { + "epoch": 0.63, + "grad_norm": 2.547961536655596, + "learning_rate": 6.281632167834555e-06, + "loss": 0.8122, + "step": 4133 + }, + { + "epoch": 0.63, + "grad_norm": 2.579679050497923, + "learning_rate": 6.277030550528681e-06, + "loss": 0.8219, + "step": 4134 + }, + { + "epoch": 0.63, + "grad_norm": 2.6577361052780657, + "learning_rate": 6.2724298482242885e-06, + "loss": 0.843, + "step": 4135 + }, + { + "epoch": 0.63, + "grad_norm": 2.796823419353427, + "learning_rate": 6.267830062052095e-06, + "loss": 0.8328, + "step": 4136 + }, + { + "epoch": 0.63, + "grad_norm": 2.675985550773138, + "learning_rate": 6.263231193142598e-06, + "loss": 0.7838, + "step": 4137 + }, + { + "epoch": 0.63, + "grad_norm": 2.5707725044542515, + "learning_rate": 6.258633242626073e-06, + "loss": 0.8262, + "step": 4138 + }, + { + "epoch": 0.63, + "grad_norm": 2.813092623388354, + "learning_rate": 6.254036211632563e-06, + "loss": 0.8425, + "step": 4139 + }, + { + "epoch": 0.63, + "grad_norm": 2.568730615011392, + "learning_rate": 6.249440101291898e-06, + "loss": 0.7452, + "step": 4140 + }, + { + "epoch": 0.63, + "grad_norm": 4.755377671537752, + "learning_rate": 6.244844912733667e-06, + "loss": 0.8829, + "step": 4141 + }, + { + "epoch": 0.63, + "grad_norm": 2.558947372504722, + "learning_rate": 6.240250647087236e-06, + "loss": 0.897, + "step": 4142 + }, + { + "epoch": 0.63, + "grad_norm": 2.745447153058319, + "learning_rate": 6.235657305481752e-06, + "loss": 0.8688, + "step": 4143 + }, + { + "epoch": 0.63, + "grad_norm": 2.647907903816932, + "learning_rate": 6.23106488904613e-06, + "loss": 0.8379, + "step": 4144 + }, + { + "epoch": 0.63, + "grad_norm": 2.790647314327554, + "learning_rate": 6.2264733989090475e-06, + "loss": 0.8742, + "step": 4145 + }, + { + "epoch": 0.63, + "grad_norm": 2.9750857386672114, + "learning_rate": 6.221882836198977e-06, + "loss": 0.8778, + "step": 4146 + }, + { + "epoch": 0.63, + "grad_norm": 2.3547014311971215, + "learning_rate": 6.217293202044143e-06, + "loss": 0.6963, + "step": 4147 + }, + { + "epoch": 0.63, + "grad_norm": 2.4512617810217496, + "learning_rate": 6.212704497572548e-06, + "loss": 0.7892, + "step": 4148 + }, + { + "epoch": 0.64, + "grad_norm": 2.5369909191742073, + "learning_rate": 6.208116723911971e-06, + "loss": 0.7219, + "step": 4149 + }, + { + "epoch": 0.64, + "grad_norm": 2.536979432046566, + "learning_rate": 6.203529882189951e-06, + "loss": 0.7174, + "step": 4150 + }, + { + "epoch": 0.64, + "grad_norm": 2.452686560261575, + "learning_rate": 6.198943973533816e-06, + "loss": 0.7855, + "step": 4151 + }, + { + "epoch": 0.64, + "grad_norm": 3.6527193305604295, + "learning_rate": 6.194358999070646e-06, + "loss": 0.7499, + "step": 4152 + }, + { + "epoch": 0.64, + "grad_norm": 2.3863825447074682, + "learning_rate": 6.189774959927297e-06, + "loss": 0.744, + "step": 4153 + }, + { + "epoch": 0.64, + "grad_norm": 2.5502961600790366, + "learning_rate": 6.185191857230408e-06, + "loss": 0.7702, + "step": 4154 + }, + { + "epoch": 0.64, + "grad_norm": 2.777791413090332, + "learning_rate": 6.180609692106368e-06, + "loss": 0.7513, + "step": 4155 + }, + { + "epoch": 0.64, + "grad_norm": 2.7429295672784546, + "learning_rate": 6.176028465681343e-06, + "loss": 0.7781, + "step": 4156 + }, + { + "epoch": 0.64, + "grad_norm": 2.583623335809288, + "learning_rate": 6.1714481790812765e-06, + "loss": 0.7991, + "step": 4157 + }, + { + "epoch": 0.64, + "grad_norm": 2.5019251528214976, + "learning_rate": 6.166868833431869e-06, + "loss": 0.7802, + "step": 4158 + }, + { + "epoch": 0.64, + "grad_norm": 2.46943361864758, + "learning_rate": 6.162290429858602e-06, + "loss": 0.8392, + "step": 4159 + }, + { + "epoch": 0.64, + "grad_norm": 2.697621406379056, + "learning_rate": 6.157712969486716e-06, + "loss": 0.7135, + "step": 4160 + }, + { + "epoch": 0.64, + "grad_norm": 2.5361851349014426, + "learning_rate": 6.153136453441216e-06, + "loss": 0.7468, + "step": 4161 + }, + { + "epoch": 0.64, + "grad_norm": 2.777669008265395, + "learning_rate": 6.14856088284689e-06, + "loss": 0.7782, + "step": 4162 + }, + { + "epoch": 0.64, + "grad_norm": 2.676393263557243, + "learning_rate": 6.14398625882828e-06, + "loss": 0.8292, + "step": 4163 + }, + { + "epoch": 0.64, + "grad_norm": 2.629987864288778, + "learning_rate": 6.1394125825097005e-06, + "loss": 0.7771, + "step": 4164 + }, + { + "epoch": 0.64, + "grad_norm": 2.3706226588091357, + "learning_rate": 6.134839855015235e-06, + "loss": 0.7614, + "step": 4165 + }, + { + "epoch": 0.64, + "grad_norm": 2.9096277892102456, + "learning_rate": 6.1302680774687325e-06, + "loss": 0.8288, + "step": 4166 + }, + { + "epoch": 0.64, + "grad_norm": 2.266472116130424, + "learning_rate": 6.125697250993804e-06, + "loss": 0.617, + "step": 4167 + }, + { + "epoch": 0.64, + "grad_norm": 2.546047394461752, + "learning_rate": 6.1211273767138336e-06, + "loss": 0.8736, + "step": 4168 + }, + { + "epoch": 0.64, + "grad_norm": 2.7568097786899677, + "learning_rate": 6.1165584557519634e-06, + "loss": 0.7315, + "step": 4169 + }, + { + "epoch": 0.64, + "grad_norm": 2.5725431683106645, + "learning_rate": 6.111990489231114e-06, + "loss": 0.7479, + "step": 4170 + }, + { + "epoch": 0.64, + "grad_norm": 2.418434465044092, + "learning_rate": 6.1074234782739576e-06, + "loss": 0.8278, + "step": 4171 + }, + { + "epoch": 0.64, + "grad_norm": 2.6811260628803524, + "learning_rate": 6.102857424002937e-06, + "loss": 0.8043, + "step": 4172 + }, + { + "epoch": 0.64, + "grad_norm": 2.7969088357041114, + "learning_rate": 6.098292327540265e-06, + "loss": 0.8, + "step": 4173 + }, + { + "epoch": 0.64, + "grad_norm": 2.8678984331762813, + "learning_rate": 6.093728190007912e-06, + "loss": 0.8598, + "step": 4174 + }, + { + "epoch": 0.64, + "grad_norm": 2.7141740857093932, + "learning_rate": 6.089165012527609e-06, + "loss": 0.7699, + "step": 4175 + }, + { + "epoch": 0.64, + "grad_norm": 2.7721712895307693, + "learning_rate": 6.084602796220866e-06, + "loss": 0.8596, + "step": 4176 + }, + { + "epoch": 0.64, + "grad_norm": 2.534281013021705, + "learning_rate": 6.080041542208946e-06, + "loss": 0.7828, + "step": 4177 + }, + { + "epoch": 0.64, + "grad_norm": 2.551375290088987, + "learning_rate": 6.075481251612873e-06, + "loss": 0.7576, + "step": 4178 + }, + { + "epoch": 0.64, + "grad_norm": 2.8120490143121186, + "learning_rate": 6.0709219255534424e-06, + "loss": 0.8315, + "step": 4179 + }, + { + "epoch": 0.64, + "grad_norm": 3.2619929727898964, + "learning_rate": 6.066363565151203e-06, + "loss": 0.8645, + "step": 4180 + }, + { + "epoch": 0.64, + "grad_norm": 2.5383095550716197, + "learning_rate": 6.061806171526482e-06, + "loss": 0.7716, + "step": 4181 + }, + { + "epoch": 0.64, + "grad_norm": 2.4670004389518843, + "learning_rate": 6.0572497457993515e-06, + "loss": 0.7649, + "step": 4182 + }, + { + "epoch": 0.64, + "grad_norm": 2.2463743452604414, + "learning_rate": 6.052694289089655e-06, + "loss": 0.7303, + "step": 4183 + }, + { + "epoch": 0.64, + "grad_norm": 2.8256884332350682, + "learning_rate": 6.048139802516997e-06, + "loss": 0.7508, + "step": 4184 + }, + { + "epoch": 0.64, + "grad_norm": 2.6277561858184866, + "learning_rate": 6.04358628720074e-06, + "loss": 0.7525, + "step": 4185 + }, + { + "epoch": 0.64, + "grad_norm": 2.788573749816516, + "learning_rate": 6.039033744260009e-06, + "loss": 0.8504, + "step": 4186 + }, + { + "epoch": 0.64, + "grad_norm": 2.6760899491206636, + "learning_rate": 6.034482174813698e-06, + "loss": 0.8448, + "step": 4187 + }, + { + "epoch": 0.64, + "grad_norm": 2.7226554524499753, + "learning_rate": 6.0299315799804524e-06, + "loss": 0.8442, + "step": 4188 + }, + { + "epoch": 0.64, + "grad_norm": 2.5518099787008737, + "learning_rate": 6.025381960878675e-06, + "loss": 0.8306, + "step": 4189 + }, + { + "epoch": 0.64, + "grad_norm": 2.696138047928778, + "learning_rate": 6.020833318626544e-06, + "loss": 0.8902, + "step": 4190 + }, + { + "epoch": 0.64, + "grad_norm": 2.3993432050044063, + "learning_rate": 6.016285654341978e-06, + "loss": 0.7706, + "step": 4191 + }, + { + "epoch": 0.64, + "grad_norm": 3.7765750937001203, + "learning_rate": 6.011738969142676e-06, + "loss": 0.8679, + "step": 4192 + }, + { + "epoch": 0.64, + "grad_norm": 2.859266254500037, + "learning_rate": 6.0071932641460784e-06, + "loss": 0.8554, + "step": 4193 + }, + { + "epoch": 0.64, + "grad_norm": 2.6019050994444024, + "learning_rate": 6.002648540469394e-06, + "loss": 0.7164, + "step": 4194 + }, + { + "epoch": 0.64, + "grad_norm": 2.7344950289561343, + "learning_rate": 5.9981047992295895e-06, + "loss": 0.8349, + "step": 4195 + }, + { + "epoch": 0.64, + "grad_norm": 2.612788730546887, + "learning_rate": 5.993562041543388e-06, + "loss": 0.8315, + "step": 4196 + }, + { + "epoch": 0.64, + "grad_norm": 2.664873642699137, + "learning_rate": 5.989020268527268e-06, + "loss": 0.8662, + "step": 4197 + }, + { + "epoch": 0.64, + "grad_norm": 2.7090021448831867, + "learning_rate": 5.9844794812974784e-06, + "loss": 0.8077, + "step": 4198 + }, + { + "epoch": 0.64, + "grad_norm": 2.8042750369301563, + "learning_rate": 5.979939680970012e-06, + "loss": 0.8289, + "step": 4199 + }, + { + "epoch": 0.64, + "grad_norm": 2.897760226237868, + "learning_rate": 5.975400868660624e-06, + "loss": 0.845, + "step": 4200 + }, + { + "epoch": 0.64, + "grad_norm": 2.8252430374625055, + "learning_rate": 5.970863045484829e-06, + "loss": 0.8849, + "step": 4201 + }, + { + "epoch": 0.64, + "grad_norm": 2.4574717176968277, + "learning_rate": 5.966326212557892e-06, + "loss": 0.9231, + "step": 4202 + }, + { + "epoch": 0.64, + "grad_norm": 2.5705754869443163, + "learning_rate": 5.9617903709948485e-06, + "loss": 0.8147, + "step": 4203 + }, + { + "epoch": 0.64, + "grad_norm": 2.559653301159418, + "learning_rate": 5.957255521910477e-06, + "loss": 0.8086, + "step": 4204 + }, + { + "epoch": 0.64, + "grad_norm": 2.6067710988929536, + "learning_rate": 5.952721666419311e-06, + "loss": 0.7963, + "step": 4205 + }, + { + "epoch": 0.64, + "grad_norm": 2.7621380129077964, + "learning_rate": 5.948188805635652e-06, + "loss": 0.8049, + "step": 4206 + }, + { + "epoch": 0.64, + "grad_norm": 2.652662783063158, + "learning_rate": 5.9436569406735475e-06, + "loss": 0.798, + "step": 4207 + }, + { + "epoch": 0.64, + "grad_norm": 2.742562653994274, + "learning_rate": 5.939126072646798e-06, + "loss": 0.8252, + "step": 4208 + }, + { + "epoch": 0.64, + "grad_norm": 2.4686756050614465, + "learning_rate": 5.934596202668973e-06, + "loss": 0.7727, + "step": 4209 + }, + { + "epoch": 0.64, + "grad_norm": 2.5693938881507408, + "learning_rate": 5.930067331853382e-06, + "loss": 0.8081, + "step": 4210 + }, + { + "epoch": 0.64, + "grad_norm": 2.6406554355059035, + "learning_rate": 5.925539461313095e-06, + "loss": 0.6952, + "step": 4211 + }, + { + "epoch": 0.64, + "grad_norm": 2.567357727216333, + "learning_rate": 5.921012592160936e-06, + "loss": 0.7968, + "step": 4212 + }, + { + "epoch": 0.64, + "grad_norm": 3.549508744267877, + "learning_rate": 5.916486725509479e-06, + "loss": 0.9364, + "step": 4213 + }, + { + "epoch": 0.65, + "grad_norm": 2.653890717917521, + "learning_rate": 5.911961862471063e-06, + "loss": 0.8733, + "step": 4214 + }, + { + "epoch": 0.65, + "grad_norm": 2.5635645587348996, + "learning_rate": 5.907438004157767e-06, + "loss": 0.8015, + "step": 4215 + }, + { + "epoch": 0.65, + "grad_norm": 2.901723645052626, + "learning_rate": 5.902915151681427e-06, + "loss": 0.8542, + "step": 4216 + }, + { + "epoch": 0.65, + "grad_norm": 2.630135346802989, + "learning_rate": 5.89839330615364e-06, + "loss": 0.7924, + "step": 4217 + }, + { + "epoch": 0.65, + "grad_norm": 2.9543974638939074, + "learning_rate": 5.893872468685743e-06, + "loss": 0.8472, + "step": 4218 + }, + { + "epoch": 0.65, + "grad_norm": 2.608700762294027, + "learning_rate": 5.889352640388828e-06, + "loss": 0.7086, + "step": 4219 + }, + { + "epoch": 0.65, + "grad_norm": 2.6845836414520825, + "learning_rate": 5.884833822373751e-06, + "loss": 0.8024, + "step": 4220 + }, + { + "epoch": 0.65, + "grad_norm": 2.712954926371659, + "learning_rate": 5.880316015751106e-06, + "loss": 0.819, + "step": 4221 + }, + { + "epoch": 0.65, + "grad_norm": 2.601280457477493, + "learning_rate": 5.875799221631242e-06, + "loss": 0.7722, + "step": 4222 + }, + { + "epoch": 0.65, + "grad_norm": 2.523625947183782, + "learning_rate": 5.871283441124264e-06, + "loss": 0.8626, + "step": 4223 + }, + { + "epoch": 0.65, + "grad_norm": 2.4552347893262985, + "learning_rate": 5.866768675340018e-06, + "loss": 0.7041, + "step": 4224 + }, + { + "epoch": 0.65, + "grad_norm": 3.52178337670741, + "learning_rate": 5.8622549253881135e-06, + "loss": 0.8436, + "step": 4225 + }, + { + "epoch": 0.65, + "grad_norm": 2.7843047744222957, + "learning_rate": 5.8577421923779025e-06, + "loss": 0.7545, + "step": 4226 + }, + { + "epoch": 0.65, + "grad_norm": 2.7334770181269517, + "learning_rate": 5.853230477418483e-06, + "loss": 0.9143, + "step": 4227 + }, + { + "epoch": 0.65, + "grad_norm": 2.551804593077235, + "learning_rate": 5.8487197816187145e-06, + "loss": 0.7694, + "step": 4228 + }, + { + "epoch": 0.65, + "grad_norm": 3.827479143157379, + "learning_rate": 5.844210106087198e-06, + "loss": 0.8784, + "step": 4229 + }, + { + "epoch": 0.65, + "grad_norm": 2.6859822414158914, + "learning_rate": 5.839701451932278e-06, + "loss": 0.7843, + "step": 4230 + }, + { + "epoch": 0.65, + "grad_norm": 2.5468246556889764, + "learning_rate": 5.8351938202620666e-06, + "loss": 0.862, + "step": 4231 + }, + { + "epoch": 0.65, + "grad_norm": 4.308901391630756, + "learning_rate": 5.830687212184407e-06, + "loss": 0.8985, + "step": 4232 + }, + { + "epoch": 0.65, + "grad_norm": 2.3919305019885573, + "learning_rate": 5.826181628806893e-06, + "loss": 0.7759, + "step": 4233 + }, + { + "epoch": 0.65, + "grad_norm": 2.795221305372727, + "learning_rate": 5.821677071236881e-06, + "loss": 0.8936, + "step": 4234 + }, + { + "epoch": 0.65, + "grad_norm": 2.543397606862117, + "learning_rate": 5.817173540581459e-06, + "loss": 0.8577, + "step": 4235 + }, + { + "epoch": 0.65, + "grad_norm": 2.5323267733398285, + "learning_rate": 5.812671037947468e-06, + "loss": 0.7667, + "step": 4236 + }, + { + "epoch": 0.65, + "grad_norm": 2.6574236218151848, + "learning_rate": 5.8081695644415e-06, + "loss": 0.8143, + "step": 4237 + }, + { + "epoch": 0.65, + "grad_norm": 2.912775437447401, + "learning_rate": 5.803669121169883e-06, + "loss": 0.8918, + "step": 4238 + }, + { + "epoch": 0.65, + "grad_norm": 2.939122520427922, + "learning_rate": 5.79916970923871e-06, + "loss": 0.858, + "step": 4239 + }, + { + "epoch": 0.65, + "grad_norm": 2.4747736151679516, + "learning_rate": 5.7946713297538045e-06, + "loss": 0.8165, + "step": 4240 + }, + { + "epoch": 0.65, + "grad_norm": 3.640788425357784, + "learning_rate": 5.790173983820741e-06, + "loss": 0.8805, + "step": 4241 + }, + { + "epoch": 0.65, + "grad_norm": 2.806348940640807, + "learning_rate": 5.785677672544847e-06, + "loss": 0.844, + "step": 4242 + }, + { + "epoch": 0.65, + "grad_norm": 2.5707488108651746, + "learning_rate": 5.781182397031182e-06, + "loss": 0.8061, + "step": 4243 + }, + { + "epoch": 0.65, + "grad_norm": 2.638873894062458, + "learning_rate": 5.776688158384565e-06, + "loss": 0.9218, + "step": 4244 + }, + { + "epoch": 0.65, + "grad_norm": 2.8972966930258943, + "learning_rate": 5.772194957709553e-06, + "loss": 0.7464, + "step": 4245 + }, + { + "epoch": 0.65, + "grad_norm": 2.686284381777874, + "learning_rate": 5.767702796110448e-06, + "loss": 0.8202, + "step": 4246 + }, + { + "epoch": 0.65, + "grad_norm": 2.605172358461725, + "learning_rate": 5.763211674691296e-06, + "loss": 0.8269, + "step": 4247 + }, + { + "epoch": 0.65, + "grad_norm": 2.539411710086652, + "learning_rate": 5.758721594555887e-06, + "loss": 0.8579, + "step": 4248 + }, + { + "epoch": 0.65, + "grad_norm": 2.4711589723884613, + "learning_rate": 5.7542325568077576e-06, + "loss": 0.7988, + "step": 4249 + }, + { + "epoch": 0.65, + "grad_norm": 2.605839636785204, + "learning_rate": 5.749744562550191e-06, + "loss": 0.8018, + "step": 4250 + }, + { + "epoch": 0.65, + "grad_norm": 2.68865790641465, + "learning_rate": 5.745257612886209e-06, + "loss": 0.8699, + "step": 4251 + }, + { + "epoch": 0.65, + "grad_norm": 2.4119373086322318, + "learning_rate": 5.740771708918573e-06, + "loss": 0.7347, + "step": 4252 + }, + { + "epoch": 0.65, + "grad_norm": 2.5149040499716757, + "learning_rate": 5.7362868517498e-06, + "loss": 0.8381, + "step": 4253 + }, + { + "epoch": 0.65, + "grad_norm": 3.575269057160359, + "learning_rate": 5.731803042482135e-06, + "loss": 0.8648, + "step": 4254 + }, + { + "epoch": 0.65, + "grad_norm": 2.492363871218477, + "learning_rate": 5.72732028221758e-06, + "loss": 0.7981, + "step": 4255 + }, + { + "epoch": 0.65, + "grad_norm": 2.6274570081759014, + "learning_rate": 5.722838572057867e-06, + "loss": 0.6948, + "step": 4256 + }, + { + "epoch": 0.65, + "grad_norm": 2.722018640380298, + "learning_rate": 5.718357913104477e-06, + "loss": 0.7619, + "step": 4257 + }, + { + "epoch": 0.65, + "grad_norm": 2.6070857482325307, + "learning_rate": 5.713878306458626e-06, + "loss": 0.8415, + "step": 4258 + }, + { + "epoch": 0.65, + "grad_norm": 2.731178106222968, + "learning_rate": 5.709399753221282e-06, + "loss": 0.8122, + "step": 4259 + }, + { + "epoch": 0.65, + "grad_norm": 2.5202536126008734, + "learning_rate": 5.704922254493139e-06, + "loss": 0.9007, + "step": 4260 + }, + { + "epoch": 0.65, + "grad_norm": 2.763827526804397, + "learning_rate": 5.7004458113746485e-06, + "loss": 0.7899, + "step": 4261 + }, + { + "epoch": 0.65, + "grad_norm": 2.9383225693445243, + "learning_rate": 5.695970424965993e-06, + "loss": 0.7951, + "step": 4262 + }, + { + "epoch": 0.65, + "grad_norm": 2.673618096203085, + "learning_rate": 5.691496096367093e-06, + "loss": 0.7874, + "step": 4263 + }, + { + "epoch": 0.65, + "grad_norm": 2.4528289976047604, + "learning_rate": 5.687022826677619e-06, + "loss": 0.7333, + "step": 4264 + }, + { + "epoch": 0.65, + "grad_norm": 2.434025160199743, + "learning_rate": 5.682550616996968e-06, + "loss": 0.7751, + "step": 4265 + }, + { + "epoch": 0.65, + "grad_norm": 2.5805781831193646, + "learning_rate": 5.678079468424293e-06, + "loss": 0.8292, + "step": 4266 + }, + { + "epoch": 0.65, + "grad_norm": 2.535643452437688, + "learning_rate": 5.67360938205847e-06, + "loss": 0.7846, + "step": 4267 + }, + { + "epoch": 0.65, + "grad_norm": 2.7760047936406242, + "learning_rate": 5.66914035899812e-06, + "loss": 0.7979, + "step": 4268 + }, + { + "epoch": 0.65, + "grad_norm": 2.3859097315091726, + "learning_rate": 5.664672400341614e-06, + "loss": 0.7801, + "step": 4269 + }, + { + "epoch": 0.65, + "grad_norm": 2.8159117598444467, + "learning_rate": 5.6602055071870395e-06, + "loss": 0.7675, + "step": 4270 + }, + { + "epoch": 0.65, + "grad_norm": 2.617294328890024, + "learning_rate": 5.655739680632233e-06, + "loss": 0.7799, + "step": 4271 + }, + { + "epoch": 0.65, + "grad_norm": 2.5763635450190776, + "learning_rate": 5.651274921774777e-06, + "loss": 0.8318, + "step": 4272 + }, + { + "epoch": 0.65, + "grad_norm": 2.7449717860806904, + "learning_rate": 5.646811231711982e-06, + "loss": 0.7591, + "step": 4273 + }, + { + "epoch": 0.65, + "grad_norm": 2.627656108484001, + "learning_rate": 5.642348611540892e-06, + "loss": 0.7857, + "step": 4274 + }, + { + "epoch": 0.65, + "grad_norm": 2.545063061864533, + "learning_rate": 5.637887062358302e-06, + "loss": 0.8196, + "step": 4275 + }, + { + "epoch": 0.65, + "grad_norm": 2.482595594212583, + "learning_rate": 5.633426585260728e-06, + "loss": 0.8116, + "step": 4276 + }, + { + "epoch": 0.65, + "grad_norm": 3.047445613173143, + "learning_rate": 5.6289671813444376e-06, + "loss": 0.8021, + "step": 4277 + }, + { + "epoch": 0.65, + "grad_norm": 2.422904459979712, + "learning_rate": 5.624508851705426e-06, + "loss": 0.8128, + "step": 4278 + }, + { + "epoch": 0.65, + "grad_norm": 2.8670743746634257, + "learning_rate": 5.620051597439417e-06, + "loss": 0.836, + "step": 4279 + }, + { + "epoch": 0.66, + "grad_norm": 2.473690554907657, + "learning_rate": 5.6155954196418905e-06, + "loss": 0.772, + "step": 4280 + }, + { + "epoch": 0.66, + "grad_norm": 2.6703573989353697, + "learning_rate": 5.6111403194080435e-06, + "loss": 0.8059, + "step": 4281 + }, + { + "epoch": 0.66, + "grad_norm": 2.63961123884444, + "learning_rate": 5.606686297832817e-06, + "loss": 0.7687, + "step": 4282 + }, + { + "epoch": 0.66, + "grad_norm": 2.7593795892751896, + "learning_rate": 5.602233356010883e-06, + "loss": 0.8548, + "step": 4283 + }, + { + "epoch": 0.66, + "grad_norm": 2.516821655894118, + "learning_rate": 5.59778149503665e-06, + "loss": 0.823, + "step": 4284 + }, + { + "epoch": 0.66, + "grad_norm": 3.034175184053461, + "learning_rate": 5.5933307160042575e-06, + "loss": 0.8375, + "step": 4285 + }, + { + "epoch": 0.66, + "grad_norm": 2.609464382963494, + "learning_rate": 5.588881020007588e-06, + "loss": 0.7616, + "step": 4286 + }, + { + "epoch": 0.66, + "grad_norm": 2.5855150647940217, + "learning_rate": 5.584432408140246e-06, + "loss": 0.7481, + "step": 4287 + }, + { + "epoch": 0.66, + "grad_norm": 2.7773608715093907, + "learning_rate": 5.579984881495582e-06, + "loss": 0.7988, + "step": 4288 + }, + { + "epoch": 0.66, + "grad_norm": 2.7223896843575988, + "learning_rate": 5.575538441166671e-06, + "loss": 0.7573, + "step": 4289 + }, + { + "epoch": 0.66, + "grad_norm": 2.690483798746339, + "learning_rate": 5.5710930882463174e-06, + "loss": 0.8885, + "step": 4290 + }, + { + "epoch": 0.66, + "grad_norm": 2.422432484428218, + "learning_rate": 5.566648823827075e-06, + "loss": 0.7379, + "step": 4291 + }, + { + "epoch": 0.66, + "grad_norm": 2.7445270021125245, + "learning_rate": 5.562205649001213e-06, + "loss": 0.7741, + "step": 4292 + }, + { + "epoch": 0.66, + "grad_norm": 2.699609159678072, + "learning_rate": 5.5577635648607396e-06, + "loss": 0.7712, + "step": 4293 + }, + { + "epoch": 0.66, + "grad_norm": 3.683136191034804, + "learning_rate": 5.553322572497395e-06, + "loss": 0.8999, + "step": 4294 + }, + { + "epoch": 0.66, + "grad_norm": 2.63819629445708, + "learning_rate": 5.548882673002651e-06, + "loss": 0.8582, + "step": 4295 + }, + { + "epoch": 0.66, + "grad_norm": 3.4904520900083083, + "learning_rate": 5.544443867467705e-06, + "loss": 0.9049, + "step": 4296 + }, + { + "epoch": 0.66, + "grad_norm": 2.677841058226375, + "learning_rate": 5.5400061569834995e-06, + "loss": 0.7495, + "step": 4297 + }, + { + "epoch": 0.66, + "grad_norm": 2.6850812833894007, + "learning_rate": 5.5355695426406905e-06, + "loss": 0.7755, + "step": 4298 + }, + { + "epoch": 0.66, + "grad_norm": 2.718695969365637, + "learning_rate": 5.531134025529684e-06, + "loss": 0.9263, + "step": 4299 + }, + { + "epoch": 0.66, + "grad_norm": 2.6832942199376215, + "learning_rate": 5.5266996067405995e-06, + "loss": 0.7404, + "step": 4300 + }, + { + "epoch": 0.66, + "grad_norm": 2.5524843916004953, + "learning_rate": 5.522266287363289e-06, + "loss": 0.7853, + "step": 4301 + }, + { + "epoch": 0.66, + "grad_norm": 2.5065546394197145, + "learning_rate": 5.517834068487347e-06, + "loss": 0.8372, + "step": 4302 + }, + { + "epoch": 0.66, + "grad_norm": 2.544350067419898, + "learning_rate": 5.513402951202082e-06, + "loss": 0.7003, + "step": 4303 + }, + { + "epoch": 0.66, + "grad_norm": 2.7065939446365435, + "learning_rate": 5.508972936596542e-06, + "loss": 0.6783, + "step": 4304 + }, + { + "epoch": 0.66, + "grad_norm": 2.5383869242448776, + "learning_rate": 5.504544025759498e-06, + "loss": 0.7375, + "step": 4305 + }, + { + "epoch": 0.66, + "grad_norm": 2.591610208015511, + "learning_rate": 5.500116219779453e-06, + "loss": 0.8246, + "step": 4306 + }, + { + "epoch": 0.66, + "grad_norm": 2.3648144456733933, + "learning_rate": 5.495689519744634e-06, + "loss": 0.752, + "step": 4307 + }, + { + "epoch": 0.66, + "grad_norm": 3.275339281801009, + "learning_rate": 5.491263926743005e-06, + "loss": 0.7565, + "step": 4308 + }, + { + "epoch": 0.66, + "grad_norm": 2.6419310382852195, + "learning_rate": 5.486839441862248e-06, + "loss": 0.7358, + "step": 4309 + }, + { + "epoch": 0.66, + "grad_norm": 2.5891680783499385, + "learning_rate": 5.482416066189783e-06, + "loss": 0.7734, + "step": 4310 + }, + { + "epoch": 0.66, + "grad_norm": 2.8216200785004437, + "learning_rate": 5.477993800812749e-06, + "loss": 0.8772, + "step": 4311 + }, + { + "epoch": 0.66, + "grad_norm": 3.640532241676102, + "learning_rate": 5.473572646818011e-06, + "loss": 0.8411, + "step": 4312 + }, + { + "epoch": 0.66, + "grad_norm": 2.427155653140146, + "learning_rate": 5.4691526052921705e-06, + "loss": 0.7412, + "step": 4313 + }, + { + "epoch": 0.66, + "grad_norm": 2.4904325112756585, + "learning_rate": 5.464733677321548e-06, + "loss": 0.7864, + "step": 4314 + }, + { + "epoch": 0.66, + "grad_norm": 2.8079062192354263, + "learning_rate": 5.460315863992191e-06, + "loss": 0.7866, + "step": 4315 + }, + { + "epoch": 0.66, + "grad_norm": 2.853060485157894, + "learning_rate": 5.455899166389875e-06, + "loss": 0.8018, + "step": 4316 + }, + { + "epoch": 0.66, + "grad_norm": 2.485773737267365, + "learning_rate": 5.451483585600096e-06, + "loss": 0.7539, + "step": 4317 + }, + { + "epoch": 0.66, + "grad_norm": 2.797910829886508, + "learning_rate": 5.447069122708086e-06, + "loss": 0.7892, + "step": 4318 + }, + { + "epoch": 0.66, + "grad_norm": 2.8726475904531275, + "learning_rate": 5.442655778798795e-06, + "loss": 0.7287, + "step": 4319 + }, + { + "epoch": 0.66, + "grad_norm": 2.964910825888747, + "learning_rate": 5.438243554956895e-06, + "loss": 0.9582, + "step": 4320 + }, + { + "epoch": 0.66, + "grad_norm": 2.6561943823597725, + "learning_rate": 5.433832452266794e-06, + "loss": 0.8632, + "step": 4321 + }, + { + "epoch": 0.66, + "grad_norm": 2.544080014332781, + "learning_rate": 5.429422471812612e-06, + "loss": 0.8035, + "step": 4322 + }, + { + "epoch": 0.66, + "grad_norm": 2.826157745342801, + "learning_rate": 5.425013614678197e-06, + "loss": 0.8831, + "step": 4323 + }, + { + "epoch": 0.66, + "grad_norm": 3.4112657605218684, + "learning_rate": 5.4206058819471276e-06, + "loss": 0.797, + "step": 4324 + }, + { + "epoch": 0.66, + "grad_norm": 2.4430557001274273, + "learning_rate": 5.416199274702699e-06, + "loss": 0.7924, + "step": 4325 + }, + { + "epoch": 0.66, + "grad_norm": 2.4328516564445186, + "learning_rate": 5.411793794027931e-06, + "loss": 0.727, + "step": 4326 + }, + { + "epoch": 0.66, + "grad_norm": 2.8558225094747867, + "learning_rate": 5.407389441005569e-06, + "loss": 0.7773, + "step": 4327 + }, + { + "epoch": 0.66, + "grad_norm": 2.5283966333337706, + "learning_rate": 5.402986216718071e-06, + "loss": 0.77, + "step": 4328 + }, + { + "epoch": 0.66, + "grad_norm": 2.728670654557776, + "learning_rate": 5.398584122247639e-06, + "loss": 0.8477, + "step": 4329 + }, + { + "epoch": 0.66, + "grad_norm": 2.8334669043005265, + "learning_rate": 5.394183158676178e-06, + "loss": 0.8122, + "step": 4330 + }, + { + "epoch": 0.66, + "grad_norm": 2.652568907162666, + "learning_rate": 5.389783327085317e-06, + "loss": 0.822, + "step": 4331 + }, + { + "epoch": 0.66, + "grad_norm": 2.3142620945835386, + "learning_rate": 5.38538462855642e-06, + "loss": 0.6978, + "step": 4332 + }, + { + "epoch": 0.66, + "grad_norm": 2.7730169314845914, + "learning_rate": 5.38098706417056e-06, + "loss": 0.807, + "step": 4333 + }, + { + "epoch": 0.66, + "grad_norm": 2.851402696586342, + "learning_rate": 5.376590635008531e-06, + "loss": 0.8454, + "step": 4334 + }, + { + "epoch": 0.66, + "grad_norm": 2.6112912494257206, + "learning_rate": 5.3721953421508585e-06, + "loss": 0.7354, + "step": 4335 + }, + { + "epoch": 0.66, + "grad_norm": 2.68768207215782, + "learning_rate": 5.367801186677779e-06, + "loss": 0.8525, + "step": 4336 + }, + { + "epoch": 0.66, + "grad_norm": 2.348257730715597, + "learning_rate": 5.363408169669253e-06, + "loss": 0.7565, + "step": 4337 + }, + { + "epoch": 0.66, + "grad_norm": 2.5730011308658454, + "learning_rate": 5.359016292204962e-06, + "loss": 0.7148, + "step": 4338 + }, + { + "epoch": 0.66, + "grad_norm": 2.820490788623341, + "learning_rate": 5.354625555364301e-06, + "loss": 0.7555, + "step": 4339 + }, + { + "epoch": 0.66, + "grad_norm": 2.407342488224494, + "learning_rate": 5.3502359602263975e-06, + "loss": 0.7254, + "step": 4340 + }, + { + "epoch": 0.66, + "grad_norm": 2.756876501448233, + "learning_rate": 5.345847507870087e-06, + "loss": 0.7881, + "step": 4341 + }, + { + "epoch": 0.66, + "grad_norm": 2.394704047096753, + "learning_rate": 5.341460199373925e-06, + "loss": 0.7396, + "step": 4342 + }, + { + "epoch": 0.66, + "grad_norm": 9.638217727897471, + "learning_rate": 5.337074035816197e-06, + "loss": 0.9129, + "step": 4343 + }, + { + "epoch": 0.66, + "grad_norm": 2.5261876067230253, + "learning_rate": 5.332689018274892e-06, + "loss": 0.7824, + "step": 4344 + }, + { + "epoch": 0.67, + "grad_norm": 2.95205594094719, + "learning_rate": 5.3283051478277244e-06, + "loss": 0.8592, + "step": 4345 + }, + { + "epoch": 0.67, + "grad_norm": 2.646327886755659, + "learning_rate": 5.323922425552133e-06, + "loss": 0.7524, + "step": 4346 + }, + { + "epoch": 0.67, + "grad_norm": 2.706809301435508, + "learning_rate": 5.319540852525264e-06, + "loss": 0.8686, + "step": 4347 + }, + { + "epoch": 0.67, + "grad_norm": 2.881161427865816, + "learning_rate": 5.315160429823984e-06, + "loss": 0.8364, + "step": 4348 + }, + { + "epoch": 0.67, + "grad_norm": 2.6895405962270083, + "learning_rate": 5.3107811585248806e-06, + "loss": 0.7875, + "step": 4349 + }, + { + "epoch": 0.67, + "grad_norm": 2.628565569216531, + "learning_rate": 5.306403039704249e-06, + "loss": 0.7816, + "step": 4350 + }, + { + "epoch": 0.67, + "grad_norm": 2.6822366932950317, + "learning_rate": 5.302026074438118e-06, + "loss": 0.7017, + "step": 4351 + }, + { + "epoch": 0.67, + "grad_norm": 2.39036593803038, + "learning_rate": 5.297650263802218e-06, + "loss": 0.7353, + "step": 4352 + }, + { + "epoch": 0.67, + "grad_norm": 2.8448153356191828, + "learning_rate": 5.293275608871997e-06, + "loss": 0.8072, + "step": 4353 + }, + { + "epoch": 0.67, + "grad_norm": 2.664536900824033, + "learning_rate": 5.28890211072263e-06, + "loss": 0.8026, + "step": 4354 + }, + { + "epoch": 0.67, + "grad_norm": 2.513815124466424, + "learning_rate": 5.284529770428997e-06, + "loss": 0.7549, + "step": 4355 + }, + { + "epoch": 0.67, + "grad_norm": 2.6869729516841416, + "learning_rate": 5.280158589065691e-06, + "loss": 0.8434, + "step": 4356 + }, + { + "epoch": 0.67, + "grad_norm": 3.658999602038804, + "learning_rate": 5.275788567707036e-06, + "loss": 0.9011, + "step": 4357 + }, + { + "epoch": 0.67, + "grad_norm": 2.625186278881105, + "learning_rate": 5.271419707427056e-06, + "loss": 0.8522, + "step": 4358 + }, + { + "epoch": 0.67, + "grad_norm": 2.8677496677154295, + "learning_rate": 5.267052009299494e-06, + "loss": 0.827, + "step": 4359 + }, + { + "epoch": 0.67, + "grad_norm": 3.046371336945512, + "learning_rate": 5.2626854743978065e-06, + "loss": 0.7203, + "step": 4360 + }, + { + "epoch": 0.67, + "grad_norm": 2.631751934773863, + "learning_rate": 5.258320103795162e-06, + "loss": 0.8268, + "step": 4361 + }, + { + "epoch": 0.67, + "grad_norm": 2.581228394407631, + "learning_rate": 5.253955898564456e-06, + "loss": 0.8345, + "step": 4362 + }, + { + "epoch": 0.67, + "grad_norm": 2.8536806404973762, + "learning_rate": 5.249592859778281e-06, + "loss": 0.9339, + "step": 4363 + }, + { + "epoch": 0.67, + "grad_norm": 3.381748068008239, + "learning_rate": 5.245230988508947e-06, + "loss": 0.8614, + "step": 4364 + }, + { + "epoch": 0.67, + "grad_norm": 2.6552696050449454, + "learning_rate": 5.240870285828489e-06, + "loss": 0.7775, + "step": 4365 + }, + { + "epoch": 0.67, + "grad_norm": 2.5855320318893997, + "learning_rate": 5.236510752808639e-06, + "loss": 0.6907, + "step": 4366 + }, + { + "epoch": 0.67, + "grad_norm": 2.844571555995099, + "learning_rate": 5.232152390520845e-06, + "loss": 0.8022, + "step": 4367 + }, + { + "epoch": 0.67, + "grad_norm": 2.6640972709004926, + "learning_rate": 5.227795200036279e-06, + "loss": 0.7807, + "step": 4368 + }, + { + "epoch": 0.67, + "grad_norm": 2.5939197694825897, + "learning_rate": 5.223439182425809e-06, + "loss": 0.7405, + "step": 4369 + }, + { + "epoch": 0.67, + "grad_norm": 2.4464154437815178, + "learning_rate": 5.219084338760025e-06, + "loss": 0.8124, + "step": 4370 + }, + { + "epoch": 0.67, + "grad_norm": 2.6917914716514706, + "learning_rate": 5.214730670109227e-06, + "loss": 0.8367, + "step": 4371 + }, + { + "epoch": 0.67, + "grad_norm": 2.3822181150948696, + "learning_rate": 5.210378177543416e-06, + "loss": 0.7319, + "step": 4372 + }, + { + "epoch": 0.67, + "grad_norm": 2.6997257949647064, + "learning_rate": 5.206026862132324e-06, + "loss": 0.7314, + "step": 4373 + }, + { + "epoch": 0.67, + "grad_norm": 2.603477960959868, + "learning_rate": 5.201676724945377e-06, + "loss": 0.8084, + "step": 4374 + }, + { + "epoch": 0.67, + "grad_norm": 2.6164521221019186, + "learning_rate": 5.197327767051713e-06, + "loss": 0.7139, + "step": 4375 + }, + { + "epoch": 0.67, + "grad_norm": 2.661036351334042, + "learning_rate": 5.192979989520193e-06, + "loss": 0.7833, + "step": 4376 + }, + { + "epoch": 0.67, + "grad_norm": 2.609577223160548, + "learning_rate": 5.188633393419371e-06, + "loss": 0.8232, + "step": 4377 + }, + { + "epoch": 0.67, + "grad_norm": 2.481068145446453, + "learning_rate": 5.184287979817519e-06, + "loss": 0.7904, + "step": 4378 + }, + { + "epoch": 0.67, + "grad_norm": 2.989034486269615, + "learning_rate": 5.179943749782623e-06, + "loss": 0.803, + "step": 4379 + }, + { + "epoch": 0.67, + "grad_norm": 2.7008982367335648, + "learning_rate": 5.175600704382371e-06, + "loss": 0.804, + "step": 4380 + }, + { + "epoch": 0.67, + "grad_norm": 2.5881479536178684, + "learning_rate": 5.171258844684155e-06, + "loss": 0.8173, + "step": 4381 + }, + { + "epoch": 0.67, + "grad_norm": 2.813293310899786, + "learning_rate": 5.166918171755097e-06, + "loss": 0.7653, + "step": 4382 + }, + { + "epoch": 0.67, + "grad_norm": 2.844039680058116, + "learning_rate": 5.1625786866619955e-06, + "loss": 0.8475, + "step": 4383 + }, + { + "epoch": 0.67, + "grad_norm": 3.2181719756764, + "learning_rate": 5.158240390471385e-06, + "loss": 0.7642, + "step": 4384 + }, + { + "epoch": 0.67, + "grad_norm": 2.606973473588288, + "learning_rate": 5.153903284249495e-06, + "loss": 0.712, + "step": 4385 + }, + { + "epoch": 0.67, + "grad_norm": 3.2000365788149314, + "learning_rate": 5.149567369062261e-06, + "loss": 0.686, + "step": 4386 + }, + { + "epoch": 0.67, + "grad_norm": 3.9467566513323096, + "learning_rate": 5.145232645975336e-06, + "loss": 0.8953, + "step": 4387 + }, + { + "epoch": 0.67, + "grad_norm": 2.7557886814227865, + "learning_rate": 5.140899116054068e-06, + "loss": 0.8869, + "step": 4388 + }, + { + "epoch": 0.67, + "grad_norm": 2.674056263695096, + "learning_rate": 5.136566780363515e-06, + "loss": 0.7777, + "step": 4389 + }, + { + "epoch": 0.67, + "grad_norm": 2.617838450703711, + "learning_rate": 5.1322356399684525e-06, + "loss": 0.8102, + "step": 4390 + }, + { + "epoch": 0.67, + "grad_norm": 2.8643287708523495, + "learning_rate": 5.127905695933343e-06, + "loss": 0.7876, + "step": 4391 + }, + { + "epoch": 0.67, + "grad_norm": 3.567823294869154, + "learning_rate": 5.123576949322375e-06, + "loss": 0.9056, + "step": 4392 + }, + { + "epoch": 0.67, + "grad_norm": 2.687122412429552, + "learning_rate": 5.119249401199428e-06, + "loss": 0.7666, + "step": 4393 + }, + { + "epoch": 0.67, + "grad_norm": 2.5083630162065336, + "learning_rate": 5.114923052628092e-06, + "loss": 0.7502, + "step": 4394 + }, + { + "epoch": 0.67, + "grad_norm": 2.464663395166503, + "learning_rate": 5.110597904671664e-06, + "loss": 0.745, + "step": 4395 + }, + { + "epoch": 0.67, + "grad_norm": 2.8353050509079107, + "learning_rate": 5.106273958393142e-06, + "loss": 0.7786, + "step": 4396 + }, + { + "epoch": 0.67, + "grad_norm": 2.872102247273557, + "learning_rate": 5.101951214855226e-06, + "loss": 0.8282, + "step": 4397 + }, + { + "epoch": 0.67, + "grad_norm": 2.730776359515011, + "learning_rate": 5.097629675120336e-06, + "loss": 0.8991, + "step": 4398 + }, + { + "epoch": 0.67, + "grad_norm": 2.7406506612238153, + "learning_rate": 5.093309340250578e-06, + "loss": 0.8758, + "step": 4399 + }, + { + "epoch": 0.67, + "grad_norm": 2.773308560374021, + "learning_rate": 5.0889902113077695e-06, + "loss": 0.8074, + "step": 4400 + }, + { + "epoch": 0.67, + "grad_norm": 2.845091034005279, + "learning_rate": 5.084672289353435e-06, + "loss": 0.8554, + "step": 4401 + }, + { + "epoch": 0.67, + "grad_norm": 2.7187441930311462, + "learning_rate": 5.080355575448792e-06, + "loss": 0.7727, + "step": 4402 + }, + { + "epoch": 0.67, + "grad_norm": 2.5786635844494543, + "learning_rate": 5.076040070654778e-06, + "loss": 0.8989, + "step": 4403 + }, + { + "epoch": 0.67, + "grad_norm": 3.5742272068838417, + "learning_rate": 5.071725776032015e-06, + "loss": 0.7542, + "step": 4404 + }, + { + "epoch": 0.67, + "grad_norm": 2.389808986839853, + "learning_rate": 5.067412692640839e-06, + "loss": 0.7614, + "step": 4405 + }, + { + "epoch": 0.67, + "grad_norm": 2.754195864365841, + "learning_rate": 5.063100821541281e-06, + "loss": 0.8037, + "step": 4406 + }, + { + "epoch": 0.67, + "grad_norm": 2.867365347124313, + "learning_rate": 5.058790163793083e-06, + "loss": 0.744, + "step": 4407 + }, + { + "epoch": 0.67, + "grad_norm": 2.629332336481846, + "learning_rate": 5.054480720455677e-06, + "loss": 0.7491, + "step": 4408 + }, + { + "epoch": 0.67, + "grad_norm": 2.603839244071808, + "learning_rate": 5.05017249258821e-06, + "loss": 0.9164, + "step": 4409 + }, + { + "epoch": 0.68, + "grad_norm": 2.702141953179618, + "learning_rate": 5.045865481249523e-06, + "loss": 0.784, + "step": 4410 + }, + { + "epoch": 0.68, + "grad_norm": 3.5178397816644646, + "learning_rate": 5.041559687498152e-06, + "loss": 0.8736, + "step": 4411 + }, + { + "epoch": 0.68, + "grad_norm": 2.5196806497253896, + "learning_rate": 5.03725511239235e-06, + "loss": 0.8599, + "step": 4412 + }, + { + "epoch": 0.68, + "grad_norm": 3.4533128431782645, + "learning_rate": 5.03295175699005e-06, + "loss": 0.8331, + "step": 4413 + }, + { + "epoch": 0.68, + "grad_norm": 2.5491905479689083, + "learning_rate": 5.0286496223489075e-06, + "loss": 0.8176, + "step": 4414 + }, + { + "epoch": 0.68, + "grad_norm": 2.667131326142279, + "learning_rate": 5.0243487095262615e-06, + "loss": 0.8368, + "step": 4415 + }, + { + "epoch": 0.68, + "grad_norm": 2.507835981837654, + "learning_rate": 5.020049019579154e-06, + "loss": 0.8017, + "step": 4416 + }, + { + "epoch": 0.68, + "grad_norm": 2.7554033825056803, + "learning_rate": 5.015750553564331e-06, + "loss": 0.8758, + "step": 4417 + }, + { + "epoch": 0.68, + "grad_norm": 2.5901696392138494, + "learning_rate": 5.011453312538233e-06, + "loss": 0.7932, + "step": 4418 + }, + { + "epoch": 0.68, + "grad_norm": 2.6740393141937977, + "learning_rate": 5.007157297556997e-06, + "loss": 0.725, + "step": 4419 + }, + { + "epoch": 0.68, + "grad_norm": 2.805224200929592, + "learning_rate": 5.002862509676471e-06, + "loss": 0.8622, + "step": 4420 + }, + { + "epoch": 0.68, + "grad_norm": 2.675242937946728, + "learning_rate": 4.998568949952192e-06, + "loss": 0.8288, + "step": 4421 + }, + { + "epoch": 0.68, + "grad_norm": 3.6625074354824894, + "learning_rate": 4.99427661943939e-06, + "loss": 0.8326, + "step": 4422 + }, + { + "epoch": 0.68, + "grad_norm": 2.4338348325708035, + "learning_rate": 4.989985519193008e-06, + "loss": 0.7297, + "step": 4423 + }, + { + "epoch": 0.68, + "grad_norm": 2.762201107569858, + "learning_rate": 4.98569565026767e-06, + "loss": 0.8451, + "step": 4424 + }, + { + "epoch": 0.68, + "grad_norm": 2.6072736360156683, + "learning_rate": 4.981407013717714e-06, + "loss": 0.7758, + "step": 4425 + }, + { + "epoch": 0.68, + "grad_norm": 2.758225364934191, + "learning_rate": 4.977119610597163e-06, + "loss": 0.8361, + "step": 4426 + }, + { + "epoch": 0.68, + "grad_norm": 2.542021886033243, + "learning_rate": 4.972833441959739e-06, + "loss": 0.7494, + "step": 4427 + }, + { + "epoch": 0.68, + "grad_norm": 2.8372559940748414, + "learning_rate": 4.968548508858863e-06, + "loss": 0.7811, + "step": 4428 + }, + { + "epoch": 0.68, + "grad_norm": 2.851293790673043, + "learning_rate": 4.964264812347651e-06, + "loss": 0.6002, + "step": 4429 + }, + { + "epoch": 0.68, + "grad_norm": 2.5694384446828313, + "learning_rate": 4.959982353478911e-06, + "loss": 0.7817, + "step": 4430 + }, + { + "epoch": 0.68, + "grad_norm": 2.8666717398217996, + "learning_rate": 4.955701133305162e-06, + "loss": 0.8643, + "step": 4431 + }, + { + "epoch": 0.68, + "grad_norm": 2.521520650821122, + "learning_rate": 4.9514211528786e-06, + "loss": 0.8003, + "step": 4432 + }, + { + "epoch": 0.68, + "grad_norm": 2.361515154255389, + "learning_rate": 4.9471424132511224e-06, + "loss": 0.7376, + "step": 4433 + }, + { + "epoch": 0.68, + "grad_norm": 2.510640568551166, + "learning_rate": 4.942864915474331e-06, + "loss": 0.827, + "step": 4434 + }, + { + "epoch": 0.68, + "grad_norm": 2.53168907265518, + "learning_rate": 4.9385886605995075e-06, + "loss": 0.7962, + "step": 4435 + }, + { + "epoch": 0.68, + "grad_norm": 2.392625479649822, + "learning_rate": 4.934313649677641e-06, + "loss": 0.6972, + "step": 4436 + }, + { + "epoch": 0.68, + "grad_norm": 2.537304145927643, + "learning_rate": 4.9300398837594076e-06, + "loss": 0.771, + "step": 4437 + }, + { + "epoch": 0.68, + "grad_norm": 2.610679280539628, + "learning_rate": 4.925767363895179e-06, + "loss": 0.7721, + "step": 4438 + }, + { + "epoch": 0.68, + "grad_norm": 2.9022231835808507, + "learning_rate": 4.9214960911350185e-06, + "loss": 0.753, + "step": 4439 + }, + { + "epoch": 0.68, + "grad_norm": 2.7496681277602684, + "learning_rate": 4.917226066528689e-06, + "loss": 0.7837, + "step": 4440 + }, + { + "epoch": 0.68, + "grad_norm": 2.6893999122809604, + "learning_rate": 4.912957291125635e-06, + "loss": 0.7454, + "step": 4441 + }, + { + "epoch": 0.68, + "grad_norm": 2.740717636172206, + "learning_rate": 4.908689765975012e-06, + "loss": 0.807, + "step": 4442 + }, + { + "epoch": 0.68, + "grad_norm": 2.4983590928601926, + "learning_rate": 4.904423492125653e-06, + "loss": 0.7373, + "step": 4443 + }, + { + "epoch": 0.68, + "grad_norm": 2.6187129268123734, + "learning_rate": 4.900158470626085e-06, + "loss": 0.7673, + "step": 4444 + }, + { + "epoch": 0.68, + "grad_norm": 2.6988355035420315, + "learning_rate": 4.895894702524538e-06, + "loss": 0.7332, + "step": 4445 + }, + { + "epoch": 0.68, + "grad_norm": 2.802231068626159, + "learning_rate": 4.891632188868921e-06, + "loss": 0.7753, + "step": 4446 + }, + { + "epoch": 0.68, + "grad_norm": 2.4975003336574, + "learning_rate": 4.887370930706845e-06, + "loss": 0.7797, + "step": 4447 + }, + { + "epoch": 0.68, + "grad_norm": 2.9680313023184275, + "learning_rate": 4.8831109290856046e-06, + "loss": 0.8899, + "step": 4448 + }, + { + "epoch": 0.68, + "grad_norm": 2.727326968393698, + "learning_rate": 4.8788521850521904e-06, + "loss": 0.826, + "step": 4449 + }, + { + "epoch": 0.68, + "grad_norm": 2.7302688744437136, + "learning_rate": 4.874594699653281e-06, + "loss": 0.8682, + "step": 4450 + }, + { + "epoch": 0.68, + "grad_norm": 3.492593665785906, + "learning_rate": 4.870338473935246e-06, + "loss": 0.7884, + "step": 4451 + }, + { + "epoch": 0.68, + "grad_norm": 2.5666077831783607, + "learning_rate": 4.866083508944145e-06, + "loss": 0.7734, + "step": 4452 + }, + { + "epoch": 0.68, + "grad_norm": 2.9646458483096367, + "learning_rate": 4.8618298057257355e-06, + "loss": 0.6841, + "step": 4453 + }, + { + "epoch": 0.68, + "grad_norm": 2.566984184996757, + "learning_rate": 4.857577365325452e-06, + "loss": 0.7161, + "step": 4454 + }, + { + "epoch": 0.68, + "grad_norm": 3.576733376569184, + "learning_rate": 4.853326188788425e-06, + "loss": 0.8825, + "step": 4455 + }, + { + "epoch": 0.68, + "grad_norm": 2.8115505630557847, + "learning_rate": 4.849076277159481e-06, + "loss": 0.9294, + "step": 4456 + }, + { + "epoch": 0.68, + "grad_norm": 2.944850469212431, + "learning_rate": 4.844827631483121e-06, + "loss": 0.7999, + "step": 4457 + }, + { + "epoch": 0.68, + "grad_norm": 2.708885597821722, + "learning_rate": 4.840580252803552e-06, + "loss": 0.7353, + "step": 4458 + }, + { + "epoch": 0.68, + "grad_norm": 2.944819532038206, + "learning_rate": 4.836334142164654e-06, + "loss": 0.8249, + "step": 4459 + }, + { + "epoch": 0.68, + "grad_norm": 2.739948538835541, + "learning_rate": 4.832089300610003e-06, + "loss": 0.8552, + "step": 4460 + }, + { + "epoch": 0.68, + "grad_norm": 2.7541977968082025, + "learning_rate": 4.8278457291828625e-06, + "loss": 0.7478, + "step": 4461 + }, + { + "epoch": 0.68, + "grad_norm": 2.765934044481929, + "learning_rate": 4.823603428926185e-06, + "loss": 0.8427, + "step": 4462 + }, + { + "epoch": 0.68, + "grad_norm": 2.62684539168531, + "learning_rate": 4.819362400882602e-06, + "loss": 0.7443, + "step": 4463 + }, + { + "epoch": 0.68, + "grad_norm": 2.666412253891732, + "learning_rate": 4.815122646094448e-06, + "loss": 0.8608, + "step": 4464 + }, + { + "epoch": 0.68, + "grad_norm": 2.58003459090976, + "learning_rate": 4.8108841656037295e-06, + "loss": 0.829, + "step": 4465 + }, + { + "epoch": 0.68, + "grad_norm": 2.540279478810456, + "learning_rate": 4.806646960452151e-06, + "loss": 0.7526, + "step": 4466 + }, + { + "epoch": 0.68, + "grad_norm": 2.5625038092096126, + "learning_rate": 4.802411031681099e-06, + "loss": 0.8165, + "step": 4467 + }, + { + "epoch": 0.68, + "grad_norm": 2.9700586139058855, + "learning_rate": 4.798176380331638e-06, + "loss": 0.7515, + "step": 4468 + }, + { + "epoch": 0.68, + "grad_norm": 2.8773827775527314, + "learning_rate": 4.793943007444536e-06, + "loss": 0.6777, + "step": 4469 + }, + { + "epoch": 0.68, + "grad_norm": 2.6582906590904454, + "learning_rate": 4.789710914060234e-06, + "loss": 0.8083, + "step": 4470 + }, + { + "epoch": 0.68, + "grad_norm": 2.528804350071189, + "learning_rate": 4.785480101218861e-06, + "loss": 0.7438, + "step": 4471 + }, + { + "epoch": 0.68, + "grad_norm": 2.542769406287971, + "learning_rate": 4.781250569960233e-06, + "loss": 0.8081, + "step": 4472 + }, + { + "epoch": 0.68, + "grad_norm": 2.693399220911003, + "learning_rate": 4.777022321323849e-06, + "loss": 0.9128, + "step": 4473 + }, + { + "epoch": 0.68, + "grad_norm": 2.6851606809802746, + "learning_rate": 4.77279535634889e-06, + "loss": 0.8111, + "step": 4474 + }, + { + "epoch": 0.68, + "grad_norm": 2.5435598261018204, + "learning_rate": 4.768569676074235e-06, + "loss": 0.764, + "step": 4475 + }, + { + "epoch": 0.69, + "grad_norm": 2.584009644323771, + "learning_rate": 4.764345281538428e-06, + "loss": 0.7162, + "step": 4476 + }, + { + "epoch": 0.69, + "grad_norm": 2.62466241921889, + "learning_rate": 4.760122173779715e-06, + "loss": 0.8222, + "step": 4477 + }, + { + "epoch": 0.69, + "grad_norm": 2.533256258740026, + "learning_rate": 4.755900353836015e-06, + "loss": 0.7704, + "step": 4478 + }, + { + "epoch": 0.69, + "grad_norm": 3.861712170497619, + "learning_rate": 4.751679822744928e-06, + "loss": 0.8504, + "step": 4479 + }, + { + "epoch": 0.69, + "grad_norm": 2.7018656507970547, + "learning_rate": 4.747460581543749e-06, + "loss": 0.7879, + "step": 4480 + }, + { + "epoch": 0.69, + "grad_norm": 2.7388923120779887, + "learning_rate": 4.743242631269445e-06, + "loss": 0.7889, + "step": 4481 + }, + { + "epoch": 0.69, + "grad_norm": 2.855257683308518, + "learning_rate": 4.739025972958673e-06, + "loss": 0.8472, + "step": 4482 + }, + { + "epoch": 0.69, + "grad_norm": 2.845673836028014, + "learning_rate": 4.734810607647766e-06, + "loss": 0.7232, + "step": 4483 + }, + { + "epoch": 0.69, + "grad_norm": 2.825542516978436, + "learning_rate": 4.730596536372745e-06, + "loss": 0.7726, + "step": 4484 + }, + { + "epoch": 0.69, + "grad_norm": 2.3857235505175716, + "learning_rate": 4.726383760169304e-06, + "loss": 0.7034, + "step": 4485 + }, + { + "epoch": 0.69, + "grad_norm": 2.530643930906867, + "learning_rate": 4.722172280072835e-06, + "loss": 0.7931, + "step": 4486 + }, + { + "epoch": 0.69, + "grad_norm": 2.657113264744651, + "learning_rate": 4.717962097118394e-06, + "loss": 0.8575, + "step": 4487 + }, + { + "epoch": 0.69, + "grad_norm": 2.713785602161177, + "learning_rate": 4.713753212340732e-06, + "loss": 0.8137, + "step": 4488 + }, + { + "epoch": 0.69, + "grad_norm": 2.5858389823910195, + "learning_rate": 4.709545626774273e-06, + "loss": 0.8343, + "step": 4489 + }, + { + "epoch": 0.69, + "grad_norm": 2.556127744178826, + "learning_rate": 4.705339341453119e-06, + "loss": 0.7033, + "step": 4490 + }, + { + "epoch": 0.69, + "grad_norm": 2.370308093136086, + "learning_rate": 4.701134357411065e-06, + "loss": 0.7872, + "step": 4491 + }, + { + "epoch": 0.69, + "grad_norm": 2.740060254795805, + "learning_rate": 4.696930675681571e-06, + "loss": 0.7007, + "step": 4492 + }, + { + "epoch": 0.69, + "grad_norm": 2.632838666024672, + "learning_rate": 4.692728297297785e-06, + "loss": 0.8182, + "step": 4493 + }, + { + "epoch": 0.69, + "grad_norm": 2.703412372670818, + "learning_rate": 4.6885272232925426e-06, + "loss": 0.8383, + "step": 4494 + }, + { + "epoch": 0.69, + "grad_norm": 2.549367641767443, + "learning_rate": 4.68432745469834e-06, + "loss": 0.8128, + "step": 4495 + }, + { + "epoch": 0.69, + "grad_norm": 2.5904592992356466, + "learning_rate": 4.68012899254736e-06, + "loss": 0.7876, + "step": 4496 + }, + { + "epoch": 0.69, + "grad_norm": 2.8401619973861996, + "learning_rate": 4.675931837871477e-06, + "loss": 0.7499, + "step": 4497 + }, + { + "epoch": 0.69, + "grad_norm": 3.0015771959374127, + "learning_rate": 4.671735991702225e-06, + "loss": 0.9088, + "step": 4498 + }, + { + "epoch": 0.69, + "grad_norm": 3.5500184314648617, + "learning_rate": 4.667541455070834e-06, + "loss": 0.7474, + "step": 4499 + }, + { + "epoch": 0.69, + "grad_norm": 2.6213439037770105, + "learning_rate": 4.663348229008199e-06, + "loss": 0.7026, + "step": 4500 + }, + { + "epoch": 0.69, + "grad_norm": 2.6287716051634447, + "learning_rate": 4.659156314544893e-06, + "loss": 0.8177, + "step": 4501 + }, + { + "epoch": 0.69, + "grad_norm": 2.459424953526767, + "learning_rate": 4.654965712711178e-06, + "loss": 0.7863, + "step": 4502 + }, + { + "epoch": 0.69, + "grad_norm": 2.587294325712288, + "learning_rate": 4.6507764245369855e-06, + "loss": 0.8057, + "step": 4503 + }, + { + "epoch": 0.69, + "grad_norm": 3.396969034639195, + "learning_rate": 4.646588451051919e-06, + "loss": 0.7544, + "step": 4504 + }, + { + "epoch": 0.69, + "grad_norm": 2.6668852980000266, + "learning_rate": 4.642401793285271e-06, + "loss": 0.8272, + "step": 4505 + }, + { + "epoch": 0.69, + "grad_norm": 2.5722160264858815, + "learning_rate": 4.6382164522660055e-06, + "loss": 0.7214, + "step": 4506 + }, + { + "epoch": 0.69, + "grad_norm": 2.8202500748321375, + "learning_rate": 4.634032429022758e-06, + "loss": 0.8385, + "step": 4507 + }, + { + "epoch": 0.69, + "grad_norm": 4.265580443189982, + "learning_rate": 4.629849724583846e-06, + "loss": 0.9186, + "step": 4508 + }, + { + "epoch": 0.69, + "grad_norm": 2.7453839692216855, + "learning_rate": 4.625668339977255e-06, + "loss": 0.7932, + "step": 4509 + }, + { + "epoch": 0.69, + "grad_norm": 2.625301799758122, + "learning_rate": 4.621488276230662e-06, + "loss": 0.8853, + "step": 4510 + }, + { + "epoch": 0.69, + "grad_norm": 2.4718008492512764, + "learning_rate": 4.617309534371404e-06, + "loss": 0.7119, + "step": 4511 + }, + { + "epoch": 0.69, + "grad_norm": 2.641255755127557, + "learning_rate": 4.613132115426496e-06, + "loss": 0.7424, + "step": 4512 + }, + { + "epoch": 0.69, + "grad_norm": 2.5223496054941363, + "learning_rate": 4.608956020422638e-06, + "loss": 0.7684, + "step": 4513 + }, + { + "epoch": 0.69, + "grad_norm": 2.561888272311924, + "learning_rate": 4.604781250386191e-06, + "loss": 0.8153, + "step": 4514 + }, + { + "epoch": 0.69, + "grad_norm": 2.4367407854667174, + "learning_rate": 4.600607806343196e-06, + "loss": 0.682, + "step": 4515 + }, + { + "epoch": 0.69, + "grad_norm": 2.7086669415237523, + "learning_rate": 4.596435689319374e-06, + "loss": 0.8494, + "step": 4516 + }, + { + "epoch": 0.69, + "grad_norm": 2.840924918015386, + "learning_rate": 4.59226490034011e-06, + "loss": 0.8094, + "step": 4517 + }, + { + "epoch": 0.69, + "grad_norm": 2.3463652209327797, + "learning_rate": 4.588095440430469e-06, + "loss": 0.7552, + "step": 4518 + }, + { + "epoch": 0.69, + "grad_norm": 2.608067751840558, + "learning_rate": 4.583927310615185e-06, + "loss": 0.8524, + "step": 4519 + }, + { + "epoch": 0.69, + "grad_norm": 2.5728811216388427, + "learning_rate": 4.579760511918666e-06, + "loss": 0.7234, + "step": 4520 + }, + { + "epoch": 0.69, + "grad_norm": 2.688885009168033, + "learning_rate": 4.575595045365e-06, + "loss": 0.8409, + "step": 4521 + }, + { + "epoch": 0.69, + "grad_norm": 2.5940511926032985, + "learning_rate": 4.5714309119779385e-06, + "loss": 0.7412, + "step": 4522 + }, + { + "epoch": 0.69, + "grad_norm": 2.6332325498634135, + "learning_rate": 4.567268112780906e-06, + "loss": 0.8335, + "step": 4523 + }, + { + "epoch": 0.69, + "grad_norm": 2.7237828697539728, + "learning_rate": 4.563106648797008e-06, + "loss": 0.8518, + "step": 4524 + }, + { + "epoch": 0.69, + "grad_norm": 2.5714001553129493, + "learning_rate": 4.558946521049011e-06, + "loss": 0.8423, + "step": 4525 + }, + { + "epoch": 0.69, + "grad_norm": 2.6647391578755824, + "learning_rate": 4.554787730559357e-06, + "loss": 0.8729, + "step": 4526 + }, + { + "epoch": 0.69, + "grad_norm": 4.669061939275215, + "learning_rate": 4.550630278350165e-06, + "loss": 0.9408, + "step": 4527 + }, + { + "epoch": 0.69, + "grad_norm": 2.508578076573883, + "learning_rate": 4.546474165443219e-06, + "loss": 0.8723, + "step": 4528 + }, + { + "epoch": 0.69, + "grad_norm": 2.687921052688467, + "learning_rate": 4.542319392859972e-06, + "loss": 0.7999, + "step": 4529 + }, + { + "epoch": 0.69, + "grad_norm": 2.492568968275793, + "learning_rate": 4.538165961621552e-06, + "loss": 0.7753, + "step": 4530 + }, + { + "epoch": 0.69, + "grad_norm": 2.526572412400771, + "learning_rate": 4.5340138727487505e-06, + "loss": 0.8778, + "step": 4531 + }, + { + "epoch": 0.69, + "grad_norm": 2.872178179667303, + "learning_rate": 4.529863127262045e-06, + "loss": 0.7821, + "step": 4532 + }, + { + "epoch": 0.69, + "grad_norm": 2.6176834783726637, + "learning_rate": 4.525713726181567e-06, + "loss": 0.7875, + "step": 4533 + }, + { + "epoch": 0.69, + "grad_norm": 2.7386225860310667, + "learning_rate": 4.521565670527119e-06, + "loss": 0.896, + "step": 4534 + }, + { + "epoch": 0.69, + "grad_norm": 2.5227923641482493, + "learning_rate": 4.517418961318185e-06, + "loss": 0.8407, + "step": 4535 + }, + { + "epoch": 0.69, + "grad_norm": 2.91481708532559, + "learning_rate": 4.513273599573906e-06, + "loss": 0.853, + "step": 4536 + }, + { + "epoch": 0.69, + "grad_norm": 2.4922437195264537, + "learning_rate": 4.50912958631309e-06, + "loss": 0.7683, + "step": 4537 + }, + { + "epoch": 0.69, + "grad_norm": 2.526874141534352, + "learning_rate": 4.504986922554229e-06, + "loss": 0.7781, + "step": 4538 + }, + { + "epoch": 0.69, + "grad_norm": 2.4559635537854567, + "learning_rate": 4.500845609315468e-06, + "loss": 0.8497, + "step": 4539 + }, + { + "epoch": 0.69, + "grad_norm": 2.524651553011375, + "learning_rate": 4.496705647614628e-06, + "loss": 0.6886, + "step": 4540 + }, + { + "epoch": 0.7, + "grad_norm": 2.8576812336465296, + "learning_rate": 4.492567038469194e-06, + "loss": 0.7955, + "step": 4541 + }, + { + "epoch": 0.7, + "grad_norm": 2.7588167195369295, + "learning_rate": 4.488429782896315e-06, + "loss": 0.7579, + "step": 4542 + }, + { + "epoch": 0.7, + "grad_norm": 2.5805819596962962, + "learning_rate": 4.484293881912823e-06, + "loss": 0.8285, + "step": 4543 + }, + { + "epoch": 0.7, + "grad_norm": 2.5669736328115, + "learning_rate": 4.4801593365352e-06, + "loss": 0.8195, + "step": 4544 + }, + { + "epoch": 0.7, + "grad_norm": 2.645156566188588, + "learning_rate": 4.4760261477796e-06, + "loss": 0.782, + "step": 4545 + }, + { + "epoch": 0.7, + "grad_norm": 2.8599751400315565, + "learning_rate": 4.47189431666185e-06, + "loss": 0.7474, + "step": 4546 + }, + { + "epoch": 0.7, + "grad_norm": 2.618919615293604, + "learning_rate": 4.4677638441974344e-06, + "loss": 0.7622, + "step": 4547 + }, + { + "epoch": 0.7, + "grad_norm": 2.5745283079506494, + "learning_rate": 4.463634731401506e-06, + "loss": 0.7358, + "step": 4548 + }, + { + "epoch": 0.7, + "grad_norm": 2.768223479539422, + "learning_rate": 4.459506979288891e-06, + "loss": 0.8084, + "step": 4549 + }, + { + "epoch": 0.7, + "grad_norm": 2.66595138445493, + "learning_rate": 4.455380588874072e-06, + "loss": 0.859, + "step": 4550 + }, + { + "epoch": 0.7, + "grad_norm": 2.5302475002778815, + "learning_rate": 4.451255561171199e-06, + "loss": 0.8463, + "step": 4551 + }, + { + "epoch": 0.7, + "grad_norm": 2.828214551033672, + "learning_rate": 4.447131897194089e-06, + "loss": 0.8653, + "step": 4552 + }, + { + "epoch": 0.7, + "grad_norm": 2.577556922731538, + "learning_rate": 4.443009597956219e-06, + "loss": 0.8231, + "step": 4553 + }, + { + "epoch": 0.7, + "grad_norm": 2.616616110701205, + "learning_rate": 4.4388886644707415e-06, + "loss": 0.8405, + "step": 4554 + }, + { + "epoch": 0.7, + "grad_norm": 2.6355056781615405, + "learning_rate": 4.434769097750463e-06, + "loss": 0.9045, + "step": 4555 + }, + { + "epoch": 0.7, + "grad_norm": 2.464200613183843, + "learning_rate": 4.4306508988078545e-06, + "loss": 0.7632, + "step": 4556 + }, + { + "epoch": 0.7, + "grad_norm": 2.5987579746298515, + "learning_rate": 4.42653406865506e-06, + "loss": 0.7914, + "step": 4557 + }, + { + "epoch": 0.7, + "grad_norm": 3.396848709039805, + "learning_rate": 4.422418608303878e-06, + "loss": 0.8756, + "step": 4558 + }, + { + "epoch": 0.7, + "grad_norm": 2.564281271524151, + "learning_rate": 4.418304518765768e-06, + "loss": 0.766, + "step": 4559 + }, + { + "epoch": 0.7, + "grad_norm": 2.893019951497073, + "learning_rate": 4.414191801051868e-06, + "loss": 0.9191, + "step": 4560 + }, + { + "epoch": 0.7, + "grad_norm": 2.643477472378628, + "learning_rate": 4.410080456172963e-06, + "loss": 0.7728, + "step": 4561 + }, + { + "epoch": 0.7, + "grad_norm": 3.191959381783351, + "learning_rate": 4.4059704851395066e-06, + "loss": 0.7745, + "step": 4562 + }, + { + "epoch": 0.7, + "grad_norm": 2.7803203795034017, + "learning_rate": 4.401861888961614e-06, + "loss": 0.8424, + "step": 4563 + }, + { + "epoch": 0.7, + "grad_norm": 2.6668926762762557, + "learning_rate": 4.39775466864906e-06, + "loss": 0.8434, + "step": 4564 + }, + { + "epoch": 0.7, + "grad_norm": 3.2259726953810026, + "learning_rate": 4.393648825211292e-06, + "loss": 0.8012, + "step": 4565 + }, + { + "epoch": 0.7, + "grad_norm": 2.485281211765713, + "learning_rate": 4.389544359657406e-06, + "loss": 0.7123, + "step": 4566 + }, + { + "epoch": 0.7, + "grad_norm": 2.7361707651748524, + "learning_rate": 4.385441272996163e-06, + "loss": 0.8163, + "step": 4567 + }, + { + "epoch": 0.7, + "grad_norm": 2.6342215617089617, + "learning_rate": 4.381339566235991e-06, + "loss": 0.7742, + "step": 4568 + }, + { + "epoch": 0.7, + "grad_norm": 3.4953478324533265, + "learning_rate": 4.3772392403849725e-06, + "loss": 0.8489, + "step": 4569 + }, + { + "epoch": 0.7, + "grad_norm": 2.8254277012049007, + "learning_rate": 4.373140296450849e-06, + "loss": 0.7949, + "step": 4570 + }, + { + "epoch": 0.7, + "grad_norm": 2.447166580679297, + "learning_rate": 4.369042735441034e-06, + "loss": 0.6744, + "step": 4571 + }, + { + "epoch": 0.7, + "grad_norm": 2.5148934014001267, + "learning_rate": 4.364946558362587e-06, + "loss": 0.7639, + "step": 4572 + }, + { + "epoch": 0.7, + "grad_norm": 2.631815990263399, + "learning_rate": 4.360851766222236e-06, + "loss": 0.7963, + "step": 4573 + }, + { + "epoch": 0.7, + "grad_norm": 2.7870802311280447, + "learning_rate": 4.356758360026364e-06, + "loss": 0.7633, + "step": 4574 + }, + { + "epoch": 0.7, + "grad_norm": 2.6983783957529743, + "learning_rate": 4.352666340781014e-06, + "loss": 0.7501, + "step": 4575 + }, + { + "epoch": 0.7, + "grad_norm": 2.7473802269485996, + "learning_rate": 4.348575709491895e-06, + "loss": 0.7177, + "step": 4576 + }, + { + "epoch": 0.7, + "grad_norm": 2.7337053133225835, + "learning_rate": 4.344486467164367e-06, + "loss": 0.8403, + "step": 4577 + }, + { + "epoch": 0.7, + "grad_norm": 2.5316406942155516, + "learning_rate": 4.340398614803446e-06, + "loss": 0.7514, + "step": 4578 + }, + { + "epoch": 0.7, + "grad_norm": 2.616088844833526, + "learning_rate": 4.336312153413821e-06, + "loss": 0.735, + "step": 4579 + }, + { + "epoch": 0.7, + "grad_norm": 2.5590860827417563, + "learning_rate": 4.332227083999824e-06, + "loss": 0.7324, + "step": 4580 + }, + { + "epoch": 0.7, + "grad_norm": 2.823463575405153, + "learning_rate": 4.328143407565446e-06, + "loss": 0.8872, + "step": 4581 + }, + { + "epoch": 0.7, + "grad_norm": 2.5845287530717598, + "learning_rate": 4.3240611251143504e-06, + "loss": 0.8051, + "step": 4582 + }, + { + "epoch": 0.7, + "grad_norm": 2.8261552773288603, + "learning_rate": 4.319980237649842e-06, + "loss": 0.8085, + "step": 4583 + }, + { + "epoch": 0.7, + "grad_norm": 2.281298269414887, + "learning_rate": 4.315900746174891e-06, + "loss": 0.7678, + "step": 4584 + }, + { + "epoch": 0.7, + "grad_norm": 2.7115424664869425, + "learning_rate": 4.311822651692117e-06, + "loss": 0.8394, + "step": 4585 + }, + { + "epoch": 0.7, + "grad_norm": 2.7292516435578658, + "learning_rate": 4.307745955203802e-06, + "loss": 0.7851, + "step": 4586 + }, + { + "epoch": 0.7, + "grad_norm": 2.867908725524442, + "learning_rate": 4.30367065771189e-06, + "loss": 0.9014, + "step": 4587 + }, + { + "epoch": 0.7, + "grad_norm": 2.539392293579164, + "learning_rate": 4.299596760217971e-06, + "loss": 0.9133, + "step": 4588 + }, + { + "epoch": 0.7, + "grad_norm": 2.595803782835126, + "learning_rate": 4.295524263723289e-06, + "loss": 0.8473, + "step": 4589 + }, + { + "epoch": 0.7, + "grad_norm": 4.051305067876277, + "learning_rate": 4.291453169228761e-06, + "loss": 0.953, + "step": 4590 + }, + { + "epoch": 0.7, + "grad_norm": 2.556206565324167, + "learning_rate": 4.287383477734941e-06, + "loss": 0.767, + "step": 4591 + }, + { + "epoch": 0.7, + "grad_norm": 3.4954943318160723, + "learning_rate": 4.283315190242043e-06, + "loss": 0.8481, + "step": 4592 + }, + { + "epoch": 0.7, + "grad_norm": 2.58254843562837, + "learning_rate": 4.279248307749942e-06, + "loss": 0.7255, + "step": 4593 + }, + { + "epoch": 0.7, + "grad_norm": 2.438739075744588, + "learning_rate": 4.275182831258165e-06, + "loss": 0.7969, + "step": 4594 + }, + { + "epoch": 0.7, + "grad_norm": 2.7256701612456022, + "learning_rate": 4.2711187617658874e-06, + "loss": 0.7933, + "step": 4595 + }, + { + "epoch": 0.7, + "grad_norm": 2.501946269627395, + "learning_rate": 4.2670561002719465e-06, + "loss": 0.7289, + "step": 4596 + }, + { + "epoch": 0.7, + "grad_norm": 2.6861185679974433, + "learning_rate": 4.262994847774826e-06, + "loss": 0.7463, + "step": 4597 + }, + { + "epoch": 0.7, + "grad_norm": 3.6513030959735073, + "learning_rate": 4.258935005272677e-06, + "loss": 0.86, + "step": 4598 + }, + { + "epoch": 0.7, + "grad_norm": 2.5170003857043337, + "learning_rate": 4.254876573763287e-06, + "loss": 0.8417, + "step": 4599 + }, + { + "epoch": 0.7, + "grad_norm": 2.5795342270973713, + "learning_rate": 4.250819554244104e-06, + "loss": 0.7703, + "step": 4600 + }, + { + "epoch": 0.7, + "grad_norm": 2.2948253468316304, + "learning_rate": 4.2467639477122365e-06, + "loss": 0.7226, + "step": 4601 + }, + { + "epoch": 0.7, + "grad_norm": 2.426464752391824, + "learning_rate": 4.242709755164436e-06, + "loss": 0.7803, + "step": 4602 + }, + { + "epoch": 0.7, + "grad_norm": 2.542434876466958, + "learning_rate": 4.238656977597104e-06, + "loss": 0.7254, + "step": 4603 + }, + { + "epoch": 0.7, + "grad_norm": 2.6932781263047523, + "learning_rate": 4.234605616006309e-06, + "loss": 0.8503, + "step": 4604 + }, + { + "epoch": 0.7, + "grad_norm": 2.596101290817688, + "learning_rate": 4.230555671387752e-06, + "loss": 0.7327, + "step": 4605 + }, + { + "epoch": 0.71, + "grad_norm": 2.517592945930184, + "learning_rate": 4.2265071447368075e-06, + "loss": 0.7414, + "step": 4606 + }, + { + "epoch": 0.71, + "grad_norm": 2.6225552220576747, + "learning_rate": 4.222460037048481e-06, + "loss": 0.7787, + "step": 4607 + }, + { + "epoch": 0.71, + "grad_norm": 3.682718897322715, + "learning_rate": 4.218414349317435e-06, + "loss": 0.8763, + "step": 4608 + }, + { + "epoch": 0.71, + "grad_norm": 2.539578696281544, + "learning_rate": 4.214370082537996e-06, + "loss": 0.7671, + "step": 4609 + }, + { + "epoch": 0.71, + "grad_norm": 2.4552266766915336, + "learning_rate": 4.210327237704127e-06, + "loss": 0.7058, + "step": 4610 + }, + { + "epoch": 0.71, + "grad_norm": 2.450334006455967, + "learning_rate": 4.206285815809442e-06, + "loss": 0.7708, + "step": 4611 + }, + { + "epoch": 0.71, + "grad_norm": 2.5675380169728568, + "learning_rate": 4.202245817847216e-06, + "loss": 0.7293, + "step": 4612 + }, + { + "epoch": 0.71, + "grad_norm": 2.8543763293592983, + "learning_rate": 4.198207244810359e-06, + "loss": 0.7996, + "step": 4613 + }, + { + "epoch": 0.71, + "grad_norm": 2.5847582955676844, + "learning_rate": 4.194170097691449e-06, + "loss": 0.8363, + "step": 4614 + }, + { + "epoch": 0.71, + "grad_norm": 2.607985959941781, + "learning_rate": 4.190134377482697e-06, + "loss": 0.8314, + "step": 4615 + }, + { + "epoch": 0.71, + "grad_norm": 2.5181490067330885, + "learning_rate": 4.186100085175969e-06, + "loss": 0.7393, + "step": 4616 + }, + { + "epoch": 0.71, + "grad_norm": 3.7186874436591073, + "learning_rate": 4.182067221762787e-06, + "loss": 0.7471, + "step": 4617 + }, + { + "epoch": 0.71, + "grad_norm": 2.631842047505717, + "learning_rate": 4.17803578823431e-06, + "loss": 0.8025, + "step": 4618 + }, + { + "epoch": 0.71, + "grad_norm": 2.681078894285222, + "learning_rate": 4.174005785581355e-06, + "loss": 0.8648, + "step": 4619 + }, + { + "epoch": 0.71, + "grad_norm": 2.5036096911340064, + "learning_rate": 4.169977214794381e-06, + "loss": 0.7002, + "step": 4620 + }, + { + "epoch": 0.71, + "grad_norm": 2.4533717993242927, + "learning_rate": 4.165950076863498e-06, + "loss": 0.7863, + "step": 4621 + }, + { + "epoch": 0.71, + "grad_norm": 2.5486747992946803, + "learning_rate": 4.161924372778461e-06, + "loss": 0.8162, + "step": 4622 + }, + { + "epoch": 0.71, + "grad_norm": 2.8385199190995167, + "learning_rate": 4.157900103528681e-06, + "loss": 0.826, + "step": 4623 + }, + { + "epoch": 0.71, + "grad_norm": 2.652735195152087, + "learning_rate": 4.153877270103205e-06, + "loss": 0.8017, + "step": 4624 + }, + { + "epoch": 0.71, + "grad_norm": 2.8035117018749363, + "learning_rate": 4.149855873490739e-06, + "loss": 0.7838, + "step": 4625 + }, + { + "epoch": 0.71, + "grad_norm": 2.8177744943240155, + "learning_rate": 4.145835914679624e-06, + "loss": 0.7602, + "step": 4626 + }, + { + "epoch": 0.71, + "grad_norm": 2.702208093817233, + "learning_rate": 4.141817394657854e-06, + "loss": 0.7792, + "step": 4627 + }, + { + "epoch": 0.71, + "grad_norm": 2.4106613142451736, + "learning_rate": 4.137800314413072e-06, + "loss": 0.7998, + "step": 4628 + }, + { + "epoch": 0.71, + "grad_norm": 2.4824405757189627, + "learning_rate": 4.1337846749325615e-06, + "loss": 0.7364, + "step": 4629 + }, + { + "epoch": 0.71, + "grad_norm": 2.422022193044217, + "learning_rate": 4.129770477203253e-06, + "loss": 0.7391, + "step": 4630 + }, + { + "epoch": 0.71, + "grad_norm": 2.391710828345458, + "learning_rate": 4.125757722211725e-06, + "loss": 0.751, + "step": 4631 + }, + { + "epoch": 0.71, + "grad_norm": 2.650588884023421, + "learning_rate": 4.1217464109442e-06, + "loss": 0.7595, + "step": 4632 + }, + { + "epoch": 0.71, + "grad_norm": 2.5578125265610416, + "learning_rate": 4.11773654438654e-06, + "loss": 0.8018, + "step": 4633 + }, + { + "epoch": 0.71, + "grad_norm": 2.728508450144264, + "learning_rate": 4.113728123524269e-06, + "loss": 0.7575, + "step": 4634 + }, + { + "epoch": 0.71, + "grad_norm": 2.8877626647567696, + "learning_rate": 4.1097211493425335e-06, + "loss": 0.8282, + "step": 4635 + }, + { + "epoch": 0.71, + "grad_norm": 2.6971660585681816, + "learning_rate": 4.105715622826144e-06, + "loss": 0.8925, + "step": 4636 + }, + { + "epoch": 0.71, + "grad_norm": 3.6891685662456877, + "learning_rate": 4.101711544959544e-06, + "loss": 0.8454, + "step": 4637 + }, + { + "epoch": 0.71, + "grad_norm": 2.6052488039847974, + "learning_rate": 4.097708916726818e-06, + "loss": 0.7644, + "step": 4638 + }, + { + "epoch": 0.71, + "grad_norm": 2.500823823477732, + "learning_rate": 4.093707739111709e-06, + "loss": 0.7763, + "step": 4639 + }, + { + "epoch": 0.71, + "grad_norm": 3.1481662271475703, + "learning_rate": 4.0897080130975885e-06, + "loss": 0.7999, + "step": 4640 + }, + { + "epoch": 0.71, + "grad_norm": 2.3819445548136193, + "learning_rate": 4.0857097396674785e-06, + "loss": 0.6348, + "step": 4641 + }, + { + "epoch": 0.71, + "grad_norm": 3.0016153917782913, + "learning_rate": 4.081712919804042e-06, + "loss": 0.8293, + "step": 4642 + }, + { + "epoch": 0.71, + "grad_norm": 3.361916696935901, + "learning_rate": 4.077717554489585e-06, + "loss": 0.8334, + "step": 4643 + }, + { + "epoch": 0.71, + "grad_norm": 3.0327023999462233, + "learning_rate": 4.073723644706052e-06, + "loss": 0.8209, + "step": 4644 + }, + { + "epoch": 0.71, + "grad_norm": 2.7807531192439456, + "learning_rate": 4.069731191435043e-06, + "loss": 0.83, + "step": 4645 + }, + { + "epoch": 0.71, + "grad_norm": 2.6526815820931104, + "learning_rate": 4.065740195657784e-06, + "loss": 0.7878, + "step": 4646 + }, + { + "epoch": 0.71, + "grad_norm": 2.441045340728361, + "learning_rate": 4.061750658355154e-06, + "loss": 0.7812, + "step": 4647 + }, + { + "epoch": 0.71, + "grad_norm": 2.6057736597483467, + "learning_rate": 4.057762580507669e-06, + "loss": 0.8392, + "step": 4648 + }, + { + "epoch": 0.71, + "grad_norm": 2.5718849496433167, + "learning_rate": 4.053775963095482e-06, + "loss": 0.7951, + "step": 4649 + }, + { + "epoch": 0.71, + "grad_norm": 2.9414300092669694, + "learning_rate": 4.049790807098398e-06, + "loss": 0.7309, + "step": 4650 + }, + { + "epoch": 0.71, + "grad_norm": 2.6655740741989957, + "learning_rate": 4.045807113495854e-06, + "loss": 0.7683, + "step": 4651 + }, + { + "epoch": 0.71, + "grad_norm": 2.620533608701822, + "learning_rate": 4.04182488326693e-06, + "loss": 0.7355, + "step": 4652 + }, + { + "epoch": 0.71, + "grad_norm": 2.6881651036970546, + "learning_rate": 4.037844117390346e-06, + "loss": 0.7604, + "step": 4653 + }, + { + "epoch": 0.71, + "grad_norm": 3.1868902215429236, + "learning_rate": 4.033864816844464e-06, + "loss": 0.8145, + "step": 4654 + }, + { + "epoch": 0.71, + "grad_norm": 2.596476042767454, + "learning_rate": 4.029886982607278e-06, + "loss": 0.7789, + "step": 4655 + }, + { + "epoch": 0.71, + "grad_norm": 2.479689038232687, + "learning_rate": 4.025910615656439e-06, + "loss": 0.7502, + "step": 4656 + }, + { + "epoch": 0.71, + "grad_norm": 4.330613762856095, + "learning_rate": 4.021935716969217e-06, + "loss": 0.9225, + "step": 4657 + }, + { + "epoch": 0.71, + "grad_norm": 2.4228114155369878, + "learning_rate": 4.017962287522538e-06, + "loss": 0.7204, + "step": 4658 + }, + { + "epoch": 0.71, + "grad_norm": 2.4409170638890645, + "learning_rate": 4.013990328292956e-06, + "loss": 0.7502, + "step": 4659 + }, + { + "epoch": 0.71, + "grad_norm": 2.702036399975697, + "learning_rate": 4.010019840256665e-06, + "loss": 0.7252, + "step": 4660 + }, + { + "epoch": 0.71, + "grad_norm": 2.7340055852492298, + "learning_rate": 4.006050824389504e-06, + "loss": 0.7979, + "step": 4661 + }, + { + "epoch": 0.71, + "grad_norm": 2.6003021686125054, + "learning_rate": 4.002083281666944e-06, + "loss": 0.8506, + "step": 4662 + }, + { + "epoch": 0.71, + "grad_norm": 2.7024305304131317, + "learning_rate": 3.998117213064096e-06, + "loss": 0.775, + "step": 4663 + }, + { + "epoch": 0.71, + "grad_norm": 2.3818260809227723, + "learning_rate": 3.994152619555708e-06, + "loss": 0.7644, + "step": 4664 + }, + { + "epoch": 0.71, + "grad_norm": 2.4210541149434954, + "learning_rate": 3.990189502116165e-06, + "loss": 0.6938, + "step": 4665 + }, + { + "epoch": 0.71, + "grad_norm": 2.8001810887021468, + "learning_rate": 3.986227861719489e-06, + "loss": 0.7897, + "step": 4666 + }, + { + "epoch": 0.71, + "grad_norm": 2.638390741878354, + "learning_rate": 3.982267699339344e-06, + "loss": 0.8287, + "step": 4667 + }, + { + "epoch": 0.71, + "grad_norm": 2.5404188437109227, + "learning_rate": 3.978309015949022e-06, + "loss": 0.7848, + "step": 4668 + }, + { + "epoch": 0.71, + "grad_norm": 2.725734600082859, + "learning_rate": 3.974351812521462e-06, + "loss": 0.8366, + "step": 4669 + }, + { + "epoch": 0.71, + "grad_norm": 2.6029752762499974, + "learning_rate": 3.970396090029231e-06, + "loss": 0.8097, + "step": 4670 + }, + { + "epoch": 0.71, + "grad_norm": 2.5952620908186947, + "learning_rate": 3.9664418494445325e-06, + "loss": 0.8636, + "step": 4671 + }, + { + "epoch": 0.72, + "grad_norm": 2.7985463975433755, + "learning_rate": 3.9624890917392125e-06, + "loss": 0.877, + "step": 4672 + }, + { + "epoch": 0.72, + "grad_norm": 3.290152520574593, + "learning_rate": 3.958537817884744e-06, + "loss": 0.8129, + "step": 4673 + }, + { + "epoch": 0.72, + "grad_norm": 2.6227573346481288, + "learning_rate": 3.9545880288522435e-06, + "loss": 0.78, + "step": 4674 + }, + { + "epoch": 0.72, + "grad_norm": 3.245561388143318, + "learning_rate": 3.950639725612453e-06, + "loss": 0.827, + "step": 4675 + }, + { + "epoch": 0.72, + "grad_norm": 2.6497248469105936, + "learning_rate": 3.9466929091357585e-06, + "loss": 0.8241, + "step": 4676 + }, + { + "epoch": 0.72, + "grad_norm": 2.7854720270745683, + "learning_rate": 3.942747580392172e-06, + "loss": 0.76, + "step": 4677 + }, + { + "epoch": 0.72, + "grad_norm": 6.453180020069248, + "learning_rate": 3.938803740351351e-06, + "loss": 0.8078, + "step": 4678 + }, + { + "epoch": 0.72, + "grad_norm": 2.537980143611081, + "learning_rate": 3.934861389982576e-06, + "loss": 0.779, + "step": 4679 + }, + { + "epoch": 0.72, + "grad_norm": 3.012175652906596, + "learning_rate": 3.93092053025477e-06, + "loss": 0.7289, + "step": 4680 + }, + { + "epoch": 0.72, + "grad_norm": 2.5317863879725326, + "learning_rate": 3.926981162136485e-06, + "loss": 0.703, + "step": 4681 + }, + { + "epoch": 0.72, + "grad_norm": 2.8787146878385266, + "learning_rate": 3.923043286595902e-06, + "loss": 0.7585, + "step": 4682 + }, + { + "epoch": 0.72, + "grad_norm": 2.7757103864171007, + "learning_rate": 3.919106904600849e-06, + "loss": 0.8586, + "step": 4683 + }, + { + "epoch": 0.72, + "grad_norm": 2.675438143895143, + "learning_rate": 3.915172017118773e-06, + "loss": 0.7801, + "step": 4684 + }, + { + "epoch": 0.72, + "grad_norm": 2.523180775382848, + "learning_rate": 3.91123862511676e-06, + "loss": 0.6941, + "step": 4685 + }, + { + "epoch": 0.72, + "grad_norm": 2.5315978159852053, + "learning_rate": 3.907306729561528e-06, + "loss": 0.8646, + "step": 4686 + }, + { + "epoch": 0.72, + "grad_norm": 2.519907492637914, + "learning_rate": 3.903376331419421e-06, + "loss": 0.6856, + "step": 4687 + }, + { + "epoch": 0.72, + "grad_norm": 2.5523043949939592, + "learning_rate": 3.89944743165643e-06, + "loss": 0.77, + "step": 4688 + }, + { + "epoch": 0.72, + "grad_norm": 2.5257770082553246, + "learning_rate": 3.895520031238163e-06, + "loss": 0.7387, + "step": 4689 + }, + { + "epoch": 0.72, + "grad_norm": 2.7803624676951646, + "learning_rate": 3.891594131129863e-06, + "loss": 0.8689, + "step": 4690 + }, + { + "epoch": 0.72, + "grad_norm": 2.652024181046161, + "learning_rate": 3.8876697322964115e-06, + "loss": 0.8326, + "step": 4691 + }, + { + "epoch": 0.72, + "grad_norm": 2.517253856703637, + "learning_rate": 3.883746835702314e-06, + "loss": 0.6858, + "step": 4692 + }, + { + "epoch": 0.72, + "grad_norm": 2.472269564693469, + "learning_rate": 3.879825442311704e-06, + "loss": 0.7663, + "step": 4693 + }, + { + "epoch": 0.72, + "grad_norm": 2.6853471906503454, + "learning_rate": 3.875905553088357e-06, + "loss": 0.7294, + "step": 4694 + }, + { + "epoch": 0.72, + "grad_norm": 2.796561653655558, + "learning_rate": 3.871987168995668e-06, + "loss": 0.8129, + "step": 4695 + }, + { + "epoch": 0.72, + "grad_norm": 2.6928289478681053, + "learning_rate": 3.868070290996666e-06, + "loss": 0.8477, + "step": 4696 + }, + { + "epoch": 0.72, + "grad_norm": 2.84870512570839, + "learning_rate": 3.86415492005401e-06, + "loss": 0.8197, + "step": 4697 + }, + { + "epoch": 0.72, + "grad_norm": 2.739562097828992, + "learning_rate": 3.860241057129987e-06, + "loss": 0.7791, + "step": 4698 + }, + { + "epoch": 0.72, + "grad_norm": 2.897960792207024, + "learning_rate": 3.856328703186517e-06, + "loss": 0.8865, + "step": 4699 + }, + { + "epoch": 0.72, + "grad_norm": 2.3354076946120386, + "learning_rate": 3.852417859185148e-06, + "loss": 0.6997, + "step": 4700 + }, + { + "epoch": 0.72, + "grad_norm": 2.4224047577876915, + "learning_rate": 3.848508526087049e-06, + "loss": 0.7737, + "step": 4701 + }, + { + "epoch": 0.72, + "grad_norm": 2.7918517638326215, + "learning_rate": 3.844600704853033e-06, + "loss": 0.7162, + "step": 4702 + }, + { + "epoch": 0.72, + "grad_norm": 2.8138739928062635, + "learning_rate": 3.840694396443529e-06, + "loss": 0.8866, + "step": 4703 + }, + { + "epoch": 0.72, + "grad_norm": 2.456079097072732, + "learning_rate": 3.836789601818596e-06, + "loss": 0.8912, + "step": 4704 + }, + { + "epoch": 0.72, + "grad_norm": 2.6774767199549454, + "learning_rate": 3.832886321937927e-06, + "loss": 0.8849, + "step": 4705 + }, + { + "epoch": 0.72, + "grad_norm": 2.5867290705841226, + "learning_rate": 3.828984557760838e-06, + "loss": 0.7677, + "step": 4706 + }, + { + "epoch": 0.72, + "grad_norm": 2.547855076935108, + "learning_rate": 3.825084310246271e-06, + "loss": 0.7806, + "step": 4707 + }, + { + "epoch": 0.72, + "grad_norm": 2.373974068705608, + "learning_rate": 3.8211855803528e-06, + "loss": 0.7564, + "step": 4708 + }, + { + "epoch": 0.72, + "grad_norm": 2.5217411177994697, + "learning_rate": 3.817288369038617e-06, + "loss": 0.8766, + "step": 4709 + }, + { + "epoch": 0.72, + "grad_norm": 2.5738411725657246, + "learning_rate": 3.813392677261557e-06, + "loss": 0.8039, + "step": 4710 + }, + { + "epoch": 0.72, + "grad_norm": 2.7923442043533715, + "learning_rate": 3.8094985059790656e-06, + "loss": 0.7569, + "step": 4711 + }, + { + "epoch": 0.72, + "grad_norm": 2.4916064610001065, + "learning_rate": 3.8056058561482203e-06, + "loss": 0.8459, + "step": 4712 + }, + { + "epoch": 0.72, + "grad_norm": 2.6814032775087977, + "learning_rate": 3.8017147287257294e-06, + "loss": 0.7843, + "step": 4713 + }, + { + "epoch": 0.72, + "grad_norm": 2.7524991249628963, + "learning_rate": 3.7978251246679223e-06, + "loss": 0.7788, + "step": 4714 + }, + { + "epoch": 0.72, + "grad_norm": 3.2629708574261396, + "learning_rate": 3.7939370449307477e-06, + "loss": 0.8294, + "step": 4715 + }, + { + "epoch": 0.72, + "grad_norm": 2.4710880212646478, + "learning_rate": 3.7900504904697964e-06, + "loss": 0.8517, + "step": 4716 + }, + { + "epoch": 0.72, + "grad_norm": 2.693293620784226, + "learning_rate": 3.78616546224027e-06, + "loss": 0.8109, + "step": 4717 + }, + { + "epoch": 0.72, + "grad_norm": 3.02761216391464, + "learning_rate": 3.782281961197e-06, + "loss": 0.8265, + "step": 4718 + }, + { + "epoch": 0.72, + "grad_norm": 2.646484955340347, + "learning_rate": 3.77839998829444e-06, + "loss": 0.7394, + "step": 4719 + }, + { + "epoch": 0.72, + "grad_norm": 2.4578067727484534, + "learning_rate": 3.774519544486669e-06, + "loss": 0.8447, + "step": 4720 + }, + { + "epoch": 0.72, + "grad_norm": 2.6699138109227407, + "learning_rate": 3.7706406307273978e-06, + "loss": 0.8107, + "step": 4721 + }, + { + "epoch": 0.72, + "grad_norm": 2.630772578360136, + "learning_rate": 3.76676324796995e-06, + "loss": 0.7524, + "step": 4722 + }, + { + "epoch": 0.72, + "grad_norm": 2.7318104679393693, + "learning_rate": 3.762887397167274e-06, + "loss": 0.7799, + "step": 4723 + }, + { + "epoch": 0.72, + "grad_norm": 2.762270216428474, + "learning_rate": 3.759013079271955e-06, + "loss": 0.8176, + "step": 4724 + }, + { + "epoch": 0.72, + "grad_norm": 2.4553305532030305, + "learning_rate": 3.7551402952361837e-06, + "loss": 0.728, + "step": 4725 + }, + { + "epoch": 0.72, + "grad_norm": 2.6450089954589697, + "learning_rate": 3.751269046011782e-06, + "loss": 0.8179, + "step": 4726 + }, + { + "epoch": 0.72, + "grad_norm": 2.343328022644456, + "learning_rate": 3.7473993325502e-06, + "loss": 0.789, + "step": 4727 + }, + { + "epoch": 0.72, + "grad_norm": 3.9286072628824007, + "learning_rate": 3.7435311558025013e-06, + "loss": 0.9128, + "step": 4728 + }, + { + "epoch": 0.72, + "grad_norm": 2.879903474320907, + "learning_rate": 3.739664516719371e-06, + "loss": 0.7496, + "step": 4729 + }, + { + "epoch": 0.72, + "grad_norm": 2.4443020986868316, + "learning_rate": 3.735799416251129e-06, + "loss": 0.7422, + "step": 4730 + }, + { + "epoch": 0.72, + "grad_norm": 2.7236021790499807, + "learning_rate": 3.7319358553477036e-06, + "loss": 0.6851, + "step": 4731 + }, + { + "epoch": 0.72, + "grad_norm": 2.419018248039533, + "learning_rate": 3.728073834958652e-06, + "loss": 0.7411, + "step": 4732 + }, + { + "epoch": 0.72, + "grad_norm": 3.024391110209093, + "learning_rate": 3.7242133560331474e-06, + "loss": 0.7883, + "step": 4733 + }, + { + "epoch": 0.72, + "grad_norm": 2.9023134680031646, + "learning_rate": 3.720354419519986e-06, + "loss": 0.768, + "step": 4734 + }, + { + "epoch": 0.72, + "grad_norm": 2.576821852078959, + "learning_rate": 3.7164970263675927e-06, + "loss": 0.65, + "step": 4735 + }, + { + "epoch": 0.72, + "grad_norm": 2.59943955832557, + "learning_rate": 3.7126411775240034e-06, + "loss": 0.8005, + "step": 4736 + }, + { + "epoch": 0.73, + "grad_norm": 2.733894188548558, + "learning_rate": 3.7087868739368727e-06, + "loss": 0.7918, + "step": 4737 + }, + { + "epoch": 0.73, + "grad_norm": 2.5152087590772974, + "learning_rate": 3.704934116553488e-06, + "loss": 0.7243, + "step": 4738 + }, + { + "epoch": 0.73, + "grad_norm": 2.270981881776336, + "learning_rate": 3.7010829063207464e-06, + "loss": 0.6764, + "step": 4739 + }, + { + "epoch": 0.73, + "grad_norm": 3.092183390138597, + "learning_rate": 3.6972332441851633e-06, + "loss": 0.8128, + "step": 4740 + }, + { + "epoch": 0.73, + "grad_norm": 2.6356895097666913, + "learning_rate": 3.6933851310928835e-06, + "loss": 0.7741, + "step": 4741 + }, + { + "epoch": 0.73, + "grad_norm": 2.5035855263563644, + "learning_rate": 3.6895385679896634e-06, + "loss": 0.7324, + "step": 4742 + }, + { + "epoch": 0.73, + "grad_norm": 2.709479031414277, + "learning_rate": 3.6856935558208805e-06, + "loss": 0.7627, + "step": 4743 + }, + { + "epoch": 0.73, + "grad_norm": 2.536976789978986, + "learning_rate": 3.6818500955315295e-06, + "loss": 0.811, + "step": 4744 + }, + { + "epoch": 0.73, + "grad_norm": 2.613504187688428, + "learning_rate": 3.678008188066222e-06, + "loss": 0.794, + "step": 4745 + }, + { + "epoch": 0.73, + "grad_norm": 2.6157031627687535, + "learning_rate": 3.6741678343691987e-06, + "loss": 0.8401, + "step": 4746 + }, + { + "epoch": 0.73, + "grad_norm": 2.667752066641231, + "learning_rate": 3.670329035384308e-06, + "loss": 0.7574, + "step": 4747 + }, + { + "epoch": 0.73, + "grad_norm": 2.542434226381904, + "learning_rate": 3.6664917920550138e-06, + "loss": 0.6986, + "step": 4748 + }, + { + "epoch": 0.73, + "grad_norm": 2.6856041087077585, + "learning_rate": 3.6626561053244102e-06, + "loss": 0.9379, + "step": 4749 + }, + { + "epoch": 0.73, + "grad_norm": 2.4464421333190356, + "learning_rate": 3.6588219761351997e-06, + "loss": 0.765, + "step": 4750 + }, + { + "epoch": 0.73, + "grad_norm": 2.4903426439491416, + "learning_rate": 3.6549894054296987e-06, + "loss": 0.751, + "step": 4751 + }, + { + "epoch": 0.73, + "grad_norm": 2.7515001060158353, + "learning_rate": 3.651158394149852e-06, + "loss": 0.7518, + "step": 4752 + }, + { + "epoch": 0.73, + "grad_norm": 3.0337445733466644, + "learning_rate": 3.6473289432372127e-06, + "loss": 0.7475, + "step": 4753 + }, + { + "epoch": 0.73, + "grad_norm": 2.395247071177547, + "learning_rate": 3.643501053632952e-06, + "loss": 0.6996, + "step": 4754 + }, + { + "epoch": 0.73, + "grad_norm": 2.489066127142281, + "learning_rate": 3.6396747262778565e-06, + "loss": 0.7531, + "step": 4755 + }, + { + "epoch": 0.73, + "grad_norm": 2.840828936358385, + "learning_rate": 3.635849962112329e-06, + "loss": 0.7807, + "step": 4756 + }, + { + "epoch": 0.73, + "grad_norm": 3.4415939387484573, + "learning_rate": 3.6320267620763948e-06, + "loss": 0.8277, + "step": 4757 + }, + { + "epoch": 0.73, + "grad_norm": 2.7229664144214722, + "learning_rate": 3.628205127109685e-06, + "loss": 0.8241, + "step": 4758 + }, + { + "epoch": 0.73, + "grad_norm": 2.5781982638832446, + "learning_rate": 3.6243850581514497e-06, + "loss": 0.825, + "step": 4759 + }, + { + "epoch": 0.73, + "grad_norm": 2.5764834094870714, + "learning_rate": 3.620566556140558e-06, + "loss": 0.8209, + "step": 4760 + }, + { + "epoch": 0.73, + "grad_norm": 2.68885570646689, + "learning_rate": 3.616749622015486e-06, + "loss": 0.6946, + "step": 4761 + }, + { + "epoch": 0.73, + "grad_norm": 2.4970988849394127, + "learning_rate": 3.6129342567143354e-06, + "loss": 0.7678, + "step": 4762 + }, + { + "epoch": 0.73, + "grad_norm": 2.6849650774427007, + "learning_rate": 3.609120461174813e-06, + "loss": 0.7518, + "step": 4763 + }, + { + "epoch": 0.73, + "grad_norm": 2.6180254178226208, + "learning_rate": 3.6053082363342416e-06, + "loss": 0.7673, + "step": 4764 + }, + { + "epoch": 0.73, + "grad_norm": 2.7240023181640893, + "learning_rate": 3.601497583129561e-06, + "loss": 0.8813, + "step": 4765 + }, + { + "epoch": 0.73, + "grad_norm": 2.4256243336055787, + "learning_rate": 3.5976885024973196e-06, + "loss": 0.7399, + "step": 4766 + }, + { + "epoch": 0.73, + "grad_norm": 2.5106095364830523, + "learning_rate": 3.593880995373683e-06, + "loss": 0.6945, + "step": 4767 + }, + { + "epoch": 0.73, + "grad_norm": 3.789038453137751, + "learning_rate": 3.5900750626944346e-06, + "loss": 0.8602, + "step": 4768 + }, + { + "epoch": 0.73, + "grad_norm": 2.7024293451721766, + "learning_rate": 3.586270705394962e-06, + "loss": 0.8042, + "step": 4769 + }, + { + "epoch": 0.73, + "grad_norm": 2.5976861428370888, + "learning_rate": 3.5824679244102677e-06, + "loss": 0.7232, + "step": 4770 + }, + { + "epoch": 0.73, + "grad_norm": 2.71288602620368, + "learning_rate": 3.5786667206749736e-06, + "loss": 0.8084, + "step": 4771 + }, + { + "epoch": 0.73, + "grad_norm": 2.6366829492883608, + "learning_rate": 3.5748670951233043e-06, + "loss": 0.7183, + "step": 4772 + }, + { + "epoch": 0.73, + "grad_norm": 2.804276556323464, + "learning_rate": 3.5710690486891066e-06, + "loss": 0.8149, + "step": 4773 + }, + { + "epoch": 0.73, + "grad_norm": 2.6738106485757993, + "learning_rate": 3.5672725823058297e-06, + "loss": 0.8348, + "step": 4774 + }, + { + "epoch": 0.73, + "grad_norm": 2.6069776651382854, + "learning_rate": 3.56347769690654e-06, + "loss": 0.713, + "step": 4775 + }, + { + "epoch": 0.73, + "grad_norm": 2.6979836985747507, + "learning_rate": 3.5596843934239133e-06, + "loss": 0.7818, + "step": 4776 + }, + { + "epoch": 0.73, + "grad_norm": 2.7598334527062054, + "learning_rate": 3.5558926727902366e-06, + "loss": 0.8516, + "step": 4777 + }, + { + "epoch": 0.73, + "grad_norm": 3.3131812110524224, + "learning_rate": 3.5521025359374074e-06, + "loss": 0.8712, + "step": 4778 + }, + { + "epoch": 0.73, + "grad_norm": 2.4949648400791204, + "learning_rate": 3.548313983796938e-06, + "loss": 0.7782, + "step": 4779 + }, + { + "epoch": 0.73, + "grad_norm": 2.5630884401307656, + "learning_rate": 3.544527017299949e-06, + "loss": 0.8788, + "step": 4780 + }, + { + "epoch": 0.73, + "grad_norm": 2.911939959801433, + "learning_rate": 3.5407416373771643e-06, + "loss": 0.8761, + "step": 4781 + }, + { + "epoch": 0.73, + "grad_norm": 2.742301645711473, + "learning_rate": 3.5369578449589325e-06, + "loss": 0.7713, + "step": 4782 + }, + { + "epoch": 0.73, + "grad_norm": 2.418614212609283, + "learning_rate": 3.533175640975196e-06, + "loss": 0.7763, + "step": 4783 + }, + { + "epoch": 0.73, + "grad_norm": 3.0920357376682004, + "learning_rate": 3.529395026355521e-06, + "loss": 0.8437, + "step": 4784 + }, + { + "epoch": 0.73, + "grad_norm": 2.458293186069364, + "learning_rate": 3.525616002029073e-06, + "loss": 0.7363, + "step": 4785 + }, + { + "epoch": 0.73, + "grad_norm": 2.8596590025593063, + "learning_rate": 3.5218385689246326e-06, + "loss": 0.8724, + "step": 4786 + }, + { + "epoch": 0.73, + "grad_norm": 2.9193255631234507, + "learning_rate": 3.5180627279705835e-06, + "loss": 0.7714, + "step": 4787 + }, + { + "epoch": 0.73, + "grad_norm": 2.79476377638554, + "learning_rate": 3.514288480094924e-06, + "loss": 0.8012, + "step": 4788 + }, + { + "epoch": 0.73, + "grad_norm": 2.6235771755809973, + "learning_rate": 3.5105158262252537e-06, + "loss": 0.8237, + "step": 4789 + }, + { + "epoch": 0.73, + "grad_norm": 2.5787053452761564, + "learning_rate": 3.506744767288792e-06, + "loss": 0.7667, + "step": 4790 + }, + { + "epoch": 0.73, + "grad_norm": 2.6403708336838543, + "learning_rate": 3.502975304212357e-06, + "loss": 0.7564, + "step": 4791 + }, + { + "epoch": 0.73, + "grad_norm": 2.637237157385171, + "learning_rate": 3.499207437922373e-06, + "loss": 0.73, + "step": 4792 + }, + { + "epoch": 0.73, + "grad_norm": 2.5619777184569874, + "learning_rate": 3.495441169344882e-06, + "loss": 0.8118, + "step": 4793 + }, + { + "epoch": 0.73, + "grad_norm": 2.688433925814969, + "learning_rate": 3.491676499405522e-06, + "loss": 0.7487, + "step": 4794 + }, + { + "epoch": 0.73, + "grad_norm": 2.603657981617799, + "learning_rate": 3.487913429029548e-06, + "loss": 0.8036, + "step": 4795 + }, + { + "epoch": 0.73, + "grad_norm": 2.876068212142599, + "learning_rate": 3.484151959141815e-06, + "loss": 0.8381, + "step": 4796 + }, + { + "epoch": 0.73, + "grad_norm": 2.7161938139739186, + "learning_rate": 3.4803920906667865e-06, + "loss": 0.7448, + "step": 4797 + }, + { + "epoch": 0.73, + "grad_norm": 2.4956209199321697, + "learning_rate": 3.4766338245285335e-06, + "loss": 0.6899, + "step": 4798 + }, + { + "epoch": 0.73, + "grad_norm": 2.591375363966042, + "learning_rate": 3.4728771616507317e-06, + "loss": 0.7904, + "step": 4799 + }, + { + "epoch": 0.73, + "grad_norm": 2.7401520195665023, + "learning_rate": 3.46912210295666e-06, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 0.73, + "grad_norm": 2.4448228135941834, + "learning_rate": 3.4653686493692128e-06, + "loss": 0.7103, + "step": 4801 + }, + { + "epoch": 0.74, + "grad_norm": 2.962985582453866, + "learning_rate": 3.461616801810882e-06, + "loss": 0.7812, + "step": 4802 + }, + { + "epoch": 0.74, + "grad_norm": 2.5877756856635648, + "learning_rate": 3.457866561203761e-06, + "loss": 0.7902, + "step": 4803 + }, + { + "epoch": 0.74, + "grad_norm": 2.879005707932505, + "learning_rate": 3.4541179284695624e-06, + "loss": 0.8495, + "step": 4804 + }, + { + "epoch": 0.74, + "grad_norm": 2.589230441682058, + "learning_rate": 3.4503709045295874e-06, + "loss": 0.7214, + "step": 4805 + }, + { + "epoch": 0.74, + "grad_norm": 2.599459866748018, + "learning_rate": 3.4466254903047558e-06, + "loss": 0.7829, + "step": 4806 + }, + { + "epoch": 0.74, + "grad_norm": 2.790768839452864, + "learning_rate": 3.4428816867155835e-06, + "loss": 0.7689, + "step": 4807 + }, + { + "epoch": 0.74, + "grad_norm": 3.1540500764058215, + "learning_rate": 3.43913949468219e-06, + "loss": 0.8027, + "step": 4808 + }, + { + "epoch": 0.74, + "grad_norm": 2.816907689724522, + "learning_rate": 3.4353989151243027e-06, + "loss": 0.7883, + "step": 4809 + }, + { + "epoch": 0.74, + "grad_norm": 2.72704029894962, + "learning_rate": 3.431659948961251e-06, + "loss": 0.8028, + "step": 4810 + }, + { + "epoch": 0.74, + "grad_norm": 2.583137093090374, + "learning_rate": 3.4279225971119654e-06, + "loss": 0.7438, + "step": 4811 + }, + { + "epoch": 0.74, + "grad_norm": 2.4804854730280765, + "learning_rate": 3.424186860494987e-06, + "loss": 0.7215, + "step": 4812 + }, + { + "epoch": 0.74, + "grad_norm": 2.658327666872961, + "learning_rate": 3.4204527400284537e-06, + "loss": 0.7776, + "step": 4813 + }, + { + "epoch": 0.74, + "grad_norm": 2.693389347062505, + "learning_rate": 3.4167202366301023e-06, + "loss": 0.703, + "step": 4814 + }, + { + "epoch": 0.74, + "grad_norm": 2.566126480384414, + "learning_rate": 3.4129893512172853e-06, + "loss": 0.8688, + "step": 4815 + }, + { + "epoch": 0.74, + "grad_norm": 2.585160035850379, + "learning_rate": 3.4092600847069423e-06, + "loss": 0.6803, + "step": 4816 + }, + { + "epoch": 0.74, + "grad_norm": 2.6303955696871135, + "learning_rate": 3.4055324380156307e-06, + "loss": 0.8045, + "step": 4817 + }, + { + "epoch": 0.74, + "grad_norm": 2.4160626378845333, + "learning_rate": 3.401806412059496e-06, + "loss": 0.6905, + "step": 4818 + }, + { + "epoch": 0.74, + "grad_norm": 2.441459359441314, + "learning_rate": 3.398082007754292e-06, + "loss": 0.7945, + "step": 4819 + }, + { + "epoch": 0.74, + "grad_norm": 2.625386501191365, + "learning_rate": 3.3943592260153734e-06, + "loss": 0.7157, + "step": 4820 + }, + { + "epoch": 0.74, + "grad_norm": 3.440194805471606, + "learning_rate": 3.3906380677576946e-06, + "loss": 0.8603, + "step": 4821 + }, + { + "epoch": 0.74, + "grad_norm": 2.694702386247108, + "learning_rate": 3.386918533895809e-06, + "loss": 0.7045, + "step": 4822 + }, + { + "epoch": 0.74, + "grad_norm": 3.1686746071454057, + "learning_rate": 3.38320062534388e-06, + "loss": 0.7979, + "step": 4823 + }, + { + "epoch": 0.74, + "grad_norm": 2.4610000489643724, + "learning_rate": 3.379484343015662e-06, + "loss": 0.7281, + "step": 4824 + }, + { + "epoch": 0.74, + "grad_norm": 2.880245536591487, + "learning_rate": 3.3757696878245105e-06, + "loss": 0.7813, + "step": 4825 + }, + { + "epoch": 0.74, + "grad_norm": 2.5397694675753377, + "learning_rate": 3.3720566606833893e-06, + "loss": 0.7543, + "step": 4826 + }, + { + "epoch": 0.74, + "grad_norm": 2.5502516158533943, + "learning_rate": 3.3683452625048508e-06, + "loss": 0.7224, + "step": 4827 + }, + { + "epoch": 0.74, + "grad_norm": 2.6288489855836685, + "learning_rate": 3.3646354942010574e-06, + "loss": 0.8899, + "step": 4828 + }, + { + "epoch": 0.74, + "grad_norm": 2.724002193740201, + "learning_rate": 3.360927356683763e-06, + "loss": 0.7548, + "step": 4829 + }, + { + "epoch": 0.74, + "grad_norm": 2.6394485502907252, + "learning_rate": 3.3572208508643254e-06, + "loss": 0.8039, + "step": 4830 + }, + { + "epoch": 0.74, + "grad_norm": 2.7206984933518865, + "learning_rate": 3.3535159776536998e-06, + "loss": 0.8256, + "step": 4831 + }, + { + "epoch": 0.74, + "grad_norm": 2.5872363477593403, + "learning_rate": 3.349812737962438e-06, + "loss": 0.7648, + "step": 4832 + }, + { + "epoch": 0.74, + "grad_norm": 2.772472674946281, + "learning_rate": 3.34611113270069e-06, + "loss": 0.7739, + "step": 4833 + }, + { + "epoch": 0.74, + "grad_norm": 3.5276411112383186, + "learning_rate": 3.3424111627782153e-06, + "loss": 0.8324, + "step": 4834 + }, + { + "epoch": 0.74, + "grad_norm": 2.738238132481366, + "learning_rate": 3.3387128291043567e-06, + "loss": 0.8597, + "step": 4835 + }, + { + "epoch": 0.74, + "grad_norm": 2.6733033129669286, + "learning_rate": 3.335016132588058e-06, + "loss": 0.8107, + "step": 4836 + }, + { + "epoch": 0.74, + "grad_norm": 2.3693822759565277, + "learning_rate": 3.331321074137872e-06, + "loss": 0.7716, + "step": 4837 + }, + { + "epoch": 0.74, + "grad_norm": 3.4285933704356033, + "learning_rate": 3.3276276546619324e-06, + "loss": 0.7562, + "step": 4838 + }, + { + "epoch": 0.74, + "grad_norm": 2.652023893212776, + "learning_rate": 3.323935875067984e-06, + "loss": 0.7635, + "step": 4839 + }, + { + "epoch": 0.74, + "grad_norm": 2.4578290945488543, + "learning_rate": 3.320245736263361e-06, + "loss": 0.6576, + "step": 4840 + }, + { + "epoch": 0.74, + "grad_norm": 2.7942404086760533, + "learning_rate": 3.3165572391549915e-06, + "loss": 0.838, + "step": 4841 + }, + { + "epoch": 0.74, + "grad_norm": 2.8569696135870015, + "learning_rate": 3.3128703846494115e-06, + "loss": 0.725, + "step": 4842 + }, + { + "epoch": 0.74, + "grad_norm": 2.7900629597578375, + "learning_rate": 3.3091851736527434e-06, + "loss": 0.7868, + "step": 4843 + }, + { + "epoch": 0.74, + "grad_norm": 2.8046779017462407, + "learning_rate": 3.3055016070707103e-06, + "loss": 0.7461, + "step": 4844 + }, + { + "epoch": 0.74, + "grad_norm": 2.8927421334982313, + "learning_rate": 3.301819685808626e-06, + "loss": 0.7093, + "step": 4845 + }, + { + "epoch": 0.74, + "grad_norm": 2.858699181190039, + "learning_rate": 3.2981394107714025e-06, + "loss": 0.8473, + "step": 4846 + }, + { + "epoch": 0.74, + "grad_norm": 2.3621659698589026, + "learning_rate": 3.2944607828635554e-06, + "loss": 0.7192, + "step": 4847 + }, + { + "epoch": 0.74, + "grad_norm": 2.3742739290029724, + "learning_rate": 3.2907838029891835e-06, + "loss": 0.7438, + "step": 4848 + }, + { + "epoch": 0.74, + "grad_norm": 2.606599052695424, + "learning_rate": 3.287108472051982e-06, + "loss": 0.709, + "step": 4849 + }, + { + "epoch": 0.74, + "grad_norm": 2.6729760499150745, + "learning_rate": 3.2834347909552524e-06, + "loss": 0.794, + "step": 4850 + }, + { + "epoch": 0.74, + "grad_norm": 2.90045193616181, + "learning_rate": 3.2797627606018766e-06, + "loss": 0.7958, + "step": 4851 + }, + { + "epoch": 0.74, + "grad_norm": 2.6004243439972416, + "learning_rate": 3.2760923818943356e-06, + "loss": 0.703, + "step": 4852 + }, + { + "epoch": 0.74, + "grad_norm": 2.754123503758659, + "learning_rate": 3.272423655734711e-06, + "loss": 0.7644, + "step": 4853 + }, + { + "epoch": 0.74, + "grad_norm": 2.442792399877945, + "learning_rate": 3.2687565830246704e-06, + "loss": 0.8134, + "step": 4854 + }, + { + "epoch": 0.74, + "grad_norm": 2.7555431485595028, + "learning_rate": 3.265091164665477e-06, + "loss": 0.7632, + "step": 4855 + }, + { + "epoch": 0.74, + "grad_norm": 2.4544350325423854, + "learning_rate": 3.261427401557987e-06, + "loss": 0.7764, + "step": 4856 + }, + { + "epoch": 0.74, + "grad_norm": 2.9536442126066604, + "learning_rate": 3.2577652946026483e-06, + "loss": 0.8453, + "step": 4857 + }, + { + "epoch": 0.74, + "grad_norm": 2.9074987910434906, + "learning_rate": 3.2541048446995104e-06, + "loss": 0.7189, + "step": 4858 + }, + { + "epoch": 0.74, + "grad_norm": 3.387376823817222, + "learning_rate": 3.2504460527482062e-06, + "loss": 0.8473, + "step": 4859 + }, + { + "epoch": 0.74, + "grad_norm": 2.630468030984959, + "learning_rate": 3.246788919647961e-06, + "loss": 0.7018, + "step": 4860 + }, + { + "epoch": 0.74, + "grad_norm": 2.673732710500758, + "learning_rate": 3.2431334462976007e-06, + "loss": 0.7881, + "step": 4861 + }, + { + "epoch": 0.74, + "grad_norm": 2.4794806478398836, + "learning_rate": 3.239479633595536e-06, + "loss": 0.6952, + "step": 4862 + }, + { + "epoch": 0.74, + "grad_norm": 2.7444790121034806, + "learning_rate": 3.2358274824397685e-06, + "loss": 0.7908, + "step": 4863 + }, + { + "epoch": 0.74, + "grad_norm": 2.7718138842390156, + "learning_rate": 3.232176993727901e-06, + "loss": 0.7955, + "step": 4864 + }, + { + "epoch": 0.74, + "grad_norm": 2.496774754790833, + "learning_rate": 3.2285281683571178e-06, + "loss": 0.7177, + "step": 4865 + }, + { + "epoch": 0.74, + "grad_norm": 2.5301033397244064, + "learning_rate": 3.2248810072241974e-06, + "loss": 0.8109, + "step": 4866 + }, + { + "epoch": 0.74, + "grad_norm": 2.926696010840226, + "learning_rate": 3.22123551122551e-06, + "loss": 0.7954, + "step": 4867 + }, + { + "epoch": 0.75, + "grad_norm": 2.8720728648076888, + "learning_rate": 3.2175916812570128e-06, + "loss": 0.8188, + "step": 4868 + }, + { + "epoch": 0.75, + "grad_norm": 2.7452597000369656, + "learning_rate": 3.2139495182142656e-06, + "loss": 0.7985, + "step": 4869 + }, + { + "epoch": 0.75, + "grad_norm": 2.622907404479844, + "learning_rate": 3.2103090229924028e-06, + "loss": 0.8043, + "step": 4870 + }, + { + "epoch": 0.75, + "grad_norm": 2.4188680852947604, + "learning_rate": 3.206670196486156e-06, + "loss": 0.6735, + "step": 4871 + }, + { + "epoch": 0.75, + "grad_norm": 2.4322841023452044, + "learning_rate": 3.203033039589851e-06, + "loss": 0.6663, + "step": 4872 + }, + { + "epoch": 0.75, + "grad_norm": 2.508207235025389, + "learning_rate": 3.1993975531973986e-06, + "loss": 0.7658, + "step": 4873 + }, + { + "epoch": 0.75, + "grad_norm": 2.7555177905513784, + "learning_rate": 3.1957637382022934e-06, + "loss": 0.7667, + "step": 4874 + }, + { + "epoch": 0.75, + "grad_norm": 2.5579291429948676, + "learning_rate": 3.1921315954976317e-06, + "loss": 0.695, + "step": 4875 + }, + { + "epoch": 0.75, + "grad_norm": 2.5855212329014883, + "learning_rate": 3.1885011259760913e-06, + "loss": 0.8272, + "step": 4876 + }, + { + "epoch": 0.75, + "grad_norm": 2.6520622459619694, + "learning_rate": 3.1848723305299377e-06, + "loss": 0.7435, + "step": 4877 + }, + { + "epoch": 0.75, + "grad_norm": 2.6051923472077285, + "learning_rate": 3.1812452100510283e-06, + "loss": 0.769, + "step": 4878 + }, + { + "epoch": 0.75, + "grad_norm": 2.7947194109042437, + "learning_rate": 3.177619765430804e-06, + "loss": 0.8326, + "step": 4879 + }, + { + "epoch": 0.75, + "grad_norm": 2.802329035693689, + "learning_rate": 3.1739959975603028e-06, + "loss": 0.832, + "step": 4880 + }, + { + "epoch": 0.75, + "grad_norm": 2.7431713512017355, + "learning_rate": 3.1703739073301443e-06, + "loss": 0.8379, + "step": 4881 + }, + { + "epoch": 0.75, + "grad_norm": 2.653079167074889, + "learning_rate": 3.166753495630531e-06, + "loss": 0.8242, + "step": 4882 + }, + { + "epoch": 0.75, + "grad_norm": 2.801604761107011, + "learning_rate": 3.1631347633512665e-06, + "loss": 0.7701, + "step": 4883 + }, + { + "epoch": 0.75, + "grad_norm": 2.4733006429141358, + "learning_rate": 3.1595177113817298e-06, + "loss": 0.7112, + "step": 4884 + }, + { + "epoch": 0.75, + "grad_norm": 3.1772595038733957, + "learning_rate": 3.1559023406108892e-06, + "loss": 0.7305, + "step": 4885 + }, + { + "epoch": 0.75, + "grad_norm": 3.225116065554904, + "learning_rate": 3.1522886519273053e-06, + "loss": 0.8426, + "step": 4886 + }, + { + "epoch": 0.75, + "grad_norm": 3.0385149713556774, + "learning_rate": 3.14867664621912e-06, + "loss": 0.7618, + "step": 4887 + }, + { + "epoch": 0.75, + "grad_norm": 2.662748289735663, + "learning_rate": 3.145066324374062e-06, + "loss": 0.7236, + "step": 4888 + }, + { + "epoch": 0.75, + "grad_norm": 3.0617462390536136, + "learning_rate": 3.141457687279448e-06, + "loss": 0.6848, + "step": 4889 + }, + { + "epoch": 0.75, + "grad_norm": 2.49603952714078, + "learning_rate": 3.1378507358221765e-06, + "loss": 0.7738, + "step": 4890 + }, + { + "epoch": 0.75, + "grad_norm": 2.996649925386801, + "learning_rate": 3.1342454708887404e-06, + "loss": 0.8548, + "step": 4891 + }, + { + "epoch": 0.75, + "grad_norm": 2.936472847178242, + "learning_rate": 3.1306418933652105e-06, + "loss": 0.7815, + "step": 4892 + }, + { + "epoch": 0.75, + "grad_norm": 2.7740070586751817, + "learning_rate": 3.127040004137242e-06, + "loss": 0.8209, + "step": 4893 + }, + { + "epoch": 0.75, + "grad_norm": 2.6643704083806368, + "learning_rate": 3.1234398040900836e-06, + "loss": 0.7166, + "step": 4894 + }, + { + "epoch": 0.75, + "grad_norm": 2.4987987550750006, + "learning_rate": 3.119841294108562e-06, + "loss": 0.6859, + "step": 4895 + }, + { + "epoch": 0.75, + "grad_norm": 2.551914873986552, + "learning_rate": 3.116244475077086e-06, + "loss": 0.7637, + "step": 4896 + }, + { + "epoch": 0.75, + "grad_norm": 2.574786283194233, + "learning_rate": 3.112649347879658e-06, + "loss": 0.7759, + "step": 4897 + }, + { + "epoch": 0.75, + "grad_norm": 2.5550201444355434, + "learning_rate": 3.1090559133998576e-06, + "loss": 0.8019, + "step": 4898 + }, + { + "epoch": 0.75, + "grad_norm": 4.54132320674916, + "learning_rate": 3.10546417252085e-06, + "loss": 0.8308, + "step": 4899 + }, + { + "epoch": 0.75, + "grad_norm": 2.981833030314064, + "learning_rate": 3.1018741261253836e-06, + "loss": 0.8591, + "step": 4900 + }, + { + "epoch": 0.75, + "grad_norm": 2.7911752802943677, + "learning_rate": 3.0982857750957895e-06, + "loss": 0.8839, + "step": 4901 + }, + { + "epoch": 0.75, + "grad_norm": 2.7415236410740995, + "learning_rate": 3.0946991203139896e-06, + "loss": 0.7958, + "step": 4902 + }, + { + "epoch": 0.75, + "grad_norm": 2.4625353307618836, + "learning_rate": 3.0911141626614792e-06, + "loss": 0.6676, + "step": 4903 + }, + { + "epoch": 0.75, + "grad_norm": 2.5561719544460404, + "learning_rate": 3.087530903019337e-06, + "loss": 0.8083, + "step": 4904 + }, + { + "epoch": 0.75, + "grad_norm": 2.8144420696732704, + "learning_rate": 3.0839493422682344e-06, + "loss": 0.7585, + "step": 4905 + }, + { + "epoch": 0.75, + "grad_norm": 2.9425670291569945, + "learning_rate": 3.0803694812884167e-06, + "loss": 0.7633, + "step": 4906 + }, + { + "epoch": 0.75, + "grad_norm": 3.743831645079609, + "learning_rate": 3.0767913209597076e-06, + "loss": 0.8813, + "step": 4907 + }, + { + "epoch": 0.75, + "grad_norm": 2.7116331823136397, + "learning_rate": 3.0732148621615266e-06, + "loss": 0.7728, + "step": 4908 + }, + { + "epoch": 0.75, + "grad_norm": 2.474561448824525, + "learning_rate": 3.069640105772864e-06, + "loss": 0.7739, + "step": 4909 + }, + { + "epoch": 0.75, + "grad_norm": 2.4641070171397588, + "learning_rate": 3.066067052672295e-06, + "loss": 0.8748, + "step": 4910 + }, + { + "epoch": 0.75, + "grad_norm": 2.5126521048603783, + "learning_rate": 3.0624957037379733e-06, + "loss": 0.8056, + "step": 4911 + }, + { + "epoch": 0.75, + "grad_norm": 2.646426069683561, + "learning_rate": 3.0589260598476354e-06, + "loss": 0.7136, + "step": 4912 + }, + { + "epoch": 0.75, + "grad_norm": 2.5569589171109284, + "learning_rate": 3.0553581218786053e-06, + "loss": 0.8465, + "step": 4913 + }, + { + "epoch": 0.75, + "grad_norm": 2.383925636701246, + "learning_rate": 3.0517918907077805e-06, + "loss": 0.6981, + "step": 4914 + }, + { + "epoch": 0.75, + "grad_norm": 2.6317329134965637, + "learning_rate": 3.0482273672116347e-06, + "loss": 0.8363, + "step": 4915 + }, + { + "epoch": 0.75, + "grad_norm": 2.599309419546993, + "learning_rate": 3.0446645522662356e-06, + "loss": 0.8145, + "step": 4916 + }, + { + "epoch": 0.75, + "grad_norm": 2.298720627729177, + "learning_rate": 3.041103446747219e-06, + "loss": 0.7698, + "step": 4917 + }, + { + "epoch": 0.75, + "grad_norm": 2.792177265740668, + "learning_rate": 3.037544051529804e-06, + "loss": 0.8006, + "step": 4918 + }, + { + "epoch": 0.75, + "grad_norm": 2.869378994570629, + "learning_rate": 3.0339863674887938e-06, + "loss": 0.6888, + "step": 4919 + }, + { + "epoch": 0.75, + "grad_norm": 2.9143939203489646, + "learning_rate": 3.0304303954985658e-06, + "loss": 0.8653, + "step": 4920 + }, + { + "epoch": 0.75, + "grad_norm": 2.68410731957122, + "learning_rate": 3.026876136433078e-06, + "loss": 0.8332, + "step": 4921 + }, + { + "epoch": 0.75, + "grad_norm": 2.544048861240414, + "learning_rate": 3.0233235911658665e-06, + "loss": 0.7344, + "step": 4922 + }, + { + "epoch": 0.75, + "grad_norm": 3.441477009221491, + "learning_rate": 3.0197727605700457e-06, + "loss": 0.8102, + "step": 4923 + }, + { + "epoch": 0.75, + "grad_norm": 2.6620877456991425, + "learning_rate": 3.016223645518315e-06, + "loss": 0.7534, + "step": 4924 + }, + { + "epoch": 0.75, + "grad_norm": 2.8841099980051874, + "learning_rate": 3.012676246882945e-06, + "loss": 0.8603, + "step": 4925 + }, + { + "epoch": 0.75, + "grad_norm": 2.6406222549052223, + "learning_rate": 3.009130565535784e-06, + "loss": 0.7859, + "step": 4926 + }, + { + "epoch": 0.75, + "grad_norm": 2.779876928459848, + "learning_rate": 3.005586602348266e-06, + "loss": 0.8433, + "step": 4927 + }, + { + "epoch": 0.75, + "grad_norm": 3.1990330226074764, + "learning_rate": 3.002044358191396e-06, + "loss": 0.822, + "step": 4928 + }, + { + "epoch": 0.75, + "grad_norm": 2.653865192531658, + "learning_rate": 2.998503833935754e-06, + "loss": 0.8725, + "step": 4929 + }, + { + "epoch": 0.75, + "grad_norm": 2.7786270658771475, + "learning_rate": 2.9949650304515098e-06, + "loss": 0.8057, + "step": 4930 + }, + { + "epoch": 0.75, + "grad_norm": 2.693592035006558, + "learning_rate": 2.9914279486083963e-06, + "loss": 0.8249, + "step": 4931 + }, + { + "epoch": 0.75, + "grad_norm": 2.6704855809519694, + "learning_rate": 2.9878925892757316e-06, + "loss": 0.868, + "step": 4932 + }, + { + "epoch": 0.76, + "grad_norm": 3.268918586765798, + "learning_rate": 2.9843589533224047e-06, + "loss": 0.8498, + "step": 4933 + }, + { + "epoch": 0.76, + "grad_norm": 2.651243223030544, + "learning_rate": 2.980827041616884e-06, + "loss": 0.796, + "step": 4934 + }, + { + "epoch": 0.76, + "grad_norm": 2.469076464190495, + "learning_rate": 2.97729685502722e-06, + "loss": 0.6783, + "step": 4935 + }, + { + "epoch": 0.76, + "grad_norm": 2.6319493654941577, + "learning_rate": 2.9737683944210293e-06, + "loss": 0.8822, + "step": 4936 + }, + { + "epoch": 0.76, + "grad_norm": 2.586829052141713, + "learning_rate": 2.970241660665506e-06, + "loss": 0.7955, + "step": 4937 + }, + { + "epoch": 0.76, + "grad_norm": 2.6870831720388932, + "learning_rate": 2.9667166546274286e-06, + "loss": 0.9148, + "step": 4938 + }, + { + "epoch": 0.76, + "grad_norm": 2.754533272622671, + "learning_rate": 2.9631933771731414e-06, + "loss": 0.8005, + "step": 4939 + }, + { + "epoch": 0.76, + "grad_norm": 2.5926012760865684, + "learning_rate": 2.9596718291685655e-06, + "loss": 0.9103, + "step": 4940 + }, + { + "epoch": 0.76, + "grad_norm": 2.62728719562524, + "learning_rate": 2.9561520114792032e-06, + "loss": 0.7476, + "step": 4941 + }, + { + "epoch": 0.76, + "grad_norm": 2.62973819085915, + "learning_rate": 2.952633924970121e-06, + "loss": 0.8022, + "step": 4942 + }, + { + "epoch": 0.76, + "grad_norm": 2.8037242977558896, + "learning_rate": 2.949117570505975e-06, + "loss": 0.8509, + "step": 4943 + }, + { + "epoch": 0.76, + "grad_norm": 2.660119092568261, + "learning_rate": 2.9456029489509773e-06, + "loss": 0.8049, + "step": 4944 + }, + { + "epoch": 0.76, + "grad_norm": 2.7454690787999785, + "learning_rate": 2.942090061168925e-06, + "loss": 0.8604, + "step": 4945 + }, + { + "epoch": 0.76, + "grad_norm": 2.7323098990553603, + "learning_rate": 2.938578908023192e-06, + "loss": 0.8264, + "step": 4946 + }, + { + "epoch": 0.76, + "grad_norm": 2.7730572623711573, + "learning_rate": 2.9350694903767185e-06, + "loss": 0.7949, + "step": 4947 + }, + { + "epoch": 0.76, + "grad_norm": 2.961591688622995, + "learning_rate": 2.9315618090920173e-06, + "loss": 0.6742, + "step": 4948 + }, + { + "epoch": 0.76, + "grad_norm": 2.650539798329177, + "learning_rate": 2.9280558650311842e-06, + "loss": 0.7325, + "step": 4949 + }, + { + "epoch": 0.76, + "grad_norm": 2.712772938021309, + "learning_rate": 2.9245516590558796e-06, + "loss": 0.8108, + "step": 4950 + }, + { + "epoch": 0.76, + "grad_norm": 2.613491635699674, + "learning_rate": 2.9210491920273365e-06, + "loss": 0.7417, + "step": 4951 + }, + { + "epoch": 0.76, + "grad_norm": 2.572729005128316, + "learning_rate": 2.917548464806368e-06, + "loss": 0.8002, + "step": 4952 + }, + { + "epoch": 0.76, + "grad_norm": 2.4466386915599383, + "learning_rate": 2.914049478253349e-06, + "loss": 0.766, + "step": 4953 + }, + { + "epoch": 0.76, + "grad_norm": 2.615461090522433, + "learning_rate": 2.910552233228239e-06, + "loss": 0.8398, + "step": 4954 + }, + { + "epoch": 0.76, + "grad_norm": 2.7812802922894964, + "learning_rate": 2.907056730590558e-06, + "loss": 0.8144, + "step": 4955 + }, + { + "epoch": 0.76, + "grad_norm": 2.6422650497261895, + "learning_rate": 2.903562971199405e-06, + "loss": 0.7422, + "step": 4956 + }, + { + "epoch": 0.76, + "grad_norm": 2.6107459832204496, + "learning_rate": 2.9000709559134456e-06, + "loss": 0.7618, + "step": 4957 + }, + { + "epoch": 0.76, + "grad_norm": 2.537852760775243, + "learning_rate": 2.8965806855909207e-06, + "loss": 0.8506, + "step": 4958 + }, + { + "epoch": 0.76, + "grad_norm": 3.247518579755631, + "learning_rate": 2.8930921610896366e-06, + "loss": 0.8605, + "step": 4959 + }, + { + "epoch": 0.76, + "grad_norm": 2.649138417092869, + "learning_rate": 2.8896053832669822e-06, + "loss": 0.7862, + "step": 4960 + }, + { + "epoch": 0.76, + "grad_norm": 2.812294876476703, + "learning_rate": 2.8861203529799052e-06, + "loss": 0.8908, + "step": 4961 + }, + { + "epoch": 0.76, + "grad_norm": 2.4486042948844307, + "learning_rate": 2.8826370710849274e-06, + "loss": 0.7694, + "step": 4962 + }, + { + "epoch": 0.76, + "grad_norm": 2.767682856019066, + "learning_rate": 2.8791555384381466e-06, + "loss": 0.8236, + "step": 4963 + }, + { + "epoch": 0.76, + "grad_norm": 2.853722056482501, + "learning_rate": 2.8756757558952186e-06, + "loss": 0.7457, + "step": 4964 + }, + { + "epoch": 0.76, + "grad_norm": 2.6353513595010014, + "learning_rate": 2.8721977243113854e-06, + "loss": 0.7987, + "step": 4965 + }, + { + "epoch": 0.76, + "grad_norm": 2.9905809231839355, + "learning_rate": 2.8687214445414434e-06, + "loss": 0.7522, + "step": 4966 + }, + { + "epoch": 0.76, + "grad_norm": 2.418550038242517, + "learning_rate": 2.8652469174397667e-06, + "loss": 0.7241, + "step": 4967 + }, + { + "epoch": 0.76, + "grad_norm": 2.4665763541728247, + "learning_rate": 2.8617741438602964e-06, + "loss": 0.736, + "step": 4968 + }, + { + "epoch": 0.76, + "grad_norm": 2.5783211822392196, + "learning_rate": 2.8583031246565417e-06, + "loss": 0.8074, + "step": 4969 + }, + { + "epoch": 0.76, + "grad_norm": 2.7056606174723274, + "learning_rate": 2.8548338606815805e-06, + "loss": 0.7845, + "step": 4970 + }, + { + "epoch": 0.76, + "grad_norm": 2.6595235853664403, + "learning_rate": 2.8513663527880653e-06, + "loss": 0.771, + "step": 4971 + }, + { + "epoch": 0.76, + "grad_norm": 2.66613898311809, + "learning_rate": 2.8479006018282096e-06, + "loss": 0.6243, + "step": 4972 + }, + { + "epoch": 0.76, + "grad_norm": 2.692923874491351, + "learning_rate": 2.8444366086537943e-06, + "loss": 0.8159, + "step": 4973 + }, + { + "epoch": 0.76, + "grad_norm": 2.5376497677560717, + "learning_rate": 2.840974374116179e-06, + "loss": 0.711, + "step": 4974 + }, + { + "epoch": 0.76, + "grad_norm": 2.863142652970658, + "learning_rate": 2.8375138990662766e-06, + "loss": 0.8243, + "step": 4975 + }, + { + "epoch": 0.76, + "grad_norm": 2.488143347726353, + "learning_rate": 2.8340551843545817e-06, + "loss": 0.7433, + "step": 4976 + }, + { + "epoch": 0.76, + "grad_norm": 2.8879241474099566, + "learning_rate": 2.8305982308311453e-06, + "loss": 0.8774, + "step": 4977 + }, + { + "epoch": 0.76, + "grad_norm": 2.5762550206887824, + "learning_rate": 2.827143039345591e-06, + "loss": 0.7337, + "step": 4978 + }, + { + "epoch": 0.76, + "grad_norm": 2.595931442952882, + "learning_rate": 2.823689610747108e-06, + "loss": 0.8634, + "step": 4979 + }, + { + "epoch": 0.76, + "grad_norm": 2.3186033910243693, + "learning_rate": 2.820237945884451e-06, + "loss": 0.7204, + "step": 4980 + }, + { + "epoch": 0.76, + "grad_norm": 2.614217693941196, + "learning_rate": 2.8167880456059394e-06, + "loss": 0.7471, + "step": 4981 + }, + { + "epoch": 0.76, + "grad_norm": 2.4858416594026043, + "learning_rate": 2.8133399107594683e-06, + "loss": 0.7809, + "step": 4982 + }, + { + "epoch": 0.76, + "grad_norm": 2.937018679007705, + "learning_rate": 2.80989354219249e-06, + "loss": 0.7293, + "step": 4983 + }, + { + "epoch": 0.76, + "grad_norm": 2.670884778953924, + "learning_rate": 2.8064489407520225e-06, + "loss": 0.764, + "step": 4984 + }, + { + "epoch": 0.76, + "grad_norm": 2.5856640266782973, + "learning_rate": 2.803006107284657e-06, + "loss": 0.8138, + "step": 4985 + }, + { + "epoch": 0.76, + "grad_norm": 3.225279699315726, + "learning_rate": 2.7995650426365405e-06, + "loss": 0.7173, + "step": 4986 + }, + { + "epoch": 0.76, + "grad_norm": 2.611351350122502, + "learning_rate": 2.7961257476533954e-06, + "loss": 0.7806, + "step": 4987 + }, + { + "epoch": 0.76, + "grad_norm": 2.671165795967523, + "learning_rate": 2.792688223180502e-06, + "loss": 0.8442, + "step": 4988 + }, + { + "epoch": 0.76, + "grad_norm": 3.221238185390751, + "learning_rate": 2.7892524700627053e-06, + "loss": 0.8327, + "step": 4989 + }, + { + "epoch": 0.76, + "grad_norm": 2.7434210137367905, + "learning_rate": 2.7858184891444197e-06, + "loss": 0.7641, + "step": 4990 + }, + { + "epoch": 0.76, + "grad_norm": 2.642891673485018, + "learning_rate": 2.7823862812696203e-06, + "loss": 0.7117, + "step": 4991 + }, + { + "epoch": 0.76, + "grad_norm": 2.713233880054568, + "learning_rate": 2.7789558472818435e-06, + "loss": 0.7222, + "step": 4992 + }, + { + "epoch": 0.76, + "grad_norm": 2.7191943051310097, + "learning_rate": 2.775527188024201e-06, + "loss": 0.8237, + "step": 4993 + }, + { + "epoch": 0.76, + "grad_norm": 2.9476597995929237, + "learning_rate": 2.772100304339355e-06, + "loss": 0.8084, + "step": 4994 + }, + { + "epoch": 0.76, + "grad_norm": 2.9476214832791774, + "learning_rate": 2.7686751970695427e-06, + "loss": 0.7891, + "step": 4995 + }, + { + "epoch": 0.76, + "grad_norm": 3.0800081298559814, + "learning_rate": 2.7652518670565577e-06, + "loss": 0.8109, + "step": 4996 + }, + { + "epoch": 0.76, + "grad_norm": 2.455886649514224, + "learning_rate": 2.7618303151417534e-06, + "loss": 0.6759, + "step": 4997 + }, + { + "epoch": 0.77, + "grad_norm": 2.53600811818245, + "learning_rate": 2.758410542166059e-06, + "loss": 0.8115, + "step": 4998 + }, + { + "epoch": 0.77, + "grad_norm": 2.819024745363265, + "learning_rate": 2.754992548969956e-06, + "loss": 0.8393, + "step": 4999 + }, + { + "epoch": 0.77, + "grad_norm": 2.6405712089148254, + "learning_rate": 2.75157633639349e-06, + "loss": 0.7871, + "step": 5000 + }, + { + "epoch": 0.77, + "grad_norm": 2.689984644938119, + "learning_rate": 2.748161905276271e-06, + "loss": 0.7925, + "step": 5001 + }, + { + "epoch": 0.77, + "grad_norm": 2.5912458575941604, + "learning_rate": 2.7447492564574708e-06, + "loss": 0.8126, + "step": 5002 + }, + { + "epoch": 0.77, + "grad_norm": 2.922850248609202, + "learning_rate": 2.7413383907758183e-06, + "loss": 0.8764, + "step": 5003 + }, + { + "epoch": 0.77, + "grad_norm": 2.9523305387496404, + "learning_rate": 2.737929309069616e-06, + "loss": 0.7544, + "step": 5004 + }, + { + "epoch": 0.77, + "grad_norm": 2.7388750693266837, + "learning_rate": 2.7345220121767136e-06, + "loss": 0.7313, + "step": 5005 + }, + { + "epoch": 0.77, + "grad_norm": 2.5705254543989833, + "learning_rate": 2.7311165009345362e-06, + "loss": 0.6219, + "step": 5006 + }, + { + "epoch": 0.77, + "grad_norm": 2.6715192568830886, + "learning_rate": 2.7277127761800592e-06, + "loss": 0.8486, + "step": 5007 + }, + { + "epoch": 0.77, + "grad_norm": 2.596380510041663, + "learning_rate": 2.7243108387498207e-06, + "loss": 0.7227, + "step": 5008 + }, + { + "epoch": 0.77, + "grad_norm": 2.586844529728933, + "learning_rate": 2.7209106894799253e-06, + "loss": 0.8468, + "step": 5009 + }, + { + "epoch": 0.77, + "grad_norm": 11.099774070134371, + "learning_rate": 2.7175123292060335e-06, + "loss": 0.8458, + "step": 5010 + }, + { + "epoch": 0.77, + "grad_norm": 2.568013495598628, + "learning_rate": 2.714115758763366e-06, + "loss": 0.7795, + "step": 5011 + }, + { + "epoch": 0.77, + "grad_norm": 2.5389843195512642, + "learning_rate": 2.710720978986705e-06, + "loss": 0.8222, + "step": 5012 + }, + { + "epoch": 0.77, + "grad_norm": 2.4481461619079146, + "learning_rate": 2.7073279907103913e-06, + "loss": 0.6732, + "step": 5013 + }, + { + "epoch": 0.77, + "grad_norm": 2.572469747307365, + "learning_rate": 2.703936794768325e-06, + "loss": 0.7647, + "step": 5014 + }, + { + "epoch": 0.77, + "grad_norm": 2.510351138783795, + "learning_rate": 2.7005473919939706e-06, + "loss": 0.7419, + "step": 5015 + }, + { + "epoch": 0.77, + "grad_norm": 2.535945112086226, + "learning_rate": 2.6971597832203434e-06, + "loss": 0.7646, + "step": 5016 + }, + { + "epoch": 0.77, + "grad_norm": 2.7867749494925027, + "learning_rate": 2.693773969280029e-06, + "loss": 0.6355, + "step": 5017 + }, + { + "epoch": 0.77, + "grad_norm": 2.788432665774386, + "learning_rate": 2.6903899510051624e-06, + "loss": 0.913, + "step": 5018 + }, + { + "epoch": 0.77, + "grad_norm": 2.4875282017699964, + "learning_rate": 2.6870077292274366e-06, + "loss": 0.6947, + "step": 5019 + }, + { + "epoch": 0.77, + "grad_norm": 2.75155649877325, + "learning_rate": 2.6836273047781137e-06, + "loss": 0.7729, + "step": 5020 + }, + { + "epoch": 0.77, + "grad_norm": 2.7143198356277995, + "learning_rate": 2.6802486784880044e-06, + "loss": 0.8094, + "step": 5021 + }, + { + "epoch": 0.77, + "grad_norm": 2.6107155069240435, + "learning_rate": 2.676871851187479e-06, + "loss": 0.8053, + "step": 5022 + }, + { + "epoch": 0.77, + "grad_norm": 2.9969143394969984, + "learning_rate": 2.6734968237064686e-06, + "loss": 0.806, + "step": 5023 + }, + { + "epoch": 0.77, + "grad_norm": 2.752371634653211, + "learning_rate": 2.6701235968744587e-06, + "loss": 0.6662, + "step": 5024 + }, + { + "epoch": 0.77, + "grad_norm": 2.552224215391848, + "learning_rate": 2.6667521715204914e-06, + "loss": 0.7168, + "step": 5025 + }, + { + "epoch": 0.77, + "grad_norm": 2.620940328135656, + "learning_rate": 2.6633825484731746e-06, + "loss": 0.8322, + "step": 5026 + }, + { + "epoch": 0.77, + "grad_norm": 5.580152864684537, + "learning_rate": 2.6600147285606625e-06, + "loss": 0.8152, + "step": 5027 + }, + { + "epoch": 0.77, + "grad_norm": 2.3639360061856247, + "learning_rate": 2.6566487126106745e-06, + "loss": 0.6962, + "step": 5028 + }, + { + "epoch": 0.77, + "grad_norm": 2.406455472298094, + "learning_rate": 2.6532845014504814e-06, + "loss": 0.7394, + "step": 5029 + }, + { + "epoch": 0.77, + "grad_norm": 2.6169693952537822, + "learning_rate": 2.6499220959069085e-06, + "loss": 0.7742, + "step": 5030 + }, + { + "epoch": 0.77, + "grad_norm": 2.4927676361698934, + "learning_rate": 2.6465614968063456e-06, + "loss": 0.781, + "step": 5031 + }, + { + "epoch": 0.77, + "grad_norm": 2.62965426461676, + "learning_rate": 2.6432027049747333e-06, + "loss": 0.8843, + "step": 5032 + }, + { + "epoch": 0.77, + "grad_norm": 2.7037687891142763, + "learning_rate": 2.639845721237566e-06, + "loss": 0.8107, + "step": 5033 + }, + { + "epoch": 0.77, + "grad_norm": 2.643649919678502, + "learning_rate": 2.6364905464198987e-06, + "loss": 0.7893, + "step": 5034 + }, + { + "epoch": 0.77, + "grad_norm": 2.691892689329616, + "learning_rate": 2.6331371813463356e-06, + "loss": 0.8265, + "step": 5035 + }, + { + "epoch": 0.77, + "grad_norm": 2.6975452736113454, + "learning_rate": 2.6297856268410406e-06, + "loss": 0.81, + "step": 5036 + }, + { + "epoch": 0.77, + "grad_norm": 2.4158822610707325, + "learning_rate": 2.626435883727735e-06, + "loss": 0.6869, + "step": 5037 + }, + { + "epoch": 0.77, + "grad_norm": 2.4939839844165905, + "learning_rate": 2.623087952829688e-06, + "loss": 0.8316, + "step": 5038 + }, + { + "epoch": 0.77, + "grad_norm": 2.776427207607364, + "learning_rate": 2.619741834969731e-06, + "loss": 0.7877, + "step": 5039 + }, + { + "epoch": 0.77, + "grad_norm": 2.3779010383973707, + "learning_rate": 2.616397530970244e-06, + "loss": 0.6899, + "step": 5040 + }, + { + "epoch": 0.77, + "grad_norm": 2.6083802441754464, + "learning_rate": 2.6130550416531597e-06, + "loss": 0.7778, + "step": 5041 + }, + { + "epoch": 0.77, + "grad_norm": 2.56450784477195, + "learning_rate": 2.609714367839975e-06, + "loss": 0.7963, + "step": 5042 + }, + { + "epoch": 0.77, + "grad_norm": 2.5459301905849654, + "learning_rate": 2.606375510351731e-06, + "loss": 0.7842, + "step": 5043 + }, + { + "epoch": 0.77, + "grad_norm": 2.5217148421897546, + "learning_rate": 2.6030384700090238e-06, + "loss": 0.7401, + "step": 5044 + }, + { + "epoch": 0.77, + "grad_norm": 2.45823283836036, + "learning_rate": 2.599703247632005e-06, + "loss": 0.7311, + "step": 5045 + }, + { + "epoch": 0.77, + "grad_norm": 2.9579669428110598, + "learning_rate": 2.596369844040378e-06, + "loss": 0.7747, + "step": 5046 + }, + { + "epoch": 0.77, + "grad_norm": 2.5750011906269923, + "learning_rate": 2.5930382600533998e-06, + "loss": 0.7482, + "step": 5047 + }, + { + "epoch": 0.77, + "grad_norm": 2.7888629756635206, + "learning_rate": 2.5897084964898835e-06, + "loss": 0.7846, + "step": 5048 + }, + { + "epoch": 0.77, + "grad_norm": 2.5721294503502157, + "learning_rate": 2.586380554168185e-06, + "loss": 0.6925, + "step": 5049 + }, + { + "epoch": 0.77, + "grad_norm": 2.7618620787962493, + "learning_rate": 2.583054433906228e-06, + "loss": 0.7054, + "step": 5050 + }, + { + "epoch": 0.77, + "grad_norm": 2.6276942856050938, + "learning_rate": 2.5797301365214742e-06, + "loss": 0.7645, + "step": 5051 + }, + { + "epoch": 0.77, + "grad_norm": 2.616392198716273, + "learning_rate": 2.576407662830942e-06, + "loss": 0.7362, + "step": 5052 + }, + { + "epoch": 0.77, + "grad_norm": 2.711256026542573, + "learning_rate": 2.5730870136512055e-06, + "loss": 0.7994, + "step": 5053 + }, + { + "epoch": 0.77, + "grad_norm": 2.6991497732690743, + "learning_rate": 2.5697681897983862e-06, + "loss": 0.8072, + "step": 5054 + }, + { + "epoch": 0.77, + "grad_norm": 2.4775791838568586, + "learning_rate": 2.566451192088156e-06, + "loss": 0.7789, + "step": 5055 + }, + { + "epoch": 0.77, + "grad_norm": 8.388324935782896, + "learning_rate": 2.5631360213357425e-06, + "loss": 0.8261, + "step": 5056 + }, + { + "epoch": 0.77, + "grad_norm": 2.514421673855606, + "learning_rate": 2.5598226783559198e-06, + "loss": 0.7355, + "step": 5057 + }, + { + "epoch": 0.77, + "grad_norm": 2.536006040702953, + "learning_rate": 2.5565111639630125e-06, + "loss": 0.6696, + "step": 5058 + }, + { + "epoch": 0.77, + "grad_norm": 2.7285987787744275, + "learning_rate": 2.5532014789709027e-06, + "loss": 0.8219, + "step": 5059 + }, + { + "epoch": 0.77, + "grad_norm": 2.476472443285063, + "learning_rate": 2.549893624193014e-06, + "loss": 0.7924, + "step": 5060 + }, + { + "epoch": 0.77, + "grad_norm": 2.662355526288266, + "learning_rate": 2.5465876004423285e-06, + "loss": 0.7893, + "step": 5061 + }, + { + "epoch": 0.77, + "grad_norm": 2.507656262345981, + "learning_rate": 2.543283408531373e-06, + "loss": 0.7732, + "step": 5062 + }, + { + "epoch": 0.77, + "grad_norm": 2.8257941998664964, + "learning_rate": 2.53998104927222e-06, + "loss": 0.7729, + "step": 5063 + }, + { + "epoch": 0.78, + "grad_norm": 2.7337037395372334, + "learning_rate": 2.5366805234765047e-06, + "loss": 0.7656, + "step": 5064 + }, + { + "epoch": 0.78, + "grad_norm": 2.5500710978463883, + "learning_rate": 2.5333818319554002e-06, + "loss": 0.6465, + "step": 5065 + }, + { + "epoch": 0.78, + "grad_norm": 2.7708922146707167, + "learning_rate": 2.530084975519629e-06, + "loss": 0.8558, + "step": 5066 + }, + { + "epoch": 0.78, + "grad_norm": 2.7554935025214697, + "learning_rate": 2.5267899549794728e-06, + "loss": 0.754, + "step": 5067 + }, + { + "epoch": 0.78, + "grad_norm": 2.4565508163642993, + "learning_rate": 2.523496771144751e-06, + "loss": 0.8833, + "step": 5068 + }, + { + "epoch": 0.78, + "grad_norm": 2.5094279415765417, + "learning_rate": 2.5202054248248362e-06, + "loss": 0.7924, + "step": 5069 + }, + { + "epoch": 0.78, + "grad_norm": 3.172596205277227, + "learning_rate": 2.5169159168286503e-06, + "loss": 0.7781, + "step": 5070 + }, + { + "epoch": 0.78, + "grad_norm": 2.780052244901856, + "learning_rate": 2.513628247964658e-06, + "loss": 0.8596, + "step": 5071 + }, + { + "epoch": 0.78, + "grad_norm": 2.6389858730839517, + "learning_rate": 2.510342419040881e-06, + "loss": 0.7318, + "step": 5072 + }, + { + "epoch": 0.78, + "grad_norm": 3.0811186388492193, + "learning_rate": 2.5070584308648828e-06, + "loss": 0.8587, + "step": 5073 + }, + { + "epoch": 0.78, + "grad_norm": 2.53080728953482, + "learning_rate": 2.503776284243772e-06, + "loss": 0.7536, + "step": 5074 + }, + { + "epoch": 0.78, + "grad_norm": 2.7781051409831026, + "learning_rate": 2.5004959799842133e-06, + "loss": 0.8435, + "step": 5075 + }, + { + "epoch": 0.78, + "grad_norm": 2.7029578113303256, + "learning_rate": 2.497217518892412e-06, + "loss": 0.7928, + "step": 5076 + }, + { + "epoch": 0.78, + "grad_norm": 2.7198966214625666, + "learning_rate": 2.493940901774118e-06, + "loss": 0.7304, + "step": 5077 + }, + { + "epoch": 0.78, + "grad_norm": 3.482025583931032, + "learning_rate": 2.490666129434638e-06, + "loss": 0.828, + "step": 5078 + }, + { + "epoch": 0.78, + "grad_norm": 2.46772119696312, + "learning_rate": 2.4873932026788162e-06, + "loss": 0.7316, + "step": 5079 + }, + { + "epoch": 0.78, + "grad_norm": 2.5961425268325375, + "learning_rate": 2.4841221223110467e-06, + "loss": 0.8223, + "step": 5080 + }, + { + "epoch": 0.78, + "grad_norm": 2.5679783685419544, + "learning_rate": 2.4808528891352677e-06, + "loss": 0.7975, + "step": 5081 + }, + { + "epoch": 0.78, + "grad_norm": 2.7506846436308914, + "learning_rate": 2.4775855039549647e-06, + "loss": 0.7494, + "step": 5082 + }, + { + "epoch": 0.78, + "grad_norm": 2.5964912817122605, + "learning_rate": 2.4743199675731722e-06, + "loss": 0.7492, + "step": 5083 + }, + { + "epoch": 0.78, + "grad_norm": 2.8934879578568884, + "learning_rate": 2.4710562807924664e-06, + "loss": 0.7708, + "step": 5084 + }, + { + "epoch": 0.78, + "grad_norm": 2.6202697209531114, + "learning_rate": 2.4677944444149683e-06, + "loss": 0.7256, + "step": 5085 + }, + { + "epoch": 0.78, + "grad_norm": 2.626432453067587, + "learning_rate": 2.464534459242348e-06, + "loss": 0.8259, + "step": 5086 + }, + { + "epoch": 0.78, + "grad_norm": 2.521069808194475, + "learning_rate": 2.4612763260758187e-06, + "loss": 0.696, + "step": 5087 + }, + { + "epoch": 0.78, + "grad_norm": 2.64835206059012, + "learning_rate": 2.458020045716134e-06, + "loss": 0.7756, + "step": 5088 + }, + { + "epoch": 0.78, + "grad_norm": 2.3416846206287163, + "learning_rate": 2.4547656189636014e-06, + "loss": 0.7253, + "step": 5089 + }, + { + "epoch": 0.78, + "grad_norm": 2.942055576016125, + "learning_rate": 2.451513046618067e-06, + "loss": 0.7859, + "step": 5090 + }, + { + "epoch": 0.78, + "grad_norm": 2.8389802865849383, + "learning_rate": 2.44826232947892e-06, + "loss": 0.8852, + "step": 5091 + }, + { + "epoch": 0.78, + "grad_norm": 2.511118745358438, + "learning_rate": 2.4450134683450957e-06, + "loss": 0.729, + "step": 5092 + }, + { + "epoch": 0.78, + "grad_norm": 3.3110347546272845, + "learning_rate": 2.4417664640150695e-06, + "loss": 0.7976, + "step": 5093 + }, + { + "epoch": 0.78, + "grad_norm": 2.440522966930613, + "learning_rate": 2.4385213172868716e-06, + "loss": 0.7789, + "step": 5094 + }, + { + "epoch": 0.78, + "grad_norm": 2.6106319712037207, + "learning_rate": 2.4352780289580647e-06, + "loss": 0.8299, + "step": 5095 + }, + { + "epoch": 0.78, + "grad_norm": 2.5840976790034924, + "learning_rate": 2.4320365998257543e-06, + "loss": 0.7839, + "step": 5096 + }, + { + "epoch": 0.78, + "grad_norm": 2.68223277349005, + "learning_rate": 2.4287970306865994e-06, + "loss": 0.7534, + "step": 5097 + }, + { + "epoch": 0.78, + "grad_norm": 2.4092525951805706, + "learning_rate": 2.4255593223367923e-06, + "loss": 0.7492, + "step": 5098 + }, + { + "epoch": 0.78, + "grad_norm": 2.6820676394906435, + "learning_rate": 2.4223234755720672e-06, + "loss": 0.7327, + "step": 5099 + }, + { + "epoch": 0.78, + "grad_norm": 2.567354858149662, + "learning_rate": 2.4190894911877105e-06, + "loss": 0.7733, + "step": 5100 + }, + { + "epoch": 0.78, + "grad_norm": 2.647653908124238, + "learning_rate": 2.4158573699785427e-06, + "loss": 0.7113, + "step": 5101 + }, + { + "epoch": 0.78, + "grad_norm": 2.6788491642529397, + "learning_rate": 2.412627112738928e-06, + "loss": 0.7993, + "step": 5102 + }, + { + "epoch": 0.78, + "grad_norm": 2.9280387750202412, + "learning_rate": 2.4093987202627735e-06, + "loss": 0.8553, + "step": 5103 + }, + { + "epoch": 0.78, + "grad_norm": 2.581053853352887, + "learning_rate": 2.4061721933435246e-06, + "loss": 0.8247, + "step": 5104 + }, + { + "epoch": 0.78, + "grad_norm": 2.580506365930907, + "learning_rate": 2.4029475327741758e-06, + "loss": 0.8352, + "step": 5105 + }, + { + "epoch": 0.78, + "grad_norm": 2.711539323105799, + "learning_rate": 2.3997247393472557e-06, + "loss": 0.7786, + "step": 5106 + }, + { + "epoch": 0.78, + "grad_norm": 3.6544888378906997, + "learning_rate": 2.3965038138548346e-06, + "loss": 0.8774, + "step": 5107 + }, + { + "epoch": 0.78, + "grad_norm": 2.781610147223123, + "learning_rate": 2.3932847570885307e-06, + "loss": 0.7689, + "step": 5108 + }, + { + "epoch": 0.78, + "grad_norm": 2.838519577882593, + "learning_rate": 2.390067569839496e-06, + "loss": 0.7351, + "step": 5109 + }, + { + "epoch": 0.78, + "grad_norm": 2.605314160377416, + "learning_rate": 2.3868522528984207e-06, + "loss": 0.7761, + "step": 5110 + }, + { + "epoch": 0.78, + "grad_norm": 2.7188592167400154, + "learning_rate": 2.383638807055545e-06, + "loss": 0.8472, + "step": 5111 + }, + { + "epoch": 0.78, + "grad_norm": 2.5482203378844366, + "learning_rate": 2.380427233100643e-06, + "loss": 0.7605, + "step": 5112 + }, + { + "epoch": 0.78, + "grad_norm": 2.704449992334734, + "learning_rate": 2.3772175318230272e-06, + "loss": 0.8089, + "step": 5113 + }, + { + "epoch": 0.78, + "grad_norm": 2.8402849358160167, + "learning_rate": 2.374009704011554e-06, + "loss": 0.6916, + "step": 5114 + }, + { + "epoch": 0.78, + "grad_norm": 2.864028803777469, + "learning_rate": 2.3708037504546124e-06, + "loss": 0.799, + "step": 5115 + }, + { + "epoch": 0.78, + "grad_norm": 2.501326567246752, + "learning_rate": 2.367599671940144e-06, + "loss": 0.6988, + "step": 5116 + }, + { + "epoch": 0.78, + "grad_norm": 3.2004108035764625, + "learning_rate": 2.364397469255617e-06, + "loss": 0.8027, + "step": 5117 + }, + { + "epoch": 0.78, + "grad_norm": 2.6490640456468046, + "learning_rate": 2.3611971431880407e-06, + "loss": 0.806, + "step": 5118 + }, + { + "epoch": 0.78, + "grad_norm": 2.5612708686624686, + "learning_rate": 2.357998694523972e-06, + "loss": 0.6694, + "step": 5119 + }, + { + "epoch": 0.78, + "grad_norm": 3.0401653824639516, + "learning_rate": 2.3548021240494955e-06, + "loss": 0.7947, + "step": 5120 + }, + { + "epoch": 0.78, + "grad_norm": 2.8307884578484686, + "learning_rate": 2.3516074325502368e-06, + "loss": 0.8022, + "step": 5121 + }, + { + "epoch": 0.78, + "grad_norm": 2.72977536780408, + "learning_rate": 2.348414620811367e-06, + "loss": 0.7399, + "step": 5122 + }, + { + "epoch": 0.78, + "grad_norm": 2.4748006126897417, + "learning_rate": 2.345223689617586e-06, + "loss": 0.6904, + "step": 5123 + }, + { + "epoch": 0.78, + "grad_norm": 2.4064619793053934, + "learning_rate": 2.342034639753137e-06, + "loss": 0.7289, + "step": 5124 + }, + { + "epoch": 0.78, + "grad_norm": 2.9327774960950537, + "learning_rate": 2.3388474720017963e-06, + "loss": 0.8078, + "step": 5125 + }, + { + "epoch": 0.78, + "grad_norm": 2.4273391870497125, + "learning_rate": 2.33566218714688e-06, + "loss": 0.8186, + "step": 5126 + }, + { + "epoch": 0.78, + "grad_norm": 3.339466964529913, + "learning_rate": 2.3324787859712462e-06, + "loss": 0.8604, + "step": 5127 + }, + { + "epoch": 0.78, + "grad_norm": 2.7521085637109732, + "learning_rate": 2.3292972692572833e-06, + "loss": 0.7874, + "step": 5128 + }, + { + "epoch": 0.79, + "grad_norm": 2.7279640075919307, + "learning_rate": 2.3261176377869165e-06, + "loss": 0.7593, + "step": 5129 + }, + { + "epoch": 0.79, + "grad_norm": 2.598691379497955, + "learning_rate": 2.3229398923416136e-06, + "loss": 0.7745, + "step": 5130 + }, + { + "epoch": 0.79, + "grad_norm": 2.519874664868604, + "learning_rate": 2.319764033702375e-06, + "loss": 0.8348, + "step": 5131 + }, + { + "epoch": 0.79, + "grad_norm": 2.601307397164149, + "learning_rate": 2.316590062649734e-06, + "loss": 0.7321, + "step": 5132 + }, + { + "epoch": 0.79, + "grad_norm": 2.4969779020558853, + "learning_rate": 2.313417979963768e-06, + "loss": 0.728, + "step": 5133 + }, + { + "epoch": 0.79, + "grad_norm": 2.650113640353963, + "learning_rate": 2.310247786424086e-06, + "loss": 0.7826, + "step": 5134 + }, + { + "epoch": 0.79, + "grad_norm": 2.5214125575947963, + "learning_rate": 2.3070794828098285e-06, + "loss": 0.7338, + "step": 5135 + }, + { + "epoch": 0.79, + "grad_norm": 2.9428606849207837, + "learning_rate": 2.3039130698996802e-06, + "loss": 0.8268, + "step": 5136 + }, + { + "epoch": 0.79, + "grad_norm": 2.6185692051050182, + "learning_rate": 2.3007485484718505e-06, + "loss": 0.6381, + "step": 5137 + }, + { + "epoch": 0.79, + "grad_norm": 2.6834657323433992, + "learning_rate": 2.297585919304097e-06, + "loss": 0.7596, + "step": 5138 + }, + { + "epoch": 0.79, + "grad_norm": 2.781398509030959, + "learning_rate": 2.294425183173703e-06, + "loss": 0.7946, + "step": 5139 + }, + { + "epoch": 0.79, + "grad_norm": 2.561953880258535, + "learning_rate": 2.2912663408574843e-06, + "loss": 0.787, + "step": 5140 + }, + { + "epoch": 0.79, + "grad_norm": 2.618970065482725, + "learning_rate": 2.288109393131802e-06, + "loss": 0.7402, + "step": 5141 + }, + { + "epoch": 0.79, + "grad_norm": 2.428394606083279, + "learning_rate": 2.2849543407725403e-06, + "loss": 0.6512, + "step": 5142 + }, + { + "epoch": 0.79, + "grad_norm": 2.8219441478736917, + "learning_rate": 2.2818011845551293e-06, + "loss": 0.8186, + "step": 5143 + }, + { + "epoch": 0.79, + "grad_norm": 2.5187641567693984, + "learning_rate": 2.27864992525452e-06, + "loss": 0.7891, + "step": 5144 + }, + { + "epoch": 0.79, + "grad_norm": 2.59796954673457, + "learning_rate": 2.2755005636452067e-06, + "loss": 0.7758, + "step": 5145 + }, + { + "epoch": 0.79, + "grad_norm": 2.652727950464101, + "learning_rate": 2.2723531005012133e-06, + "loss": 0.8253, + "step": 5146 + }, + { + "epoch": 0.79, + "grad_norm": 2.7847114123727943, + "learning_rate": 2.2692075365960974e-06, + "loss": 0.8096, + "step": 5147 + }, + { + "epoch": 0.79, + "grad_norm": 2.692857244476947, + "learning_rate": 2.2660638727029484e-06, + "loss": 0.8111, + "step": 5148 + }, + { + "epoch": 0.79, + "grad_norm": 2.879953323799714, + "learning_rate": 2.2629221095943952e-06, + "loss": 0.7733, + "step": 5149 + }, + { + "epoch": 0.79, + "grad_norm": 2.710345885860965, + "learning_rate": 2.2597822480425934e-06, + "loss": 0.7381, + "step": 5150 + }, + { + "epoch": 0.79, + "grad_norm": 2.532688579438755, + "learning_rate": 2.2566442888192306e-06, + "loss": 0.8801, + "step": 5151 + }, + { + "epoch": 0.79, + "grad_norm": 2.6739871284912713, + "learning_rate": 2.2535082326955325e-06, + "loss": 0.7638, + "step": 5152 + }, + { + "epoch": 0.79, + "grad_norm": 2.6604332292354353, + "learning_rate": 2.2503740804422504e-06, + "loss": 0.7703, + "step": 5153 + }, + { + "epoch": 0.79, + "grad_norm": 3.1129370913861996, + "learning_rate": 2.247241832829675e-06, + "loss": 0.81, + "step": 5154 + }, + { + "epoch": 0.79, + "grad_norm": 2.777618927551345, + "learning_rate": 2.244111490627623e-06, + "loss": 0.8235, + "step": 5155 + }, + { + "epoch": 0.79, + "grad_norm": 2.6995224648073917, + "learning_rate": 2.2409830546054456e-06, + "loss": 0.6747, + "step": 5156 + }, + { + "epoch": 0.79, + "grad_norm": 2.670008968578099, + "learning_rate": 2.2378565255320226e-06, + "loss": 0.8097, + "step": 5157 + }, + { + "epoch": 0.79, + "grad_norm": 2.7697821647266103, + "learning_rate": 2.2347319041757675e-06, + "loss": 0.8285, + "step": 5158 + }, + { + "epoch": 0.79, + "grad_norm": 2.749625518855146, + "learning_rate": 2.2316091913046235e-06, + "loss": 0.713, + "step": 5159 + }, + { + "epoch": 0.79, + "grad_norm": 2.594875868036443, + "learning_rate": 2.2284883876860707e-06, + "loss": 0.7735, + "step": 5160 + }, + { + "epoch": 0.79, + "grad_norm": 2.527292720733303, + "learning_rate": 2.2253694940871106e-06, + "loss": 0.7583, + "step": 5161 + }, + { + "epoch": 0.79, + "grad_norm": 3.417019669741224, + "learning_rate": 2.22225251127428e-06, + "loss": 0.7943, + "step": 5162 + }, + { + "epoch": 0.79, + "grad_norm": 2.642528698366732, + "learning_rate": 2.219137440013649e-06, + "loss": 0.8459, + "step": 5163 + }, + { + "epoch": 0.79, + "grad_norm": 2.5632337289614746, + "learning_rate": 2.2160242810708098e-06, + "loss": 0.8122, + "step": 5164 + }, + { + "epoch": 0.79, + "grad_norm": 2.5843935751636216, + "learning_rate": 2.212913035210895e-06, + "loss": 0.761, + "step": 5165 + }, + { + "epoch": 0.79, + "grad_norm": 2.6664617878227626, + "learning_rate": 2.2098037031985598e-06, + "loss": 0.6748, + "step": 5166 + }, + { + "epoch": 0.79, + "grad_norm": 2.5021842064630033, + "learning_rate": 2.20669628579799e-06, + "loss": 0.7371, + "step": 5167 + }, + { + "epoch": 0.79, + "grad_norm": 2.6612014852259103, + "learning_rate": 2.203590783772902e-06, + "loss": 0.7361, + "step": 5168 + }, + { + "epoch": 0.79, + "grad_norm": 2.562022582459715, + "learning_rate": 2.2004871978865407e-06, + "loss": 0.7548, + "step": 5169 + }, + { + "epoch": 0.79, + "grad_norm": 2.8162698580900045, + "learning_rate": 2.1973855289016767e-06, + "loss": 0.8065, + "step": 5170 + }, + { + "epoch": 0.79, + "grad_norm": 2.667831941143919, + "learning_rate": 2.1942857775806215e-06, + "loss": 0.827, + "step": 5171 + }, + { + "epoch": 0.79, + "grad_norm": 2.4825779833500237, + "learning_rate": 2.1911879446852016e-06, + "loss": 0.7071, + "step": 5172 + }, + { + "epoch": 0.79, + "grad_norm": 2.874618733898814, + "learning_rate": 2.1880920309767763e-06, + "loss": 0.723, + "step": 5173 + }, + { + "epoch": 0.79, + "grad_norm": 2.6996800389012137, + "learning_rate": 2.1849980372162393e-06, + "loss": 0.7636, + "step": 5174 + }, + { + "epoch": 0.79, + "grad_norm": 2.922952745301586, + "learning_rate": 2.1819059641640015e-06, + "loss": 0.7544, + "step": 5175 + }, + { + "epoch": 0.79, + "grad_norm": 2.4509868820621143, + "learning_rate": 2.178815812580015e-06, + "loss": 0.8044, + "step": 5176 + }, + { + "epoch": 0.79, + "grad_norm": 2.7097921123798425, + "learning_rate": 2.175727583223748e-06, + "loss": 0.7816, + "step": 5177 + }, + { + "epoch": 0.79, + "grad_norm": 2.7726244992765974, + "learning_rate": 2.1726412768541984e-06, + "loss": 0.8708, + "step": 5178 + }, + { + "epoch": 0.79, + "grad_norm": 2.646471864402857, + "learning_rate": 2.1695568942298984e-06, + "loss": 0.7968, + "step": 5179 + }, + { + "epoch": 0.79, + "grad_norm": 2.7006362811759965, + "learning_rate": 2.1664744361089042e-06, + "loss": 0.7764, + "step": 5180 + }, + { + "epoch": 0.79, + "grad_norm": 2.4489036380967026, + "learning_rate": 2.1633939032487883e-06, + "loss": 0.7578, + "step": 5181 + }, + { + "epoch": 0.79, + "grad_norm": 2.8565073056265224, + "learning_rate": 2.160315296406669e-06, + "loss": 0.7507, + "step": 5182 + }, + { + "epoch": 0.79, + "grad_norm": 3.033558611119768, + "learning_rate": 2.1572386163391767e-06, + "loss": 0.8208, + "step": 5183 + }, + { + "epoch": 0.79, + "grad_norm": 2.5760594105590156, + "learning_rate": 2.154163863802471e-06, + "loss": 0.7048, + "step": 5184 + }, + { + "epoch": 0.79, + "grad_norm": 2.572431198217354, + "learning_rate": 2.1510910395522454e-06, + "loss": 0.7405, + "step": 5185 + }, + { + "epoch": 0.79, + "grad_norm": 2.68710261224045, + "learning_rate": 2.1480201443437097e-06, + "loss": 0.7985, + "step": 5186 + }, + { + "epoch": 0.79, + "grad_norm": 2.7139550604337064, + "learning_rate": 2.144951178931608e-06, + "loss": 0.7096, + "step": 5187 + }, + { + "epoch": 0.79, + "grad_norm": 2.853358959872608, + "learning_rate": 2.1418841440702032e-06, + "loss": 0.8113, + "step": 5188 + }, + { + "epoch": 0.79, + "grad_norm": 2.6227026638339734, + "learning_rate": 2.1388190405132835e-06, + "loss": 0.7304, + "step": 5189 + }, + { + "epoch": 0.79, + "grad_norm": 2.755493062706885, + "learning_rate": 2.135755869014171e-06, + "loss": 0.8311, + "step": 5190 + }, + { + "epoch": 0.79, + "grad_norm": 3.007649818068204, + "learning_rate": 2.1326946303257055e-06, + "loss": 0.7688, + "step": 5191 + }, + { + "epoch": 0.79, + "grad_norm": 3.3892465626955914, + "learning_rate": 2.1296353252002535e-06, + "loss": 0.8222, + "step": 5192 + }, + { + "epoch": 0.79, + "grad_norm": 2.8379742097250498, + "learning_rate": 2.126577954389706e-06, + "loss": 0.6913, + "step": 5193 + }, + { + "epoch": 0.8, + "grad_norm": 2.603497319834362, + "learning_rate": 2.123522518645478e-06, + "loss": 0.8435, + "step": 5194 + }, + { + "epoch": 0.8, + "grad_norm": 2.653831764050139, + "learning_rate": 2.1204690187185083e-06, + "loss": 0.8059, + "step": 5195 + }, + { + "epoch": 0.8, + "grad_norm": 2.826818041431396, + "learning_rate": 2.1174174553592662e-06, + "loss": 0.8316, + "step": 5196 + }, + { + "epoch": 0.8, + "grad_norm": 2.7569910967979547, + "learning_rate": 2.114367829317737e-06, + "loss": 0.7594, + "step": 5197 + }, + { + "epoch": 0.8, + "grad_norm": 2.6456699660607916, + "learning_rate": 2.111320141343437e-06, + "loss": 0.7005, + "step": 5198 + }, + { + "epoch": 0.8, + "grad_norm": 2.6894829901775865, + "learning_rate": 2.1082743921853986e-06, + "loss": 0.8221, + "step": 5199 + }, + { + "epoch": 0.8, + "grad_norm": 2.4653459998874787, + "learning_rate": 2.1052305825921814e-06, + "loss": 0.7755, + "step": 5200 + }, + { + "epoch": 0.8, + "grad_norm": 2.3475070217631435, + "learning_rate": 2.1021887133118724e-06, + "loss": 0.7037, + "step": 5201 + }, + { + "epoch": 0.8, + "grad_norm": 2.5835424412431998, + "learning_rate": 2.0991487850920744e-06, + "loss": 0.8099, + "step": 5202 + }, + { + "epoch": 0.8, + "grad_norm": 2.529540829600717, + "learning_rate": 2.0961107986799177e-06, + "loss": 0.7818, + "step": 5203 + }, + { + "epoch": 0.8, + "grad_norm": 2.5303647554019335, + "learning_rate": 2.0930747548220544e-06, + "loss": 0.8147, + "step": 5204 + }, + { + "epoch": 0.8, + "grad_norm": 2.680295338295072, + "learning_rate": 2.0900406542646578e-06, + "loss": 0.7119, + "step": 5205 + }, + { + "epoch": 0.8, + "grad_norm": 2.4917329323772948, + "learning_rate": 2.0870084977534234e-06, + "loss": 0.709, + "step": 5206 + }, + { + "epoch": 0.8, + "grad_norm": 2.680142357862478, + "learning_rate": 2.0839782860335744e-06, + "loss": 0.7774, + "step": 5207 + }, + { + "epoch": 0.8, + "grad_norm": 2.7202509491980726, + "learning_rate": 2.0809500198498465e-06, + "loss": 0.7406, + "step": 5208 + }, + { + "epoch": 0.8, + "grad_norm": 2.538821119730595, + "learning_rate": 2.077923699946508e-06, + "loss": 0.6594, + "step": 5209 + }, + { + "epoch": 0.8, + "grad_norm": 2.696771041993024, + "learning_rate": 2.0748993270673413e-06, + "loss": 0.6496, + "step": 5210 + }, + { + "epoch": 0.8, + "grad_norm": 2.9231310393079006, + "learning_rate": 2.0718769019556497e-06, + "loss": 0.7194, + "step": 5211 + }, + { + "epoch": 0.8, + "grad_norm": 2.4163140505938148, + "learning_rate": 2.0688564253542665e-06, + "loss": 0.7744, + "step": 5212 + }, + { + "epoch": 0.8, + "grad_norm": 2.4284110144498947, + "learning_rate": 2.065837898005536e-06, + "loss": 0.7628, + "step": 5213 + }, + { + "epoch": 0.8, + "grad_norm": 2.564184491525239, + "learning_rate": 2.0628213206513283e-06, + "loss": 0.7292, + "step": 5214 + }, + { + "epoch": 0.8, + "grad_norm": 2.5657572703609413, + "learning_rate": 2.059806694033033e-06, + "loss": 0.7039, + "step": 5215 + }, + { + "epoch": 0.8, + "grad_norm": 2.61890696078291, + "learning_rate": 2.056794018891559e-06, + "loss": 0.7752, + "step": 5216 + }, + { + "epoch": 0.8, + "grad_norm": 2.841991160663711, + "learning_rate": 2.053783295967342e-06, + "loss": 0.8468, + "step": 5217 + }, + { + "epoch": 0.8, + "grad_norm": 2.632280940282483, + "learning_rate": 2.050774526000331e-06, + "loss": 0.7935, + "step": 5218 + }, + { + "epoch": 0.8, + "grad_norm": 2.7062111362509427, + "learning_rate": 2.0477677097299944e-06, + "loss": 0.8167, + "step": 5219 + }, + { + "epoch": 0.8, + "grad_norm": 2.6616929561759517, + "learning_rate": 2.0447628478953285e-06, + "loss": 0.8678, + "step": 5220 + }, + { + "epoch": 0.8, + "grad_norm": 2.6295559456205364, + "learning_rate": 2.041759941234842e-06, + "loss": 0.7249, + "step": 5221 + }, + { + "epoch": 0.8, + "grad_norm": 2.854155984098713, + "learning_rate": 2.0387589904865624e-06, + "loss": 0.8524, + "step": 5222 + }, + { + "epoch": 0.8, + "grad_norm": 2.915882148566005, + "learning_rate": 2.035759996388044e-06, + "loss": 0.8829, + "step": 5223 + }, + { + "epoch": 0.8, + "grad_norm": 2.5580168750836223, + "learning_rate": 2.0327629596763522e-06, + "loss": 0.7832, + "step": 5224 + }, + { + "epoch": 0.8, + "grad_norm": 2.667148793752107, + "learning_rate": 2.029767881088076e-06, + "loss": 0.7752, + "step": 5225 + }, + { + "epoch": 0.8, + "grad_norm": 2.485601095948484, + "learning_rate": 2.0267747613593216e-06, + "loss": 0.7068, + "step": 5226 + }, + { + "epoch": 0.8, + "grad_norm": 3.3539907143857755, + "learning_rate": 2.0237836012257094e-06, + "loss": 0.851, + "step": 5227 + }, + { + "epoch": 0.8, + "grad_norm": 3.207716113325043, + "learning_rate": 2.02079440142239e-06, + "loss": 0.7128, + "step": 5228 + }, + { + "epoch": 0.8, + "grad_norm": 2.6476668136581853, + "learning_rate": 2.0178071626840222e-06, + "loss": 0.7198, + "step": 5229 + }, + { + "epoch": 0.8, + "grad_norm": 2.524164666724541, + "learning_rate": 2.014821885744782e-06, + "loss": 0.8026, + "step": 5230 + }, + { + "epoch": 0.8, + "grad_norm": 2.433542763429434, + "learning_rate": 2.0118385713383717e-06, + "loss": 0.6877, + "step": 5231 + }, + { + "epoch": 0.8, + "grad_norm": 2.7388708012854384, + "learning_rate": 2.008857220198004e-06, + "loss": 0.8663, + "step": 5232 + }, + { + "epoch": 0.8, + "grad_norm": 2.546278045195248, + "learning_rate": 2.0058778330564087e-06, + "loss": 0.7761, + "step": 5233 + }, + { + "epoch": 0.8, + "grad_norm": 2.5100886555285506, + "learning_rate": 2.002900410645842e-06, + "loss": 0.745, + "step": 5234 + }, + { + "epoch": 0.8, + "grad_norm": 2.832589902344868, + "learning_rate": 1.999924953698067e-06, + "loss": 0.7755, + "step": 5235 + }, + { + "epoch": 0.8, + "grad_norm": 2.7081162850341634, + "learning_rate": 1.9969514629443676e-06, + "loss": 0.815, + "step": 5236 + }, + { + "epoch": 0.8, + "grad_norm": 2.612861915456119, + "learning_rate": 1.9939799391155447e-06, + "loss": 0.7633, + "step": 5237 + }, + { + "epoch": 0.8, + "grad_norm": 2.968076668129109, + "learning_rate": 1.9910103829419136e-06, + "loss": 0.7578, + "step": 5238 + }, + { + "epoch": 0.8, + "grad_norm": 2.7880471818672863, + "learning_rate": 1.9880427951533123e-06, + "loss": 0.7828, + "step": 5239 + }, + { + "epoch": 0.8, + "grad_norm": 2.5007751028079, + "learning_rate": 1.9850771764790866e-06, + "loss": 0.7722, + "step": 5240 + }, + { + "epoch": 0.8, + "grad_norm": 2.7926472560728772, + "learning_rate": 1.982113527648103e-06, + "loss": 0.7983, + "step": 5241 + }, + { + "epoch": 0.8, + "grad_norm": 2.688729300603576, + "learning_rate": 1.9791518493887464e-06, + "loss": 0.7109, + "step": 5242 + }, + { + "epoch": 0.8, + "grad_norm": 2.526455965144725, + "learning_rate": 1.976192142428912e-06, + "loss": 0.7415, + "step": 5243 + }, + { + "epoch": 0.8, + "grad_norm": 2.361831127463669, + "learning_rate": 1.9732344074960106e-06, + "loss": 0.6652, + "step": 5244 + }, + { + "epoch": 0.8, + "grad_norm": 2.8803546372423576, + "learning_rate": 1.9702786453169753e-06, + "loss": 0.7467, + "step": 5245 + }, + { + "epoch": 0.8, + "grad_norm": 2.606351929515565, + "learning_rate": 1.967324856618247e-06, + "loss": 0.7258, + "step": 5246 + }, + { + "epoch": 0.8, + "grad_norm": 2.5139030510363156, + "learning_rate": 1.9643730421257836e-06, + "loss": 0.714, + "step": 5247 + }, + { + "epoch": 0.8, + "grad_norm": 2.561178712202148, + "learning_rate": 1.961423202565059e-06, + "loss": 0.6892, + "step": 5248 + }, + { + "epoch": 0.8, + "grad_norm": 2.6508261197310112, + "learning_rate": 1.958475338661059e-06, + "loss": 0.6926, + "step": 5249 + }, + { + "epoch": 0.8, + "grad_norm": 3.1385624324100405, + "learning_rate": 1.9555294511382895e-06, + "loss": 0.8052, + "step": 5250 + }, + { + "epoch": 0.8, + "grad_norm": 2.5151851416799134, + "learning_rate": 1.952585540720765e-06, + "loss": 0.7427, + "step": 5251 + }, + { + "epoch": 0.8, + "grad_norm": 2.7096528444167545, + "learning_rate": 1.949643608132015e-06, + "loss": 0.854, + "step": 5252 + }, + { + "epoch": 0.8, + "grad_norm": 2.6073748284771137, + "learning_rate": 1.946703654095087e-06, + "loss": 0.7681, + "step": 5253 + }, + { + "epoch": 0.8, + "grad_norm": 2.647365142933058, + "learning_rate": 1.943765679332539e-06, + "loss": 0.7355, + "step": 5254 + }, + { + "epoch": 0.8, + "grad_norm": 2.554651512314461, + "learning_rate": 1.9408296845664374e-06, + "loss": 0.7792, + "step": 5255 + }, + { + "epoch": 0.8, + "grad_norm": 2.7749366497128714, + "learning_rate": 1.937895670518374e-06, + "loss": 0.7521, + "step": 5256 + }, + { + "epoch": 0.8, + "grad_norm": 2.6113040123040587, + "learning_rate": 1.934963637909445e-06, + "loss": 0.7682, + "step": 5257 + }, + { + "epoch": 0.8, + "grad_norm": 2.68125144170482, + "learning_rate": 1.9320335874602615e-06, + "loss": 0.6771, + "step": 5258 + }, + { + "epoch": 0.8, + "grad_norm": 2.79670004343362, + "learning_rate": 1.929105519890948e-06, + "loss": 0.7837, + "step": 5259 + }, + { + "epoch": 0.81, + "grad_norm": 2.858531857582643, + "learning_rate": 1.9261794359211385e-06, + "loss": 0.797, + "step": 5260 + }, + { + "epoch": 0.81, + "grad_norm": 2.601601436421406, + "learning_rate": 1.9232553362699867e-06, + "loss": 0.8745, + "step": 5261 + }, + { + "epoch": 0.81, + "grad_norm": 2.7155178948988015, + "learning_rate": 1.920333221656152e-06, + "loss": 0.7517, + "step": 5262 + }, + { + "epoch": 0.81, + "grad_norm": 2.71136245899951, + "learning_rate": 1.9174130927978073e-06, + "loss": 0.8722, + "step": 5263 + }, + { + "epoch": 0.81, + "grad_norm": 2.6266359171360882, + "learning_rate": 1.914494950412642e-06, + "loss": 0.769, + "step": 5264 + }, + { + "epoch": 0.81, + "grad_norm": 2.676658527106433, + "learning_rate": 1.9115787952178513e-06, + "loss": 0.8158, + "step": 5265 + }, + { + "epoch": 0.81, + "grad_norm": 2.7060334105432813, + "learning_rate": 1.9086646279301414e-06, + "loss": 0.7251, + "step": 5266 + }, + { + "epoch": 0.81, + "grad_norm": 2.6210489060040496, + "learning_rate": 1.9057524492657386e-06, + "loss": 0.7473, + "step": 5267 + }, + { + "epoch": 0.81, + "grad_norm": 2.662061165888899, + "learning_rate": 1.902842259940373e-06, + "loss": 0.88, + "step": 5268 + }, + { + "epoch": 0.81, + "grad_norm": 2.5082720864633803, + "learning_rate": 1.899934060669285e-06, + "loss": 0.6801, + "step": 5269 + }, + { + "epoch": 0.81, + "grad_norm": 3.0120117717607733, + "learning_rate": 1.8970278521672314e-06, + "loss": 0.7691, + "step": 5270 + }, + { + "epoch": 0.81, + "grad_norm": 2.4061591232928166, + "learning_rate": 1.8941236351484727e-06, + "loss": 0.632, + "step": 5271 + }, + { + "epoch": 0.81, + "grad_norm": 2.5620260182509225, + "learning_rate": 1.891221410326789e-06, + "loss": 0.826, + "step": 5272 + }, + { + "epoch": 0.81, + "grad_norm": 2.6932349967864324, + "learning_rate": 1.888321178415463e-06, + "loss": 0.7042, + "step": 5273 + }, + { + "epoch": 0.81, + "grad_norm": 2.5244052706215725, + "learning_rate": 1.8854229401272883e-06, + "loss": 0.766, + "step": 5274 + }, + { + "epoch": 0.81, + "grad_norm": 2.7826439118860393, + "learning_rate": 1.8825266961745759e-06, + "loss": 0.7974, + "step": 5275 + }, + { + "epoch": 0.81, + "grad_norm": 2.813590114997112, + "learning_rate": 1.8796324472691386e-06, + "loss": 0.7621, + "step": 5276 + }, + { + "epoch": 0.81, + "grad_norm": 2.4679542676635147, + "learning_rate": 1.8767401941222996e-06, + "loss": 0.7581, + "step": 5277 + }, + { + "epoch": 0.81, + "grad_norm": 2.6626177268115923, + "learning_rate": 1.873849937444897e-06, + "loss": 0.6859, + "step": 5278 + }, + { + "epoch": 0.81, + "grad_norm": 2.514550595473596, + "learning_rate": 1.870961677947274e-06, + "loss": 0.7612, + "step": 5279 + }, + { + "epoch": 0.81, + "grad_norm": 2.429325244420133, + "learning_rate": 1.8680754163392821e-06, + "loss": 0.8087, + "step": 5280 + }, + { + "epoch": 0.81, + "grad_norm": 2.78224370282897, + "learning_rate": 1.8651911533302858e-06, + "loss": 0.7911, + "step": 5281 + }, + { + "epoch": 0.81, + "grad_norm": 2.6394580071077534, + "learning_rate": 1.8623088896291508e-06, + "loss": 0.7007, + "step": 5282 + }, + { + "epoch": 0.81, + "grad_norm": 3.08473638196854, + "learning_rate": 1.859428625944264e-06, + "loss": 0.8031, + "step": 5283 + }, + { + "epoch": 0.81, + "grad_norm": 2.6230923065312197, + "learning_rate": 1.8565503629835102e-06, + "loss": 0.7139, + "step": 5284 + }, + { + "epoch": 0.81, + "grad_norm": 2.484455187154463, + "learning_rate": 1.8536741014542825e-06, + "loss": 0.7813, + "step": 5285 + }, + { + "epoch": 0.81, + "grad_norm": 2.9025005355173255, + "learning_rate": 1.850799842063492e-06, + "loss": 0.8687, + "step": 5286 + }, + { + "epoch": 0.81, + "grad_norm": 2.7855494735084583, + "learning_rate": 1.847927585517546e-06, + "loss": 0.8703, + "step": 5287 + }, + { + "epoch": 0.81, + "grad_norm": 2.7774861500730554, + "learning_rate": 1.845057332522364e-06, + "loss": 0.7098, + "step": 5288 + }, + { + "epoch": 0.81, + "grad_norm": 2.5571608259583045, + "learning_rate": 1.8421890837833789e-06, + "loss": 0.7756, + "step": 5289 + }, + { + "epoch": 0.81, + "grad_norm": 2.6508095224438284, + "learning_rate": 1.8393228400055197e-06, + "loss": 0.7547, + "step": 5290 + }, + { + "epoch": 0.81, + "grad_norm": 2.4331685506681624, + "learning_rate": 1.836458601893234e-06, + "loss": 0.6196, + "step": 5291 + }, + { + "epoch": 0.81, + "grad_norm": 2.8057215466078, + "learning_rate": 1.833596370150469e-06, + "loss": 0.8488, + "step": 5292 + }, + { + "epoch": 0.81, + "grad_norm": 2.8768433031219436, + "learning_rate": 1.8307361454806815e-06, + "loss": 0.8106, + "step": 5293 + }, + { + "epoch": 0.81, + "grad_norm": 2.6425605653461894, + "learning_rate": 1.8278779285868332e-06, + "loss": 0.8511, + "step": 5294 + }, + { + "epoch": 0.81, + "grad_norm": 3.2196702056179816, + "learning_rate": 1.8250217201713938e-06, + "loss": 0.7947, + "step": 5295 + }, + { + "epoch": 0.81, + "grad_norm": 2.6762413209747815, + "learning_rate": 1.8221675209363376e-06, + "loss": 0.7019, + "step": 5296 + }, + { + "epoch": 0.81, + "grad_norm": 3.218514063148928, + "learning_rate": 1.8193153315831514e-06, + "loss": 0.7803, + "step": 5297 + }, + { + "epoch": 0.81, + "grad_norm": 2.7580589992515314, + "learning_rate": 1.8164651528128208e-06, + "loss": 0.7302, + "step": 5298 + }, + { + "epoch": 0.81, + "grad_norm": 2.738998097998686, + "learning_rate": 1.8136169853258379e-06, + "loss": 0.7869, + "step": 5299 + }, + { + "epoch": 0.81, + "grad_norm": 2.5623291055732937, + "learning_rate": 1.810770829822206e-06, + "loss": 0.776, + "step": 5300 + }, + { + "epoch": 0.81, + "grad_norm": 3.544255059308854, + "learning_rate": 1.8079266870014266e-06, + "loss": 0.8067, + "step": 5301 + }, + { + "epoch": 0.81, + "grad_norm": 2.6241190831600134, + "learning_rate": 1.8050845575625142e-06, + "loss": 0.6629, + "step": 5302 + }, + { + "epoch": 0.81, + "grad_norm": 2.8966106388304853, + "learning_rate": 1.802244442203983e-06, + "loss": 0.8091, + "step": 5303 + }, + { + "epoch": 0.81, + "grad_norm": 2.536447184938207, + "learning_rate": 1.7994063416238528e-06, + "loss": 0.7255, + "step": 5304 + }, + { + "epoch": 0.81, + "grad_norm": 2.628368067949291, + "learning_rate": 1.7965702565196496e-06, + "loss": 0.7316, + "step": 5305 + }, + { + "epoch": 0.81, + "grad_norm": 2.7213475845824426, + "learning_rate": 1.793736187588404e-06, + "loss": 0.8211, + "step": 5306 + }, + { + "epoch": 0.81, + "grad_norm": 2.5743627911226468, + "learning_rate": 1.7909041355266465e-06, + "loss": 0.7483, + "step": 5307 + }, + { + "epoch": 0.81, + "grad_norm": 2.6514697681954487, + "learning_rate": 1.7880741010304236e-06, + "loss": 0.8514, + "step": 5308 + }, + { + "epoch": 0.81, + "grad_norm": 2.6117066555165516, + "learning_rate": 1.7852460847952745e-06, + "loss": 0.7341, + "step": 5309 + }, + { + "epoch": 0.81, + "grad_norm": 2.7937497971549448, + "learning_rate": 1.7824200875162435e-06, + "loss": 0.8227, + "step": 5310 + }, + { + "epoch": 0.81, + "grad_norm": 2.6601227382781683, + "learning_rate": 1.7795961098878867e-06, + "loss": 0.6883, + "step": 5311 + }, + { + "epoch": 0.81, + "grad_norm": 2.555270181944585, + "learning_rate": 1.7767741526042537e-06, + "loss": 0.7181, + "step": 5312 + }, + { + "epoch": 0.81, + "grad_norm": 2.505665661355078, + "learning_rate": 1.7739542163589074e-06, + "loss": 0.7687, + "step": 5313 + }, + { + "epoch": 0.81, + "grad_norm": 2.898396487815443, + "learning_rate": 1.7711363018449068e-06, + "loss": 0.8078, + "step": 5314 + }, + { + "epoch": 0.81, + "grad_norm": 3.3112582709990637, + "learning_rate": 1.7683204097548157e-06, + "loss": 0.8103, + "step": 5315 + }, + { + "epoch": 0.81, + "grad_norm": 2.6724151435389834, + "learning_rate": 1.7655065407807025e-06, + "loss": 0.7417, + "step": 5316 + }, + { + "epoch": 0.81, + "grad_norm": 2.6001799710561384, + "learning_rate": 1.7626946956141355e-06, + "loss": 0.8023, + "step": 5317 + }, + { + "epoch": 0.81, + "grad_norm": 2.5608651624482253, + "learning_rate": 1.759884874946187e-06, + "loss": 0.7915, + "step": 5318 + }, + { + "epoch": 0.81, + "grad_norm": 3.4450320366094944, + "learning_rate": 1.7570770794674352e-06, + "loss": 0.8027, + "step": 5319 + }, + { + "epoch": 0.81, + "grad_norm": 2.7623894648539227, + "learning_rate": 1.7542713098679564e-06, + "loss": 0.7392, + "step": 5320 + }, + { + "epoch": 0.81, + "grad_norm": 2.7112253465641376, + "learning_rate": 1.7514675668373272e-06, + "loss": 0.7352, + "step": 5321 + }, + { + "epoch": 0.81, + "grad_norm": 2.462287508288741, + "learning_rate": 1.7486658510646337e-06, + "loss": 0.8257, + "step": 5322 + }, + { + "epoch": 0.81, + "grad_norm": 2.3514294846209958, + "learning_rate": 1.7458661632384532e-06, + "loss": 0.7096, + "step": 5323 + }, + { + "epoch": 0.81, + "grad_norm": 2.6405501271234275, + "learning_rate": 1.7430685040468775e-06, + "loss": 0.755, + "step": 5324 + }, + { + "epoch": 0.82, + "grad_norm": 2.8005781186999488, + "learning_rate": 1.7402728741774887e-06, + "loss": 0.681, + "step": 5325 + }, + { + "epoch": 0.82, + "grad_norm": 2.7722066542218786, + "learning_rate": 1.737479274317375e-06, + "loss": 0.7788, + "step": 5326 + }, + { + "epoch": 0.82, + "grad_norm": 2.5659134830313954, + "learning_rate": 1.7346877051531241e-06, + "loss": 0.7842, + "step": 5327 + }, + { + "epoch": 0.82, + "grad_norm": 2.438173108512466, + "learning_rate": 1.731898167370827e-06, + "loss": 0.8067, + "step": 5328 + }, + { + "epoch": 0.82, + "grad_norm": 2.5327512937131007, + "learning_rate": 1.7291106616560693e-06, + "loss": 0.7078, + "step": 5329 + }, + { + "epoch": 0.82, + "grad_norm": 2.836323440661082, + "learning_rate": 1.726325188693948e-06, + "loss": 0.7786, + "step": 5330 + }, + { + "epoch": 0.82, + "grad_norm": 2.800206264294481, + "learning_rate": 1.7235417491690509e-06, + "loss": 0.8309, + "step": 5331 + }, + { + "epoch": 0.82, + "grad_norm": 2.5362830414927586, + "learning_rate": 1.7207603437654674e-06, + "loss": 0.6841, + "step": 5332 + }, + { + "epoch": 0.82, + "grad_norm": 2.426639483725904, + "learning_rate": 1.7179809731667952e-06, + "loss": 0.7774, + "step": 5333 + }, + { + "epoch": 0.82, + "grad_norm": 2.562833126398352, + "learning_rate": 1.7152036380561176e-06, + "loss": 0.7171, + "step": 5334 + }, + { + "epoch": 0.82, + "grad_norm": 2.7560900086185023, + "learning_rate": 1.7124283391160335e-06, + "loss": 0.7865, + "step": 5335 + }, + { + "epoch": 0.82, + "grad_norm": 2.6461379808218592, + "learning_rate": 1.7096550770286302e-06, + "loss": 0.781, + "step": 5336 + }, + { + "epoch": 0.82, + "grad_norm": 2.56642023701555, + "learning_rate": 1.7068838524754961e-06, + "loss": 0.7718, + "step": 5337 + }, + { + "epoch": 0.82, + "grad_norm": 2.5198940553270335, + "learning_rate": 1.7041146661377229e-06, + "loss": 0.8147, + "step": 5338 + }, + { + "epoch": 0.82, + "grad_norm": 3.2246821208943555, + "learning_rate": 1.7013475186958983e-06, + "loss": 0.7905, + "step": 5339 + }, + { + "epoch": 0.82, + "grad_norm": 2.488770000107356, + "learning_rate": 1.6985824108301063e-06, + "loss": 0.7767, + "step": 5340 + }, + { + "epoch": 0.82, + "grad_norm": 2.579072452946949, + "learning_rate": 1.6958193432199377e-06, + "loss": 0.714, + "step": 5341 + }, + { + "epoch": 0.82, + "grad_norm": 2.636564346699858, + "learning_rate": 1.6930583165444759e-06, + "loss": 0.8383, + "step": 5342 + }, + { + "epoch": 0.82, + "grad_norm": 2.763894488128038, + "learning_rate": 1.6902993314823003e-06, + "loss": 0.8249, + "step": 5343 + }, + { + "epoch": 0.82, + "grad_norm": 2.841520959324338, + "learning_rate": 1.6875423887114973e-06, + "loss": 0.7667, + "step": 5344 + }, + { + "epoch": 0.82, + "grad_norm": 2.721729548909902, + "learning_rate": 1.6847874889096404e-06, + "loss": 0.8535, + "step": 5345 + }, + { + "epoch": 0.82, + "grad_norm": 2.353445242020036, + "learning_rate": 1.682034632753813e-06, + "loss": 0.7144, + "step": 5346 + }, + { + "epoch": 0.82, + "grad_norm": 2.86263924929456, + "learning_rate": 1.6792838209205865e-06, + "loss": 0.7944, + "step": 5347 + }, + { + "epoch": 0.82, + "grad_norm": 2.6271226482000403, + "learning_rate": 1.6765350540860336e-06, + "loss": 0.8035, + "step": 5348 + }, + { + "epoch": 0.82, + "grad_norm": 2.7871625156641815, + "learning_rate": 1.6737883329257242e-06, + "loss": 0.7978, + "step": 5349 + }, + { + "epoch": 0.82, + "grad_norm": 2.7585265680058004, + "learning_rate": 1.6710436581147248e-06, + "loss": 0.7069, + "step": 5350 + }, + { + "epoch": 0.82, + "grad_norm": 2.973963568363898, + "learning_rate": 1.6683010303275982e-06, + "loss": 0.7856, + "step": 5351 + }, + { + "epoch": 0.82, + "grad_norm": 2.640924319007717, + "learning_rate": 1.665560450238408e-06, + "loss": 0.7277, + "step": 5352 + }, + { + "epoch": 0.82, + "grad_norm": 2.6543214807904603, + "learning_rate": 1.6628219185207127e-06, + "loss": 0.7371, + "step": 5353 + }, + { + "epoch": 0.82, + "grad_norm": 2.5983459295205256, + "learning_rate": 1.6600854358475615e-06, + "loss": 0.6777, + "step": 5354 + }, + { + "epoch": 0.82, + "grad_norm": 2.5050712357400977, + "learning_rate": 1.6573510028915118e-06, + "loss": 0.7303, + "step": 5355 + }, + { + "epoch": 0.82, + "grad_norm": 2.581829671919241, + "learning_rate": 1.654618620324604e-06, + "loss": 0.8259, + "step": 5356 + }, + { + "epoch": 0.82, + "grad_norm": 2.757472544258648, + "learning_rate": 1.6518882888183874e-06, + "loss": 0.743, + "step": 5357 + }, + { + "epoch": 0.82, + "grad_norm": 2.6302986483984796, + "learning_rate": 1.6491600090438976e-06, + "loss": 0.7763, + "step": 5358 + }, + { + "epoch": 0.82, + "grad_norm": 2.392032692849635, + "learning_rate": 1.646433781671669e-06, + "loss": 0.752, + "step": 5359 + }, + { + "epoch": 0.82, + "grad_norm": 2.4315483130409152, + "learning_rate": 1.643709607371733e-06, + "loss": 0.7202, + "step": 5360 + }, + { + "epoch": 0.82, + "grad_norm": 2.6533577527278807, + "learning_rate": 1.6409874868136132e-06, + "loss": 0.7539, + "step": 5361 + }, + { + "epoch": 0.82, + "grad_norm": 2.7936297349799672, + "learning_rate": 1.6382674206663308e-06, + "loss": 0.867, + "step": 5362 + }, + { + "epoch": 0.82, + "grad_norm": 2.486355890016533, + "learning_rate": 1.6355494095984049e-06, + "loss": 0.6968, + "step": 5363 + }, + { + "epoch": 0.82, + "grad_norm": 2.992793892900273, + "learning_rate": 1.6328334542778423e-06, + "loss": 0.8133, + "step": 5364 + }, + { + "epoch": 0.82, + "grad_norm": 3.0974063076184413, + "learning_rate": 1.6301195553721528e-06, + "loss": 0.7223, + "step": 5365 + }, + { + "epoch": 0.82, + "grad_norm": 2.5535242999454044, + "learning_rate": 1.6274077135483336e-06, + "loss": 0.8412, + "step": 5366 + }, + { + "epoch": 0.82, + "grad_norm": 3.0316245210841513, + "learning_rate": 1.6246979294728793e-06, + "loss": 0.7508, + "step": 5367 + }, + { + "epoch": 0.82, + "grad_norm": 2.7099534979043196, + "learning_rate": 1.6219902038117807e-06, + "loss": 0.7069, + "step": 5368 + }, + { + "epoch": 0.82, + "grad_norm": 3.0971035301260312, + "learning_rate": 1.619284537230521e-06, + "loss": 0.7463, + "step": 5369 + }, + { + "epoch": 0.82, + "grad_norm": 3.4461091608804377, + "learning_rate": 1.6165809303940761e-06, + "loss": 0.8191, + "step": 5370 + }, + { + "epoch": 0.82, + "grad_norm": 2.8631386451165426, + "learning_rate": 1.6138793839669165e-06, + "loss": 0.6966, + "step": 5371 + }, + { + "epoch": 0.82, + "grad_norm": 2.970022929368473, + "learning_rate": 1.6111798986130067e-06, + "loss": 0.7699, + "step": 5372 + }, + { + "epoch": 0.82, + "grad_norm": 2.752622230132209, + "learning_rate": 1.6084824749958017e-06, + "loss": 0.7213, + "step": 5373 + }, + { + "epoch": 0.82, + "grad_norm": 2.689384664158231, + "learning_rate": 1.6057871137782578e-06, + "loss": 0.7925, + "step": 5374 + }, + { + "epoch": 0.82, + "grad_norm": 2.3902986847975325, + "learning_rate": 1.603093815622815e-06, + "loss": 0.7179, + "step": 5375 + }, + { + "epoch": 0.82, + "grad_norm": 2.6156407562968806, + "learning_rate": 1.6004025811914147e-06, + "loss": 0.7825, + "step": 5376 + }, + { + "epoch": 0.82, + "grad_norm": 2.8870697987925333, + "learning_rate": 1.5977134111454839e-06, + "loss": 0.7597, + "step": 5377 + }, + { + "epoch": 0.82, + "grad_norm": 3.5329585787430418, + "learning_rate": 1.5950263061459437e-06, + "loss": 0.8816, + "step": 5378 + }, + { + "epoch": 0.82, + "grad_norm": 2.706232341302241, + "learning_rate": 1.5923412668532135e-06, + "loss": 0.7472, + "step": 5379 + }, + { + "epoch": 0.82, + "grad_norm": 2.538461844289609, + "learning_rate": 1.5896582939271976e-06, + "loss": 0.7767, + "step": 5380 + }, + { + "epoch": 0.82, + "grad_norm": 2.6899633155459206, + "learning_rate": 1.5869773880272964e-06, + "loss": 0.7833, + "step": 5381 + }, + { + "epoch": 0.82, + "grad_norm": 2.6689412056515254, + "learning_rate": 1.584298549812402e-06, + "loss": 0.7906, + "step": 5382 + }, + { + "epoch": 0.82, + "grad_norm": 2.616921439622051, + "learning_rate": 1.5816217799408962e-06, + "loss": 0.7398, + "step": 5383 + }, + { + "epoch": 0.82, + "grad_norm": 2.5860191831094523, + "learning_rate": 1.578947079070652e-06, + "loss": 0.785, + "step": 5384 + }, + { + "epoch": 0.82, + "grad_norm": 2.7409743051749684, + "learning_rate": 1.576274447859041e-06, + "loss": 0.7139, + "step": 5385 + }, + { + "epoch": 0.82, + "grad_norm": 2.5230935232058465, + "learning_rate": 1.5736038869629168e-06, + "loss": 0.7318, + "step": 5386 + }, + { + "epoch": 0.82, + "grad_norm": 2.8231121808138373, + "learning_rate": 1.5709353970386322e-06, + "loss": 0.885, + "step": 5387 + }, + { + "epoch": 0.82, + "grad_norm": 2.539737814038569, + "learning_rate": 1.5682689787420246e-06, + "loss": 0.7765, + "step": 5388 + }, + { + "epoch": 0.82, + "grad_norm": 2.623946125219109, + "learning_rate": 1.5656046327284225e-06, + "loss": 0.7303, + "step": 5389 + }, + { + "epoch": 0.83, + "grad_norm": 2.6049099707804535, + "learning_rate": 1.5629423596526528e-06, + "loss": 0.8136, + "step": 5390 + }, + { + "epoch": 0.83, + "grad_norm": 2.759094162043502, + "learning_rate": 1.5602821601690254e-06, + "loss": 0.7784, + "step": 5391 + }, + { + "epoch": 0.83, + "grad_norm": 2.8019148720081737, + "learning_rate": 1.557624034931342e-06, + "loss": 0.7514, + "step": 5392 + }, + { + "epoch": 0.83, + "grad_norm": 2.559663404527581, + "learning_rate": 1.5549679845928956e-06, + "loss": 0.8213, + "step": 5393 + }, + { + "epoch": 0.83, + "grad_norm": 2.6417069273809766, + "learning_rate": 1.5523140098064692e-06, + "loss": 0.8017, + "step": 5394 + }, + { + "epoch": 0.83, + "grad_norm": 2.8132581020986205, + "learning_rate": 1.5496621112243327e-06, + "loss": 0.842, + "step": 5395 + }, + { + "epoch": 0.83, + "grad_norm": 2.539662982926937, + "learning_rate": 1.547012289498252e-06, + "loss": 0.8017, + "step": 5396 + }, + { + "epoch": 0.83, + "grad_norm": 3.2025744239552973, + "learning_rate": 1.5443645452794754e-06, + "loss": 0.7549, + "step": 5397 + }, + { + "epoch": 0.83, + "grad_norm": 2.7559697768379205, + "learning_rate": 1.5417188792187488e-06, + "loss": 0.803, + "step": 5398 + }, + { + "epoch": 0.83, + "grad_norm": 2.603866972428103, + "learning_rate": 1.539075291966301e-06, + "loss": 0.9145, + "step": 5399 + }, + { + "epoch": 0.83, + "grad_norm": 2.600906822519969, + "learning_rate": 1.536433784171848e-06, + "loss": 0.8172, + "step": 5400 + }, + { + "epoch": 0.83, + "grad_norm": 2.7699414581326964, + "learning_rate": 1.5337943564846035e-06, + "loss": 0.716, + "step": 5401 + }, + { + "epoch": 0.83, + "grad_norm": 2.4063352904234256, + "learning_rate": 1.5311570095532636e-06, + "loss": 0.8076, + "step": 5402 + }, + { + "epoch": 0.83, + "grad_norm": 2.5692469504876687, + "learning_rate": 1.5285217440260092e-06, + "loss": 0.7613, + "step": 5403 + }, + { + "epoch": 0.83, + "grad_norm": 2.572772925076905, + "learning_rate": 1.5258885605505226e-06, + "loss": 0.7561, + "step": 5404 + }, + { + "epoch": 0.83, + "grad_norm": 2.513167661863724, + "learning_rate": 1.5232574597739635e-06, + "loss": 0.7256, + "step": 5405 + }, + { + "epoch": 0.83, + "grad_norm": 2.566421922523046, + "learning_rate": 1.5206284423429773e-06, + "loss": 0.7812, + "step": 5406 + }, + { + "epoch": 0.83, + "grad_norm": 3.3353428043208844, + "learning_rate": 1.5180015089037093e-06, + "loss": 0.8176, + "step": 5407 + }, + { + "epoch": 0.83, + "grad_norm": 2.5207035562280757, + "learning_rate": 1.5153766601017816e-06, + "loss": 0.8464, + "step": 5408 + }, + { + "epoch": 0.83, + "grad_norm": 2.5939553659111687, + "learning_rate": 1.512753896582313e-06, + "loss": 0.7761, + "step": 5409 + }, + { + "epoch": 0.83, + "grad_norm": 2.4308392420491383, + "learning_rate": 1.5101332189899032e-06, + "loss": 0.7191, + "step": 5410 + }, + { + "epoch": 0.83, + "grad_norm": 2.752464101088354, + "learning_rate": 1.507514627968638e-06, + "loss": 0.7498, + "step": 5411 + }, + { + "epoch": 0.83, + "grad_norm": 2.573045601314764, + "learning_rate": 1.5048981241620996e-06, + "loss": 0.7168, + "step": 5412 + }, + { + "epoch": 0.83, + "grad_norm": 3.374466306394915, + "learning_rate": 1.5022837082133479e-06, + "loss": 0.8401, + "step": 5413 + }, + { + "epoch": 0.83, + "grad_norm": 2.908769349069825, + "learning_rate": 1.499671380764931e-06, + "loss": 0.7582, + "step": 5414 + }, + { + "epoch": 0.83, + "grad_norm": 2.5939245885102267, + "learning_rate": 1.49706114245889e-06, + "loss": 0.8097, + "step": 5415 + }, + { + "epoch": 0.83, + "grad_norm": 5.456523353585106, + "learning_rate": 1.494452993936747e-06, + "loss": 0.7621, + "step": 5416 + }, + { + "epoch": 0.83, + "grad_norm": 3.479603609725367, + "learning_rate": 1.4918469358395104e-06, + "loss": 0.8219, + "step": 5417 + }, + { + "epoch": 0.83, + "grad_norm": 2.7942623118175147, + "learning_rate": 1.4892429688076771e-06, + "loss": 0.7696, + "step": 5418 + }, + { + "epoch": 0.83, + "grad_norm": 2.46734362607315, + "learning_rate": 1.486641093481227e-06, + "loss": 0.6862, + "step": 5419 + }, + { + "epoch": 0.83, + "grad_norm": 2.955292126554179, + "learning_rate": 1.4840413104996322e-06, + "loss": 0.7807, + "step": 5420 + }, + { + "epoch": 0.83, + "grad_norm": 2.656829684311352, + "learning_rate": 1.4814436205018435e-06, + "loss": 0.6895, + "step": 5421 + }, + { + "epoch": 0.83, + "grad_norm": 2.379219039208738, + "learning_rate": 1.4788480241262992e-06, + "loss": 0.7099, + "step": 5422 + }, + { + "epoch": 0.83, + "grad_norm": 2.513725790883196, + "learning_rate": 1.4762545220109292e-06, + "loss": 0.6907, + "step": 5423 + }, + { + "epoch": 0.83, + "grad_norm": 3.895226797679898, + "learning_rate": 1.473663114793139e-06, + "loss": 0.7467, + "step": 5424 + }, + { + "epoch": 0.83, + "grad_norm": 2.5764193868923346, + "learning_rate": 1.4710738031098226e-06, + "loss": 0.6766, + "step": 5425 + }, + { + "epoch": 0.83, + "grad_norm": 2.5882262502356967, + "learning_rate": 1.4684865875973663e-06, + "loss": 0.7805, + "step": 5426 + }, + { + "epoch": 0.83, + "grad_norm": 2.5245263727879115, + "learning_rate": 1.4659014688916306e-06, + "loss": 0.7652, + "step": 5427 + }, + { + "epoch": 0.83, + "grad_norm": 2.4390552809494097, + "learning_rate": 1.463318447627966e-06, + "loss": 0.6893, + "step": 5428 + }, + { + "epoch": 0.83, + "grad_norm": 2.4830323556842315, + "learning_rate": 1.460737524441207e-06, + "loss": 0.8335, + "step": 5429 + }, + { + "epoch": 0.83, + "grad_norm": 2.580296423991282, + "learning_rate": 1.4581586999656706e-06, + "loss": 0.7148, + "step": 5430 + }, + { + "epoch": 0.83, + "grad_norm": 2.531672934729575, + "learning_rate": 1.455581974835162e-06, + "loss": 0.8116, + "step": 5431 + }, + { + "epoch": 0.83, + "grad_norm": 3.097529937171779, + "learning_rate": 1.4530073496829667e-06, + "loss": 0.7705, + "step": 5432 + }, + { + "epoch": 0.83, + "grad_norm": 2.462077058668238, + "learning_rate": 1.4504348251418532e-06, + "loss": 0.7132, + "step": 5433 + }, + { + "epoch": 0.83, + "grad_norm": 2.510920740948573, + "learning_rate": 1.4478644018440813e-06, + "loss": 0.7715, + "step": 5434 + }, + { + "epoch": 0.83, + "grad_norm": 2.7183210859443965, + "learning_rate": 1.4452960804213844e-06, + "loss": 0.7191, + "step": 5435 + }, + { + "epoch": 0.83, + "grad_norm": 2.4965139774054697, + "learning_rate": 1.4427298615049834e-06, + "loss": 0.7392, + "step": 5436 + }, + { + "epoch": 0.83, + "grad_norm": 2.4493008215582424, + "learning_rate": 1.440165745725588e-06, + "loss": 0.6751, + "step": 5437 + }, + { + "epoch": 0.83, + "grad_norm": 2.8053401617693123, + "learning_rate": 1.4376037337133818e-06, + "loss": 0.8177, + "step": 5438 + }, + { + "epoch": 0.83, + "grad_norm": 2.6707495812946984, + "learning_rate": 1.435043826098037e-06, + "loss": 0.718, + "step": 5439 + }, + { + "epoch": 0.83, + "grad_norm": 2.766260855397848, + "learning_rate": 1.4324860235087069e-06, + "loss": 0.6813, + "step": 5440 + }, + { + "epoch": 0.83, + "grad_norm": 2.7959908546395873, + "learning_rate": 1.4299303265740238e-06, + "loss": 0.7618, + "step": 5441 + }, + { + "epoch": 0.83, + "grad_norm": 2.5790556035434795, + "learning_rate": 1.4273767359221125e-06, + "loss": 0.7761, + "step": 5442 + }, + { + "epoch": 0.83, + "grad_norm": 2.667404676892974, + "learning_rate": 1.4248252521805716e-06, + "loss": 0.7206, + "step": 5443 + }, + { + "epoch": 0.83, + "grad_norm": 2.9916119664187026, + "learning_rate": 1.422275875976482e-06, + "loss": 0.6933, + "step": 5444 + }, + { + "epoch": 0.83, + "grad_norm": 2.9269245266415003, + "learning_rate": 1.4197286079364125e-06, + "loss": 0.7127, + "step": 5445 + }, + { + "epoch": 0.83, + "grad_norm": 2.7534623467804686, + "learning_rate": 1.4171834486864089e-06, + "loss": 0.7684, + "step": 5446 + }, + { + "epoch": 0.83, + "grad_norm": 2.8876537105947166, + "learning_rate": 1.4146403988519963e-06, + "loss": 0.7838, + "step": 5447 + }, + { + "epoch": 0.83, + "grad_norm": 2.8182121582454123, + "learning_rate": 1.4120994590581916e-06, + "loss": 0.7686, + "step": 5448 + }, + { + "epoch": 0.83, + "grad_norm": 3.082920784480041, + "learning_rate": 1.4095606299294827e-06, + "loss": 0.8127, + "step": 5449 + }, + { + "epoch": 0.83, + "grad_norm": 2.8232693322075004, + "learning_rate": 1.4070239120898433e-06, + "loss": 0.7653, + "step": 5450 + }, + { + "epoch": 0.83, + "grad_norm": 3.1856411989759135, + "learning_rate": 1.4044893061627263e-06, + "loss": 0.8263, + "step": 5451 + }, + { + "epoch": 0.83, + "grad_norm": 2.676050937118328, + "learning_rate": 1.4019568127710659e-06, + "loss": 0.6561, + "step": 5452 + }, + { + "epoch": 0.83, + "grad_norm": 2.6926069696599217, + "learning_rate": 1.399426432537283e-06, + "loss": 0.7164, + "step": 5453 + }, + { + "epoch": 0.83, + "grad_norm": 2.633397695011211, + "learning_rate": 1.3968981660832693e-06, + "loss": 0.7313, + "step": 5454 + }, + { + "epoch": 0.83, + "grad_norm": 2.7503718548301737, + "learning_rate": 1.3943720140304018e-06, + "loss": 0.8687, + "step": 5455 + }, + { + "epoch": 0.84, + "grad_norm": 2.7667013768472226, + "learning_rate": 1.3918479769995418e-06, + "loss": 0.759, + "step": 5456 + }, + { + "epoch": 0.84, + "grad_norm": 2.7654801691226893, + "learning_rate": 1.3893260556110243e-06, + "loss": 0.7636, + "step": 5457 + }, + { + "epoch": 0.84, + "grad_norm": 2.5775808923066905, + "learning_rate": 1.3868062504846646e-06, + "loss": 0.7662, + "step": 5458 + }, + { + "epoch": 0.84, + "grad_norm": 2.6147303801081803, + "learning_rate": 1.384288562239765e-06, + "loss": 0.7888, + "step": 5459 + }, + { + "epoch": 0.84, + "grad_norm": 2.555246259881729, + "learning_rate": 1.3817729914950995e-06, + "loss": 0.7107, + "step": 5460 + }, + { + "epoch": 0.84, + "grad_norm": 2.6749101810783924, + "learning_rate": 1.3792595388689267e-06, + "loss": 0.8295, + "step": 5461 + }, + { + "epoch": 0.84, + "grad_norm": 2.9072465544160893, + "learning_rate": 1.3767482049789804e-06, + "loss": 0.7472, + "step": 5462 + }, + { + "epoch": 0.84, + "grad_norm": 2.7431607296729603, + "learning_rate": 1.3742389904424747e-06, + "loss": 0.7544, + "step": 5463 + }, + { + "epoch": 0.84, + "grad_norm": 2.4523455376569108, + "learning_rate": 1.3717318958761094e-06, + "loss": 0.7079, + "step": 5464 + }, + { + "epoch": 0.84, + "grad_norm": 2.6762219222918024, + "learning_rate": 1.3692269218960553e-06, + "loss": 0.8438, + "step": 5465 + }, + { + "epoch": 0.84, + "grad_norm": 2.5159210232208418, + "learning_rate": 1.3667240691179618e-06, + "loss": 0.8016, + "step": 5466 + }, + { + "epoch": 0.84, + "grad_norm": 2.7143958324723036, + "learning_rate": 1.3642233381569657e-06, + "loss": 0.7513, + "step": 5467 + }, + { + "epoch": 0.84, + "grad_norm": 3.30670046856752, + "learning_rate": 1.3617247296276737e-06, + "loss": 0.805, + "step": 5468 + }, + { + "epoch": 0.84, + "grad_norm": 2.711816364111367, + "learning_rate": 1.3592282441441707e-06, + "loss": 0.6686, + "step": 5469 + }, + { + "epoch": 0.84, + "grad_norm": 2.8587264851624865, + "learning_rate": 1.3567338823200293e-06, + "loss": 0.7559, + "step": 5470 + }, + { + "epoch": 0.84, + "grad_norm": 2.9278007847640493, + "learning_rate": 1.3542416447682893e-06, + "loss": 0.7923, + "step": 5471 + }, + { + "epoch": 0.84, + "grad_norm": 2.784064503676124, + "learning_rate": 1.351751532101473e-06, + "loss": 0.8089, + "step": 5472 + }, + { + "epoch": 0.84, + "grad_norm": 2.469197716587361, + "learning_rate": 1.3492635449315817e-06, + "loss": 0.7145, + "step": 5473 + }, + { + "epoch": 0.84, + "grad_norm": 2.5894330340506713, + "learning_rate": 1.3467776838700896e-06, + "loss": 0.7227, + "step": 5474 + }, + { + "epoch": 0.84, + "grad_norm": 2.471371802458738, + "learning_rate": 1.344293949527956e-06, + "loss": 0.678, + "step": 5475 + }, + { + "epoch": 0.84, + "grad_norm": 2.798773140145835, + "learning_rate": 1.3418123425156115e-06, + "loss": 0.7428, + "step": 5476 + }, + { + "epoch": 0.84, + "grad_norm": 2.5164154651123667, + "learning_rate": 1.3393328634429636e-06, + "loss": 0.7172, + "step": 5477 + }, + { + "epoch": 0.84, + "grad_norm": 3.917342816664016, + "learning_rate": 1.3368555129194016e-06, + "loss": 0.8129, + "step": 5478 + }, + { + "epoch": 0.84, + "grad_norm": 2.638060194697016, + "learning_rate": 1.3343802915537885e-06, + "loss": 0.8024, + "step": 5479 + }, + { + "epoch": 0.84, + "grad_norm": 2.7588261673114265, + "learning_rate": 1.3319071999544607e-06, + "loss": 0.8135, + "step": 5480 + }, + { + "epoch": 0.84, + "grad_norm": 2.551590169459782, + "learning_rate": 1.3294362387292391e-06, + "loss": 0.7706, + "step": 5481 + }, + { + "epoch": 0.84, + "grad_norm": 2.944194600995512, + "learning_rate": 1.3269674084854156e-06, + "loss": 0.8076, + "step": 5482 + }, + { + "epoch": 0.84, + "grad_norm": 2.6657774464142485, + "learning_rate": 1.324500709829759e-06, + "loss": 0.8025, + "step": 5483 + }, + { + "epoch": 0.84, + "grad_norm": 3.0273054989836354, + "learning_rate": 1.3220361433685137e-06, + "loss": 0.8681, + "step": 5484 + }, + { + "epoch": 0.84, + "grad_norm": 3.2634353936725327, + "learning_rate": 1.319573709707399e-06, + "loss": 0.8016, + "step": 5485 + }, + { + "epoch": 0.84, + "grad_norm": 2.773258396954878, + "learning_rate": 1.317113409451618e-06, + "loss": 0.7462, + "step": 5486 + }, + { + "epoch": 0.84, + "grad_norm": 2.4504422966436388, + "learning_rate": 1.31465524320584e-06, + "loss": 0.7048, + "step": 5487 + }, + { + "epoch": 0.84, + "grad_norm": 2.6226489048346395, + "learning_rate": 1.3121992115742122e-06, + "loss": 0.7727, + "step": 5488 + }, + { + "epoch": 0.84, + "grad_norm": 2.499913869641089, + "learning_rate": 1.3097453151603602e-06, + "loss": 0.8013, + "step": 5489 + }, + { + "epoch": 0.84, + "grad_norm": 2.7080732867459343, + "learning_rate": 1.3072935545673836e-06, + "loss": 0.8594, + "step": 5490 + }, + { + "epoch": 0.84, + "grad_norm": 2.6214148178831653, + "learning_rate": 1.3048439303978534e-06, + "loss": 0.8599, + "step": 5491 + }, + { + "epoch": 0.84, + "grad_norm": 2.8228550531261165, + "learning_rate": 1.3023964432538216e-06, + "loss": 0.7535, + "step": 5492 + }, + { + "epoch": 0.84, + "grad_norm": 2.6231774742032004, + "learning_rate": 1.2999510937368109e-06, + "loss": 0.742, + "step": 5493 + }, + { + "epoch": 0.84, + "grad_norm": 2.7395097723549884, + "learning_rate": 1.2975078824478181e-06, + "loss": 0.765, + "step": 5494 + }, + { + "epoch": 0.84, + "grad_norm": 2.5325901699463103, + "learning_rate": 1.2950668099873186e-06, + "loss": 0.8974, + "step": 5495 + }, + { + "epoch": 0.84, + "grad_norm": 2.7576079244145997, + "learning_rate": 1.2926278769552558e-06, + "loss": 0.8176, + "step": 5496 + }, + { + "epoch": 0.84, + "grad_norm": 2.6983001920413723, + "learning_rate": 1.2901910839510557e-06, + "loss": 0.7643, + "step": 5497 + }, + { + "epoch": 0.84, + "grad_norm": 2.7065562167614767, + "learning_rate": 1.2877564315736114e-06, + "loss": 0.8274, + "step": 5498 + }, + { + "epoch": 0.84, + "grad_norm": 2.592440020847409, + "learning_rate": 1.2853239204212908e-06, + "loss": 0.7146, + "step": 5499 + }, + { + "epoch": 0.84, + "grad_norm": 2.6082475382596924, + "learning_rate": 1.2828935510919393e-06, + "loss": 0.6917, + "step": 5500 + }, + { + "epoch": 0.84, + "grad_norm": 2.7716965757094347, + "learning_rate": 1.2804653241828724e-06, + "loss": 0.7834, + "step": 5501 + }, + { + "epoch": 0.84, + "grad_norm": 2.4278847914452375, + "learning_rate": 1.2780392402908793e-06, + "loss": 0.7978, + "step": 5502 + }, + { + "epoch": 0.84, + "grad_norm": 2.6464039784162, + "learning_rate": 1.2756153000122252e-06, + "loss": 0.7255, + "step": 5503 + }, + { + "epoch": 0.84, + "grad_norm": 2.638944099478603, + "learning_rate": 1.2731935039426469e-06, + "loss": 0.744, + "step": 5504 + }, + { + "epoch": 0.84, + "grad_norm": 2.65867797896158, + "learning_rate": 1.2707738526773528e-06, + "loss": 0.7639, + "step": 5505 + }, + { + "epoch": 0.84, + "grad_norm": 2.81615400279156, + "learning_rate": 1.268356346811025e-06, + "loss": 0.9008, + "step": 5506 + }, + { + "epoch": 0.84, + "grad_norm": 2.6510825781269673, + "learning_rate": 1.2659409869378159e-06, + "loss": 0.818, + "step": 5507 + }, + { + "epoch": 0.84, + "grad_norm": 2.866416891237285, + "learning_rate": 1.2635277736513596e-06, + "loss": 0.804, + "step": 5508 + }, + { + "epoch": 0.84, + "grad_norm": 3.0494698466774426, + "learning_rate": 1.2611167075447527e-06, + "loss": 0.7794, + "step": 5509 + }, + { + "epoch": 0.84, + "grad_norm": 2.7386950159586627, + "learning_rate": 1.258707789210566e-06, + "loss": 0.7816, + "step": 5510 + }, + { + "epoch": 0.84, + "grad_norm": 2.785828567783617, + "learning_rate": 1.2563010192408487e-06, + "loss": 0.7821, + "step": 5511 + }, + { + "epoch": 0.84, + "grad_norm": 2.5718903666323296, + "learning_rate": 1.2538963982271135e-06, + "loss": 0.7565, + "step": 5512 + }, + { + "epoch": 0.84, + "grad_norm": 3.124178037199288, + "learning_rate": 1.2514939267603489e-06, + "loss": 0.823, + "step": 5513 + }, + { + "epoch": 0.84, + "grad_norm": 2.7898308110383474, + "learning_rate": 1.2490936054310176e-06, + "loss": 0.7378, + "step": 5514 + }, + { + "epoch": 0.84, + "grad_norm": 2.4373883835923507, + "learning_rate": 1.2466954348290473e-06, + "loss": 0.7807, + "step": 5515 + }, + { + "epoch": 0.84, + "grad_norm": 2.466570363766371, + "learning_rate": 1.244299415543846e-06, + "loss": 0.7284, + "step": 5516 + }, + { + "epoch": 0.84, + "grad_norm": 2.5189153820994323, + "learning_rate": 1.2419055481642873e-06, + "loss": 0.7953, + "step": 5517 + }, + { + "epoch": 0.84, + "grad_norm": 2.6885918862881444, + "learning_rate": 1.2395138332787105e-06, + "loss": 0.7116, + "step": 5518 + }, + { + "epoch": 0.84, + "grad_norm": 2.5023705356369255, + "learning_rate": 1.2371242714749388e-06, + "loss": 0.7292, + "step": 5519 + }, + { + "epoch": 0.84, + "grad_norm": 2.5056392809495147, + "learning_rate": 1.2347368633402578e-06, + "loss": 0.7093, + "step": 5520 + }, + { + "epoch": 0.85, + "grad_norm": 2.6732155135175915, + "learning_rate": 1.2323516094614218e-06, + "loss": 0.8043, + "step": 5521 + }, + { + "epoch": 0.85, + "grad_norm": 2.5131745570401867, + "learning_rate": 1.229968510424665e-06, + "loss": 0.6928, + "step": 5522 + }, + { + "epoch": 0.85, + "grad_norm": 2.3139445552542974, + "learning_rate": 1.2275875668156812e-06, + "loss": 0.7818, + "step": 5523 + }, + { + "epoch": 0.85, + "grad_norm": 2.5637986429478112, + "learning_rate": 1.2252087792196432e-06, + "loss": 0.624, + "step": 5524 + }, + { + "epoch": 0.85, + "grad_norm": 2.720954635132861, + "learning_rate": 1.2228321482211903e-06, + "loss": 0.7613, + "step": 5525 + }, + { + "epoch": 0.85, + "grad_norm": 3.421612728710161, + "learning_rate": 1.2204576744044284e-06, + "loss": 0.802, + "step": 5526 + }, + { + "epoch": 0.85, + "grad_norm": 2.727338927023116, + "learning_rate": 1.2180853583529394e-06, + "loss": 0.826, + "step": 5527 + }, + { + "epoch": 0.85, + "grad_norm": 2.9617344064018982, + "learning_rate": 1.215715200649773e-06, + "loss": 0.7746, + "step": 5528 + }, + { + "epoch": 0.85, + "grad_norm": 2.6211203203398132, + "learning_rate": 1.2133472018774439e-06, + "loss": 0.7474, + "step": 5529 + }, + { + "epoch": 0.85, + "grad_norm": 3.060772179332325, + "learning_rate": 1.2109813626179434e-06, + "loss": 0.7757, + "step": 5530 + }, + { + "epoch": 0.85, + "grad_norm": 2.4640680019112717, + "learning_rate": 1.2086176834527252e-06, + "loss": 0.7388, + "step": 5531 + }, + { + "epoch": 0.85, + "grad_norm": 2.4614625902107177, + "learning_rate": 1.2062561649627158e-06, + "loss": 0.7391, + "step": 5532 + }, + { + "epoch": 0.85, + "grad_norm": 2.6876423721050724, + "learning_rate": 1.203896807728313e-06, + "loss": 0.773, + "step": 5533 + }, + { + "epoch": 0.85, + "grad_norm": 2.7163326211812153, + "learning_rate": 1.2015396123293766e-06, + "loss": 0.8139, + "step": 5534 + }, + { + "epoch": 0.85, + "grad_norm": 2.71148568189279, + "learning_rate": 1.1991845793452438e-06, + "loss": 0.7613, + "step": 5535 + }, + { + "epoch": 0.85, + "grad_norm": 2.751931038453203, + "learning_rate": 1.1968317093547133e-06, + "loss": 0.8681, + "step": 5536 + }, + { + "epoch": 0.85, + "grad_norm": 2.6658751770376754, + "learning_rate": 1.1944810029360532e-06, + "loss": 0.7588, + "step": 5537 + }, + { + "epoch": 0.85, + "grad_norm": 2.668427297252355, + "learning_rate": 1.1921324606670037e-06, + "loss": 0.7865, + "step": 5538 + }, + { + "epoch": 0.85, + "grad_norm": 2.89947410474797, + "learning_rate": 1.1897860831247686e-06, + "loss": 0.7112, + "step": 5539 + }, + { + "epoch": 0.85, + "grad_norm": 2.5563106453481006, + "learning_rate": 1.1874418708860237e-06, + "loss": 0.7953, + "step": 5540 + }, + { + "epoch": 0.85, + "grad_norm": 2.702224895601683, + "learning_rate": 1.1850998245269096e-06, + "loss": 0.8173, + "step": 5541 + }, + { + "epoch": 0.85, + "grad_norm": 2.7219772816758465, + "learning_rate": 1.1827599446230354e-06, + "loss": 0.7415, + "step": 5542 + }, + { + "epoch": 0.85, + "grad_norm": 2.811823473183657, + "learning_rate": 1.180422231749475e-06, + "loss": 0.7129, + "step": 5543 + }, + { + "epoch": 0.85, + "grad_norm": 2.8051088616260356, + "learning_rate": 1.1780866864807795e-06, + "loss": 0.7613, + "step": 5544 + }, + { + "epoch": 0.85, + "grad_norm": 2.516067073289398, + "learning_rate": 1.1757533093909535e-06, + "loss": 0.768, + "step": 5545 + }, + { + "epoch": 0.85, + "grad_norm": 2.5930686110231003, + "learning_rate": 1.1734221010534807e-06, + "loss": 0.7489, + "step": 5546 + }, + { + "epoch": 0.85, + "grad_norm": 2.893175999553963, + "learning_rate": 1.1710930620413053e-06, + "loss": 0.7489, + "step": 5547 + }, + { + "epoch": 0.85, + "grad_norm": 2.8849620602373243, + "learning_rate": 1.1687661929268367e-06, + "loss": 0.7634, + "step": 5548 + }, + { + "epoch": 0.85, + "grad_norm": 2.4784255720944017, + "learning_rate": 1.166441494281959e-06, + "loss": 0.7371, + "step": 5549 + }, + { + "epoch": 0.85, + "grad_norm": 2.771587298983736, + "learning_rate": 1.1641189666780151e-06, + "loss": 0.7193, + "step": 5550 + }, + { + "epoch": 0.85, + "grad_norm": 2.8362983127978527, + "learning_rate": 1.161798610685818e-06, + "loss": 0.8037, + "step": 5551 + }, + { + "epoch": 0.85, + "grad_norm": 2.583812039759552, + "learning_rate": 1.1594804268756455e-06, + "loss": 0.6902, + "step": 5552 + }, + { + "epoch": 0.85, + "grad_norm": 2.7400986861722365, + "learning_rate": 1.1571644158172435e-06, + "loss": 0.7665, + "step": 5553 + }, + { + "epoch": 0.85, + "grad_norm": 2.8029871469307004, + "learning_rate": 1.154850578079818e-06, + "loss": 0.7821, + "step": 5554 + }, + { + "epoch": 0.85, + "grad_norm": 2.5630396676092575, + "learning_rate": 1.152538914232052e-06, + "loss": 0.6432, + "step": 5555 + }, + { + "epoch": 0.85, + "grad_norm": 2.754520435755249, + "learning_rate": 1.150229424842082e-06, + "loss": 0.7511, + "step": 5556 + }, + { + "epoch": 0.85, + "grad_norm": 2.668987688707433, + "learning_rate": 1.1479221104775195e-06, + "loss": 0.7309, + "step": 5557 + }, + { + "epoch": 0.85, + "grad_norm": 2.5638068256997326, + "learning_rate": 1.1456169717054378e-06, + "loss": 0.6905, + "step": 5558 + }, + { + "epoch": 0.85, + "grad_norm": 2.690726492060148, + "learning_rate": 1.143314009092371e-06, + "loss": 0.7855, + "step": 5559 + }, + { + "epoch": 0.85, + "grad_norm": 2.4735568575923454, + "learning_rate": 1.141013223204328e-06, + "loss": 0.6855, + "step": 5560 + }, + { + "epoch": 0.85, + "grad_norm": 2.5690760259180982, + "learning_rate": 1.138714614606775e-06, + "loss": 0.7458, + "step": 5561 + }, + { + "epoch": 0.85, + "grad_norm": 2.4077556748136186, + "learning_rate": 1.136418183864646e-06, + "loss": 0.7496, + "step": 5562 + }, + { + "epoch": 0.85, + "grad_norm": 2.468467344759974, + "learning_rate": 1.134123931542339e-06, + "loss": 0.7601, + "step": 5563 + }, + { + "epoch": 0.85, + "grad_norm": 2.7634747107670807, + "learning_rate": 1.1318318582037168e-06, + "loss": 0.7783, + "step": 5564 + }, + { + "epoch": 0.85, + "grad_norm": 2.559302157414664, + "learning_rate": 1.129541964412104e-06, + "loss": 0.756, + "step": 5565 + }, + { + "epoch": 0.85, + "grad_norm": 2.699526488995371, + "learning_rate": 1.1272542507302985e-06, + "loss": 0.7129, + "step": 5566 + }, + { + "epoch": 0.85, + "grad_norm": 2.974066739013522, + "learning_rate": 1.1249687177205493e-06, + "loss": 0.8527, + "step": 5567 + }, + { + "epoch": 0.85, + "grad_norm": 2.9764748728047463, + "learning_rate": 1.1226853659445824e-06, + "loss": 0.7279, + "step": 5568 + }, + { + "epoch": 0.85, + "grad_norm": 2.857166506430615, + "learning_rate": 1.1204041959635791e-06, + "loss": 0.7442, + "step": 5569 + }, + { + "epoch": 0.85, + "grad_norm": 2.896244832002908, + "learning_rate": 1.1181252083381844e-06, + "loss": 0.7165, + "step": 5570 + }, + { + "epoch": 0.85, + "grad_norm": 2.3254418710580094, + "learning_rate": 1.1158484036285134e-06, + "loss": 0.7253, + "step": 5571 + }, + { + "epoch": 0.85, + "grad_norm": 2.4974170064587518, + "learning_rate": 1.1135737823941405e-06, + "loss": 0.7949, + "step": 5572 + }, + { + "epoch": 0.85, + "grad_norm": 2.777471113005515, + "learning_rate": 1.1113013451941024e-06, + "loss": 0.7766, + "step": 5573 + }, + { + "epoch": 0.85, + "grad_norm": 2.921270848027808, + "learning_rate": 1.1090310925869009e-06, + "loss": 0.7718, + "step": 5574 + }, + { + "epoch": 0.85, + "grad_norm": 2.701574184000981, + "learning_rate": 1.1067630251304996e-06, + "loss": 0.8679, + "step": 5575 + }, + { + "epoch": 0.85, + "grad_norm": 2.787188494578665, + "learning_rate": 1.104497143382325e-06, + "loss": 0.7404, + "step": 5576 + }, + { + "epoch": 0.85, + "grad_norm": 2.551177086497416, + "learning_rate": 1.1022334478992702e-06, + "loss": 0.7234, + "step": 5577 + }, + { + "epoch": 0.85, + "grad_norm": 3.232648327249752, + "learning_rate": 1.099971939237685e-06, + "loss": 0.759, + "step": 5578 + }, + { + "epoch": 0.85, + "grad_norm": 2.5073434658427947, + "learning_rate": 1.0977126179533892e-06, + "loss": 0.7839, + "step": 5579 + }, + { + "epoch": 0.85, + "grad_norm": 2.541437023443651, + "learning_rate": 1.0954554846016575e-06, + "loss": 0.7859, + "step": 5580 + }, + { + "epoch": 0.85, + "grad_norm": 2.5804987815371825, + "learning_rate": 1.0932005397372282e-06, + "loss": 0.7573, + "step": 5581 + }, + { + "epoch": 0.85, + "grad_norm": 2.838112090513552, + "learning_rate": 1.090947783914308e-06, + "loss": 0.7422, + "step": 5582 + }, + { + "epoch": 0.85, + "grad_norm": 2.659588045163355, + "learning_rate": 1.0886972176865585e-06, + "loss": 0.7169, + "step": 5583 + }, + { + "epoch": 0.85, + "grad_norm": 2.6839338857706836, + "learning_rate": 1.0864488416071061e-06, + "loss": 0.731, + "step": 5584 + }, + { + "epoch": 0.85, + "grad_norm": 2.3940028914051665, + "learning_rate": 1.084202656228538e-06, + "loss": 0.6844, + "step": 5585 + }, + { + "epoch": 0.86, + "grad_norm": 2.7620213694437195, + "learning_rate": 1.0819586621029043e-06, + "loss": 0.7331, + "step": 5586 + }, + { + "epoch": 0.86, + "grad_norm": 2.774375832645594, + "learning_rate": 1.0797168597817143e-06, + "loss": 0.6923, + "step": 5587 + }, + { + "epoch": 0.86, + "grad_norm": 3.1324940080961343, + "learning_rate": 1.0774772498159424e-06, + "loss": 0.7889, + "step": 5588 + }, + { + "epoch": 0.86, + "grad_norm": 2.439717577235111, + "learning_rate": 1.0752398327560199e-06, + "loss": 0.6448, + "step": 5589 + }, + { + "epoch": 0.86, + "grad_norm": 2.6372831755194044, + "learning_rate": 1.0730046091518442e-06, + "loss": 0.7257, + "step": 5590 + }, + { + "epoch": 0.86, + "grad_norm": 2.809764341873967, + "learning_rate": 1.0707715795527685e-06, + "loss": 0.7913, + "step": 5591 + }, + { + "epoch": 0.86, + "grad_norm": 2.530267851629339, + "learning_rate": 1.0685407445076067e-06, + "loss": 0.6289, + "step": 5592 + }, + { + "epoch": 0.86, + "grad_norm": 2.7837106686303845, + "learning_rate": 1.0663121045646397e-06, + "loss": 0.7535, + "step": 5593 + }, + { + "epoch": 0.86, + "grad_norm": 2.8086295330354774, + "learning_rate": 1.0640856602716021e-06, + "loss": 0.7656, + "step": 5594 + }, + { + "epoch": 0.86, + "grad_norm": 2.4661618844981947, + "learning_rate": 1.0618614121756932e-06, + "loss": 0.7703, + "step": 5595 + }, + { + "epoch": 0.86, + "grad_norm": 2.8436514864476696, + "learning_rate": 1.059639360823569e-06, + "loss": 0.8581, + "step": 5596 + }, + { + "epoch": 0.86, + "grad_norm": 3.331913971552762, + "learning_rate": 1.057419506761347e-06, + "loss": 0.815, + "step": 5597 + }, + { + "epoch": 0.86, + "grad_norm": 2.9949193504323883, + "learning_rate": 1.0552018505346074e-06, + "loss": 0.7474, + "step": 5598 + }, + { + "epoch": 0.86, + "grad_norm": 2.6932560263632497, + "learning_rate": 1.0529863926883865e-06, + "loss": 0.8571, + "step": 5599 + }, + { + "epoch": 0.86, + "grad_norm": 2.72659374994844, + "learning_rate": 1.05077313376718e-06, + "loss": 0.6961, + "step": 5600 + }, + { + "epoch": 0.86, + "grad_norm": 2.386739434798129, + "learning_rate": 1.0485620743149494e-06, + "loss": 0.7768, + "step": 5601 + }, + { + "epoch": 0.86, + "grad_norm": 2.7055649797455272, + "learning_rate": 1.0463532148751076e-06, + "loss": 0.7969, + "step": 5602 + }, + { + "epoch": 0.86, + "grad_norm": 2.753211825418324, + "learning_rate": 1.0441465559905295e-06, + "loss": 0.7847, + "step": 5603 + }, + { + "epoch": 0.86, + "grad_norm": 2.629614530438831, + "learning_rate": 1.0419420982035545e-06, + "loss": 0.7199, + "step": 5604 + }, + { + "epoch": 0.86, + "grad_norm": 2.634367194061076, + "learning_rate": 1.0397398420559724e-06, + "loss": 0.7798, + "step": 5605 + }, + { + "epoch": 0.86, + "grad_norm": 2.7751002146810633, + "learning_rate": 1.037539788089037e-06, + "loss": 0.763, + "step": 5606 + }, + { + "epoch": 0.86, + "grad_norm": 2.6799197719112042, + "learning_rate": 1.0353419368434614e-06, + "loss": 0.7823, + "step": 5607 + }, + { + "epoch": 0.86, + "grad_norm": 2.7524900554080403, + "learning_rate": 1.0331462888594112e-06, + "loss": 0.8375, + "step": 5608 + }, + { + "epoch": 0.86, + "grad_norm": 2.917619249390206, + "learning_rate": 1.0309528446765206e-06, + "loss": 0.7743, + "step": 5609 + }, + { + "epoch": 0.86, + "grad_norm": 2.8815542439790574, + "learning_rate": 1.0287616048338743e-06, + "loss": 0.8678, + "step": 5610 + }, + { + "epoch": 0.86, + "grad_norm": 2.8823740915476574, + "learning_rate": 1.026572569870017e-06, + "loss": 0.7844, + "step": 5611 + }, + { + "epoch": 0.86, + "grad_norm": 2.7691903823345037, + "learning_rate": 1.024385740322954e-06, + "loss": 0.6978, + "step": 5612 + }, + { + "epoch": 0.86, + "grad_norm": 2.748898140798564, + "learning_rate": 1.022201116730145e-06, + "loss": 0.8205, + "step": 5613 + }, + { + "epoch": 0.86, + "grad_norm": 2.502627608195643, + "learning_rate": 1.0200186996285077e-06, + "loss": 0.7322, + "step": 5614 + }, + { + "epoch": 0.86, + "grad_norm": 2.7651574060507675, + "learning_rate": 1.0178384895544235e-06, + "loss": 0.7698, + "step": 5615 + }, + { + "epoch": 0.86, + "grad_norm": 2.6948552555209138, + "learning_rate": 1.0156604870437247e-06, + "loss": 0.8678, + "step": 5616 + }, + { + "epoch": 0.86, + "grad_norm": 2.4570240141927884, + "learning_rate": 1.0134846926317022e-06, + "loss": 0.7441, + "step": 5617 + }, + { + "epoch": 0.86, + "grad_norm": 2.5496315754472625, + "learning_rate": 1.0113111068531068e-06, + "loss": 0.7771, + "step": 5618 + }, + { + "epoch": 0.86, + "grad_norm": 2.4844705154268096, + "learning_rate": 1.0091397302421412e-06, + "loss": 0.6951, + "step": 5619 + }, + { + "epoch": 0.86, + "grad_norm": 2.9074753927062607, + "learning_rate": 1.0069705633324745e-06, + "loss": 0.8395, + "step": 5620 + }, + { + "epoch": 0.86, + "grad_norm": 2.870455798197107, + "learning_rate": 1.004803606657223e-06, + "loss": 0.7325, + "step": 5621 + }, + { + "epoch": 0.86, + "grad_norm": 2.792599450580263, + "learning_rate": 1.0026388607489646e-06, + "loss": 0.7905, + "step": 5622 + }, + { + "epoch": 0.86, + "grad_norm": 3.0586931850734733, + "learning_rate": 1.0004763261397355e-06, + "loss": 0.8382, + "step": 5623 + }, + { + "epoch": 0.86, + "grad_norm": 2.816909936823562, + "learning_rate": 9.98316003361025e-07, + "loss": 0.8835, + "step": 5624 + }, + { + "epoch": 0.86, + "grad_norm": 2.657891923324879, + "learning_rate": 9.961578929437764e-07, + "loss": 0.7731, + "step": 5625 + }, + { + "epoch": 0.86, + "grad_norm": 2.7264248273390295, + "learning_rate": 9.940019954183977e-07, + "loss": 0.8202, + "step": 5626 + }, + { + "epoch": 0.86, + "grad_norm": 2.6457426310497985, + "learning_rate": 9.918483113147447e-07, + "loss": 0.7751, + "step": 5627 + }, + { + "epoch": 0.86, + "grad_norm": 3.0766698726447506, + "learning_rate": 9.896968411621332e-07, + "loss": 0.8389, + "step": 5628 + }, + { + "epoch": 0.86, + "grad_norm": 2.844765688371593, + "learning_rate": 9.87547585489338e-07, + "loss": 0.721, + "step": 5629 + }, + { + "epoch": 0.86, + "grad_norm": 2.4495238218117894, + "learning_rate": 9.854005448245796e-07, + "loss": 0.6981, + "step": 5630 + }, + { + "epoch": 0.86, + "grad_norm": 2.43118087190644, + "learning_rate": 9.832557196955438e-07, + "loss": 0.754, + "step": 5631 + }, + { + "epoch": 0.86, + "grad_norm": 2.7459618919864357, + "learning_rate": 9.811131106293691e-07, + "loss": 0.7378, + "step": 5632 + }, + { + "epoch": 0.86, + "grad_norm": 2.6027613647952124, + "learning_rate": 9.789727181526453e-07, + "loss": 0.767, + "step": 5633 + }, + { + "epoch": 0.86, + "grad_norm": 2.827180784194871, + "learning_rate": 9.76834542791425e-07, + "loss": 0.826, + "step": 5634 + }, + { + "epoch": 0.86, + "grad_norm": 2.2869195172489576, + "learning_rate": 9.746985850712099e-07, + "loss": 0.7602, + "step": 5635 + }, + { + "epoch": 0.86, + "grad_norm": 2.518787706950005, + "learning_rate": 9.725648455169568e-07, + "loss": 0.7141, + "step": 5636 + }, + { + "epoch": 0.86, + "grad_norm": 2.909684582981514, + "learning_rate": 9.704333246530828e-07, + "loss": 0.7257, + "step": 5637 + }, + { + "epoch": 0.86, + "grad_norm": 3.0101097572734803, + "learning_rate": 9.683040230034536e-07, + "loss": 0.7605, + "step": 5638 + }, + { + "epoch": 0.86, + "grad_norm": 2.8883948525240966, + "learning_rate": 9.661769410913913e-07, + "loss": 0.8731, + "step": 5639 + }, + { + "epoch": 0.86, + "grad_norm": 2.799359360157228, + "learning_rate": 9.640520794396746e-07, + "loss": 0.7386, + "step": 5640 + }, + { + "epoch": 0.86, + "grad_norm": 2.515485834092157, + "learning_rate": 9.619294385705336e-07, + "loss": 0.7184, + "step": 5641 + }, + { + "epoch": 0.86, + "grad_norm": 2.3554919634425526, + "learning_rate": 9.598090190056553e-07, + "loss": 0.6601, + "step": 5642 + }, + { + "epoch": 0.86, + "grad_norm": 2.5777835963991556, + "learning_rate": 9.576908212661784e-07, + "loss": 0.7231, + "step": 5643 + }, + { + "epoch": 0.86, + "grad_norm": 2.544230852123378, + "learning_rate": 9.555748458726944e-07, + "loss": 0.7102, + "step": 5644 + }, + { + "epoch": 0.86, + "grad_norm": 2.4889271852604997, + "learning_rate": 9.534610933452548e-07, + "loss": 0.7839, + "step": 5645 + }, + { + "epoch": 0.86, + "grad_norm": 3.011943460171467, + "learning_rate": 9.513495642033599e-07, + "loss": 0.755, + "step": 5646 + }, + { + "epoch": 0.86, + "grad_norm": 2.5838351272639097, + "learning_rate": 9.4924025896596e-07, + "loss": 0.8452, + "step": 5647 + }, + { + "epoch": 0.86, + "grad_norm": 2.513812262444184, + "learning_rate": 9.471331781514681e-07, + "loss": 0.779, + "step": 5648 + }, + { + "epoch": 0.86, + "grad_norm": 2.6039552895971103, + "learning_rate": 9.450283222777445e-07, + "loss": 0.7665, + "step": 5649 + }, + { + "epoch": 0.86, + "grad_norm": 2.5535137404506805, + "learning_rate": 9.429256918621011e-07, + "loss": 0.7957, + "step": 5650 + }, + { + "epoch": 0.86, + "grad_norm": 2.814064035545963, + "learning_rate": 9.408252874213097e-07, + "loss": 0.7728, + "step": 5651 + }, + { + "epoch": 0.87, + "grad_norm": 2.437786969150219, + "learning_rate": 9.387271094715877e-07, + "loss": 0.735, + "step": 5652 + }, + { + "epoch": 0.87, + "grad_norm": 2.7107592789968256, + "learning_rate": 9.366311585286103e-07, + "loss": 0.7294, + "step": 5653 + }, + { + "epoch": 0.87, + "grad_norm": 2.6797509758975924, + "learning_rate": 9.345374351075009e-07, + "loss": 0.8449, + "step": 5654 + }, + { + "epoch": 0.87, + "grad_norm": 2.702157913427348, + "learning_rate": 9.324459397228391e-07, + "loss": 0.7923, + "step": 5655 + }, + { + "epoch": 0.87, + "grad_norm": 2.667032279356962, + "learning_rate": 9.303566728886571e-07, + "loss": 0.7211, + "step": 5656 + }, + { + "epoch": 0.87, + "grad_norm": 2.483166462186134, + "learning_rate": 9.282696351184383e-07, + "loss": 0.6771, + "step": 5657 + }, + { + "epoch": 0.87, + "grad_norm": 2.632341617741301, + "learning_rate": 9.26184826925114e-07, + "loss": 0.7805, + "step": 5658 + }, + { + "epoch": 0.87, + "grad_norm": 2.803600915223199, + "learning_rate": 9.241022488210772e-07, + "loss": 0.7631, + "step": 5659 + }, + { + "epoch": 0.87, + "grad_norm": 2.668583864035917, + "learning_rate": 9.220219013181642e-07, + "loss": 0.6902, + "step": 5660 + }, + { + "epoch": 0.87, + "grad_norm": 2.9654201592454927, + "learning_rate": 9.199437849276649e-07, + "loss": 0.7293, + "step": 5661 + }, + { + "epoch": 0.87, + "grad_norm": 2.4537593166619436, + "learning_rate": 9.178679001603252e-07, + "loss": 0.6717, + "step": 5662 + }, + { + "epoch": 0.87, + "grad_norm": 2.7924386936278873, + "learning_rate": 9.15794247526337e-07, + "loss": 0.8249, + "step": 5663 + }, + { + "epoch": 0.87, + "grad_norm": 2.5150201116445263, + "learning_rate": 9.137228275353471e-07, + "loss": 0.715, + "step": 5664 + }, + { + "epoch": 0.87, + "grad_norm": 2.782277092977421, + "learning_rate": 9.116536406964527e-07, + "loss": 0.8094, + "step": 5665 + }, + { + "epoch": 0.87, + "grad_norm": 2.6310827622808963, + "learning_rate": 9.095866875181991e-07, + "loss": 0.6985, + "step": 5666 + }, + { + "epoch": 0.87, + "grad_norm": 2.7013028558305043, + "learning_rate": 9.07521968508589e-07, + "loss": 0.7311, + "step": 5667 + }, + { + "epoch": 0.87, + "grad_norm": 2.504527699009366, + "learning_rate": 9.054594841750707e-07, + "loss": 0.6948, + "step": 5668 + }, + { + "epoch": 0.87, + "grad_norm": 2.9194448959018002, + "learning_rate": 9.033992350245435e-07, + "loss": 0.8246, + "step": 5669 + }, + { + "epoch": 0.87, + "grad_norm": 2.580593402067048, + "learning_rate": 9.013412215633633e-07, + "loss": 0.7519, + "step": 5670 + }, + { + "epoch": 0.87, + "grad_norm": 2.645194130581552, + "learning_rate": 8.992854442973264e-07, + "loss": 0.7213, + "step": 5671 + }, + { + "epoch": 0.87, + "grad_norm": 2.4402723740062746, + "learning_rate": 8.972319037316901e-07, + "loss": 0.6859, + "step": 5672 + }, + { + "epoch": 0.87, + "grad_norm": 2.6777006267138024, + "learning_rate": 8.95180600371156e-07, + "loss": 0.7776, + "step": 5673 + }, + { + "epoch": 0.87, + "grad_norm": 2.6668439690078825, + "learning_rate": 8.931315347198754e-07, + "loss": 0.7065, + "step": 5674 + }, + { + "epoch": 0.87, + "grad_norm": 2.558904510622878, + "learning_rate": 8.910847072814521e-07, + "loss": 0.8571, + "step": 5675 + }, + { + "epoch": 0.87, + "grad_norm": 2.8898235434369774, + "learning_rate": 8.890401185589393e-07, + "loss": 0.7485, + "step": 5676 + }, + { + "epoch": 0.87, + "grad_norm": 2.6045615465844207, + "learning_rate": 8.86997769054836e-07, + "loss": 0.74, + "step": 5677 + }, + { + "epoch": 0.87, + "grad_norm": 2.5371702561275455, + "learning_rate": 8.849576592710996e-07, + "loss": 0.7178, + "step": 5678 + }, + { + "epoch": 0.87, + "grad_norm": 2.9241583684169945, + "learning_rate": 8.82919789709129e-07, + "loss": 0.7445, + "step": 5679 + }, + { + "epoch": 0.87, + "grad_norm": 2.5870081381342502, + "learning_rate": 8.808841608697749e-07, + "loss": 0.8143, + "step": 5680 + }, + { + "epoch": 0.87, + "grad_norm": 2.6290340495413433, + "learning_rate": 8.788507732533413e-07, + "loss": 0.8404, + "step": 5681 + }, + { + "epoch": 0.87, + "grad_norm": 2.592317293842339, + "learning_rate": 8.768196273595719e-07, + "loss": 0.7048, + "step": 5682 + }, + { + "epoch": 0.87, + "grad_norm": 2.6255917103261073, + "learning_rate": 8.747907236876718e-07, + "loss": 0.8538, + "step": 5683 + }, + { + "epoch": 0.87, + "grad_norm": 2.8929535869985923, + "learning_rate": 8.727640627362854e-07, + "loss": 0.7605, + "step": 5684 + }, + { + "epoch": 0.87, + "grad_norm": 2.5282572880948555, + "learning_rate": 8.707396450035099e-07, + "loss": 0.6847, + "step": 5685 + }, + { + "epoch": 0.87, + "grad_norm": 2.571181113520221, + "learning_rate": 8.687174709868895e-07, + "loss": 0.821, + "step": 5686 + }, + { + "epoch": 0.87, + "grad_norm": 2.640049537712735, + "learning_rate": 8.666975411834188e-07, + "loss": 0.7872, + "step": 5687 + }, + { + "epoch": 0.87, + "grad_norm": 3.15738494350599, + "learning_rate": 8.646798560895376e-07, + "loss": 0.8605, + "step": 5688 + }, + { + "epoch": 0.87, + "grad_norm": 2.660662027497053, + "learning_rate": 8.626644162011399e-07, + "loss": 0.7281, + "step": 5689 + }, + { + "epoch": 0.87, + "grad_norm": 2.6983042556708496, + "learning_rate": 8.606512220135621e-07, + "loss": 0.7855, + "step": 5690 + }, + { + "epoch": 0.87, + "grad_norm": 2.4483957728925616, + "learning_rate": 8.586402740215893e-07, + "loss": 0.7887, + "step": 5691 + }, + { + "epoch": 0.87, + "grad_norm": 2.778073690955736, + "learning_rate": 8.566315727194607e-07, + "loss": 0.8097, + "step": 5692 + }, + { + "epoch": 0.87, + "grad_norm": 2.7771629099321484, + "learning_rate": 8.546251186008536e-07, + "loss": 0.701, + "step": 5693 + }, + { + "epoch": 0.87, + "grad_norm": 2.498328459699538, + "learning_rate": 8.526209121589024e-07, + "loss": 0.7905, + "step": 5694 + }, + { + "epoch": 0.87, + "grad_norm": 2.6098896557905844, + "learning_rate": 8.506189538861831e-07, + "loss": 0.8243, + "step": 5695 + }, + { + "epoch": 0.87, + "grad_norm": 2.677295059627817, + "learning_rate": 8.486192442747221e-07, + "loss": 0.7308, + "step": 5696 + }, + { + "epoch": 0.87, + "grad_norm": 2.5776146690712345, + "learning_rate": 8.466217838159896e-07, + "loss": 0.8068, + "step": 5697 + }, + { + "epoch": 0.87, + "grad_norm": 2.571734748384213, + "learning_rate": 8.446265730009074e-07, + "loss": 0.6534, + "step": 5698 + }, + { + "epoch": 0.87, + "grad_norm": 2.7287346704837434, + "learning_rate": 8.426336123198386e-07, + "loss": 0.7079, + "step": 5699 + }, + { + "epoch": 0.87, + "grad_norm": 2.5595723305614015, + "learning_rate": 8.406429022626028e-07, + "loss": 0.7684, + "step": 5700 + }, + { + "epoch": 0.87, + "grad_norm": 2.519081924016723, + "learning_rate": 8.386544433184573e-07, + "loss": 0.7304, + "step": 5701 + }, + { + "epoch": 0.87, + "grad_norm": 2.6865616162166055, + "learning_rate": 8.36668235976108e-07, + "loss": 0.6706, + "step": 5702 + }, + { + "epoch": 0.87, + "grad_norm": 2.6769547229461383, + "learning_rate": 8.346842807237132e-07, + "loss": 0.799, + "step": 5703 + }, + { + "epoch": 0.87, + "grad_norm": 2.525744181272662, + "learning_rate": 8.327025780488696e-07, + "loss": 0.6805, + "step": 5704 + }, + { + "epoch": 0.87, + "grad_norm": 2.4046255837962276, + "learning_rate": 8.307231284386264e-07, + "loss": 0.7225, + "step": 5705 + }, + { + "epoch": 0.87, + "grad_norm": 2.515854941221048, + "learning_rate": 8.287459323794777e-07, + "loss": 0.661, + "step": 5706 + }, + { + "epoch": 0.87, + "grad_norm": 2.4848884090304417, + "learning_rate": 8.267709903573606e-07, + "loss": 0.7989, + "step": 5707 + }, + { + "epoch": 0.87, + "grad_norm": 2.8341237442734837, + "learning_rate": 8.247983028576612e-07, + "loss": 0.8496, + "step": 5708 + }, + { + "epoch": 0.87, + "grad_norm": 2.7306413767137143, + "learning_rate": 8.228278703652115e-07, + "loss": 0.743, + "step": 5709 + }, + { + "epoch": 0.87, + "grad_norm": 2.7006097693456637, + "learning_rate": 8.208596933642854e-07, + "loss": 0.7519, + "step": 5710 + }, + { + "epoch": 0.87, + "grad_norm": 2.667161566314598, + "learning_rate": 8.188937723386104e-07, + "loss": 0.7382, + "step": 5711 + }, + { + "epoch": 0.87, + "grad_norm": 2.4897380903135375, + "learning_rate": 8.16930107771352e-07, + "loss": 0.6433, + "step": 5712 + }, + { + "epoch": 0.87, + "grad_norm": 2.6686472482951227, + "learning_rate": 8.149687001451223e-07, + "loss": 0.7862, + "step": 5713 + }, + { + "epoch": 0.87, + "grad_norm": 2.6432076739400863, + "learning_rate": 8.130095499419843e-07, + "loss": 0.768, + "step": 5714 + }, + { + "epoch": 0.87, + "grad_norm": 2.623041787253728, + "learning_rate": 8.110526576434386e-07, + "loss": 0.8018, + "step": 5715 + }, + { + "epoch": 0.87, + "grad_norm": 2.361320310110989, + "learning_rate": 8.090980237304369e-07, + "loss": 0.693, + "step": 5716 + }, + { + "epoch": 0.88, + "grad_norm": 2.53061217832916, + "learning_rate": 8.071456486833729e-07, + "loss": 0.7252, + "step": 5717 + }, + { + "epoch": 0.88, + "grad_norm": 2.7445462678102412, + "learning_rate": 8.051955329820849e-07, + "loss": 0.8299, + "step": 5718 + }, + { + "epoch": 0.88, + "grad_norm": 2.678869645619878, + "learning_rate": 8.032476771058572e-07, + "loss": 0.779, + "step": 5719 + }, + { + "epoch": 0.88, + "grad_norm": 2.597783234585979, + "learning_rate": 8.013020815334182e-07, + "loss": 0.7771, + "step": 5720 + }, + { + "epoch": 0.88, + "grad_norm": 2.5709764296981987, + "learning_rate": 7.993587467429387e-07, + "loss": 0.6913, + "step": 5721 + }, + { + "epoch": 0.88, + "grad_norm": 2.8878615808643717, + "learning_rate": 7.974176732120386e-07, + "loss": 0.8386, + "step": 5722 + }, + { + "epoch": 0.88, + "grad_norm": 2.4739802793848904, + "learning_rate": 7.954788614177789e-07, + "loss": 0.7136, + "step": 5723 + }, + { + "epoch": 0.88, + "grad_norm": 2.5254962861324204, + "learning_rate": 7.935423118366625e-07, + "loss": 0.7467, + "step": 5724 + }, + { + "epoch": 0.88, + "grad_norm": 2.59825821177977, + "learning_rate": 7.916080249446434e-07, + "loss": 0.749, + "step": 5725 + }, + { + "epoch": 0.88, + "grad_norm": 2.6911630767002617, + "learning_rate": 7.896760012171101e-07, + "loss": 0.8131, + "step": 5726 + }, + { + "epoch": 0.88, + "grad_norm": 2.7269735173965386, + "learning_rate": 7.87746241128905e-07, + "loss": 0.8272, + "step": 5727 + }, + { + "epoch": 0.88, + "grad_norm": 2.655547149168662, + "learning_rate": 7.858187451543064e-07, + "loss": 0.7408, + "step": 5728 + }, + { + "epoch": 0.88, + "grad_norm": 2.7887679312481746, + "learning_rate": 7.838935137670378e-07, + "loss": 0.8403, + "step": 5729 + }, + { + "epoch": 0.88, + "grad_norm": 2.8238335497241875, + "learning_rate": 7.819705474402695e-07, + "loss": 0.8438, + "step": 5730 + }, + { + "epoch": 0.88, + "grad_norm": 2.7864697800450675, + "learning_rate": 7.800498466466099e-07, + "loss": 0.7503, + "step": 5731 + }, + { + "epoch": 0.88, + "grad_norm": 2.9423775361485385, + "learning_rate": 7.781314118581141e-07, + "loss": 0.6889, + "step": 5732 + }, + { + "epoch": 0.88, + "grad_norm": 2.6177300541736197, + "learning_rate": 7.762152435462821e-07, + "loss": 0.7418, + "step": 5733 + }, + { + "epoch": 0.88, + "grad_norm": 2.5682224575904224, + "learning_rate": 7.743013421820522e-07, + "loss": 0.7338, + "step": 5734 + }, + { + "epoch": 0.88, + "grad_norm": 2.730093806909977, + "learning_rate": 7.723897082358067e-07, + "loss": 0.7525, + "step": 5735 + }, + { + "epoch": 0.88, + "grad_norm": 2.773570026247756, + "learning_rate": 7.704803421773743e-07, + "loss": 0.8018, + "step": 5736 + }, + { + "epoch": 0.88, + "grad_norm": 2.8167438145337145, + "learning_rate": 7.685732444760197e-07, + "loss": 0.749, + "step": 5737 + }, + { + "epoch": 0.88, + "grad_norm": 2.8460419034294295, + "learning_rate": 7.666684156004589e-07, + "loss": 0.7815, + "step": 5738 + }, + { + "epoch": 0.88, + "grad_norm": 2.581997146323194, + "learning_rate": 7.647658560188431e-07, + "loss": 0.6945, + "step": 5739 + }, + { + "epoch": 0.88, + "grad_norm": 2.6421514833054456, + "learning_rate": 7.628655661987661e-07, + "loss": 0.8141, + "step": 5740 + }, + { + "epoch": 0.88, + "grad_norm": 2.500537719209546, + "learning_rate": 7.609675466072719e-07, + "loss": 0.7023, + "step": 5741 + }, + { + "epoch": 0.88, + "grad_norm": 2.4936945926955283, + "learning_rate": 7.590717977108342e-07, + "loss": 0.7819, + "step": 5742 + }, + { + "epoch": 0.88, + "grad_norm": 2.59145708475709, + "learning_rate": 7.571783199753746e-07, + "loss": 0.7877, + "step": 5743 + }, + { + "epoch": 0.88, + "grad_norm": 2.8791756410563267, + "learning_rate": 7.552871138662621e-07, + "loss": 0.8033, + "step": 5744 + }, + { + "epoch": 0.88, + "grad_norm": 2.410861373680256, + "learning_rate": 7.53398179848297e-07, + "loss": 0.6554, + "step": 5745 + }, + { + "epoch": 0.88, + "grad_norm": 2.6788493966132716, + "learning_rate": 7.515115183857302e-07, + "loss": 0.8059, + "step": 5746 + }, + { + "epoch": 0.88, + "grad_norm": 2.509424351655514, + "learning_rate": 7.496271299422498e-07, + "loss": 0.6685, + "step": 5747 + }, + { + "epoch": 0.88, + "grad_norm": 2.487752174672153, + "learning_rate": 7.477450149809818e-07, + "loss": 0.6712, + "step": 5748 + }, + { + "epoch": 0.88, + "grad_norm": 2.8476622128192064, + "learning_rate": 7.458651739645017e-07, + "loss": 0.8459, + "step": 5749 + }, + { + "epoch": 0.88, + "grad_norm": 2.559185180461762, + "learning_rate": 7.439876073548192e-07, + "loss": 0.6805, + "step": 5750 + }, + { + "epoch": 0.88, + "grad_norm": 2.459377863705062, + "learning_rate": 7.421123156133869e-07, + "loss": 0.7042, + "step": 5751 + }, + { + "epoch": 0.88, + "grad_norm": 2.4854631564887613, + "learning_rate": 7.402392992011032e-07, + "loss": 0.7098, + "step": 5752 + }, + { + "epoch": 0.88, + "grad_norm": 2.772999848604892, + "learning_rate": 7.383685585782985e-07, + "loss": 0.8132, + "step": 5753 + }, + { + "epoch": 0.88, + "grad_norm": 3.296150380444605, + "learning_rate": 7.365000942047506e-07, + "loss": 0.8222, + "step": 5754 + }, + { + "epoch": 0.88, + "grad_norm": 3.083231204527326, + "learning_rate": 7.346339065396746e-07, + "loss": 0.8012, + "step": 5755 + }, + { + "epoch": 0.88, + "grad_norm": 2.4491012331318753, + "learning_rate": 7.327699960417256e-07, + "loss": 0.6899, + "step": 5756 + }, + { + "epoch": 0.88, + "grad_norm": 2.7249084898062574, + "learning_rate": 7.30908363169005e-07, + "loss": 0.6925, + "step": 5757 + }, + { + "epoch": 0.88, + "grad_norm": 2.6967437072752967, + "learning_rate": 7.290490083790458e-07, + "loss": 0.7866, + "step": 5758 + }, + { + "epoch": 0.88, + "grad_norm": 2.556881939811135, + "learning_rate": 7.271919321288268e-07, + "loss": 0.7139, + "step": 5759 + }, + { + "epoch": 0.88, + "grad_norm": 2.815172538512507, + "learning_rate": 7.253371348747662e-07, + "loss": 0.8048, + "step": 5760 + }, + { + "epoch": 0.88, + "grad_norm": 2.638440578512759, + "learning_rate": 7.234846170727194e-07, + "loss": 0.7051, + "step": 5761 + }, + { + "epoch": 0.88, + "grad_norm": 2.6966172220810654, + "learning_rate": 7.216343791779834e-07, + "loss": 0.7231, + "step": 5762 + }, + { + "epoch": 0.88, + "grad_norm": 2.5534587002684463, + "learning_rate": 7.197864216452965e-07, + "loss": 0.7466, + "step": 5763 + }, + { + "epoch": 0.88, + "grad_norm": 2.7849363578109165, + "learning_rate": 7.179407449288344e-07, + "loss": 0.7409, + "step": 5764 + }, + { + "epoch": 0.88, + "grad_norm": 2.544837759440299, + "learning_rate": 7.160973494822121e-07, + "loss": 0.6465, + "step": 5765 + }, + { + "epoch": 0.88, + "grad_norm": 2.7231069748329695, + "learning_rate": 7.142562357584836e-07, + "loss": 0.7866, + "step": 5766 + }, + { + "epoch": 0.88, + "grad_norm": 2.546310544712134, + "learning_rate": 7.124174042101428e-07, + "loss": 0.7346, + "step": 5767 + }, + { + "epoch": 0.88, + "grad_norm": 3.0119781270597197, + "learning_rate": 7.105808552891258e-07, + "loss": 0.8738, + "step": 5768 + }, + { + "epoch": 0.88, + "grad_norm": 2.588257545636272, + "learning_rate": 7.087465894468037e-07, + "loss": 0.7377, + "step": 5769 + }, + { + "epoch": 0.88, + "grad_norm": 2.6439528897738858, + "learning_rate": 7.069146071339839e-07, + "loss": 0.8051, + "step": 5770 + }, + { + "epoch": 0.88, + "grad_norm": 2.7572845518844895, + "learning_rate": 7.050849088009216e-07, + "loss": 0.6755, + "step": 5771 + }, + { + "epoch": 0.88, + "grad_norm": 3.5912309063614627, + "learning_rate": 7.032574948973037e-07, + "loss": 0.7551, + "step": 5772 + }, + { + "epoch": 0.88, + "grad_norm": 2.710962518663575, + "learning_rate": 7.014323658722544e-07, + "loss": 0.7734, + "step": 5773 + }, + { + "epoch": 0.88, + "grad_norm": 2.6644565985000748, + "learning_rate": 6.996095221743426e-07, + "loss": 0.7861, + "step": 5774 + }, + { + "epoch": 0.88, + "grad_norm": 2.7841204191393847, + "learning_rate": 6.977889642515711e-07, + "loss": 0.8666, + "step": 5775 + }, + { + "epoch": 0.88, + "grad_norm": 2.4809075380037533, + "learning_rate": 6.959706925513832e-07, + "loss": 0.6904, + "step": 5776 + }, + { + "epoch": 0.88, + "grad_norm": 2.6378329404590213, + "learning_rate": 6.941547075206567e-07, + "loss": 0.7582, + "step": 5777 + }, + { + "epoch": 0.88, + "grad_norm": 2.4058948024512397, + "learning_rate": 6.923410096057093e-07, + "loss": 0.7166, + "step": 5778 + }, + { + "epoch": 0.88, + "grad_norm": 2.7027364478522724, + "learning_rate": 6.905295992522998e-07, + "loss": 0.6934, + "step": 5779 + }, + { + "epoch": 0.88, + "grad_norm": 2.8777912482750647, + "learning_rate": 6.887204769056221e-07, + "loss": 0.8297, + "step": 5780 + }, + { + "epoch": 0.88, + "grad_norm": 3.0010290591989044, + "learning_rate": 6.86913643010304e-07, + "loss": 0.8016, + "step": 5781 + }, + { + "epoch": 0.89, + "grad_norm": 2.8797253529185736, + "learning_rate": 6.851090980104191e-07, + "loss": 0.7898, + "step": 5782 + }, + { + "epoch": 0.89, + "grad_norm": 2.8338261112987286, + "learning_rate": 6.833068423494727e-07, + "loss": 0.8343, + "step": 5783 + }, + { + "epoch": 0.89, + "grad_norm": 2.6323910473659247, + "learning_rate": 6.815068764704047e-07, + "loss": 0.8172, + "step": 5784 + }, + { + "epoch": 0.89, + "grad_norm": 2.596841816940055, + "learning_rate": 6.797092008156026e-07, + "loss": 0.7403, + "step": 5785 + }, + { + "epoch": 0.89, + "grad_norm": 2.738457357878896, + "learning_rate": 6.779138158268806e-07, + "loss": 0.6369, + "step": 5786 + }, + { + "epoch": 0.89, + "grad_norm": 2.915690322939979, + "learning_rate": 6.761207219454957e-07, + "loss": 0.8547, + "step": 5787 + }, + { + "epoch": 0.89, + "grad_norm": 2.537974599329801, + "learning_rate": 6.743299196121389e-07, + "loss": 0.7582, + "step": 5788 + }, + { + "epoch": 0.89, + "grad_norm": 2.707714310242144, + "learning_rate": 6.725414092669391e-07, + "loss": 0.7383, + "step": 5789 + }, + { + "epoch": 0.89, + "grad_norm": 2.5190834121324293, + "learning_rate": 6.707551913494626e-07, + "loss": 0.7306, + "step": 5790 + }, + { + "epoch": 0.89, + "grad_norm": 2.8560257151264334, + "learning_rate": 6.689712662987124e-07, + "loss": 0.7703, + "step": 5791 + }, + { + "epoch": 0.89, + "grad_norm": 2.6256493925464923, + "learning_rate": 6.671896345531248e-07, + "loss": 0.7432, + "step": 5792 + }, + { + "epoch": 0.89, + "grad_norm": 2.4196845841221815, + "learning_rate": 6.654102965505782e-07, + "loss": 0.6261, + "step": 5793 + }, + { + "epoch": 0.89, + "grad_norm": 2.6043213927859536, + "learning_rate": 6.636332527283817e-07, + "loss": 0.7592, + "step": 5794 + }, + { + "epoch": 0.89, + "grad_norm": 2.3460123501037726, + "learning_rate": 6.618585035232828e-07, + "loss": 0.6926, + "step": 5795 + }, + { + "epoch": 0.89, + "grad_norm": 2.5889018853522097, + "learning_rate": 6.600860493714667e-07, + "loss": 0.7117, + "step": 5796 + }, + { + "epoch": 0.89, + "grad_norm": 2.515917583771837, + "learning_rate": 6.583158907085518e-07, + "loss": 0.6722, + "step": 5797 + }, + { + "epoch": 0.89, + "grad_norm": 2.7197011832215168, + "learning_rate": 6.565480279695946e-07, + "loss": 0.7158, + "step": 5798 + }, + { + "epoch": 0.89, + "grad_norm": 3.229504657309361, + "learning_rate": 6.547824615890841e-07, + "loss": 0.7511, + "step": 5799 + }, + { + "epoch": 0.89, + "grad_norm": 2.5513386297995084, + "learning_rate": 6.530191920009465e-07, + "loss": 0.7745, + "step": 5800 + }, + { + "epoch": 0.89, + "grad_norm": 2.616216804463169, + "learning_rate": 6.512582196385475e-07, + "loss": 0.7391, + "step": 5801 + }, + { + "epoch": 0.89, + "grad_norm": 2.9565292094630533, + "learning_rate": 6.49499544934683e-07, + "loss": 0.7664, + "step": 5802 + }, + { + "epoch": 0.89, + "grad_norm": 2.626493868178572, + "learning_rate": 6.477431683215841e-07, + "loss": 0.7836, + "step": 5803 + }, + { + "epoch": 0.89, + "grad_norm": 2.5789068803703685, + "learning_rate": 6.459890902309218e-07, + "loss": 0.6654, + "step": 5804 + }, + { + "epoch": 0.89, + "grad_norm": 2.8827734647635666, + "learning_rate": 6.442373110937994e-07, + "loss": 0.6883, + "step": 5805 + }, + { + "epoch": 0.89, + "grad_norm": 2.7036197267519055, + "learning_rate": 6.424878313407501e-07, + "loss": 0.6766, + "step": 5806 + }, + { + "epoch": 0.89, + "grad_norm": 2.4623104650551375, + "learning_rate": 6.407406514017534e-07, + "loss": 0.6671, + "step": 5807 + }, + { + "epoch": 0.89, + "grad_norm": 2.441650887017995, + "learning_rate": 6.389957717062145e-07, + "loss": 0.5803, + "step": 5808 + }, + { + "epoch": 0.89, + "grad_norm": 2.72535880083023, + "learning_rate": 6.37253192682975e-07, + "loss": 0.7387, + "step": 5809 + }, + { + "epoch": 0.89, + "grad_norm": 2.531639961170924, + "learning_rate": 6.355129147603134e-07, + "loss": 0.7407, + "step": 5810 + }, + { + "epoch": 0.89, + "grad_norm": 3.1542566998202872, + "learning_rate": 6.337749383659386e-07, + "loss": 0.8718, + "step": 5811 + }, + { + "epoch": 0.89, + "grad_norm": 2.5198596314597026, + "learning_rate": 6.32039263926999e-07, + "loss": 0.694, + "step": 5812 + }, + { + "epoch": 0.89, + "grad_norm": 2.5182991329606703, + "learning_rate": 6.303058918700744e-07, + "loss": 0.7802, + "step": 5813 + }, + { + "epoch": 0.89, + "grad_norm": 2.603245498148873, + "learning_rate": 6.285748226211774e-07, + "loss": 0.7041, + "step": 5814 + }, + { + "epoch": 0.89, + "grad_norm": 2.7378847104911825, + "learning_rate": 6.268460566057599e-07, + "loss": 0.7567, + "step": 5815 + }, + { + "epoch": 0.89, + "grad_norm": 2.539181928795849, + "learning_rate": 6.251195942487009e-07, + "loss": 0.7489, + "step": 5816 + }, + { + "epoch": 0.89, + "grad_norm": 2.6082265092629866, + "learning_rate": 6.233954359743155e-07, + "loss": 0.8378, + "step": 5817 + }, + { + "epoch": 0.89, + "grad_norm": 2.479348939551202, + "learning_rate": 6.216735822063569e-07, + "loss": 0.7271, + "step": 5818 + }, + { + "epoch": 0.89, + "grad_norm": 2.6870083550884587, + "learning_rate": 6.199540333680065e-07, + "loss": 0.7784, + "step": 5819 + }, + { + "epoch": 0.89, + "grad_norm": 2.7374816629494965, + "learning_rate": 6.18236789881882e-07, + "loss": 0.796, + "step": 5820 + }, + { + "epoch": 0.89, + "grad_norm": 3.599437310217699, + "learning_rate": 6.165218521700333e-07, + "loss": 0.8368, + "step": 5821 + }, + { + "epoch": 0.89, + "grad_norm": 2.6962671130641436, + "learning_rate": 6.148092206539425e-07, + "loss": 0.6851, + "step": 5822 + }, + { + "epoch": 0.89, + "grad_norm": 2.546788351836191, + "learning_rate": 6.130988957545281e-07, + "loss": 0.7044, + "step": 5823 + }, + { + "epoch": 0.89, + "grad_norm": 2.4294266150794495, + "learning_rate": 6.113908778921407e-07, + "loss": 0.7229, + "step": 5824 + }, + { + "epoch": 0.89, + "grad_norm": 2.5925856274482286, + "learning_rate": 6.0968516748656e-07, + "loss": 0.7794, + "step": 5825 + }, + { + "epoch": 0.89, + "grad_norm": 2.8770495316561764, + "learning_rate": 6.079817649570052e-07, + "loss": 0.7082, + "step": 5826 + }, + { + "epoch": 0.89, + "grad_norm": 2.799481628204912, + "learning_rate": 6.062806707221236e-07, + "loss": 0.769, + "step": 5827 + }, + { + "epoch": 0.89, + "grad_norm": 2.7666018489574338, + "learning_rate": 6.045818851999952e-07, + "loss": 0.7609, + "step": 5828 + }, + { + "epoch": 0.89, + "grad_norm": 2.753622059951933, + "learning_rate": 6.028854088081359e-07, + "loss": 0.7736, + "step": 5829 + }, + { + "epoch": 0.89, + "grad_norm": 2.5700093823746983, + "learning_rate": 6.011912419634924e-07, + "loss": 0.6826, + "step": 5830 + }, + { + "epoch": 0.89, + "grad_norm": 2.8027455356023525, + "learning_rate": 5.994993850824415e-07, + "loss": 0.8633, + "step": 5831 + }, + { + "epoch": 0.89, + "grad_norm": 3.0498463017469133, + "learning_rate": 5.97809838580794e-07, + "loss": 0.7948, + "step": 5832 + }, + { + "epoch": 0.89, + "grad_norm": 2.787764168001535, + "learning_rate": 5.961226028737932e-07, + "loss": 0.7494, + "step": 5833 + }, + { + "epoch": 0.89, + "grad_norm": 3.0044287571788915, + "learning_rate": 5.944376783761164e-07, + "loss": 0.7201, + "step": 5834 + }, + { + "epoch": 0.89, + "grad_norm": 2.6799421290682717, + "learning_rate": 5.927550655018699e-07, + "loss": 0.7481, + "step": 5835 + }, + { + "epoch": 0.89, + "grad_norm": 2.7264929525021024, + "learning_rate": 5.910747646645898e-07, + "loss": 0.8224, + "step": 5836 + }, + { + "epoch": 0.89, + "grad_norm": 2.545838928841274, + "learning_rate": 5.893967762772512e-07, + "loss": 0.7329, + "step": 5837 + }, + { + "epoch": 0.89, + "grad_norm": 2.6500564552598376, + "learning_rate": 5.877211007522555e-07, + "loss": 0.7454, + "step": 5838 + }, + { + "epoch": 0.89, + "grad_norm": 2.65799984455843, + "learning_rate": 5.86047738501433e-07, + "loss": 0.7239, + "step": 5839 + }, + { + "epoch": 0.89, + "grad_norm": 2.6193677259589876, + "learning_rate": 5.843766899360547e-07, + "loss": 0.8434, + "step": 5840 + }, + { + "epoch": 0.89, + "grad_norm": 2.3599943054612464, + "learning_rate": 5.827079554668147e-07, + "loss": 0.7064, + "step": 5841 + }, + { + "epoch": 0.89, + "grad_norm": 2.5828320727422587, + "learning_rate": 5.810415355038413e-07, + "loss": 0.7733, + "step": 5842 + }, + { + "epoch": 0.89, + "grad_norm": 2.954381968066485, + "learning_rate": 5.793774304566946e-07, + "loss": 0.8062, + "step": 5843 + }, + { + "epoch": 0.89, + "grad_norm": 2.5084137036333116, + "learning_rate": 5.777156407343621e-07, + "loss": 0.8645, + "step": 5844 + }, + { + "epoch": 0.89, + "grad_norm": 2.6051884694422123, + "learning_rate": 5.76056166745268e-07, + "loss": 0.8177, + "step": 5845 + }, + { + "epoch": 0.89, + "grad_norm": 2.6013475332208693, + "learning_rate": 5.74399008897265e-07, + "loss": 0.7638, + "step": 5846 + }, + { + "epoch": 0.89, + "grad_norm": 2.5731747532030598, + "learning_rate": 5.72744167597632e-07, + "loss": 0.7762, + "step": 5847 + }, + { + "epoch": 0.9, + "grad_norm": 2.6559945677786647, + "learning_rate": 5.710916432530877e-07, + "loss": 0.7291, + "step": 5848 + }, + { + "epoch": 0.9, + "grad_norm": 2.575724733370775, + "learning_rate": 5.694414362697742e-07, + "loss": 0.8015, + "step": 5849 + }, + { + "epoch": 0.9, + "grad_norm": 2.6141726668382512, + "learning_rate": 5.677935470532636e-07, + "loss": 0.7739, + "step": 5850 + }, + { + "epoch": 0.9, + "grad_norm": 2.869045509165927, + "learning_rate": 5.661479760085642e-07, + "loss": 0.8365, + "step": 5851 + }, + { + "epoch": 0.9, + "grad_norm": 2.579470374575014, + "learning_rate": 5.645047235401091e-07, + "loss": 0.6617, + "step": 5852 + }, + { + "epoch": 0.9, + "grad_norm": 2.665435683114741, + "learning_rate": 5.628637900517652e-07, + "loss": 0.7943, + "step": 5853 + }, + { + "epoch": 0.9, + "grad_norm": 2.737771133863642, + "learning_rate": 5.612251759468301e-07, + "loss": 0.7013, + "step": 5854 + }, + { + "epoch": 0.9, + "grad_norm": 2.6553279117918236, + "learning_rate": 5.595888816280226e-07, + "loss": 0.8019, + "step": 5855 + }, + { + "epoch": 0.9, + "grad_norm": 2.523369735721981, + "learning_rate": 5.579549074975032e-07, + "loss": 0.6509, + "step": 5856 + }, + { + "epoch": 0.9, + "grad_norm": 5.0010206491405205, + "learning_rate": 5.563232539568553e-07, + "loss": 0.8583, + "step": 5857 + }, + { + "epoch": 0.9, + "grad_norm": 2.5704308172659847, + "learning_rate": 5.546939214070923e-07, + "loss": 0.7319, + "step": 5858 + }, + { + "epoch": 0.9, + "grad_norm": 2.9387864037827605, + "learning_rate": 5.530669102486619e-07, + "loss": 0.8983, + "step": 5859 + }, + { + "epoch": 0.9, + "grad_norm": 2.934935762272051, + "learning_rate": 5.514422208814352e-07, + "loss": 0.6828, + "step": 5860 + }, + { + "epoch": 0.9, + "grad_norm": 2.334253343305994, + "learning_rate": 5.49819853704715e-07, + "loss": 0.7281, + "step": 5861 + }, + { + "epoch": 0.9, + "grad_norm": 2.8362699742929722, + "learning_rate": 5.481998091172358e-07, + "loss": 0.7761, + "step": 5862 + }, + { + "epoch": 0.9, + "grad_norm": 2.5752370173710646, + "learning_rate": 5.465820875171557e-07, + "loss": 0.7658, + "step": 5863 + }, + { + "epoch": 0.9, + "grad_norm": 3.7342653154702, + "learning_rate": 5.4496668930207e-07, + "loss": 0.818, + "step": 5864 + }, + { + "epoch": 0.9, + "grad_norm": 2.625860994123965, + "learning_rate": 5.433536148689944e-07, + "loss": 0.7583, + "step": 5865 + }, + { + "epoch": 0.9, + "grad_norm": 2.774371794317327, + "learning_rate": 5.417428646143797e-07, + "loss": 0.773, + "step": 5866 + }, + { + "epoch": 0.9, + "grad_norm": 2.7174080154169933, + "learning_rate": 5.401344389341013e-07, + "loss": 0.7942, + "step": 5867 + }, + { + "epoch": 0.9, + "grad_norm": 2.928065420008455, + "learning_rate": 5.385283382234674e-07, + "loss": 0.7364, + "step": 5868 + }, + { + "epoch": 0.9, + "grad_norm": 3.071811201597634, + "learning_rate": 5.369245628772079e-07, + "loss": 0.72, + "step": 5869 + }, + { + "epoch": 0.9, + "grad_norm": 2.608659449286244, + "learning_rate": 5.35323113289491e-07, + "loss": 0.6901, + "step": 5870 + }, + { + "epoch": 0.9, + "grad_norm": 2.684645889963231, + "learning_rate": 5.337239898539071e-07, + "loss": 0.7628, + "step": 5871 + }, + { + "epoch": 0.9, + "grad_norm": 2.6352813074432464, + "learning_rate": 5.321271929634719e-07, + "loss": 0.8173, + "step": 5872 + }, + { + "epoch": 0.9, + "grad_norm": 2.590851206610053, + "learning_rate": 5.305327230106383e-07, + "loss": 0.6468, + "step": 5873 + }, + { + "epoch": 0.9, + "grad_norm": 2.752933732434142, + "learning_rate": 5.289405803872782e-07, + "loss": 0.7054, + "step": 5874 + }, + { + "epoch": 0.9, + "grad_norm": 3.2451802406056287, + "learning_rate": 5.273507654846999e-07, + "loss": 0.7577, + "step": 5875 + }, + { + "epoch": 0.9, + "grad_norm": 2.853764991107126, + "learning_rate": 5.257632786936328e-07, + "loss": 0.7259, + "step": 5876 + }, + { + "epoch": 0.9, + "grad_norm": 2.4914983750520885, + "learning_rate": 5.241781204042362e-07, + "loss": 0.6737, + "step": 5877 + }, + { + "epoch": 0.9, + "grad_norm": 3.05209228111768, + "learning_rate": 5.225952910060994e-07, + "loss": 0.7027, + "step": 5878 + }, + { + "epoch": 0.9, + "grad_norm": 2.621867794165036, + "learning_rate": 5.210147908882357e-07, + "loss": 0.7497, + "step": 5879 + }, + { + "epoch": 0.9, + "grad_norm": 2.922865468524496, + "learning_rate": 5.194366204390867e-07, + "loss": 0.8475, + "step": 5880 + }, + { + "epoch": 0.9, + "grad_norm": 2.5874957343095515, + "learning_rate": 5.178607800465252e-07, + "loss": 0.6427, + "step": 5881 + }, + { + "epoch": 0.9, + "grad_norm": 2.825060543684903, + "learning_rate": 5.162872700978483e-07, + "loss": 0.762, + "step": 5882 + }, + { + "epoch": 0.9, + "grad_norm": 2.9238862189206114, + "learning_rate": 5.147160909797777e-07, + "loss": 0.8199, + "step": 5883 + }, + { + "epoch": 0.9, + "grad_norm": 3.1311383501295964, + "learning_rate": 5.13147243078469e-07, + "loss": 0.8092, + "step": 5884 + }, + { + "epoch": 0.9, + "grad_norm": 2.8516286192122204, + "learning_rate": 5.11580726779497e-07, + "loss": 0.7055, + "step": 5885 + }, + { + "epoch": 0.9, + "grad_norm": 2.597901538018006, + "learning_rate": 5.100165424678715e-07, + "loss": 0.7527, + "step": 5886 + }, + { + "epoch": 0.9, + "grad_norm": 2.8139780948798947, + "learning_rate": 5.08454690528023e-07, + "loss": 0.7478, + "step": 5887 + }, + { + "epoch": 0.9, + "grad_norm": 2.434325694543393, + "learning_rate": 5.06895171343812e-07, + "loss": 0.7304, + "step": 5888 + }, + { + "epoch": 0.9, + "grad_norm": 2.4674361312600506, + "learning_rate": 5.05337985298523e-07, + "loss": 0.7271, + "step": 5889 + }, + { + "epoch": 0.9, + "grad_norm": 2.642364749860154, + "learning_rate": 5.037831327748699e-07, + "loss": 0.7311, + "step": 5890 + }, + { + "epoch": 0.9, + "grad_norm": 2.688287039097314, + "learning_rate": 5.022306141549893e-07, + "loss": 0.7586, + "step": 5891 + }, + { + "epoch": 0.9, + "grad_norm": 2.4741994629105313, + "learning_rate": 5.006804298204515e-07, + "loss": 0.7872, + "step": 5892 + }, + { + "epoch": 0.9, + "grad_norm": 4.854934089901339, + "learning_rate": 4.991325801522429e-07, + "loss": 0.6969, + "step": 5893 + }, + { + "epoch": 0.9, + "grad_norm": 3.41087337300248, + "learning_rate": 4.975870655307868e-07, + "loss": 0.7949, + "step": 5894 + }, + { + "epoch": 0.9, + "grad_norm": 2.953241328716546, + "learning_rate": 4.96043886335924e-07, + "loss": 0.7946, + "step": 5895 + }, + { + "epoch": 0.9, + "grad_norm": 2.8049058507014983, + "learning_rate": 4.945030429469244e-07, + "loss": 0.7912, + "step": 5896 + }, + { + "epoch": 0.9, + "grad_norm": 2.595414400806504, + "learning_rate": 4.929645357424862e-07, + "loss": 0.7498, + "step": 5897 + }, + { + "epoch": 0.9, + "grad_norm": 2.745635922213741, + "learning_rate": 4.914283651007312e-07, + "loss": 0.7651, + "step": 5898 + }, + { + "epoch": 0.9, + "grad_norm": 2.982896718226123, + "learning_rate": 4.898945313992054e-07, + "loss": 0.7171, + "step": 5899 + }, + { + "epoch": 0.9, + "grad_norm": 2.6219179763701024, + "learning_rate": 4.883630350148827e-07, + "loss": 0.737, + "step": 5900 + }, + { + "epoch": 0.9, + "grad_norm": 2.602707647768999, + "learning_rate": 4.868338763241631e-07, + "loss": 0.7237, + "step": 5901 + }, + { + "epoch": 0.9, + "grad_norm": 2.6003569589861706, + "learning_rate": 4.853070557028672e-07, + "loss": 0.7409, + "step": 5902 + }, + { + "epoch": 0.9, + "grad_norm": 2.5458129925277726, + "learning_rate": 4.837825735262503e-07, + "loss": 0.7459, + "step": 5903 + }, + { + "epoch": 0.9, + "grad_norm": 2.758220418720208, + "learning_rate": 4.822604301689826e-07, + "loss": 0.8263, + "step": 5904 + }, + { + "epoch": 0.9, + "grad_norm": 2.79996513102502, + "learning_rate": 4.807406260051672e-07, + "loss": 0.848, + "step": 5905 + }, + { + "epoch": 0.9, + "grad_norm": 2.7388566308633995, + "learning_rate": 4.792231614083287e-07, + "loss": 0.7703, + "step": 5906 + }, + { + "epoch": 0.9, + "grad_norm": 2.704544888648876, + "learning_rate": 4.777080367514153e-07, + "loss": 0.8292, + "step": 5907 + }, + { + "epoch": 0.9, + "grad_norm": 2.7338348905418997, + "learning_rate": 4.7619525240680475e-07, + "loss": 0.7844, + "step": 5908 + }, + { + "epoch": 0.9, + "grad_norm": 2.545428147432849, + "learning_rate": 4.746848087462963e-07, + "loss": 0.7442, + "step": 5909 + }, + { + "epoch": 0.9, + "grad_norm": 3.751373583389988, + "learning_rate": 4.731767061411141e-07, + "loss": 0.7968, + "step": 5910 + }, + { + "epoch": 0.9, + "grad_norm": 2.632331818799675, + "learning_rate": 4.716709449619084e-07, + "loss": 0.8126, + "step": 5911 + }, + { + "epoch": 0.9, + "grad_norm": 2.397912522997727, + "learning_rate": 4.701675255787519e-07, + "loss": 0.7333, + "step": 5912 + }, + { + "epoch": 0.91, + "grad_norm": 2.8191668230435645, + "learning_rate": 4.686664483611425e-07, + "loss": 0.6911, + "step": 5913 + }, + { + "epoch": 0.91, + "grad_norm": 2.5749939501751276, + "learning_rate": 4.6716771367800507e-07, + "loss": 0.6782, + "step": 5914 + }, + { + "epoch": 0.91, + "grad_norm": 2.6443502932254215, + "learning_rate": 4.656713218976838e-07, + "loss": 0.6271, + "step": 5915 + }, + { + "epoch": 0.91, + "grad_norm": 3.0452845496714342, + "learning_rate": 4.641772733879535e-07, + "loss": 0.8101, + "step": 5916 + }, + { + "epoch": 0.91, + "grad_norm": 3.292735176357161, + "learning_rate": 4.626855685160059e-07, + "loss": 0.8173, + "step": 5917 + }, + { + "epoch": 0.91, + "grad_norm": 2.5951150490455013, + "learning_rate": 4.611962076484611e-07, + "loss": 0.7186, + "step": 5918 + }, + { + "epoch": 0.91, + "grad_norm": 3.0111442088519453, + "learning_rate": 4.5970919115136406e-07, + "loss": 0.7697, + "step": 5919 + }, + { + "epoch": 0.91, + "grad_norm": 2.5973671680062185, + "learning_rate": 4.582245193901802e-07, + "loss": 0.7933, + "step": 5920 + }, + { + "epoch": 0.91, + "grad_norm": 2.475572384849162, + "learning_rate": 4.567421927297999e-07, + "loss": 0.6779, + "step": 5921 + }, + { + "epoch": 0.91, + "grad_norm": 2.4497504442552986, + "learning_rate": 4.5526221153453845e-07, + "loss": 0.7951, + "step": 5922 + }, + { + "epoch": 0.91, + "grad_norm": 3.2379051155154412, + "learning_rate": 4.5378457616813255e-07, + "loss": 0.8106, + "step": 5923 + }, + { + "epoch": 0.91, + "grad_norm": 3.0168271076344393, + "learning_rate": 4.52309286993744e-07, + "loss": 0.8453, + "step": 5924 + }, + { + "epoch": 0.91, + "grad_norm": 2.5571350055712943, + "learning_rate": 4.508363443739583e-07, + "loss": 0.7459, + "step": 5925 + }, + { + "epoch": 0.91, + "grad_norm": 2.422313233053986, + "learning_rate": 4.493657486707814e-07, + "loss": 0.8195, + "step": 5926 + }, + { + "epoch": 0.91, + "grad_norm": 2.6614041261222745, + "learning_rate": 4.478975002456465e-07, + "loss": 0.8475, + "step": 5927 + }, + { + "epoch": 0.91, + "grad_norm": 2.5459306588993655, + "learning_rate": 4.4643159945940816e-07, + "loss": 0.7793, + "step": 5928 + }, + { + "epoch": 0.91, + "grad_norm": 2.638010072274686, + "learning_rate": 4.449680466723416e-07, + "loss": 0.7412, + "step": 5929 + }, + { + "epoch": 0.91, + "grad_norm": 2.6374186218730866, + "learning_rate": 4.435068422441491e-07, + "loss": 0.7863, + "step": 5930 + }, + { + "epoch": 0.91, + "grad_norm": 3.0473504441514527, + "learning_rate": 4.4204798653395334e-07, + "loss": 0.724, + "step": 5931 + }, + { + "epoch": 0.91, + "grad_norm": 2.880335924375563, + "learning_rate": 4.405914799002997e-07, + "loss": 0.8654, + "step": 5932 + }, + { + "epoch": 0.91, + "grad_norm": 2.5988830192409593, + "learning_rate": 4.391373227011564e-07, + "loss": 0.8395, + "step": 5933 + }, + { + "epoch": 0.91, + "grad_norm": 2.8005912873580217, + "learning_rate": 4.376855152939152e-07, + "loss": 0.8489, + "step": 5934 + }, + { + "epoch": 0.91, + "grad_norm": 2.5327179535187696, + "learning_rate": 4.362360580353875e-07, + "loss": 0.7765, + "step": 5935 + }, + { + "epoch": 0.91, + "grad_norm": 3.110555066156692, + "learning_rate": 4.347889512818115e-07, + "loss": 0.7865, + "step": 5936 + }, + { + "epoch": 0.91, + "grad_norm": 2.425327650857889, + "learning_rate": 4.33344195388844e-07, + "loss": 0.7973, + "step": 5937 + }, + { + "epoch": 0.91, + "grad_norm": 2.484576397445717, + "learning_rate": 4.319017907115686e-07, + "loss": 0.6719, + "step": 5938 + }, + { + "epoch": 0.91, + "grad_norm": 2.7141696380110596, + "learning_rate": 4.3046173760448507e-07, + "loss": 0.7373, + "step": 5939 + }, + { + "epoch": 0.91, + "grad_norm": 2.6212725554013523, + "learning_rate": 4.2902403642151704e-07, + "loss": 0.6919, + "step": 5940 + }, + { + "epoch": 0.91, + "grad_norm": 2.622753613256637, + "learning_rate": 4.27588687516014e-07, + "loss": 0.7129, + "step": 5941 + }, + { + "epoch": 0.91, + "grad_norm": 2.7304264084363306, + "learning_rate": 4.2615569124074385e-07, + "loss": 0.7973, + "step": 5942 + }, + { + "epoch": 0.91, + "grad_norm": 2.6718605017803423, + "learning_rate": 4.2472504794789593e-07, + "loss": 0.764, + "step": 5943 + }, + { + "epoch": 0.91, + "grad_norm": 3.0322436728986784, + "learning_rate": 4.232967579890823e-07, + "loss": 0.8199, + "step": 5944 + }, + { + "epoch": 0.91, + "grad_norm": 2.7072193959741755, + "learning_rate": 4.2187082171533665e-07, + "loss": 0.8041, + "step": 5945 + }, + { + "epoch": 0.91, + "grad_norm": 2.8810435881483913, + "learning_rate": 4.204472394771142e-07, + "loss": 0.7923, + "step": 5946 + }, + { + "epoch": 0.91, + "grad_norm": 2.8209460055158027, + "learning_rate": 4.190260116242917e-07, + "loss": 0.5896, + "step": 5947 + }, + { + "epoch": 0.91, + "grad_norm": 2.612444919294644, + "learning_rate": 4.176071385061664e-07, + "loss": 0.7208, + "step": 5948 + }, + { + "epoch": 0.91, + "grad_norm": 2.5928435503810485, + "learning_rate": 4.1619062047145943e-07, + "loss": 0.735, + "step": 5949 + }, + { + "epoch": 0.91, + "grad_norm": 2.7103087781610578, + "learning_rate": 4.1477645786831e-07, + "loss": 0.7629, + "step": 5950 + }, + { + "epoch": 0.91, + "grad_norm": 2.55167539583218, + "learning_rate": 4.1336465104427793e-07, + "loss": 0.6997, + "step": 5951 + }, + { + "epoch": 0.91, + "grad_norm": 2.608092717551468, + "learning_rate": 4.1195520034634896e-07, + "loss": 0.7727, + "step": 5952 + }, + { + "epoch": 0.91, + "grad_norm": 2.578863079954882, + "learning_rate": 4.105481061209249e-07, + "loss": 0.7717, + "step": 5953 + }, + { + "epoch": 0.91, + "grad_norm": 2.6419007520560576, + "learning_rate": 4.091433687138291e-07, + "loss": 0.7056, + "step": 5954 + }, + { + "epoch": 0.91, + "grad_norm": 2.674006217875925, + "learning_rate": 4.0774098847030875e-07, + "loss": 0.8579, + "step": 5955 + }, + { + "epoch": 0.91, + "grad_norm": 2.6653185857072184, + "learning_rate": 4.063409657350281e-07, + "loss": 0.7532, + "step": 5956 + }, + { + "epoch": 0.91, + "grad_norm": 2.695677170700721, + "learning_rate": 4.0494330085207314e-07, + "loss": 0.7022, + "step": 5957 + }, + { + "epoch": 0.91, + "grad_norm": 2.7217815025546916, + "learning_rate": 4.0354799416495227e-07, + "loss": 0.7414, + "step": 5958 + }, + { + "epoch": 0.91, + "grad_norm": 2.6230826008731913, + "learning_rate": 4.0215504601659017e-07, + "loss": 0.7257, + "step": 5959 + }, + { + "epoch": 0.91, + "grad_norm": 3.201265139540046, + "learning_rate": 4.007644567493374e-07, + "loss": 0.8023, + "step": 5960 + }, + { + "epoch": 0.91, + "grad_norm": 2.534269235714635, + "learning_rate": 3.993762267049606e-07, + "loss": 0.6844, + "step": 5961 + }, + { + "epoch": 0.91, + "grad_norm": 2.6808601105724374, + "learning_rate": 3.9799035622464674e-07, + "loss": 0.6956, + "step": 5962 + }, + { + "epoch": 0.91, + "grad_norm": 2.581383954042853, + "learning_rate": 3.9660684564900574e-07, + "loss": 0.7696, + "step": 5963 + }, + { + "epoch": 0.91, + "grad_norm": 2.5849069154868634, + "learning_rate": 3.9522569531806556e-07, + "loss": 0.7603, + "step": 5964 + }, + { + "epoch": 0.91, + "grad_norm": 2.768738229693881, + "learning_rate": 3.9384690557127125e-07, + "loss": 0.6432, + "step": 5965 + }, + { + "epoch": 0.91, + "grad_norm": 2.8027221859258016, + "learning_rate": 3.9247047674749625e-07, + "loss": 0.7137, + "step": 5966 + }, + { + "epoch": 0.91, + "grad_norm": 2.6574074923851816, + "learning_rate": 3.9109640918502333e-07, + "loss": 0.7667, + "step": 5967 + }, + { + "epoch": 0.91, + "grad_norm": 2.7951563356356353, + "learning_rate": 3.897247032215601e-07, + "loss": 0.8096, + "step": 5968 + }, + { + "epoch": 0.91, + "grad_norm": 2.5183737997202704, + "learning_rate": 3.883553591942346e-07, + "loss": 0.812, + "step": 5969 + }, + { + "epoch": 0.91, + "grad_norm": 2.8079909667473477, + "learning_rate": 3.8698837743959325e-07, + "loss": 0.7284, + "step": 5970 + }, + { + "epoch": 0.91, + "grad_norm": 2.68043983615785, + "learning_rate": 3.8562375829360286e-07, + "loss": 0.6751, + "step": 5971 + }, + { + "epoch": 0.91, + "grad_norm": 2.7758583003898907, + "learning_rate": 3.8426150209164624e-07, + "loss": 0.7877, + "step": 5972 + }, + { + "epoch": 0.91, + "grad_norm": 2.7937395372284466, + "learning_rate": 3.8290160916852894e-07, + "loss": 0.8075, + "step": 5973 + }, + { + "epoch": 0.91, + "grad_norm": 2.688516227702249, + "learning_rate": 3.8154407985847466e-07, + "loss": 0.7854, + "step": 5974 + }, + { + "epoch": 0.91, + "grad_norm": 2.628765653663377, + "learning_rate": 3.8018891449512654e-07, + "loss": 0.6539, + "step": 5975 + }, + { + "epoch": 0.91, + "grad_norm": 2.7655961176352757, + "learning_rate": 3.788361134115448e-07, + "loss": 0.7628, + "step": 5976 + }, + { + "epoch": 0.91, + "grad_norm": 2.6225969425848907, + "learning_rate": 3.774856769402113e-07, + "loss": 0.7296, + "step": 5977 + }, + { + "epoch": 0.92, + "grad_norm": 2.691373260121956, + "learning_rate": 3.7613760541302504e-07, + "loss": 0.7735, + "step": 5978 + }, + { + "epoch": 0.92, + "grad_norm": 2.5287047223034285, + "learning_rate": 3.747918991613031e-07, + "loss": 0.7892, + "step": 5979 + }, + { + "epoch": 0.92, + "grad_norm": 2.56744426619113, + "learning_rate": 3.734485585157843e-07, + "loss": 0.7774, + "step": 5980 + }, + { + "epoch": 0.92, + "grad_norm": 2.6214997737584773, + "learning_rate": 3.7210758380662125e-07, + "loss": 0.8006, + "step": 5981 + }, + { + "epoch": 0.92, + "grad_norm": 3.8131875526793033, + "learning_rate": 3.707689753633914e-07, + "loss": 0.7807, + "step": 5982 + }, + { + "epoch": 0.92, + "grad_norm": 2.8500936267310752, + "learning_rate": 3.6943273351508604e-07, + "loss": 0.7625, + "step": 5983 + }, + { + "epoch": 0.92, + "grad_norm": 2.6465562873541058, + "learning_rate": 3.680988585901124e-07, + "loss": 0.7722, + "step": 5984 + }, + { + "epoch": 0.92, + "grad_norm": 2.5574726673727084, + "learning_rate": 3.66767350916305e-07, + "loss": 0.8831, + "step": 5985 + }, + { + "epoch": 0.92, + "grad_norm": 2.782848100485891, + "learning_rate": 3.654382108209087e-07, + "loss": 0.73, + "step": 5986 + }, + { + "epoch": 0.92, + "grad_norm": 2.6344063959839676, + "learning_rate": 3.6411143863058773e-07, + "loss": 0.7311, + "step": 5987 + }, + { + "epoch": 0.92, + "grad_norm": 2.6342752111204453, + "learning_rate": 3.6278703467142684e-07, + "loss": 0.7964, + "step": 5988 + }, + { + "epoch": 0.92, + "grad_norm": 2.6008339547301254, + "learning_rate": 3.6146499926892786e-07, + "loss": 0.7708, + "step": 5989 + }, + { + "epoch": 0.92, + "grad_norm": 2.7388841845665794, + "learning_rate": 3.6014533274800867e-07, + "loss": 0.7886, + "step": 5990 + }, + { + "epoch": 0.92, + "grad_norm": 2.4069664083059545, + "learning_rate": 3.588280354330065e-07, + "loss": 0.6621, + "step": 5991 + }, + { + "epoch": 0.92, + "grad_norm": 2.8839315931631804, + "learning_rate": 3.5751310764767567e-07, + "loss": 0.7736, + "step": 5992 + }, + { + "epoch": 0.92, + "grad_norm": 2.733277195734112, + "learning_rate": 3.56200549715191e-07, + "loss": 0.8026, + "step": 5993 + }, + { + "epoch": 0.92, + "grad_norm": 2.540055857905672, + "learning_rate": 3.5489036195814007e-07, + "loss": 0.6752, + "step": 5994 + }, + { + "epoch": 0.92, + "grad_norm": 2.8417129277346667, + "learning_rate": 3.535825446985297e-07, + "loss": 0.7915, + "step": 5995 + }, + { + "epoch": 0.92, + "grad_norm": 2.5360667168506597, + "learning_rate": 3.522770982577872e-07, + "loss": 0.7955, + "step": 5996 + }, + { + "epoch": 0.92, + "grad_norm": 2.6673994000706913, + "learning_rate": 3.5097402295675373e-07, + "loss": 0.7334, + "step": 5997 + }, + { + "epoch": 0.92, + "grad_norm": 2.472030136770235, + "learning_rate": 3.496733191156876e-07, + "loss": 0.7367, + "step": 5998 + }, + { + "epoch": 0.92, + "grad_norm": 2.6281808987607693, + "learning_rate": 3.483749870542663e-07, + "loss": 0.811, + "step": 5999 + }, + { + "epoch": 0.92, + "grad_norm": 3.084336288946746, + "learning_rate": 3.470790270915836e-07, + "loss": 0.791, + "step": 6000 + }, + { + "epoch": 0.92, + "grad_norm": 2.69540151361276, + "learning_rate": 3.4578543954615017e-07, + "loss": 0.8372, + "step": 6001 + }, + { + "epoch": 0.92, + "grad_norm": 2.6477593650807663, + "learning_rate": 3.444942247358918e-07, + "loss": 0.7617, + "step": 6002 + }, + { + "epoch": 0.92, + "grad_norm": 2.6109865773332137, + "learning_rate": 3.4320538297815454e-07, + "loss": 0.8041, + "step": 6003 + }, + { + "epoch": 0.92, + "grad_norm": 2.564118447432275, + "learning_rate": 3.419189145896995e-07, + "loss": 0.6351, + "step": 6004 + }, + { + "epoch": 0.92, + "grad_norm": 2.7347820160761946, + "learning_rate": 3.4063481988670375e-07, + "loss": 0.85, + "step": 6005 + }, + { + "epoch": 0.92, + "grad_norm": 2.6586665554457767, + "learning_rate": 3.3935309918476155e-07, + "loss": 0.7468, + "step": 6006 + }, + { + "epoch": 0.92, + "grad_norm": 2.6617987820098747, + "learning_rate": 3.3807375279888644e-07, + "loss": 0.7237, + "step": 6007 + }, + { + "epoch": 0.92, + "grad_norm": 2.6600573016886737, + "learning_rate": 3.3679678104350353e-07, + "loss": 0.7518, + "step": 6008 + }, + { + "epoch": 0.92, + "grad_norm": 2.5286339031019405, + "learning_rate": 3.355221842324552e-07, + "loss": 0.7932, + "step": 6009 + }, + { + "epoch": 0.92, + "grad_norm": 3.454421060746707, + "learning_rate": 3.342499626790052e-07, + "loss": 0.8156, + "step": 6010 + }, + { + "epoch": 0.92, + "grad_norm": 2.7733675155642494, + "learning_rate": 3.329801166958291e-07, + "loss": 0.8301, + "step": 6011 + }, + { + "epoch": 0.92, + "grad_norm": 3.1470563228210944, + "learning_rate": 3.317126465950171e-07, + "loss": 0.7118, + "step": 6012 + }, + { + "epoch": 0.92, + "grad_norm": 2.7036527420458616, + "learning_rate": 3.3044755268808013e-07, + "loss": 0.7611, + "step": 6013 + }, + { + "epoch": 0.92, + "grad_norm": 2.697254921913109, + "learning_rate": 3.291848352859406e-07, + "loss": 0.7057, + "step": 6014 + }, + { + "epoch": 0.92, + "grad_norm": 2.5999225248707623, + "learning_rate": 3.279244946989424e-07, + "loss": 0.7057, + "step": 6015 + }, + { + "epoch": 0.92, + "grad_norm": 2.8870135590746515, + "learning_rate": 3.266665312368389e-07, + "loss": 0.6807, + "step": 6016 + }, + { + "epoch": 0.92, + "grad_norm": 2.7602796195978123, + "learning_rate": 3.2541094520880166e-07, + "loss": 0.8041, + "step": 6017 + }, + { + "epoch": 0.92, + "grad_norm": 3.4683812993866825, + "learning_rate": 3.241577369234228e-07, + "loss": 0.8085, + "step": 6018 + }, + { + "epoch": 0.92, + "grad_norm": 2.4856380251395604, + "learning_rate": 3.229069066887014e-07, + "loss": 0.7617, + "step": 6019 + }, + { + "epoch": 0.92, + "grad_norm": 2.4488210107592954, + "learning_rate": 3.2165845481205826e-07, + "loss": 0.7211, + "step": 6020 + }, + { + "epoch": 0.92, + "grad_norm": 2.4634891340354304, + "learning_rate": 3.2041238160032793e-07, + "loss": 0.6962, + "step": 6021 + }, + { + "epoch": 0.92, + "grad_norm": 2.7593153098002845, + "learning_rate": 3.191686873597599e-07, + "loss": 0.7758, + "step": 6022 + }, + { + "epoch": 0.92, + "grad_norm": 2.38830097389013, + "learning_rate": 3.1792737239601965e-07, + "loss": 0.6635, + "step": 6023 + }, + { + "epoch": 0.92, + "grad_norm": 2.7072742284457427, + "learning_rate": 3.166884370141876e-07, + "loss": 0.7502, + "step": 6024 + }, + { + "epoch": 0.92, + "grad_norm": 2.690475824319762, + "learning_rate": 3.1545188151875795e-07, + "loss": 0.7552, + "step": 6025 + }, + { + "epoch": 0.92, + "grad_norm": 2.846891691572372, + "learning_rate": 3.142177062136431e-07, + "loss": 0.707, + "step": 6026 + }, + { + "epoch": 0.92, + "grad_norm": 3.0359662518289827, + "learning_rate": 3.1298591140216827e-07, + "loss": 0.7993, + "step": 6027 + }, + { + "epoch": 0.92, + "grad_norm": 2.746866925967777, + "learning_rate": 3.117564973870735e-07, + "loss": 0.875, + "step": 6028 + }, + { + "epoch": 0.92, + "grad_norm": 2.5842931925443122, + "learning_rate": 3.105294644705148e-07, + "loss": 0.6753, + "step": 6029 + }, + { + "epoch": 0.92, + "grad_norm": 2.7795678238783204, + "learning_rate": 3.093048129540632e-07, + "loss": 0.8573, + "step": 6030 + }, + { + "epoch": 0.92, + "grad_norm": 2.6587604849304656, + "learning_rate": 3.0808254313870133e-07, + "loss": 0.7591, + "step": 6031 + }, + { + "epoch": 0.92, + "grad_norm": 2.6921282826190547, + "learning_rate": 3.068626553248311e-07, + "loss": 0.8027, + "step": 6032 + }, + { + "epoch": 0.92, + "grad_norm": 2.650607136409856, + "learning_rate": 3.0564514981226703e-07, + "loss": 0.7448, + "step": 6033 + }, + { + "epoch": 0.92, + "grad_norm": 2.5959315484385255, + "learning_rate": 3.0443002690023537e-07, + "loss": 0.6463, + "step": 6034 + }, + { + "epoch": 0.92, + "grad_norm": 2.4771076355337986, + "learning_rate": 3.032172868873817e-07, + "loss": 0.7658, + "step": 6035 + }, + { + "epoch": 0.92, + "grad_norm": 2.560831138005353, + "learning_rate": 3.0200693007176097e-07, + "loss": 0.7482, + "step": 6036 + }, + { + "epoch": 0.92, + "grad_norm": 2.6691916248453276, + "learning_rate": 3.0079895675084734e-07, + "loss": 0.7337, + "step": 6037 + }, + { + "epoch": 0.92, + "grad_norm": 2.4902436658315352, + "learning_rate": 2.995933672215257e-07, + "loss": 0.7613, + "step": 6038 + }, + { + "epoch": 0.92, + "grad_norm": 2.4597485662812892, + "learning_rate": 2.983901617800955e-07, + "loss": 0.6547, + "step": 6039 + }, + { + "epoch": 0.92, + "grad_norm": 2.889902140838847, + "learning_rate": 2.971893407222737e-07, + "loss": 0.8037, + "step": 6040 + }, + { + "epoch": 0.92, + "grad_norm": 2.8326105413501033, + "learning_rate": 2.9599090434318523e-07, + "loss": 0.7842, + "step": 6041 + }, + { + "epoch": 0.92, + "grad_norm": 2.6873547730949623, + "learning_rate": 2.947948529373723e-07, + "loss": 0.791, + "step": 6042 + }, + { + "epoch": 0.92, + "grad_norm": 2.812710976846079, + "learning_rate": 2.9360118679879315e-07, + "loss": 0.7425, + "step": 6043 + }, + { + "epoch": 0.93, + "grad_norm": 2.720941580967908, + "learning_rate": 2.9240990622081634e-07, + "loss": 0.7981, + "step": 6044 + }, + { + "epoch": 0.93, + "grad_norm": 3.1581410361795386, + "learning_rate": 2.912210114962244e-07, + "loss": 0.7823, + "step": 6045 + }, + { + "epoch": 0.93, + "grad_norm": 2.656038443141141, + "learning_rate": 2.900345029172158e-07, + "loss": 0.6951, + "step": 6046 + }, + { + "epoch": 0.93, + "grad_norm": 2.7266040236326963, + "learning_rate": 2.888503807753984e-07, + "loss": 0.7339, + "step": 6047 + }, + { + "epoch": 0.93, + "grad_norm": 2.7799392585020386, + "learning_rate": 2.8766864536179937e-07, + "loss": 0.7675, + "step": 6048 + }, + { + "epoch": 0.93, + "grad_norm": 2.737208483644531, + "learning_rate": 2.8648929696685535e-07, + "loss": 0.8165, + "step": 6049 + }, + { + "epoch": 0.93, + "grad_norm": 2.842487845149167, + "learning_rate": 2.853123358804144e-07, + "loss": 0.7047, + "step": 6050 + }, + { + "epoch": 0.93, + "grad_norm": 3.7143236985630113, + "learning_rate": 2.8413776239174404e-07, + "loss": 0.8682, + "step": 6051 + }, + { + "epoch": 0.93, + "grad_norm": 2.556683398994706, + "learning_rate": 2.82965576789519e-07, + "loss": 0.6815, + "step": 6052 + }, + { + "epoch": 0.93, + "grad_norm": 2.9004282615196924, + "learning_rate": 2.81795779361832e-07, + "loss": 0.6865, + "step": 6053 + }, + { + "epoch": 0.93, + "grad_norm": 2.5260827904674867, + "learning_rate": 2.806283703961854e-07, + "loss": 0.8114, + "step": 6054 + }, + { + "epoch": 0.93, + "grad_norm": 2.911421961080375, + "learning_rate": 2.794633501794952e-07, + "loss": 0.7076, + "step": 6055 + }, + { + "epoch": 0.93, + "grad_norm": 2.5387998313691216, + "learning_rate": 2.783007189980902e-07, + "loss": 0.7606, + "step": 6056 + }, + { + "epoch": 0.93, + "grad_norm": 2.45474959202561, + "learning_rate": 2.77140477137714e-07, + "loss": 0.7467, + "step": 6057 + }, + { + "epoch": 0.93, + "grad_norm": 2.966599236143366, + "learning_rate": 2.759826248835196e-07, + "loss": 0.7403, + "step": 6058 + }, + { + "epoch": 0.93, + "grad_norm": 2.4783423814613297, + "learning_rate": 2.74827162520076e-07, + "loss": 0.6726, + "step": 6059 + }, + { + "epoch": 0.93, + "grad_norm": 2.638212372845662, + "learning_rate": 2.7367409033136395e-07, + "loss": 0.7919, + "step": 6060 + }, + { + "epoch": 0.93, + "grad_norm": 2.565673526069858, + "learning_rate": 2.725234086007744e-07, + "loss": 0.8529, + "step": 6061 + }, + { + "epoch": 0.93, + "grad_norm": 2.495147047946822, + "learning_rate": 2.713751176111146e-07, + "loss": 0.6873, + "step": 6062 + }, + { + "epoch": 0.93, + "grad_norm": 2.6994013441494182, + "learning_rate": 2.7022921764459977e-07, + "loss": 0.6993, + "step": 6063 + }, + { + "epoch": 0.93, + "grad_norm": 2.8122848045588733, + "learning_rate": 2.6908570898286355e-07, + "loss": 0.7469, + "step": 6064 + }, + { + "epoch": 0.93, + "grad_norm": 2.593524412271433, + "learning_rate": 2.679445919069457e-07, + "loss": 0.7646, + "step": 6065 + }, + { + "epoch": 0.93, + "grad_norm": 2.6157359363362693, + "learning_rate": 2.668058666973017e-07, + "loss": 0.6645, + "step": 6066 + }, + { + "epoch": 0.93, + "grad_norm": 2.685971547296702, + "learning_rate": 2.6566953363379777e-07, + "loss": 0.7256, + "step": 6067 + }, + { + "epoch": 0.93, + "grad_norm": 3.505877258203053, + "learning_rate": 2.6453559299571276e-07, + "loss": 0.8303, + "step": 6068 + }, + { + "epoch": 0.93, + "grad_norm": 2.510661056965399, + "learning_rate": 2.63404045061737e-07, + "loss": 0.7109, + "step": 6069 + }, + { + "epoch": 0.93, + "grad_norm": 2.650980374114757, + "learning_rate": 2.622748901099759e-07, + "loss": 0.6944, + "step": 6070 + }, + { + "epoch": 0.93, + "grad_norm": 2.705362907402246, + "learning_rate": 2.611481284179407e-07, + "loss": 0.7836, + "step": 6071 + }, + { + "epoch": 0.93, + "grad_norm": 2.7305925750666393, + "learning_rate": 2.6002376026255883e-07, + "loss": 0.7874, + "step": 6072 + }, + { + "epoch": 0.93, + "grad_norm": 2.6614969532801633, + "learning_rate": 2.5890178592016925e-07, + "loss": 0.8704, + "step": 6073 + }, + { + "epoch": 0.93, + "grad_norm": 2.4483769604343886, + "learning_rate": 2.5778220566652025e-07, + "loss": 0.686, + "step": 6074 + }, + { + "epoch": 0.93, + "grad_norm": 2.4934663469292087, + "learning_rate": 2.5666501977677614e-07, + "loss": 0.721, + "step": 6075 + }, + { + "epoch": 0.93, + "grad_norm": 2.459218609307165, + "learning_rate": 2.5555022852550736e-07, + "loss": 0.6552, + "step": 6076 + }, + { + "epoch": 0.93, + "grad_norm": 2.8604168333650875, + "learning_rate": 2.5443783218669804e-07, + "loss": 0.6455, + "step": 6077 + }, + { + "epoch": 0.93, + "grad_norm": 2.5098450736455997, + "learning_rate": 2.5332783103374725e-07, + "loss": 0.6351, + "step": 6078 + }, + { + "epoch": 0.93, + "grad_norm": 2.841494576735548, + "learning_rate": 2.52220225339459e-07, + "loss": 0.7896, + "step": 6079 + }, + { + "epoch": 0.93, + "grad_norm": 2.692905809508813, + "learning_rate": 2.511150153760522e-07, + "loss": 0.7215, + "step": 6080 + }, + { + "epoch": 0.93, + "grad_norm": 2.727313980169833, + "learning_rate": 2.5001220141515736e-07, + "loss": 0.7061, + "step": 6081 + }, + { + "epoch": 0.93, + "grad_norm": 2.5469765476370787, + "learning_rate": 2.489117837278143e-07, + "loss": 0.8545, + "step": 6082 + }, + { + "epoch": 0.93, + "grad_norm": 2.559331431246218, + "learning_rate": 2.4781376258447564e-07, + "loss": 0.754, + "step": 6083 + }, + { + "epoch": 0.93, + "grad_norm": 2.722703605969057, + "learning_rate": 2.4671813825500324e-07, + "loss": 0.6871, + "step": 6084 + }, + { + "epoch": 0.93, + "grad_norm": 2.596003743401093, + "learning_rate": 2.456249110086717e-07, + "loss": 0.6845, + "step": 6085 + }, + { + "epoch": 0.93, + "grad_norm": 3.083109926834688, + "learning_rate": 2.4453408111416497e-07, + "loss": 0.7946, + "step": 6086 + }, + { + "epoch": 0.93, + "grad_norm": 2.7663008724269544, + "learning_rate": 2.4344564883957976e-07, + "loss": 0.7694, + "step": 6087 + }, + { + "epoch": 0.93, + "grad_norm": 2.8746332918626205, + "learning_rate": 2.4235961445241987e-07, + "loss": 0.7514, + "step": 6088 + }, + { + "epoch": 0.93, + "grad_norm": 2.6013942389612517, + "learning_rate": 2.41275978219605e-07, + "loss": 0.6923, + "step": 6089 + }, + { + "epoch": 0.93, + "grad_norm": 2.912273621633104, + "learning_rate": 2.4019474040746004e-07, + "loss": 0.8051, + "step": 6090 + }, + { + "epoch": 0.93, + "grad_norm": 2.7464518435612963, + "learning_rate": 2.391159012817246e-07, + "loss": 0.7467, + "step": 6091 + }, + { + "epoch": 0.93, + "grad_norm": 2.7813044398351425, + "learning_rate": 2.3803946110754649e-07, + "loss": 0.7753, + "step": 6092 + }, + { + "epoch": 0.93, + "grad_norm": 2.518752845522808, + "learning_rate": 2.369654201494842e-07, + "loss": 0.7387, + "step": 6093 + }, + { + "epoch": 0.93, + "grad_norm": 2.725181935975217, + "learning_rate": 2.3589377867150543e-07, + "loss": 0.8623, + "step": 6094 + }, + { + "epoch": 0.93, + "grad_norm": 3.1544796740668186, + "learning_rate": 2.3482453693699282e-07, + "loss": 0.7932, + "step": 6095 + }, + { + "epoch": 0.93, + "grad_norm": 2.4788389266335646, + "learning_rate": 2.3375769520873393e-07, + "loss": 0.7703, + "step": 6096 + }, + { + "epoch": 0.93, + "grad_norm": 2.842756149992874, + "learning_rate": 2.3269325374892903e-07, + "loss": 0.6308, + "step": 6097 + }, + { + "epoch": 0.93, + "grad_norm": 2.5921408232583545, + "learning_rate": 2.3163121281918888e-07, + "loss": 0.7522, + "step": 6098 + }, + { + "epoch": 0.93, + "grad_norm": 2.633161071717052, + "learning_rate": 2.3057157268053133e-07, + "loss": 0.6943, + "step": 6099 + }, + { + "epoch": 0.93, + "grad_norm": 2.552057063916039, + "learning_rate": 2.2951433359338805e-07, + "loss": 0.7337, + "step": 6100 + }, + { + "epoch": 0.93, + "grad_norm": 2.520467762762636, + "learning_rate": 2.2845949581759785e-07, + "loss": 0.7381, + "step": 6101 + }, + { + "epoch": 0.93, + "grad_norm": 2.5453403536853623, + "learning_rate": 2.2740705961241006e-07, + "loss": 0.7681, + "step": 6102 + }, + { + "epoch": 0.93, + "grad_norm": 3.0029521200394957, + "learning_rate": 2.2635702523648552e-07, + "loss": 0.6868, + "step": 6103 + }, + { + "epoch": 0.93, + "grad_norm": 2.8302795535117573, + "learning_rate": 2.253093929478911e-07, + "loss": 0.7434, + "step": 6104 + }, + { + "epoch": 0.93, + "grad_norm": 2.4786546039372634, + "learning_rate": 2.2426416300410537e-07, + "loss": 0.7557, + "step": 6105 + }, + { + "epoch": 0.93, + "grad_norm": 2.48657198357177, + "learning_rate": 2.2322133566201941e-07, + "loss": 0.7145, + "step": 6106 + }, + { + "epoch": 0.93, + "grad_norm": 2.6497695414229825, + "learning_rate": 2.2218091117792718e-07, + "loss": 0.668, + "step": 6107 + }, + { + "epoch": 0.93, + "grad_norm": 2.6722487924038067, + "learning_rate": 2.2114288980753962e-07, + "loss": 0.7089, + "step": 6108 + }, + { + "epoch": 0.94, + "grad_norm": 2.714821583484799, + "learning_rate": 2.2010727180597157e-07, + "loss": 0.7621, + "step": 6109 + }, + { + "epoch": 0.94, + "grad_norm": 2.8942861918137464, + "learning_rate": 2.1907405742774723e-07, + "loss": 0.7278, + "step": 6110 + }, + { + "epoch": 0.94, + "grad_norm": 3.280632946907079, + "learning_rate": 2.1804324692680345e-07, + "loss": 0.7916, + "step": 6111 + }, + { + "epoch": 0.94, + "grad_norm": 2.5931959273453886, + "learning_rate": 2.1701484055648536e-07, + "loss": 0.7815, + "step": 6112 + }, + { + "epoch": 0.94, + "grad_norm": 2.7709121512981065, + "learning_rate": 2.1598883856954523e-07, + "loss": 0.7502, + "step": 6113 + }, + { + "epoch": 0.94, + "grad_norm": 2.9441111783976277, + "learning_rate": 2.1496524121814576e-07, + "loss": 0.8539, + "step": 6114 + }, + { + "epoch": 0.94, + "grad_norm": 2.6898849255122323, + "learning_rate": 2.13944048753858e-07, + "loss": 0.7489, + "step": 6115 + }, + { + "epoch": 0.94, + "grad_norm": 2.9925758163754987, + "learning_rate": 2.1292526142766223e-07, + "loss": 0.7994, + "step": 6116 + }, + { + "epoch": 0.94, + "grad_norm": 2.6208054097627214, + "learning_rate": 2.1190887948994822e-07, + "loss": 0.6947, + "step": 6117 + }, + { + "epoch": 0.94, + "grad_norm": 2.8860097331924224, + "learning_rate": 2.1089490319051497e-07, + "loss": 0.7517, + "step": 6118 + }, + { + "epoch": 0.94, + "grad_norm": 2.49078276280568, + "learning_rate": 2.0988333277856877e-07, + "loss": 0.6892, + "step": 6119 + }, + { + "epoch": 0.94, + "grad_norm": 2.7127390041528097, + "learning_rate": 2.0887416850272514e-07, + "loss": 0.8339, + "step": 6120 + }, + { + "epoch": 0.94, + "grad_norm": 2.7664440908223717, + "learning_rate": 2.078674106110079e-07, + "loss": 0.788, + "step": 6121 + }, + { + "epoch": 0.94, + "grad_norm": 2.7691557943619647, + "learning_rate": 2.068630593508514e-07, + "loss": 0.8135, + "step": 6122 + }, + { + "epoch": 0.94, + "grad_norm": 2.519939135167319, + "learning_rate": 2.05861114969097e-07, + "loss": 0.7414, + "step": 6123 + }, + { + "epoch": 0.94, + "grad_norm": 2.875431563743744, + "learning_rate": 2.0486157771199223e-07, + "loss": 0.7245, + "step": 6124 + }, + { + "epoch": 0.94, + "grad_norm": 2.6094402261215643, + "learning_rate": 2.0386444782519722e-07, + "loss": 0.7991, + "step": 6125 + }, + { + "epoch": 0.94, + "grad_norm": 2.859394466977294, + "learning_rate": 2.0286972555377704e-07, + "loss": 0.748, + "step": 6126 + }, + { + "epoch": 0.94, + "grad_norm": 2.9414054266730982, + "learning_rate": 2.0187741114220837e-07, + "loss": 0.7258, + "step": 6127 + }, + { + "epoch": 0.94, + "grad_norm": 2.6658460640061454, + "learning_rate": 2.0088750483437393e-07, + "loss": 0.7896, + "step": 6128 + }, + { + "epoch": 0.94, + "grad_norm": 2.478030327430569, + "learning_rate": 1.9990000687356348e-07, + "loss": 0.7599, + "step": 6129 + }, + { + "epoch": 0.94, + "grad_norm": 2.772471185791872, + "learning_rate": 1.9891491750247738e-07, + "loss": 0.8028, + "step": 6130 + }, + { + "epoch": 0.94, + "grad_norm": 2.6702866352287553, + "learning_rate": 1.9793223696322306e-07, + "loss": 0.6865, + "step": 6131 + }, + { + "epoch": 0.94, + "grad_norm": 2.640238395629171, + "learning_rate": 1.969519654973151e-07, + "loss": 0.6392, + "step": 6132 + }, + { + "epoch": 0.94, + "grad_norm": 2.5352033004938774, + "learning_rate": 1.9597410334567746e-07, + "loss": 0.7221, + "step": 6133 + }, + { + "epoch": 0.94, + "grad_norm": 2.7111068335551245, + "learning_rate": 1.9499865074864122e-07, + "loss": 0.7778, + "step": 6134 + }, + { + "epoch": 0.94, + "grad_norm": 2.7186845902814403, + "learning_rate": 1.9402560794594572e-07, + "loss": 0.7479, + "step": 6135 + }, + { + "epoch": 0.94, + "grad_norm": 2.6369963760573523, + "learning_rate": 1.9305497517673633e-07, + "loss": 0.7727, + "step": 6136 + }, + { + "epoch": 0.94, + "grad_norm": 2.550199010640346, + "learning_rate": 1.9208675267956666e-07, + "loss": 0.736, + "step": 6137 + }, + { + "epoch": 0.94, + "grad_norm": 2.9158190657987237, + "learning_rate": 1.9112094069240084e-07, + "loss": 0.7635, + "step": 6138 + }, + { + "epoch": 0.94, + "grad_norm": 2.894596711016241, + "learning_rate": 1.9015753945260784e-07, + "loss": 0.8388, + "step": 6139 + }, + { + "epoch": 0.94, + "grad_norm": 2.6370639749811517, + "learning_rate": 1.8919654919696383e-07, + "loss": 0.7775, + "step": 6140 + }, + { + "epoch": 0.94, + "grad_norm": 2.9681643766004524, + "learning_rate": 1.8823797016165435e-07, + "loss": 0.7151, + "step": 6141 + }, + { + "epoch": 0.94, + "grad_norm": 2.7649931358599034, + "learning_rate": 1.872818025822709e-07, + "loss": 0.7196, + "step": 6142 + }, + { + "epoch": 0.94, + "grad_norm": 2.6534712725876854, + "learning_rate": 1.863280466938111e-07, + "loss": 0.7542, + "step": 6143 + }, + { + "epoch": 0.94, + "grad_norm": 2.868580015988256, + "learning_rate": 1.8537670273068298e-07, + "loss": 0.7451, + "step": 6144 + }, + { + "epoch": 0.94, + "grad_norm": 2.6025947830881266, + "learning_rate": 1.844277709267006e-07, + "loss": 0.7171, + "step": 6145 + }, + { + "epoch": 0.94, + "grad_norm": 3.0658603765662704, + "learning_rate": 1.834812515150841e-07, + "loss": 0.7431, + "step": 6146 + }, + { + "epoch": 0.94, + "grad_norm": 2.603293519933533, + "learning_rate": 1.8253714472846184e-07, + "loss": 0.7527, + "step": 6147 + }, + { + "epoch": 0.94, + "grad_norm": 2.71605864580305, + "learning_rate": 1.81595450798866e-07, + "loss": 0.7696, + "step": 6148 + }, + { + "epoch": 0.94, + "grad_norm": 2.537355350931297, + "learning_rate": 1.806561699577436e-07, + "loss": 0.6957, + "step": 6149 + }, + { + "epoch": 0.94, + "grad_norm": 3.0873686115208168, + "learning_rate": 1.7971930243593893e-07, + "loss": 0.7041, + "step": 6150 + }, + { + "epoch": 0.94, + "grad_norm": 2.8757638527449303, + "learning_rate": 1.7878484846371001e-07, + "loss": 0.7434, + "step": 6151 + }, + { + "epoch": 0.94, + "grad_norm": 2.6085872075147596, + "learning_rate": 1.7785280827071982e-07, + "loss": 0.6898, + "step": 6152 + }, + { + "epoch": 0.94, + "grad_norm": 2.5746517753489577, + "learning_rate": 1.769231820860362e-07, + "loss": 0.74, + "step": 6153 + }, + { + "epoch": 0.94, + "grad_norm": 2.667581760238676, + "learning_rate": 1.759959701381353e-07, + "loss": 0.6863, + "step": 6154 + }, + { + "epoch": 0.94, + "grad_norm": 2.799957147402608, + "learning_rate": 1.7507117265490148e-07, + "loss": 0.6544, + "step": 6155 + }, + { + "epoch": 0.94, + "grad_norm": 2.6982037422852576, + "learning_rate": 1.7414878986362294e-07, + "loss": 0.6762, + "step": 6156 + }, + { + "epoch": 0.94, + "grad_norm": 2.976137628722369, + "learning_rate": 1.732288219909961e-07, + "loss": 0.8104, + "step": 6157 + }, + { + "epoch": 0.94, + "grad_norm": 2.559301938412268, + "learning_rate": 1.7231126926312235e-07, + "loss": 0.6893, + "step": 6158 + }, + { + "epoch": 0.94, + "grad_norm": 2.9686201652518025, + "learning_rate": 1.7139613190550906e-07, + "loss": 0.7745, + "step": 6159 + }, + { + "epoch": 0.94, + "grad_norm": 2.741683295872832, + "learning_rate": 1.7048341014307523e-07, + "loss": 0.6785, + "step": 6160 + }, + { + "epoch": 0.94, + "grad_norm": 2.8906234050957478, + "learning_rate": 1.6957310420013918e-07, + "loss": 0.734, + "step": 6161 + }, + { + "epoch": 0.94, + "grad_norm": 2.6247075942288927, + "learning_rate": 1.686652143004286e-07, + "loss": 0.6899, + "step": 6162 + }, + { + "epoch": 0.94, + "grad_norm": 2.558633377476938, + "learning_rate": 1.6775974066707833e-07, + "loss": 0.7669, + "step": 6163 + }, + { + "epoch": 0.94, + "grad_norm": 2.6371825253937002, + "learning_rate": 1.668566835226293e-07, + "loss": 0.6652, + "step": 6164 + }, + { + "epoch": 0.94, + "grad_norm": 2.7199806217099205, + "learning_rate": 1.6595604308902613e-07, + "loss": 0.8627, + "step": 6165 + }, + { + "epoch": 0.94, + "grad_norm": 2.6341507876042325, + "learning_rate": 1.650578195876218e-07, + "loss": 0.7276, + "step": 6166 + }, + { + "epoch": 0.94, + "grad_norm": 3.1415513717540815, + "learning_rate": 1.6416201323917413e-07, + "loss": 0.7839, + "step": 6167 + }, + { + "epoch": 0.94, + "grad_norm": 2.4806645277468076, + "learning_rate": 1.6326862426384705e-07, + "loss": 0.7079, + "step": 6168 + }, + { + "epoch": 0.94, + "grad_norm": 2.567681957131878, + "learning_rate": 1.6237765288121044e-07, + "loss": 0.7302, + "step": 6169 + }, + { + "epoch": 0.94, + "grad_norm": 2.749034703344554, + "learning_rate": 1.6148909931024026e-07, + "loss": 0.7658, + "step": 6170 + }, + { + "epoch": 0.94, + "grad_norm": 2.7358323500883133, + "learning_rate": 1.606029637693185e-07, + "loss": 0.7315, + "step": 6171 + }, + { + "epoch": 0.94, + "grad_norm": 2.783968035090616, + "learning_rate": 1.5971924647623204e-07, + "loss": 0.8276, + "step": 6172 + }, + { + "epoch": 0.94, + "grad_norm": 2.8293213016993257, + "learning_rate": 1.5883794764817272e-07, + "loss": 0.7387, + "step": 6173 + }, + { + "epoch": 0.95, + "grad_norm": 2.8949693147568425, + "learning_rate": 1.5795906750174062e-07, + "loss": 0.814, + "step": 6174 + }, + { + "epoch": 0.95, + "grad_norm": 3.2324609639018966, + "learning_rate": 1.570826062529407e-07, + "loss": 0.7689, + "step": 6175 + }, + { + "epoch": 0.95, + "grad_norm": 2.5822059542555635, + "learning_rate": 1.5620856411717954e-07, + "loss": 0.6804, + "step": 6176 + }, + { + "epoch": 0.95, + "grad_norm": 2.777882167249254, + "learning_rate": 1.5533694130927424e-07, + "loss": 0.7848, + "step": 6177 + }, + { + "epoch": 0.95, + "grad_norm": 3.1167534290212546, + "learning_rate": 1.544677380434445e-07, + "loss": 0.6931, + "step": 6178 + }, + { + "epoch": 0.95, + "grad_norm": 2.8203501178896424, + "learning_rate": 1.5360095453331724e-07, + "loss": 0.6658, + "step": 6179 + }, + { + "epoch": 0.95, + "grad_norm": 2.5251340746824225, + "learning_rate": 1.5273659099192317e-07, + "loss": 0.6895, + "step": 6180 + }, + { + "epoch": 0.95, + "grad_norm": 2.599439628395259, + "learning_rate": 1.5187464763169678e-07, + "loss": 0.8254, + "step": 6181 + }, + { + "epoch": 0.95, + "grad_norm": 2.804159345742806, + "learning_rate": 1.5101512466448197e-07, + "loss": 0.7402, + "step": 6182 + }, + { + "epoch": 0.95, + "grad_norm": 2.6898781919312302, + "learning_rate": 1.501580223015242e-07, + "loss": 0.7552, + "step": 6183 + }, + { + "epoch": 0.95, + "grad_norm": 2.7033042374066762, + "learning_rate": 1.4930334075347497e-07, + "loss": 0.7328, + "step": 6184 + }, + { + "epoch": 0.95, + "grad_norm": 3.473450686295006, + "learning_rate": 1.4845108023039178e-07, + "loss": 0.8645, + "step": 6185 + }, + { + "epoch": 0.95, + "grad_norm": 2.556613318569326, + "learning_rate": 1.47601240941736e-07, + "loss": 0.7423, + "step": 6186 + }, + { + "epoch": 0.95, + "grad_norm": 2.6293302467512274, + "learning_rate": 1.4675382309637386e-07, + "loss": 0.6899, + "step": 6187 + }, + { + "epoch": 0.95, + "grad_norm": 2.9116476239785314, + "learning_rate": 1.4590882690257768e-07, + "loss": 0.7177, + "step": 6188 + }, + { + "epoch": 0.95, + "grad_norm": 2.6550641452752664, + "learning_rate": 1.4506625256802355e-07, + "loss": 0.6495, + "step": 6189 + }, + { + "epoch": 0.95, + "grad_norm": 2.6021279534255637, + "learning_rate": 1.4422610029979244e-07, + "loss": 0.7871, + "step": 6190 + }, + { + "epoch": 0.95, + "grad_norm": 2.76386657740609, + "learning_rate": 1.4338837030437147e-07, + "loss": 0.7694, + "step": 6191 + }, + { + "epoch": 0.95, + "grad_norm": 2.7179569750741943, + "learning_rate": 1.4255306278764813e-07, + "loss": 0.7451, + "step": 6192 + }, + { + "epoch": 0.95, + "grad_norm": 2.7081378794105344, + "learning_rate": 1.4172017795492044e-07, + "loss": 0.8077, + "step": 6193 + }, + { + "epoch": 0.95, + "grad_norm": 3.263696185374978, + "learning_rate": 1.4088971601088796e-07, + "loss": 0.8088, + "step": 6194 + }, + { + "epoch": 0.95, + "grad_norm": 2.7987236616266005, + "learning_rate": 1.400616771596519e-07, + "loss": 0.7513, + "step": 6195 + }, + { + "epoch": 0.95, + "grad_norm": 2.7879734412104082, + "learning_rate": 1.3923606160472504e-07, + "loss": 0.7186, + "step": 6196 + }, + { + "epoch": 0.95, + "grad_norm": 2.675492212746545, + "learning_rate": 1.3841286954901834e-07, + "loss": 0.7624, + "step": 6197 + }, + { + "epoch": 0.95, + "grad_norm": 2.5732871399578583, + "learning_rate": 1.3759210119485e-07, + "loss": 0.7256, + "step": 6198 + }, + { + "epoch": 0.95, + "grad_norm": 2.719745063329388, + "learning_rate": 1.3677375674394088e-07, + "loss": 0.8057, + "step": 6199 + }, + { + "epoch": 0.95, + "grad_norm": 2.8279350287112672, + "learning_rate": 1.359578363974179e-07, + "loss": 0.7716, + "step": 6200 + }, + { + "epoch": 0.95, + "grad_norm": 2.6119360982876247, + "learning_rate": 1.351443403558117e-07, + "loss": 0.7593, + "step": 6201 + }, + { + "epoch": 0.95, + "grad_norm": 2.548879171886614, + "learning_rate": 1.3433326881905683e-07, + "loss": 0.6883, + "step": 6202 + }, + { + "epoch": 0.95, + "grad_norm": 2.768212090009197, + "learning_rate": 1.3352462198649163e-07, + "loss": 0.8027, + "step": 6203 + }, + { + "epoch": 0.95, + "grad_norm": 2.6002595913786246, + "learning_rate": 1.327184000568582e-07, + "loss": 0.7569, + "step": 6204 + }, + { + "epoch": 0.95, + "grad_norm": 2.8178858099550923, + "learning_rate": 1.3191460322830364e-07, + "loss": 0.7394, + "step": 6205 + }, + { + "epoch": 0.95, + "grad_norm": 2.758556782006995, + "learning_rate": 1.3111323169837875e-07, + "loss": 0.855, + "step": 6206 + }, + { + "epoch": 0.95, + "grad_norm": 2.8694004245810385, + "learning_rate": 1.3031428566403824e-07, + "loss": 0.7254, + "step": 6207 + }, + { + "epoch": 0.95, + "grad_norm": 2.875488729219647, + "learning_rate": 1.2951776532164062e-07, + "loss": 0.7762, + "step": 6208 + }, + { + "epoch": 0.95, + "grad_norm": 2.542501426417321, + "learning_rate": 1.2872367086694704e-07, + "loss": 0.6791, + "step": 6209 + }, + { + "epoch": 0.95, + "grad_norm": 2.637201824519699, + "learning_rate": 1.2793200249512693e-07, + "loss": 0.7678, + "step": 6210 + }, + { + "epoch": 0.95, + "grad_norm": 2.6419157596436613, + "learning_rate": 1.271427604007458e-07, + "loss": 0.7553, + "step": 6211 + }, + { + "epoch": 0.95, + "grad_norm": 2.639956132902984, + "learning_rate": 1.263559447777818e-07, + "loss": 0.7385, + "step": 6212 + }, + { + "epoch": 0.95, + "grad_norm": 2.936008032052921, + "learning_rate": 1.2557155581960913e-07, + "loss": 0.745, + "step": 6213 + }, + { + "epoch": 0.95, + "grad_norm": 2.565648666042382, + "learning_rate": 1.247895937190091e-07, + "loss": 0.7251, + "step": 6214 + }, + { + "epoch": 0.95, + "grad_norm": 2.6059344542245975, + "learning_rate": 1.2401005866816586e-07, + "loss": 0.7442, + "step": 6215 + }, + { + "epoch": 0.95, + "grad_norm": 2.5199423385190025, + "learning_rate": 1.2323295085866827e-07, + "loss": 0.7081, + "step": 6216 + }, + { + "epoch": 0.95, + "grad_norm": 2.766694260160205, + "learning_rate": 1.2245827048150694e-07, + "loss": 0.8202, + "step": 6217 + }, + { + "epoch": 0.95, + "grad_norm": 2.7075914640665246, + "learning_rate": 1.2168601772707622e-07, + "loss": 0.7708, + "step": 6218 + }, + { + "epoch": 0.95, + "grad_norm": 2.623914655169994, + "learning_rate": 1.209161927851754e-07, + "loss": 0.836, + "step": 6219 + }, + { + "epoch": 0.95, + "grad_norm": 2.5301146209908945, + "learning_rate": 1.2014879584500427e-07, + "loss": 0.6935, + "step": 6220 + }, + { + "epoch": 0.95, + "grad_norm": 2.884730413869238, + "learning_rate": 1.193838270951686e-07, + "loss": 0.7502, + "step": 6221 + }, + { + "epoch": 0.95, + "grad_norm": 2.862630433813254, + "learning_rate": 1.1862128672367579e-07, + "loss": 0.7959, + "step": 6222 + }, + { + "epoch": 0.95, + "grad_norm": 2.7660713652174667, + "learning_rate": 1.1786117491793702e-07, + "loss": 0.7949, + "step": 6223 + }, + { + "epoch": 0.95, + "grad_norm": 2.8777554137925074, + "learning_rate": 1.1710349186476732e-07, + "loss": 0.7505, + "step": 6224 + }, + { + "epoch": 0.95, + "grad_norm": 2.6031163077151893, + "learning_rate": 1.1634823775038218e-07, + "loss": 0.7089, + "step": 6225 + }, + { + "epoch": 0.95, + "grad_norm": 3.2398709920206232, + "learning_rate": 1.155954127604042e-07, + "loss": 0.7993, + "step": 6226 + }, + { + "epoch": 0.95, + "grad_norm": 2.7285117964587773, + "learning_rate": 1.148450170798543e-07, + "loss": 0.7544, + "step": 6227 + }, + { + "epoch": 0.95, + "grad_norm": 3.0268938632735596, + "learning_rate": 1.1409705089315826e-07, + "loss": 0.6847, + "step": 6228 + }, + { + "epoch": 0.95, + "grad_norm": 2.6551913985471, + "learning_rate": 1.1335151438414905e-07, + "loss": 0.7259, + "step": 6229 + }, + { + "epoch": 0.95, + "grad_norm": 2.9149685796855356, + "learning_rate": 1.1260840773605452e-07, + "loss": 0.7453, + "step": 6230 + }, + { + "epoch": 0.95, + "grad_norm": 2.519006685781597, + "learning_rate": 1.1186773113151083e-07, + "loss": 0.7421, + "step": 6231 + }, + { + "epoch": 0.95, + "grad_norm": 2.5509960965895147, + "learning_rate": 1.1112948475255681e-07, + "loss": 0.7092, + "step": 6232 + }, + { + "epoch": 0.95, + "grad_norm": 2.5465021594958617, + "learning_rate": 1.1039366878063062e-07, + "loss": 0.7868, + "step": 6233 + }, + { + "epoch": 0.95, + "grad_norm": 2.7905289925773573, + "learning_rate": 1.0966028339657763e-07, + "loss": 0.7965, + "step": 6234 + }, + { + "epoch": 0.95, + "grad_norm": 2.847803802086891, + "learning_rate": 1.0892932878064144e-07, + "loss": 0.8927, + "step": 6235 + }, + { + "epoch": 0.95, + "grad_norm": 2.8952086928132865, + "learning_rate": 1.0820080511247055e-07, + "loss": 0.888, + "step": 6236 + }, + { + "epoch": 0.95, + "grad_norm": 2.726833808841528, + "learning_rate": 1.0747471257111619e-07, + "loss": 0.756, + "step": 6237 + }, + { + "epoch": 0.95, + "grad_norm": 2.6147257812951286, + "learning_rate": 1.0675105133503116e-07, + "loss": 0.6977, + "step": 6238 + }, + { + "epoch": 0.95, + "grad_norm": 2.5060729867642375, + "learning_rate": 1.0602982158206987e-07, + "loss": 0.825, + "step": 6239 + }, + { + "epoch": 0.96, + "grad_norm": 2.403041630415913, + "learning_rate": 1.0531102348949273e-07, + "loss": 0.6932, + "step": 6240 + }, + { + "epoch": 0.96, + "grad_norm": 2.5488272121277284, + "learning_rate": 1.0459465723395956e-07, + "loss": 0.7599, + "step": 6241 + }, + { + "epoch": 0.96, + "grad_norm": 2.6571821875824044, + "learning_rate": 1.0388072299153174e-07, + "loss": 0.7615, + "step": 6242 + }, + { + "epoch": 0.96, + "grad_norm": 2.871469208438707, + "learning_rate": 1.0316922093767556e-07, + "loss": 0.7261, + "step": 6243 + }, + { + "epoch": 0.96, + "grad_norm": 2.6783501614843437, + "learning_rate": 1.0246015124725672e-07, + "loss": 0.7352, + "step": 6244 + }, + { + "epoch": 0.96, + "grad_norm": 2.862113879224904, + "learning_rate": 1.0175351409454693e-07, + "loss": 0.7957, + "step": 6245 + }, + { + "epoch": 0.96, + "grad_norm": 2.5378602149474845, + "learning_rate": 1.0104930965321724e-07, + "loss": 0.7019, + "step": 6246 + }, + { + "epoch": 0.96, + "grad_norm": 2.9359078320937506, + "learning_rate": 1.0034753809634035e-07, + "loss": 0.6046, + "step": 6247 + }, + { + "epoch": 0.96, + "grad_norm": 3.2423297564849984, + "learning_rate": 9.96481995963916e-08, + "loss": 0.738, + "step": 6248 + }, + { + "epoch": 0.96, + "grad_norm": 2.491630187666835, + "learning_rate": 9.89512943252513e-08, + "loss": 0.71, + "step": 6249 + }, + { + "epoch": 0.96, + "grad_norm": 2.6773863279396015, + "learning_rate": 9.825682245419576e-08, + "loss": 0.7626, + "step": 6250 + }, + { + "epoch": 0.96, + "grad_norm": 2.7660942543458504, + "learning_rate": 9.756478415390847e-08, + "loss": 0.7542, + "step": 6251 + }, + { + "epoch": 0.96, + "grad_norm": 2.897450522419471, + "learning_rate": 9.687517959447446e-08, + "loss": 0.7878, + "step": 6252 + }, + { + "epoch": 0.96, + "grad_norm": 2.499684952206905, + "learning_rate": 9.618800894537594e-08, + "loss": 0.7011, + "step": 6253 + }, + { + "epoch": 0.96, + "grad_norm": 2.691932857931784, + "learning_rate": 9.550327237550339e-08, + "loss": 0.792, + "step": 6254 + }, + { + "epoch": 0.96, + "grad_norm": 2.4931560935692882, + "learning_rate": 9.482097005314328e-08, + "loss": 0.8742, + "step": 6255 + }, + { + "epoch": 0.96, + "grad_norm": 2.820129725912161, + "learning_rate": 9.414110214598815e-08, + "loss": 0.6588, + "step": 6256 + }, + { + "epoch": 0.96, + "grad_norm": 2.7829327595354405, + "learning_rate": 9.3463668821131e-08, + "loss": 0.732, + "step": 6257 + }, + { + "epoch": 0.96, + "grad_norm": 2.8675835655307327, + "learning_rate": 9.278867024506421e-08, + "loss": 0.7629, + "step": 6258 + }, + { + "epoch": 0.96, + "grad_norm": 2.736723949619082, + "learning_rate": 9.211610658368397e-08, + "loss": 0.7268, + "step": 6259 + }, + { + "epoch": 0.96, + "grad_norm": 2.5946668669950257, + "learning_rate": 9.144597800228693e-08, + "loss": 0.7606, + "step": 6260 + }, + { + "epoch": 0.96, + "grad_norm": 2.7121437423369064, + "learning_rate": 9.077828466557359e-08, + "loss": 0.8796, + "step": 6261 + }, + { + "epoch": 0.96, + "grad_norm": 2.449222452836164, + "learning_rate": 9.011302673764266e-08, + "loss": 0.7207, + "step": 6262 + }, + { + "epoch": 0.96, + "grad_norm": 2.5211066598013563, + "learning_rate": 8.94502043819967e-08, + "loss": 0.7476, + "step": 6263 + }, + { + "epoch": 0.96, + "grad_norm": 2.7837470093262406, + "learning_rate": 8.878981776153872e-08, + "loss": 0.7562, + "step": 6264 + }, + { + "epoch": 0.96, + "grad_norm": 2.6387684488289946, + "learning_rate": 8.813186703857334e-08, + "loss": 0.73, + "step": 6265 + }, + { + "epoch": 0.96, + "grad_norm": 2.514520139552492, + "learning_rate": 8.747635237480679e-08, + "loss": 0.7326, + "step": 6266 + }, + { + "epoch": 0.96, + "grad_norm": 2.642189124321444, + "learning_rate": 8.68232739313446e-08, + "loss": 0.7199, + "step": 6267 + }, + { + "epoch": 0.96, + "grad_norm": 2.658159002696627, + "learning_rate": 8.617263186869728e-08, + "loss": 0.8156, + "step": 6268 + }, + { + "epoch": 0.96, + "grad_norm": 2.5569705911088567, + "learning_rate": 8.552442634677361e-08, + "loss": 0.7859, + "step": 6269 + }, + { + "epoch": 0.96, + "grad_norm": 2.507594117809746, + "learning_rate": 8.487865752488277e-08, + "loss": 0.6458, + "step": 6270 + }, + { + "epoch": 0.96, + "grad_norm": 2.7535773207528407, + "learning_rate": 8.423532556173896e-08, + "loss": 0.8386, + "step": 6271 + }, + { + "epoch": 0.96, + "grad_norm": 2.4238202953270656, + "learning_rate": 8.359443061545348e-08, + "loss": 0.6733, + "step": 6272 + }, + { + "epoch": 0.96, + "grad_norm": 2.6580601292273744, + "learning_rate": 8.29559728435414e-08, + "loss": 0.7644, + "step": 6273 + }, + { + "epoch": 0.96, + "grad_norm": 2.5288605289633383, + "learning_rate": 8.231995240291613e-08, + "loss": 0.7586, + "step": 6274 + }, + { + "epoch": 0.96, + "grad_norm": 2.7429246054717367, + "learning_rate": 8.168636944989595e-08, + "loss": 0.8357, + "step": 6275 + }, + { + "epoch": 0.96, + "grad_norm": 2.836135023842705, + "learning_rate": 8.105522414019629e-08, + "loss": 0.7551, + "step": 6276 + }, + { + "epoch": 0.96, + "grad_norm": 2.601433221659957, + "learning_rate": 8.04265166289353e-08, + "loss": 0.7847, + "step": 6277 + }, + { + "epoch": 0.96, + "grad_norm": 2.8694538043296087, + "learning_rate": 7.980024707063161e-08, + "loss": 0.7775, + "step": 6278 + }, + { + "epoch": 0.96, + "grad_norm": 2.6236106508897428, + "learning_rate": 7.91764156192043e-08, + "loss": 0.7561, + "step": 6279 + }, + { + "epoch": 0.96, + "grad_norm": 2.7156377840729413, + "learning_rate": 7.85550224279752e-08, + "loss": 0.7812, + "step": 6280 + }, + { + "epoch": 0.96, + "grad_norm": 2.8693681860985936, + "learning_rate": 7.793606764966321e-08, + "loss": 0.7878, + "step": 6281 + }, + { + "epoch": 0.96, + "grad_norm": 2.859687806470752, + "learning_rate": 7.731955143639225e-08, + "loss": 0.7898, + "step": 6282 + }, + { + "epoch": 0.96, + "grad_norm": 2.753093997291165, + "learning_rate": 7.67054739396822e-08, + "loss": 0.7418, + "step": 6283 + }, + { + "epoch": 0.96, + "grad_norm": 2.697051038845374, + "learning_rate": 7.609383531045788e-08, + "loss": 0.7606, + "step": 6284 + }, + { + "epoch": 0.96, + "grad_norm": 2.6081068715682965, + "learning_rate": 7.548463569904241e-08, + "loss": 0.7459, + "step": 6285 + }, + { + "epoch": 0.96, + "grad_norm": 5.493989378746843, + "learning_rate": 7.487787525516044e-08, + "loss": 0.8218, + "step": 6286 + }, + { + "epoch": 0.96, + "grad_norm": 3.1768599720968322, + "learning_rate": 7.427355412793602e-08, + "loss": 0.7762, + "step": 6287 + }, + { + "epoch": 0.96, + "grad_norm": 2.8093572987998923, + "learning_rate": 7.367167246589479e-08, + "loss": 0.7584, + "step": 6288 + }, + { + "epoch": 0.96, + "grad_norm": 2.5319574525103574, + "learning_rate": 7.307223041696177e-08, + "loss": 0.7272, + "step": 6289 + }, + { + "epoch": 0.96, + "grad_norm": 2.7085439960965894, + "learning_rate": 7.247522812846353e-08, + "loss": 0.7297, + "step": 6290 + }, + { + "epoch": 0.96, + "grad_norm": 2.656018882388865, + "learning_rate": 7.188066574712604e-08, + "loss": 0.743, + "step": 6291 + }, + { + "epoch": 0.96, + "grad_norm": 2.854981479330724, + "learning_rate": 7.128854341907688e-08, + "loss": 0.8181, + "step": 6292 + }, + { + "epoch": 0.96, + "grad_norm": 2.7236793623907425, + "learning_rate": 7.069886128984294e-08, + "loss": 0.7819, + "step": 6293 + }, + { + "epoch": 0.96, + "grad_norm": 2.592995981693292, + "learning_rate": 7.011161950435053e-08, + "loss": 0.7192, + "step": 6294 + }, + { + "epoch": 0.96, + "grad_norm": 2.73348258434267, + "learning_rate": 6.952681820692753e-08, + "loss": 0.7506, + "step": 6295 + }, + { + "epoch": 0.96, + "grad_norm": 2.794527286958416, + "learning_rate": 6.89444575413023e-08, + "loss": 0.7659, + "step": 6296 + }, + { + "epoch": 0.96, + "grad_norm": 2.6722789816838124, + "learning_rate": 6.836453765060258e-08, + "loss": 0.7245, + "step": 6297 + }, + { + "epoch": 0.96, + "grad_norm": 2.6796626725806423, + "learning_rate": 6.778705867735657e-08, + "loss": 0.7804, + "step": 6298 + }, + { + "epoch": 0.96, + "grad_norm": 2.795808761513518, + "learning_rate": 6.721202076349187e-08, + "loss": 0.7924, + "step": 6299 + }, + { + "epoch": 0.96, + "grad_norm": 2.578487552690196, + "learning_rate": 6.663942405033763e-08, + "loss": 0.7203, + "step": 6300 + }, + { + "epoch": 0.96, + "grad_norm": 2.912508034176297, + "learning_rate": 6.606926867862129e-08, + "loss": 0.62, + "step": 6301 + }, + { + "epoch": 0.96, + "grad_norm": 2.5523100748647063, + "learning_rate": 6.550155478847075e-08, + "loss": 0.7968, + "step": 6302 + }, + { + "epoch": 0.96, + "grad_norm": 2.8976628589496327, + "learning_rate": 6.493628251941442e-08, + "loss": 0.7717, + "step": 6303 + }, + { + "epoch": 0.96, + "grad_norm": 2.4732197073116837, + "learning_rate": 6.437345201038115e-08, + "loss": 0.7558, + "step": 6304 + }, + { + "epoch": 0.97, + "grad_norm": 2.6265852771064067, + "learning_rate": 6.381306339969806e-08, + "loss": 0.6971, + "step": 6305 + }, + { + "epoch": 0.97, + "grad_norm": 2.573298694817957, + "learning_rate": 6.325511682509277e-08, + "loss": 0.7839, + "step": 6306 + }, + { + "epoch": 0.97, + "grad_norm": 3.0305915808088173, + "learning_rate": 6.269961242369338e-08, + "loss": 0.8317, + "step": 6307 + }, + { + "epoch": 0.97, + "grad_norm": 2.924612836068098, + "learning_rate": 6.214655033202732e-08, + "loss": 0.7161, + "step": 6308 + }, + { + "epoch": 0.97, + "grad_norm": 2.6344706369779183, + "learning_rate": 6.159593068602255e-08, + "loss": 0.6954, + "step": 6309 + }, + { + "epoch": 0.97, + "grad_norm": 2.4740253678071262, + "learning_rate": 6.104775362100301e-08, + "loss": 0.6835, + "step": 6310 + }, + { + "epoch": 0.97, + "grad_norm": 2.807355565461176, + "learning_rate": 6.050201927169875e-08, + "loss": 0.699, + "step": 6311 + }, + { + "epoch": 0.97, + "grad_norm": 2.689978770639496, + "learning_rate": 5.995872777223466e-08, + "loss": 0.8269, + "step": 6312 + }, + { + "epoch": 0.97, + "grad_norm": 2.728285657498286, + "learning_rate": 5.941787925613507e-08, + "loss": 0.748, + "step": 6313 + }, + { + "epoch": 0.97, + "grad_norm": 2.6673535571875577, + "learning_rate": 5.8879473856328084e-08, + "loss": 0.8318, + "step": 6314 + }, + { + "epoch": 0.97, + "grad_norm": 2.858044430958844, + "learning_rate": 5.8343511705136746e-08, + "loss": 0.755, + "step": 6315 + }, + { + "epoch": 0.97, + "grad_norm": 2.6411821646605738, + "learning_rate": 5.780999293428569e-08, + "loss": 0.7643, + "step": 6316 + }, + { + "epoch": 0.97, + "grad_norm": 2.8070989561640527, + "learning_rate": 5.727891767490001e-08, + "loss": 0.8535, + "step": 6317 + }, + { + "epoch": 0.97, + "grad_norm": 2.5051496078033573, + "learning_rate": 5.675028605750199e-08, + "loss": 0.6572, + "step": 6318 + }, + { + "epoch": 0.97, + "grad_norm": 2.635854501865161, + "learning_rate": 5.622409821201436e-08, + "loss": 0.7762, + "step": 6319 + }, + { + "epoch": 0.97, + "grad_norm": 2.9390171766065847, + "learning_rate": 5.5700354267760326e-08, + "loss": 0.7604, + "step": 6320 + }, + { + "epoch": 0.97, + "grad_norm": 2.9946880040924264, + "learning_rate": 5.517905435345916e-08, + "loss": 0.7079, + "step": 6321 + }, + { + "epoch": 0.97, + "grad_norm": 2.561259049882208, + "learning_rate": 5.4660198597235035e-08, + "loss": 0.795, + "step": 6322 + }, + { + "epoch": 0.97, + "grad_norm": 2.6513264646111168, + "learning_rate": 5.414378712660706e-08, + "loss": 0.6741, + "step": 6323 + }, + { + "epoch": 0.97, + "grad_norm": 2.741628132065017, + "learning_rate": 5.36298200684926e-08, + "loss": 0.8026, + "step": 6324 + }, + { + "epoch": 0.97, + "grad_norm": 2.7135972268576647, + "learning_rate": 5.3118297549212826e-08, + "loss": 0.7927, + "step": 6325 + }, + { + "epoch": 0.97, + "grad_norm": 2.848539842298522, + "learning_rate": 5.260921969448496e-08, + "loss": 0.8676, + "step": 6326 + }, + { + "epoch": 0.97, + "grad_norm": 2.789041773355471, + "learning_rate": 5.210258662942669e-08, + "loss": 0.815, + "step": 6327 + }, + { + "epoch": 0.97, + "grad_norm": 2.5460210484795236, + "learning_rate": 5.159839847855175e-08, + "loss": 0.7364, + "step": 6328 + }, + { + "epoch": 0.97, + "grad_norm": 2.7098943104095854, + "learning_rate": 5.109665536577768e-08, + "loss": 0.6325, + "step": 6329 + }, + { + "epoch": 0.97, + "grad_norm": 2.855953216059735, + "learning_rate": 5.059735741441807e-08, + "loss": 0.6845, + "step": 6330 + }, + { + "epoch": 0.97, + "grad_norm": 2.7627575881511905, + "learning_rate": 5.0100504747186974e-08, + "loss": 0.751, + "step": 6331 + }, + { + "epoch": 0.97, + "grad_norm": 2.6643373656366363, + "learning_rate": 4.9606097486195604e-08, + "loss": 0.7861, + "step": 6332 + }, + { + "epoch": 0.97, + "grad_norm": 2.840794267648225, + "learning_rate": 4.911413575295787e-08, + "loss": 0.7249, + "step": 6333 + }, + { + "epoch": 0.97, + "grad_norm": 3.635273402121381, + "learning_rate": 4.8624619668381504e-08, + "loss": 0.6318, + "step": 6334 + }, + { + "epoch": 0.97, + "grad_norm": 2.7977870462313095, + "learning_rate": 4.813754935277581e-08, + "loss": 0.8487, + "step": 6335 + }, + { + "epoch": 0.97, + "grad_norm": 2.6463144854142757, + "learning_rate": 4.765292492585172e-08, + "loss": 0.6637, + "step": 6336 + }, + { + "epoch": 0.97, + "grad_norm": 3.065956639630013, + "learning_rate": 4.717074650671394e-08, + "loss": 0.8322, + "step": 6337 + }, + { + "epoch": 0.97, + "grad_norm": 2.7850962673987762, + "learning_rate": 4.6691014213868794e-08, + "loss": 0.7225, + "step": 6338 + }, + { + "epoch": 0.97, + "grad_norm": 2.9330373089277932, + "learning_rate": 4.621372816522196e-08, + "loss": 0.8766, + "step": 6339 + }, + { + "epoch": 0.97, + "grad_norm": 2.653237142876649, + "learning_rate": 4.573888847807517e-08, + "loss": 0.7426, + "step": 6340 + }, + { + "epoch": 0.97, + "grad_norm": 2.4358443741832705, + "learning_rate": 4.5266495269132846e-08, + "loss": 0.7164, + "step": 6341 + }, + { + "epoch": 0.97, + "grad_norm": 2.392735653487316, + "learning_rate": 4.479654865449545e-08, + "loss": 0.6767, + "step": 6342 + }, + { + "epoch": 0.97, + "grad_norm": 2.972324375045306, + "learning_rate": 4.432904874966171e-08, + "loss": 0.8116, + "step": 6343 + }, + { + "epoch": 0.97, + "grad_norm": 2.380025455047499, + "learning_rate": 4.386399566953081e-08, + "loss": 0.6901, + "step": 6344 + }, + { + "epoch": 0.97, + "grad_norm": 2.771678928999211, + "learning_rate": 4.340138952839912e-08, + "loss": 0.7367, + "step": 6345 + }, + { + "epoch": 0.97, + "grad_norm": 2.557862003826193, + "learning_rate": 4.294123043996235e-08, + "loss": 0.777, + "step": 6346 + }, + { + "epoch": 0.97, + "grad_norm": 2.840869855641693, + "learning_rate": 4.248351851731558e-08, + "loss": 0.7093, + "step": 6347 + }, + { + "epoch": 0.97, + "grad_norm": 2.7641169505849392, + "learning_rate": 4.2028253872949954e-08, + "loss": 0.813, + "step": 6348 + }, + { + "epoch": 0.97, + "grad_norm": 2.7348826985320174, + "learning_rate": 4.157543661875929e-08, + "loss": 0.7661, + "step": 6349 + }, + { + "epoch": 0.97, + "grad_norm": 2.7141646574375673, + "learning_rate": 4.112506686603013e-08, + "loss": 0.7166, + "step": 6350 + }, + { + "epoch": 0.97, + "grad_norm": 2.433956966142259, + "learning_rate": 4.067714472545281e-08, + "loss": 0.6179, + "step": 6351 + }, + { + "epoch": 0.97, + "grad_norm": 2.5873762825931, + "learning_rate": 4.023167030711484e-08, + "loss": 0.7205, + "step": 6352 + }, + { + "epoch": 0.97, + "grad_norm": 2.5709124602943914, + "learning_rate": 3.978864372049973e-08, + "loss": 0.7412, + "step": 6353 + }, + { + "epoch": 0.97, + "grad_norm": 2.894106131249421, + "learning_rate": 3.934806507449041e-08, + "loss": 0.7344, + "step": 6354 + }, + { + "epoch": 0.97, + "grad_norm": 2.649700374213911, + "learning_rate": 3.8909934477370234e-08, + "loss": 0.7375, + "step": 6355 + }, + { + "epoch": 0.97, + "grad_norm": 2.7352121319525016, + "learning_rate": 3.847425203681865e-08, + "loss": 0.6973, + "step": 6356 + }, + { + "epoch": 0.97, + "grad_norm": 2.546676278554904, + "learning_rate": 3.804101785991443e-08, + "loss": 0.8082, + "step": 6357 + }, + { + "epoch": 0.97, + "grad_norm": 2.731368752423272, + "learning_rate": 3.7610232053135745e-08, + "loss": 0.7807, + "step": 6358 + }, + { + "epoch": 0.97, + "grad_norm": 2.600089510254957, + "learning_rate": 3.7181894722355674e-08, + "loss": 0.7551, + "step": 6359 + }, + { + "epoch": 0.97, + "grad_norm": 2.710146340660481, + "learning_rate": 3.675600597284779e-08, + "loss": 0.7711, + "step": 6360 + }, + { + "epoch": 0.97, + "grad_norm": 2.728594166988212, + "learning_rate": 3.6332565909283915e-08, + "loss": 0.7805, + "step": 6361 + }, + { + "epoch": 0.97, + "grad_norm": 3.1455008400453077, + "learning_rate": 3.591157463573303e-08, + "loss": 0.7706, + "step": 6362 + }, + { + "epoch": 0.97, + "grad_norm": 2.7493337696123508, + "learning_rate": 3.5493032255665694e-08, + "loss": 0.7219, + "step": 6363 + }, + { + "epoch": 0.97, + "grad_norm": 2.582480537155428, + "learning_rate": 3.507693887194408e-08, + "loss": 0.7119, + "step": 6364 + }, + { + "epoch": 0.97, + "grad_norm": 2.9089662705341968, + "learning_rate": 3.4663294586835264e-08, + "loss": 0.7601, + "step": 6365 + }, + { + "epoch": 0.97, + "grad_norm": 2.579269195629373, + "learning_rate": 3.425209950200015e-08, + "loss": 0.719, + "step": 6366 + }, + { + "epoch": 0.97, + "grad_norm": 2.9961981717863577, + "learning_rate": 3.3843353718499004e-08, + "loss": 0.7513, + "step": 6367 + }, + { + "epoch": 0.97, + "grad_norm": 2.7274870617059004, + "learning_rate": 3.343705733679037e-08, + "loss": 0.7911, + "step": 6368 + }, + { + "epoch": 0.97, + "grad_norm": 2.6028781172920112, + "learning_rate": 3.3033210456729915e-08, + "loss": 0.8072, + "step": 6369 + }, + { + "epoch": 0.98, + "grad_norm": 2.5206585417321614, + "learning_rate": 3.263181317757269e-08, + "loss": 0.7499, + "step": 6370 + }, + { + "epoch": 0.98, + "grad_norm": 2.89486420226331, + "learning_rate": 3.2232865597969786e-08, + "loss": 0.763, + "step": 6371 + }, + { + "epoch": 0.98, + "grad_norm": 2.4263650727508166, + "learning_rate": 3.183636781597277e-08, + "loss": 0.7045, + "step": 6372 + }, + { + "epoch": 0.98, + "grad_norm": 2.688965635402593, + "learning_rate": 3.144231992902813e-08, + "loss": 0.8026, + "step": 6373 + }, + { + "epoch": 0.98, + "grad_norm": 2.9726884382447123, + "learning_rate": 3.105072203398285e-08, + "loss": 0.8126, + "step": 6374 + }, + { + "epoch": 0.98, + "grad_norm": 2.5089298805790174, + "learning_rate": 3.0661574227081046e-08, + "loss": 0.82, + "step": 6375 + }, + { + "epoch": 0.98, + "grad_norm": 2.6440762681304477, + "learning_rate": 3.0274876603962885e-08, + "loss": 0.7883, + "step": 6376 + }, + { + "epoch": 0.98, + "grad_norm": 2.514536365196918, + "learning_rate": 2.989062925966901e-08, + "loss": 0.7623, + "step": 6377 + }, + { + "epoch": 0.98, + "grad_norm": 2.4245283656582233, + "learning_rate": 2.95088322886361e-08, + "loss": 0.7147, + "step": 6378 + }, + { + "epoch": 0.98, + "grad_norm": 2.896910981296976, + "learning_rate": 2.9129485784699096e-08, + "loss": 0.8048, + "step": 6379 + }, + { + "epoch": 0.98, + "grad_norm": 2.694298363324352, + "learning_rate": 2.8752589841092306e-08, + "loss": 0.7603, + "step": 6380 + }, + { + "epoch": 0.98, + "grad_norm": 2.7538631724452465, + "learning_rate": 2.8378144550443852e-08, + "loss": 0.8414, + "step": 6381 + }, + { + "epoch": 0.98, + "grad_norm": 2.607625116690004, + "learning_rate": 2.8006150004782352e-08, + "loss": 0.7615, + "step": 6382 + }, + { + "epoch": 0.98, + "grad_norm": 2.7452716377783775, + "learning_rate": 2.7636606295534664e-08, + "loss": 0.7256, + "step": 6383 + }, + { + "epoch": 0.98, + "grad_norm": 2.6681791298014645, + "learning_rate": 2.7269513513523692e-08, + "loss": 0.7814, + "step": 6384 + }, + { + "epoch": 0.98, + "grad_norm": 2.785266623894804, + "learning_rate": 2.6904871748970607e-08, + "loss": 0.6774, + "step": 6385 + }, + { + "epoch": 0.98, + "grad_norm": 2.548359718667786, + "learning_rate": 2.6542681091493715e-08, + "loss": 0.6805, + "step": 6386 + }, + { + "epoch": 0.98, + "grad_norm": 2.8727987933271812, + "learning_rate": 2.6182941630109594e-08, + "loss": 0.7931, + "step": 6387 + }, + { + "epoch": 0.98, + "grad_norm": 2.954983354484794, + "learning_rate": 2.5825653453233067e-08, + "loss": 0.84, + "step": 6388 + }, + { + "epoch": 0.98, + "grad_norm": 2.4906140254521496, + "learning_rate": 2.547081664867501e-08, + "loss": 0.6294, + "step": 6389 + }, + { + "epoch": 0.98, + "grad_norm": 2.8461489930565773, + "learning_rate": 2.511843130364233e-08, + "loss": 0.8024, + "step": 6390 + }, + { + "epoch": 0.98, + "grad_norm": 2.539588939846344, + "learning_rate": 2.4768497504744637e-08, + "loss": 0.7963, + "step": 6391 + }, + { + "epoch": 0.98, + "grad_norm": 2.551724315283847, + "learning_rate": 2.4421015337984244e-08, + "loss": 0.7367, + "step": 6392 + }, + { + "epoch": 0.98, + "grad_norm": 2.792982810540755, + "learning_rate": 2.4075984888762838e-08, + "loss": 0.7481, + "step": 6393 + }, + { + "epoch": 0.98, + "grad_norm": 2.6340251459926645, + "learning_rate": 2.3733406241878145e-08, + "loss": 0.7786, + "step": 6394 + }, + { + "epoch": 0.98, + "grad_norm": 2.795483266771458, + "learning_rate": 2.3393279481527255e-08, + "loss": 0.7786, + "step": 6395 + }, + { + "epoch": 0.98, + "grad_norm": 2.7073715073593534, + "learning_rate": 2.305560469130552e-08, + "loss": 0.7669, + "step": 6396 + }, + { + "epoch": 0.98, + "grad_norm": 2.5520780876011058, + "learning_rate": 2.2720381954201008e-08, + "loss": 0.6954, + "step": 6397 + }, + { + "epoch": 0.98, + "grad_norm": 2.6048219054047697, + "learning_rate": 2.2387611352603365e-08, + "loss": 0.7811, + "step": 6398 + }, + { + "epoch": 0.98, + "grad_norm": 2.6189944256832876, + "learning_rate": 2.2057292968298284e-08, + "loss": 0.7038, + "step": 6399 + }, + { + "epoch": 0.98, + "grad_norm": 2.615322166675953, + "learning_rate": 2.1729426882468596e-08, + "loss": 0.7856, + "step": 6400 + }, + { + "epoch": 0.98, + "grad_norm": 2.950085694256778, + "learning_rate": 2.1404013175694292e-08, + "loss": 0.7141, + "step": 6401 + }, + { + "epoch": 0.98, + "grad_norm": 2.7197599076092187, + "learning_rate": 2.1081051927953623e-08, + "loss": 0.726, + "step": 6402 + }, + { + "epoch": 0.98, + "grad_norm": 2.9021177208654376, + "learning_rate": 2.0760543218621976e-08, + "loss": 0.7622, + "step": 6403 + }, + { + "epoch": 0.98, + "grad_norm": 2.5346673680250844, + "learning_rate": 2.044248712646968e-08, + "loss": 0.6864, + "step": 6404 + }, + { + "epoch": 0.98, + "grad_norm": 2.7369348943349383, + "learning_rate": 2.0126883729667534e-08, + "loss": 0.7364, + "step": 6405 + }, + { + "epoch": 0.98, + "grad_norm": 2.534941329247269, + "learning_rate": 1.9813733105780163e-08, + "loss": 0.741, + "step": 6406 + }, + { + "epoch": 0.98, + "grad_norm": 2.9269037015104162, + "learning_rate": 1.950303533177378e-08, + "loss": 0.7941, + "step": 6407 + }, + { + "epoch": 0.98, + "grad_norm": 2.6677188601438586, + "learning_rate": 1.91947904840073e-08, + "loss": 0.7969, + "step": 6408 + }, + { + "epoch": 0.98, + "grad_norm": 2.5997357128251033, + "learning_rate": 1.88889986382379e-08, + "loss": 0.7151, + "step": 6409 + }, + { + "epoch": 0.98, + "grad_norm": 2.7504245622344503, + "learning_rate": 1.858565986962324e-08, + "loss": 0.7679, + "step": 6410 + }, + { + "epoch": 0.98, + "grad_norm": 2.626561601579103, + "learning_rate": 1.8284774252713688e-08, + "loss": 0.7941, + "step": 6411 + }, + { + "epoch": 0.98, + "grad_norm": 2.771230928418999, + "learning_rate": 1.7986341861458976e-08, + "loss": 0.7155, + "step": 6412 + }, + { + "epoch": 0.98, + "grad_norm": 2.696959350794206, + "learning_rate": 1.7690362769205993e-08, + "loss": 0.747, + "step": 6413 + }, + { + "epoch": 0.98, + "grad_norm": 3.13649995719672, + "learning_rate": 1.7396837048696547e-08, + "loss": 0.8031, + "step": 6414 + }, + { + "epoch": 0.98, + "grad_norm": 2.6578860908672857, + "learning_rate": 1.710576477207293e-08, + "loss": 0.803, + "step": 6415 + }, + { + "epoch": 0.98, + "grad_norm": 2.844554639142442, + "learning_rate": 1.6817146010871255e-08, + "loss": 0.7004, + "step": 6416 + }, + { + "epoch": 0.98, + "grad_norm": 2.5950797820771916, + "learning_rate": 1.653098083602478e-08, + "loss": 0.7316, + "step": 6417 + }, + { + "epoch": 0.98, + "grad_norm": 2.939752922140131, + "learning_rate": 1.6247269317868353e-08, + "loss": 0.7377, + "step": 6418 + }, + { + "epoch": 0.98, + "grad_norm": 2.654338826560342, + "learning_rate": 1.596601152612731e-08, + "loss": 0.8092, + "step": 6419 + }, + { + "epoch": 0.98, + "grad_norm": 2.55037502648767, + "learning_rate": 1.5687207529927472e-08, + "loss": 0.6677, + "step": 6420 + }, + { + "epoch": 0.98, + "grad_norm": 2.4745890974533937, + "learning_rate": 1.54108573977918e-08, + "loss": 0.6845, + "step": 6421 + }, + { + "epoch": 0.98, + "grad_norm": 3.352244373978386, + "learning_rate": 1.51369611976393e-08, + "loss": 0.7089, + "step": 6422 + }, + { + "epoch": 0.98, + "grad_norm": 2.67104638009709, + "learning_rate": 1.4865518996786122e-08, + "loss": 0.8284, + "step": 6423 + }, + { + "epoch": 0.98, + "grad_norm": 2.9781450975243584, + "learning_rate": 1.4596530861944458e-08, + "loss": 0.7792, + "step": 6424 + }, + { + "epoch": 0.98, + "grad_norm": 2.5664080284926336, + "learning_rate": 1.432999685922365e-08, + "loss": 0.8157, + "step": 6425 + }, + { + "epoch": 0.98, + "grad_norm": 2.7769335025657664, + "learning_rate": 1.4065917054132405e-08, + "loss": 0.8154, + "step": 6426 + }, + { + "epoch": 0.98, + "grad_norm": 2.649007252116273, + "learning_rate": 1.3804291511572144e-08, + "loss": 0.8174, + "step": 6427 + }, + { + "epoch": 0.98, + "grad_norm": 2.926878045065561, + "learning_rate": 1.3545120295843651e-08, + "loss": 0.7972, + "step": 6428 + }, + { + "epoch": 0.98, + "grad_norm": 2.8927224247284036, + "learning_rate": 1.3288403470643751e-08, + "loss": 0.7388, + "step": 6429 + }, + { + "epoch": 0.98, + "grad_norm": 2.7844682550817415, + "learning_rate": 1.3034141099066422e-08, + "loss": 0.7739, + "step": 6430 + }, + { + "epoch": 0.98, + "grad_norm": 2.6199308848828426, + "learning_rate": 1.2782333243601675e-08, + "loss": 0.8188, + "step": 6431 + }, + { + "epoch": 0.98, + "grad_norm": 2.5090276146240362, + "learning_rate": 1.2532979966138892e-08, + "loss": 0.7832, + "step": 6432 + }, + { + "epoch": 0.98, + "grad_norm": 2.7804406982385927, + "learning_rate": 1.2286081327959055e-08, + "loss": 0.6882, + "step": 6433 + }, + { + "epoch": 0.98, + "grad_norm": 3.2823754032742363, + "learning_rate": 1.2041637389745842e-08, + "loss": 0.8219, + "step": 6434 + }, + { + "epoch": 0.98, + "grad_norm": 2.592159073695726, + "learning_rate": 1.1799648211574533e-08, + "loss": 0.8091, + "step": 6435 + }, + { + "epoch": 0.99, + "grad_norm": 2.667384164685074, + "learning_rate": 1.1560113852919774e-08, + "loss": 0.7555, + "step": 6436 + }, + { + "epoch": 0.99, + "grad_norm": 2.6250053411416956, + "learning_rate": 1.1323034372653364e-08, + "loss": 0.7027, + "step": 6437 + }, + { + "epoch": 0.99, + "grad_norm": 2.558165207717662, + "learning_rate": 1.1088409829042023e-08, + "loss": 0.6945, + "step": 6438 + }, + { + "epoch": 0.99, + "grad_norm": 2.554335993292053, + "learning_rate": 1.0856240279750741e-08, + "loss": 0.7743, + "step": 6439 + }, + { + "epoch": 0.99, + "grad_norm": 2.6211979677469794, + "learning_rate": 1.0626525781838316e-08, + "loss": 0.6903, + "step": 6440 + }, + { + "epoch": 0.99, + "grad_norm": 2.5971448988828993, + "learning_rate": 1.0399266391764029e-08, + "loss": 0.7464, + "step": 6441 + }, + { + "epoch": 0.99, + "grad_norm": 2.5474481779950495, + "learning_rate": 1.0174462165380983e-08, + "loss": 0.8297, + "step": 6442 + }, + { + "epoch": 0.99, + "grad_norm": 2.4791215754553084, + "learning_rate": 9.952113157940534e-09, + "loss": 0.7697, + "step": 6443 + }, + { + "epoch": 0.99, + "grad_norm": 2.479645255730251, + "learning_rate": 9.732219424087863e-09, + "loss": 0.7865, + "step": 6444 + }, + { + "epoch": 0.99, + "grad_norm": 2.620899042191569, + "learning_rate": 9.514781017869734e-09, + "loss": 0.6784, + "step": 6445 + }, + { + "epoch": 0.99, + "grad_norm": 2.7710163417045157, + "learning_rate": 9.299797992724514e-09, + "loss": 0.7089, + "step": 6446 + }, + { + "epoch": 0.99, + "grad_norm": 2.6119013634252335, + "learning_rate": 9.087270401488823e-09, + "loss": 0.7556, + "step": 6447 + }, + { + "epoch": 0.99, + "grad_norm": 2.5940185225330477, + "learning_rate": 8.877198296396438e-09, + "loss": 0.6927, + "step": 6448 + }, + { + "epoch": 0.99, + "grad_norm": 2.5758034981359446, + "learning_rate": 8.66958172907717e-09, + "loss": 0.705, + "step": 6449 + }, + { + "epoch": 0.99, + "grad_norm": 3.4943602171675057, + "learning_rate": 8.464420750556868e-09, + "loss": 0.8198, + "step": 6450 + }, + { + "epoch": 0.99, + "grad_norm": 2.5946902014802995, + "learning_rate": 8.26171541125964e-09, + "loss": 0.7201, + "step": 6451 + }, + { + "epoch": 0.99, + "grad_norm": 2.5390879531895627, + "learning_rate": 8.061465761003417e-09, + "loss": 0.8203, + "step": 6452 + }, + { + "epoch": 0.99, + "grad_norm": 2.982939394034132, + "learning_rate": 7.863671849004384e-09, + "loss": 0.8149, + "step": 6453 + }, + { + "epoch": 0.99, + "grad_norm": 2.6976834349346293, + "learning_rate": 7.668333723874765e-09, + "loss": 0.8409, + "step": 6454 + }, + { + "epoch": 0.99, + "grad_norm": 2.7794087904559786, + "learning_rate": 7.475451433623936e-09, + "loss": 0.7434, + "step": 6455 + }, + { + "epoch": 0.99, + "grad_norm": 2.7798006349430144, + "learning_rate": 7.2850250256562e-09, + "loss": 0.8585, + "step": 6456 + }, + { + "epoch": 0.99, + "grad_norm": 2.749781684197471, + "learning_rate": 7.097054546773008e-09, + "loss": 0.7369, + "step": 6457 + }, + { + "epoch": 0.99, + "grad_norm": 2.754818961661538, + "learning_rate": 6.911540043171849e-09, + "loss": 0.7442, + "step": 6458 + }, + { + "epoch": 0.99, + "grad_norm": 2.4299632918254526, + "learning_rate": 6.728481560448474e-09, + "loss": 0.5999, + "step": 6459 + }, + { + "epoch": 0.99, + "grad_norm": 2.691427186417734, + "learning_rate": 6.54787914359134e-09, + "loss": 0.7845, + "step": 6460 + }, + { + "epoch": 0.99, + "grad_norm": 2.47678081900891, + "learning_rate": 6.369732836989384e-09, + "loss": 0.7412, + "step": 6461 + }, + { + "epoch": 0.99, + "grad_norm": 2.7436779125657464, + "learning_rate": 6.194042684425361e-09, + "loss": 0.7747, + "step": 6462 + }, + { + "epoch": 0.99, + "grad_norm": 2.8750941731143174, + "learning_rate": 6.0208087290780645e-09, + "loss": 0.7888, + "step": 6463 + }, + { + "epoch": 0.99, + "grad_norm": 2.770040007515961, + "learning_rate": 5.850031013524549e-09, + "loss": 0.6909, + "step": 6464 + }, + { + "epoch": 0.99, + "grad_norm": 2.6684774674425613, + "learning_rate": 5.681709579737904e-09, + "loss": 0.7019, + "step": 6465 + }, + { + "epoch": 0.99, + "grad_norm": 3.0274878532422287, + "learning_rate": 5.515844469085041e-09, + "loss": 0.7444, + "step": 6466 + }, + { + "epoch": 0.99, + "grad_norm": 2.5402296932243495, + "learning_rate": 5.352435722332238e-09, + "loss": 0.6953, + "step": 6467 + }, + { + "epoch": 0.99, + "grad_norm": 2.6199879604104117, + "learning_rate": 5.191483379639595e-09, + "loss": 0.7152, + "step": 6468 + }, + { + "epoch": 0.99, + "grad_norm": 2.5531274187025073, + "learning_rate": 5.0329874805654656e-09, + "loss": 0.7096, + "step": 6469 + }, + { + "epoch": 0.99, + "grad_norm": 2.634831711496866, + "learning_rate": 4.876948064064246e-09, + "loss": 0.7461, + "step": 6470 + }, + { + "epoch": 0.99, + "grad_norm": 2.3984961627115484, + "learning_rate": 4.723365168485261e-09, + "loss": 0.7232, + "step": 6471 + }, + { + "epoch": 0.99, + "grad_norm": 2.4114883364205943, + "learning_rate": 4.572238831574982e-09, + "loss": 0.717, + "step": 6472 + }, + { + "epoch": 0.99, + "grad_norm": 2.5541673170994525, + "learning_rate": 4.4235690904759206e-09, + "loss": 0.8181, + "step": 6473 + }, + { + "epoch": 0.99, + "grad_norm": 2.7352721716539095, + "learning_rate": 4.277355981727738e-09, + "loss": 0.6996, + "step": 6474 + }, + { + "epoch": 0.99, + "grad_norm": 2.7112077492868942, + "learning_rate": 4.133599541265021e-09, + "loss": 0.7444, + "step": 6475 + }, + { + "epoch": 0.99, + "grad_norm": 2.8709744528710486, + "learning_rate": 3.992299804418398e-09, + "loss": 0.7478, + "step": 6476 + }, + { + "epoch": 0.99, + "grad_norm": 2.830825498249224, + "learning_rate": 3.853456805915645e-09, + "loss": 0.8736, + "step": 6477 + }, + { + "epoch": 0.99, + "grad_norm": 2.7189102796459337, + "learning_rate": 3.7170705798816875e-09, + "loss": 0.717, + "step": 6478 + }, + { + "epoch": 0.99, + "grad_norm": 3.2342398789984927, + "learning_rate": 3.583141159834158e-09, + "loss": 0.801, + "step": 6479 + }, + { + "epoch": 0.99, + "grad_norm": 2.80624204172367, + "learning_rate": 3.4516685786922798e-09, + "loss": 0.8927, + "step": 6480 + }, + { + "epoch": 0.99, + "grad_norm": 2.8881355207422845, + "learning_rate": 3.322652868764653e-09, + "loss": 0.7925, + "step": 6481 + }, + { + "epoch": 0.99, + "grad_norm": 2.833888014886496, + "learning_rate": 3.196094061762578e-09, + "loss": 0.8012, + "step": 6482 + }, + { + "epoch": 0.99, + "grad_norm": 2.6885761878683825, + "learning_rate": 3.071992188790063e-09, + "loss": 0.8459, + "step": 6483 + }, + { + "epoch": 0.99, + "grad_norm": 2.6920429486726296, + "learning_rate": 2.9503472803471546e-09, + "loss": 0.7366, + "step": 6484 + }, + { + "epoch": 0.99, + "grad_norm": 2.793627620980447, + "learning_rate": 2.831159366331049e-09, + "loss": 0.7325, + "step": 6485 + }, + { + "epoch": 0.99, + "grad_norm": 2.4166025338417283, + "learning_rate": 2.7144284760349804e-09, + "loss": 0.6311, + "step": 6486 + }, + { + "epoch": 0.99, + "grad_norm": 2.788670358621249, + "learning_rate": 2.600154638148222e-09, + "loss": 0.7814, + "step": 6487 + }, + { + "epoch": 0.99, + "grad_norm": 2.7429804605417925, + "learning_rate": 2.488337880754976e-09, + "loss": 0.8176, + "step": 6488 + }, + { + "epoch": 0.99, + "grad_norm": 2.699912986291421, + "learning_rate": 2.378978231338813e-09, + "loss": 0.7569, + "step": 6489 + }, + { + "epoch": 0.99, + "grad_norm": 2.6965672451161726, + "learning_rate": 2.272075716774902e-09, + "loss": 0.711, + "step": 6490 + }, + { + "epoch": 0.99, + "grad_norm": 2.637709653615361, + "learning_rate": 2.167630363338891e-09, + "loss": 0.786, + "step": 6491 + }, + { + "epoch": 0.99, + "grad_norm": 2.7864743894646486, + "learning_rate": 2.065642196699136e-09, + "loss": 0.7065, + "step": 6492 + }, + { + "epoch": 0.99, + "grad_norm": 2.5309378266446063, + "learning_rate": 1.966111241922253e-09, + "loss": 0.7645, + "step": 6493 + }, + { + "epoch": 0.99, + "grad_norm": 2.9183202809954496, + "learning_rate": 1.8690375234697854e-09, + "loss": 0.7266, + "step": 6494 + }, + { + "epoch": 0.99, + "grad_norm": 2.8177372518135932, + "learning_rate": 1.7744210651993167e-09, + "loss": 0.7258, + "step": 6495 + }, + { + "epoch": 0.99, + "grad_norm": 2.797322429322555, + "learning_rate": 1.6822618903655773e-09, + "loss": 0.7366, + "step": 6496 + }, + { + "epoch": 0.99, + "grad_norm": 2.6958226240614174, + "learning_rate": 1.592560021618228e-09, + "loss": 0.7854, + "step": 6497 + }, + { + "epoch": 0.99, + "grad_norm": 2.5733350687801146, + "learning_rate": 1.5053154810040772e-09, + "loss": 0.6753, + "step": 6498 + }, + { + "epoch": 0.99, + "grad_norm": 2.8514235606033753, + "learning_rate": 1.4205282899659722e-09, + "loss": 0.7067, + "step": 6499 + }, + { + "epoch": 0.99, + "grad_norm": 3.0375905676960664, + "learning_rate": 1.3381984693405793e-09, + "loss": 0.7599, + "step": 6500 + }, + { + "epoch": 1.0, + "grad_norm": 3.483747166388334, + "learning_rate": 1.2583260393628226e-09, + "loss": 0.7857, + "step": 6501 + }, + { + "epoch": 1.0, + "grad_norm": 2.8775232736931855, + "learning_rate": 1.1809110196636663e-09, + "loss": 0.8218, + "step": 6502 + }, + { + "epoch": 1.0, + "grad_norm": 3.0171976204757547, + "learning_rate": 1.1059534292690022e-09, + "loss": 0.7009, + "step": 6503 + }, + { + "epoch": 1.0, + "grad_norm": 2.5790803766513988, + "learning_rate": 1.0334532866007608e-09, + "loss": 0.7804, + "step": 6504 + }, + { + "epoch": 1.0, + "grad_norm": 2.8344092373568186, + "learning_rate": 9.634106094791317e-10, + "loss": 0.7493, + "step": 6505 + }, + { + "epoch": 1.0, + "grad_norm": 2.5103505676785516, + "learning_rate": 8.958254151170131e-10, + "loss": 0.7996, + "step": 6506 + }, + { + "epoch": 1.0, + "grad_norm": 2.6642999642212963, + "learning_rate": 8.30697720126672e-10, + "loss": 0.6592, + "step": 6507 + }, + { + "epoch": 1.0, + "grad_norm": 2.861760072908654, + "learning_rate": 7.680275405130833e-10, + "loss": 0.749, + "step": 6508 + }, + { + "epoch": 1.0, + "grad_norm": 2.5797148250536144, + "learning_rate": 7.078148916783711e-10, + "loss": 0.7418, + "step": 6509 + }, + { + "epoch": 1.0, + "grad_norm": 2.572541953632293, + "learning_rate": 6.500597884229188e-10, + "loss": 0.7554, + "step": 6510 + }, + { + "epoch": 1.0, + "grad_norm": 2.605496462403277, + "learning_rate": 5.947622449409274e-10, + "loss": 0.8051, + "step": 6511 + }, + { + "epoch": 1.0, + "grad_norm": 2.316767161854669, + "learning_rate": 5.419222748226372e-10, + "loss": 0.7009, + "step": 6512 + }, + { + "epoch": 1.0, + "grad_norm": 2.547132182034208, + "learning_rate": 4.915398910532166e-10, + "loss": 0.8116, + "step": 6513 + }, + { + "epoch": 1.0, + "grad_norm": 2.6269179574986836, + "learning_rate": 4.436151060183136e-10, + "loss": 0.7224, + "step": 6514 + }, + { + "epoch": 1.0, + "grad_norm": 2.8338788225773315, + "learning_rate": 3.9814793149295373e-10, + "loss": 0.7944, + "step": 6515 + }, + { + "epoch": 1.0, + "grad_norm": 2.8067035466603394, + "learning_rate": 3.5513837865486236e-10, + "loss": 0.8032, + "step": 6516 + }, + { + "epoch": 1.0, + "grad_norm": 2.550073722813949, + "learning_rate": 3.145864580722524e-10, + "loss": 0.7207, + "step": 6517 + }, + { + "epoch": 1.0, + "grad_norm": 2.636265469611505, + "learning_rate": 2.764921797138165e-10, + "loss": 0.7546, + "step": 6518 + }, + { + "epoch": 1.0, + "grad_norm": 2.616310209319721, + "learning_rate": 2.4085555293984484e-10, + "loss": 0.8064, + "step": 6519 + }, + { + "epoch": 1.0, + "grad_norm": 2.817876730596737, + "learning_rate": 2.0767658650999723e-10, + "loss": 0.7837, + "step": 6520 + }, + { + "epoch": 1.0, + "grad_norm": 2.4123576010128955, + "learning_rate": 1.7695528857886169e-10, + "loss": 0.7163, + "step": 6521 + }, + { + "epoch": 1.0, + "grad_norm": 2.801149751350197, + "learning_rate": 1.4869166669595482e-10, + "loss": 0.791, + "step": 6522 + }, + { + "epoch": 1.0, + "grad_norm": 2.669239979861841, + "learning_rate": 1.2288572780905227e-10, + "loss": 0.6997, + "step": 6523 + }, + { + "epoch": 1.0, + "grad_norm": 3.0859647495600746, + "learning_rate": 9.95374782586378e-11, + "loss": 0.7652, + "step": 6524 + }, + { + "epoch": 1.0, + "grad_norm": 2.8567736532925334, + "learning_rate": 7.86469237845644e-11, + "loss": 0.8433, + "step": 6525 + }, + { + "epoch": 1.0, + "grad_norm": 2.95683604369921, + "learning_rate": 6.021406952161357e-11, + "loss": 0.8292, + "step": 6526 + }, + { + "epoch": 1.0, + "grad_norm": 2.5110923123790236, + "learning_rate": 4.423891999838503e-11, + "loss": 0.7575, + "step": 6527 + }, + { + "epoch": 1.0, + "grad_norm": 2.3604571293366843, + "learning_rate": 3.072147914284784e-11, + "loss": 0.6375, + "step": 6528 + }, + { + "epoch": 1.0, + "grad_norm": 2.6002150822854406, + "learning_rate": 1.966175027567907e-11, + "loss": 0.6676, + "step": 6529 + }, + { + "epoch": 1.0, + "grad_norm": 2.781193187571927, + "learning_rate": 1.1059736115814901e-11, + "loss": 0.7323, + "step": 6530 + }, + { + "epoch": 1.0, + "grad_norm": 2.670757867546929, + "learning_rate": 4.915438777119974e-12, + "loss": 0.7783, + "step": 6531 + }, + { + "epoch": 1.0, + "grad_norm": 2.786026911255166, + "learning_rate": 1.2288597694976034e-12, + "loss": 0.6062, + "step": 6532 + }, + { + "epoch": 1.0, + "grad_norm": 4.022792001865195, + "learning_rate": 0.0, + "loss": 0.8392, + "step": 6533 + }, + { + "epoch": 1.0, + "step": 6533, + "total_flos": 6195132391227392.0, + "train_loss": 0.8881614911151154, + "train_runtime": 203520.9615, + "train_samples_per_second": 4.109, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 6533, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 400, + "total_flos": 6195132391227392.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}