diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20806 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 10.0, + "global_step": 2968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013477088948787063, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.4965, + "step": 1 + }, + { + "epoch": 0.0026954177897574125, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.7316, + "step": 2 + }, + { + "epoch": 0.004043126684636119, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.7793, + "step": 3 + }, + { + "epoch": 0.005390835579514825, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.4895, + "step": 4 + }, + { + "epoch": 0.006738544474393531, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.8158, + "step": 5 + }, + { + "epoch": 0.008086253369272238, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.7961, + "step": 6 + }, + { + "epoch": 0.009433962264150943, + "grad_norm": 125.61207719089278, + "learning_rate": 1.1111111111111112e-07, + "loss": 1.8011, + "step": 7 + }, + { + "epoch": 0.01078167115902965, + "grad_norm": 98.23506086937144, + "learning_rate": 2.2222222222222224e-07, + "loss": 1.3899, + "step": 8 + }, + { + "epoch": 0.012129380053908356, + "grad_norm": 118.21454575937221, + "learning_rate": 3.3333333333333335e-07, + "loss": 1.6654, + "step": 9 + }, + { + "epoch": 0.013477088948787063, + "grad_norm": 126.2446047171038, + "learning_rate": 4.444444444444445e-07, + "loss": 1.7041, + "step": 10 + }, + { + "epoch": 0.014824797843665768, + "grad_norm": 110.73046762632127, + "learning_rate": 5.555555555555555e-07, + "loss": 1.5553, + "step": 11 + }, + { + "epoch": 0.016172506738544475, + "grad_norm": 63.309686670366844, + "learning_rate": 6.666666666666667e-07, + "loss": 1.2495, + "step": 12 + }, + { + "epoch": 0.01752021563342318, + "grad_norm": 52.8489242206399, + "learning_rate": 7.777777777777779e-07, + "loss": 1.1774, + "step": 13 + }, + { + "epoch": 0.018867924528301886, + "grad_norm": 32.96070712156255, + "learning_rate": 8.88888888888889e-07, + "loss": 1.0179, + "step": 14 + }, + { + "epoch": 0.02021563342318059, + "grad_norm": 35.537448147274496, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0232, + "step": 15 + }, + { + "epoch": 0.0215633423180593, + "grad_norm": 73.41324772720895, + "learning_rate": 1.111111111111111e-06, + "loss": 1.035, + "step": 16 + }, + { + "epoch": 0.022911051212938006, + "grad_norm": 73.41324772720895, + "learning_rate": 1.111111111111111e-06, + "loss": 1.1794, + "step": 17 + }, + { + "epoch": 0.02425876010781671, + "grad_norm": 109.6299108731624, + "learning_rate": 1.2222222222222223e-06, + "loss": 1.2452, + "step": 18 + }, + { + "epoch": 0.025606469002695417, + "grad_norm": 117.8970881867975, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.244, + "step": 19 + }, + { + "epoch": 0.026954177897574125, + "grad_norm": 139.8707056543549, + "learning_rate": 1.4444444444444445e-06, + "loss": 1.3268, + "step": 20 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 127.24603490377721, + "learning_rate": 1.5555555555555558e-06, + "loss": 1.2716, + "step": 21 + }, + { + "epoch": 0.029649595687331536, + "grad_norm": 100.79834986663668, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.9979, + "step": 22 + }, + { + "epoch": 0.03099730458221024, + "grad_norm": 83.10285838411271, + "learning_rate": 1.777777777777778e-06, + "loss": 0.9061, + "step": 23 + }, + { + "epoch": 0.03234501347708895, + "grad_norm": 46.828286103613564, + "learning_rate": 1.888888888888889e-06, + "loss": 0.7808, + "step": 24 + }, + { + "epoch": 0.03369272237196765, + "grad_norm": 26.9875948959685, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6967, + "step": 25 + }, + { + "epoch": 0.03504043126684636, + "grad_norm": 25.53072818703274, + "learning_rate": 2.1111111111111114e-06, + "loss": 0.6744, + "step": 26 + }, + { + "epoch": 0.03638814016172507, + "grad_norm": 43.37963600215304, + "learning_rate": 2.222222222222222e-06, + "loss": 0.6779, + "step": 27 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 35.14058514329771, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.6255, + "step": 28 + }, + { + "epoch": 0.03908355795148248, + "grad_norm": 45.69048246290026, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.6325, + "step": 29 + }, + { + "epoch": 0.04043126684636118, + "grad_norm": 44.69113961684672, + "learning_rate": 2.5555555555555557e-06, + "loss": 0.5578, + "step": 30 + }, + { + "epoch": 0.04177897574123989, + "grad_norm": 37.17052342819259, + "learning_rate": 2.666666666666667e-06, + "loss": 0.5745, + "step": 31 + }, + { + "epoch": 0.0431266846361186, + "grad_norm": 28.10252538244143, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.501, + "step": 32 + }, + { + "epoch": 0.0444743935309973, + "grad_norm": 14.027573666035739, + "learning_rate": 2.888888888888889e-06, + "loss": 0.517, + "step": 33 + }, + { + "epoch": 0.04582210242587601, + "grad_norm": 13.738232776182715, + "learning_rate": 3e-06, + "loss": 0.4633, + "step": 34 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 27.550121109589607, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.4749, + "step": 35 + }, + { + "epoch": 0.04851752021563342, + "grad_norm": 28.307537576758794, + "learning_rate": 3.2222222222222227e-06, + "loss": 0.4444, + "step": 36 + }, + { + "epoch": 0.04986522911051213, + "grad_norm": 56.555770594277284, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5176, + "step": 37 + }, + { + "epoch": 0.05121293800539083, + "grad_norm": 49.06620593547809, + "learning_rate": 3.444444444444445e-06, + "loss": 0.4832, + "step": 38 + }, + { + "epoch": 0.05256064690026954, + "grad_norm": 34.84067743965358, + "learning_rate": 3.555555555555556e-06, + "loss": 0.402, + "step": 39 + }, + { + "epoch": 0.05390835579514825, + "grad_norm": 45.606628886410945, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.4299, + "step": 40 + }, + { + "epoch": 0.05525606469002695, + "grad_norm": 23.779030150082647, + "learning_rate": 3.777777777777778e-06, + "loss": 0.3801, + "step": 41 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 14.950583573615173, + "learning_rate": 3.88888888888889e-06, + "loss": 0.3274, + "step": 42 + }, + { + "epoch": 0.057951482479784364, + "grad_norm": 28.90248453023528, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3886, + "step": 43 + }, + { + "epoch": 0.05929919137466307, + "grad_norm": 53.4099079972952, + "learning_rate": 4.111111111111111e-06, + "loss": 0.38, + "step": 44 + }, + { + "epoch": 0.06064690026954178, + "grad_norm": 38.96863172694806, + "learning_rate": 4.222222222222223e-06, + "loss": 0.3325, + "step": 45 + }, + { + "epoch": 0.06199460916442048, + "grad_norm": 72.12866200206949, + "learning_rate": 4.333333333333334e-06, + "loss": 0.4206, + "step": 46 + }, + { + "epoch": 0.06334231805929919, + "grad_norm": 66.19219785449302, + "learning_rate": 4.444444444444444e-06, + "loss": 0.398, + "step": 47 + }, + { + "epoch": 0.0646900269541779, + "grad_norm": 37.65051473594846, + "learning_rate": 4.555555555555556e-06, + "loss": 0.3341, + "step": 48 + }, + { + "epoch": 0.0660377358490566, + "grad_norm": 50.125497292388744, + "learning_rate": 4.666666666666667e-06, + "loss": 0.3177, + "step": 49 + }, + { + "epoch": 0.0673854447439353, + "grad_norm": 27.278155874550993, + "learning_rate": 4.777777777777778e-06, + "loss": 0.3391, + "step": 50 + }, + { + "epoch": 0.06873315363881402, + "grad_norm": 51.78944733015054, + "learning_rate": 4.888888888888889e-06, + "loss": 0.3505, + "step": 51 + }, + { + "epoch": 0.07008086253369272, + "grad_norm": 67.92363636946386, + "learning_rate": 5e-06, + "loss": 0.4006, + "step": 52 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 62.24646812118444, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.3516, + "step": 53 + }, + { + "epoch": 0.07277628032345014, + "grad_norm": 74.6429285043474, + "learning_rate": 5.2222222222222226e-06, + "loss": 0.3652, + "step": 54 + }, + { + "epoch": 0.07412398921832884, + "grad_norm": 56.47246304537127, + "learning_rate": 5.333333333333334e-06, + "loss": 0.3336, + "step": 55 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 33.70465393499382, + "learning_rate": 5.444444444444445e-06, + "loss": 0.2602, + "step": 56 + }, + { + "epoch": 0.07681940700808626, + "grad_norm": 17.39152752635625, + "learning_rate": 5.555555555555557e-06, + "loss": 0.2238, + "step": 57 + }, + { + "epoch": 0.07816711590296496, + "grad_norm": 13.353268527381852, + "learning_rate": 5.666666666666667e-06, + "loss": 0.2376, + "step": 58 + }, + { + "epoch": 0.07951482479784366, + "grad_norm": 44.48567504391602, + "learning_rate": 5.777777777777778e-06, + "loss": 0.2676, + "step": 59 + }, + { + "epoch": 0.08086253369272237, + "grad_norm": 45.460669680166134, + "learning_rate": 5.88888888888889e-06, + "loss": 0.2955, + "step": 60 + }, + { + "epoch": 0.08221024258760108, + "grad_norm": 47.704379599898196, + "learning_rate": 6e-06, + "loss": 0.32, + "step": 61 + }, + { + "epoch": 0.08355795148247978, + "grad_norm": 46.34810663042404, + "learning_rate": 6.111111111111112e-06, + "loss": 0.2772, + "step": 62 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 26.26495461553407, + "learning_rate": 6.222222222222223e-06, + "loss": 0.2442, + "step": 63 + }, + { + "epoch": 0.0862533692722372, + "grad_norm": 36.74944868759868, + "learning_rate": 6.333333333333333e-06, + "loss": 0.2838, + "step": 64 + }, + { + "epoch": 0.0876010781671159, + "grad_norm": 21.189898475706347, + "learning_rate": 6.444444444444445e-06, + "loss": 0.2329, + "step": 65 + }, + { + "epoch": 0.0889487870619946, + "grad_norm": 16.089352318470358, + "learning_rate": 6.555555555555556e-06, + "loss": 0.2507, + "step": 66 + }, + { + "epoch": 0.09029649595687332, + "grad_norm": 51.85170847556683, + "learning_rate": 6.666666666666667e-06, + "loss": 0.279, + "step": 67 + }, + { + "epoch": 0.09164420485175202, + "grad_norm": 70.64383489157596, + "learning_rate": 6.777777777777779e-06, + "loss": 0.3031, + "step": 68 + }, + { + "epoch": 0.09299191374663072, + "grad_norm": 67.69300029522314, + "learning_rate": 6.88888888888889e-06, + "loss": 0.2921, + "step": 69 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 62.7500653381619, + "learning_rate": 7e-06, + "loss": 0.2552, + "step": 70 + }, + { + "epoch": 0.09568733153638814, + "grad_norm": 32.83567301528463, + "learning_rate": 7.111111111111112e-06, + "loss": 0.2908, + "step": 71 + }, + { + "epoch": 0.09703504043126684, + "grad_norm": 26.17203400319299, + "learning_rate": 7.222222222222223e-06, + "loss": 0.2682, + "step": 72 + }, + { + "epoch": 0.09838274932614555, + "grad_norm": 29.28078092811994, + "learning_rate": 7.333333333333333e-06, + "loss": 0.2441, + "step": 73 + }, + { + "epoch": 0.09973045822102426, + "grad_norm": 32.1220579594131, + "learning_rate": 7.444444444444445e-06, + "loss": 0.2243, + "step": 74 + }, + { + "epoch": 0.10107816711590296, + "grad_norm": 20.023366265882274, + "learning_rate": 7.555555555555556e-06, + "loss": 0.1801, + "step": 75 + }, + { + "epoch": 0.10242587601078167, + "grad_norm": 15.029544004934412, + "learning_rate": 7.666666666666667e-06, + "loss": 0.2154, + "step": 76 + }, + { + "epoch": 0.10377358490566038, + "grad_norm": 40.724561557489416, + "learning_rate": 7.77777777777778e-06, + "loss": 0.232, + "step": 77 + }, + { + "epoch": 0.10512129380053908, + "grad_norm": 32.3195955865964, + "learning_rate": 7.88888888888889e-06, + "loss": 0.2238, + "step": 78 + }, + { + "epoch": 0.10646900269541779, + "grad_norm": 21.806982371437734, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2554, + "step": 79 + }, + { + "epoch": 0.1078167115902965, + "grad_norm": 15.302329433597961, + "learning_rate": 8.111111111111112e-06, + "loss": 0.1839, + "step": 80 + }, + { + "epoch": 0.1091644204851752, + "grad_norm": 18.617188483842565, + "learning_rate": 8.222222222222222e-06, + "loss": 0.2162, + "step": 81 + }, + { + "epoch": 0.1105121293800539, + "grad_norm": 61.49211658814918, + "learning_rate": 8.333333333333334e-06, + "loss": 0.2345, + "step": 82 + }, + { + "epoch": 0.11185983827493262, + "grad_norm": 82.48435683770734, + "learning_rate": 8.444444444444446e-06, + "loss": 0.3249, + "step": 83 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 95.11618741436764, + "learning_rate": 8.555555555555556e-06, + "loss": 0.3188, + "step": 84 + }, + { + "epoch": 0.11455525606469003, + "grad_norm": 89.59786197084256, + "learning_rate": 8.666666666666668e-06, + "loss": 0.3148, + "step": 85 + }, + { + "epoch": 0.11590296495956873, + "grad_norm": 76.9513831208834, + "learning_rate": 8.777777777777778e-06, + "loss": 0.2767, + "step": 86 + }, + { + "epoch": 0.11725067385444744, + "grad_norm": 91.86723151717425, + "learning_rate": 8.888888888888888e-06, + "loss": 0.3037, + "step": 87 + }, + { + "epoch": 0.11859838274932614, + "grad_norm": 49.23765781184444, + "learning_rate": 9e-06, + "loss": 0.1934, + "step": 88 + }, + { + "epoch": 0.11994609164420485, + "grad_norm": 21.449002233034083, + "learning_rate": 9.111111111111112e-06, + "loss": 0.2333, + "step": 89 + }, + { + "epoch": 0.12129380053908356, + "grad_norm": 56.259585040515084, + "learning_rate": 9.222222222222224e-06, + "loss": 0.2038, + "step": 90 + }, + { + "epoch": 0.12264150943396226, + "grad_norm": 60.64784698943617, + "learning_rate": 9.333333333333334e-06, + "loss": 0.2323, + "step": 91 + }, + { + "epoch": 0.12398921832884097, + "grad_norm": 64.57516541808731, + "learning_rate": 9.444444444444445e-06, + "loss": 0.239, + "step": 92 + }, + { + "epoch": 0.12533692722371967, + "grad_norm": 64.24927104670861, + "learning_rate": 9.555555555555556e-06, + "loss": 0.2557, + "step": 93 + }, + { + "epoch": 0.12668463611859837, + "grad_norm": 82.8327943119739, + "learning_rate": 9.666666666666667e-06, + "loss": 0.2675, + "step": 94 + }, + { + "epoch": 0.1280323450134771, + "grad_norm": 71.32395068993172, + "learning_rate": 9.777777777777779e-06, + "loss": 0.2597, + "step": 95 + }, + { + "epoch": 0.1293800539083558, + "grad_norm": 46.61894852134876, + "learning_rate": 9.88888888888889e-06, + "loss": 0.2365, + "step": 96 + }, + { + "epoch": 0.1307277628032345, + "grad_norm": 14.407839256888954, + "learning_rate": 1e-05, + "loss": 0.1754, + "step": 97 + }, + { + "epoch": 0.1320754716981132, + "grad_norm": 13.422748378006194, + "learning_rate": 9.99999702108486e-06, + "loss": 0.2115, + "step": 98 + }, + { + "epoch": 0.1334231805929919, + "grad_norm": 24.073885410162287, + "learning_rate": 9.999988084342989e-06, + "loss": 0.1753, + "step": 99 + }, + { + "epoch": 0.1347708894878706, + "grad_norm": 72.04898720252486, + "learning_rate": 9.999973189785035e-06, + "loss": 0.2593, + "step": 100 + }, + { + "epoch": 0.13611859838274934, + "grad_norm": 65.36252696486945, + "learning_rate": 9.999952337428749e-06, + "loss": 0.2535, + "step": 101 + }, + { + "epoch": 0.13746630727762804, + "grad_norm": 76.59348732452403, + "learning_rate": 9.999925527298973e-06, + "loss": 0.2691, + "step": 102 + }, + { + "epoch": 0.13881401617250674, + "grad_norm": 60.4067810909582, + "learning_rate": 9.999892759427657e-06, + "loss": 0.1827, + "step": 103 + }, + { + "epoch": 0.14016172506738545, + "grad_norm": 18.859694236312162, + "learning_rate": 9.999854033853843e-06, + "loss": 0.18, + "step": 104 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 18.931436379236544, + "learning_rate": 9.999809350623678e-06, + "loss": 0.1607, + "step": 105 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 12.643279782569419, + "learning_rate": 9.999758709790403e-06, + "loss": 0.2037, + "step": 106 + }, + { + "epoch": 0.14420485175202155, + "grad_norm": 42.03546024944768, + "learning_rate": 9.999702111414362e-06, + "loss": 0.1948, + "step": 107 + }, + { + "epoch": 0.14555256064690028, + "grad_norm": 92.39530951078983, + "learning_rate": 9.999639555562993e-06, + "loss": 0.2775, + "step": 108 + }, + { + "epoch": 0.14690026954177898, + "grad_norm": 81.32535101312078, + "learning_rate": 9.999571042310838e-06, + "loss": 0.2958, + "step": 109 + }, + { + "epoch": 0.14824797843665768, + "grad_norm": 91.53629748618035, + "learning_rate": 9.999496571739534e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.1495956873315364, + "grad_norm": 82.4132019419034, + "learning_rate": 9.999416143937816e-06, + "loss": 0.2536, + "step": 111 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 81.77417618124623, + "learning_rate": 9.999329759001521e-06, + "loss": 0.2655, + "step": 112 + }, + { + "epoch": 0.1522911051212938, + "grad_norm": 53.604910507971056, + "learning_rate": 9.999237417033582e-06, + "loss": 0.2135, + "step": 113 + }, + { + "epoch": 0.15363881401617252, + "grad_norm": 62.954047065266586, + "learning_rate": 9.999139118144032e-06, + "loss": 0.2048, + "step": 114 + }, + { + "epoch": 0.15498652291105122, + "grad_norm": 10.199314341287616, + "learning_rate": 9.999034862449997e-06, + "loss": 0.1638, + "step": 115 + }, + { + "epoch": 0.15633423180592992, + "grad_norm": 7.853745852398947, + "learning_rate": 9.998924650075707e-06, + "loss": 0.168, + "step": 116 + }, + { + "epoch": 0.15768194070080863, + "grad_norm": 57.4174403976984, + "learning_rate": 9.998808481152488e-06, + "loss": 0.205, + "step": 117 + }, + { + "epoch": 0.15902964959568733, + "grad_norm": 44.59263675542461, + "learning_rate": 9.998686355818763e-06, + "loss": 0.1853, + "step": 118 + }, + { + "epoch": 0.16037735849056603, + "grad_norm": 92.02152742870838, + "learning_rate": 9.998558274220048e-06, + "loss": 0.2357, + "step": 119 + }, + { + "epoch": 0.16172506738544473, + "grad_norm": 114.71686889095547, + "learning_rate": 9.998424236508966e-06, + "loss": 0.3395, + "step": 120 + }, + { + "epoch": 0.16307277628032346, + "grad_norm": 95.62106340588882, + "learning_rate": 9.998284242845229e-06, + "loss": 0.2704, + "step": 121 + }, + { + "epoch": 0.16442048517520216, + "grad_norm": 107.47265151256973, + "learning_rate": 9.998138293395649e-06, + "loss": 0.3086, + "step": 122 + }, + { + "epoch": 0.16576819407008087, + "grad_norm": 88.11804083846752, + "learning_rate": 9.997986388334137e-06, + "loss": 0.2656, + "step": 123 + }, + { + "epoch": 0.16711590296495957, + "grad_norm": 65.01682558827005, + "learning_rate": 9.997828527841692e-06, + "loss": 0.2343, + "step": 124 + }, + { + "epoch": 0.16846361185983827, + "grad_norm": 43.03829871981552, + "learning_rate": 9.997664712106424e-06, + "loss": 0.1981, + "step": 125 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 33.47602648531359, + "learning_rate": 9.997494941323522e-06, + "loss": 0.2078, + "step": 126 + }, + { + "epoch": 0.1711590296495957, + "grad_norm": 33.791904033234, + "learning_rate": 9.997319215695282e-06, + "loss": 0.1817, + "step": 127 + }, + { + "epoch": 0.1725067385444744, + "grad_norm": 24.12474786416528, + "learning_rate": 9.997137535431094e-06, + "loss": 0.1606, + "step": 128 + }, + { + "epoch": 0.1738544474393531, + "grad_norm": 70.69352108597073, + "learning_rate": 9.996949900747441e-06, + "loss": 0.2361, + "step": 129 + }, + { + "epoch": 0.1752021563342318, + "grad_norm": 72.20445027157899, + "learning_rate": 9.996756311867904e-06, + "loss": 0.2478, + "step": 130 + }, + { + "epoch": 0.1765498652291105, + "grad_norm": 82.12412678491732, + "learning_rate": 9.996556769023152e-06, + "loss": 0.2624, + "step": 131 + }, + { + "epoch": 0.1778975741239892, + "grad_norm": 109.02837882068626, + "learning_rate": 9.996351272450959e-06, + "loss": 0.2982, + "step": 132 + }, + { + "epoch": 0.1792452830188679, + "grad_norm": 74.10175262936568, + "learning_rate": 9.996139822396185e-06, + "loss": 0.2737, + "step": 133 + }, + { + "epoch": 0.18059299191374664, + "grad_norm": 80.89085133132998, + "learning_rate": 9.995922419110786e-06, + "loss": 0.2361, + "step": 134 + }, + { + "epoch": 0.18194070080862534, + "grad_norm": 69.31696095690423, + "learning_rate": 9.995699062853814e-06, + "loss": 0.1961, + "step": 135 + }, + { + "epoch": 0.18328840970350405, + "grad_norm": 29.754075196162294, + "learning_rate": 9.995469753891412e-06, + "loss": 0.1657, + "step": 136 + }, + { + "epoch": 0.18463611859838275, + "grad_norm": 7.838309995694679, + "learning_rate": 9.995234492496818e-06, + "loss": 0.1617, + "step": 137 + }, + { + "epoch": 0.18598382749326145, + "grad_norm": 10.125849486927097, + "learning_rate": 9.994993278950358e-06, + "loss": 0.2188, + "step": 138 + }, + { + "epoch": 0.18733153638814015, + "grad_norm": 37.9804403576614, + "learning_rate": 9.99474611353946e-06, + "loss": 0.2054, + "step": 139 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 24.78781511140767, + "learning_rate": 9.994492996558632e-06, + "loss": 0.158, + "step": 140 + }, + { + "epoch": 0.19002695417789758, + "grad_norm": 67.89299066041484, + "learning_rate": 9.99423392830948e-06, + "loss": 0.2387, + "step": 141 + }, + { + "epoch": 0.19137466307277629, + "grad_norm": 45.67296667751866, + "learning_rate": 9.993968909100705e-06, + "loss": 0.1976, + "step": 142 + }, + { + "epoch": 0.192722371967655, + "grad_norm": 53.22531213674166, + "learning_rate": 9.993697939248093e-06, + "loss": 0.2125, + "step": 143 + }, + { + "epoch": 0.1940700808625337, + "grad_norm": 45.05002818406896, + "learning_rate": 9.99342101907452e-06, + "loss": 0.2055, + "step": 144 + }, + { + "epoch": 0.1954177897574124, + "grad_norm": 21.50388479430456, + "learning_rate": 9.99313814890996e-06, + "loss": 0.1955, + "step": 145 + }, + { + "epoch": 0.1967654986522911, + "grad_norm": 11.439322084091629, + "learning_rate": 9.992849329091466e-06, + "loss": 0.1593, + "step": 146 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 15.368127152710288, + "learning_rate": 9.992554559963189e-06, + "loss": 0.1927, + "step": 147 + }, + { + "epoch": 0.19946091644204852, + "grad_norm": 32.38517669530493, + "learning_rate": 9.992253841876365e-06, + "loss": 0.1649, + "step": 148 + }, + { + "epoch": 0.20080862533692723, + "grad_norm": 41.57935049724927, + "learning_rate": 9.99194717518932e-06, + "loss": 0.1889, + "step": 149 + }, + { + "epoch": 0.20215633423180593, + "grad_norm": 16.998279760466197, + "learning_rate": 9.991634560267467e-06, + "loss": 0.1554, + "step": 150 + }, + { + "epoch": 0.20350404312668463, + "grad_norm": 25.575091429147935, + "learning_rate": 9.991315997483307e-06, + "loss": 0.1341, + "step": 151 + }, + { + "epoch": 0.20485175202156333, + "grad_norm": 26.663285219339144, + "learning_rate": 9.990991487216428e-06, + "loss": 0.1846, + "step": 152 + }, + { + "epoch": 0.20619946091644206, + "grad_norm": 19.84348122752292, + "learning_rate": 9.990661029853508e-06, + "loss": 0.1431, + "step": 153 + }, + { + "epoch": 0.20754716981132076, + "grad_norm": 21.619214646712184, + "learning_rate": 9.990324625788308e-06, + "loss": 0.1536, + "step": 154 + }, + { + "epoch": 0.20889487870619947, + "grad_norm": 14.514888918479356, + "learning_rate": 9.989982275421674e-06, + "loss": 0.188, + "step": 155 + }, + { + "epoch": 0.21024258760107817, + "grad_norm": 13.357299872095762, + "learning_rate": 9.989633979161539e-06, + "loss": 0.2015, + "step": 156 + }, + { + "epoch": 0.21159029649595687, + "grad_norm": 53.846239109464776, + "learning_rate": 9.989279737422923e-06, + "loss": 0.2092, + "step": 157 + }, + { + "epoch": 0.21293800539083557, + "grad_norm": 29.308940479568605, + "learning_rate": 9.988919550627929e-06, + "loss": 0.1744, + "step": 158 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 7.5539854745072965, + "learning_rate": 9.98855341920574e-06, + "loss": 0.1338, + "step": 159 + }, + { + "epoch": 0.215633423180593, + "grad_norm": 16.451032398107994, + "learning_rate": 9.988181343592628e-06, + "loss": 0.1809, + "step": 160 + }, + { + "epoch": 0.2169811320754717, + "grad_norm": 17.91030767016008, + "learning_rate": 9.987803324231945e-06, + "loss": 0.1616, + "step": 161 + }, + { + "epoch": 0.2183288409703504, + "grad_norm": 11.0784635890078, + "learning_rate": 9.987419361574127e-06, + "loss": 0.1797, + "step": 162 + }, + { + "epoch": 0.2196765498652291, + "grad_norm": 15.31475936923448, + "learning_rate": 9.987029456076688e-06, + "loss": 0.1544, + "step": 163 + }, + { + "epoch": 0.2210242587601078, + "grad_norm": 27.680003312167383, + "learning_rate": 9.98663360820423e-06, + "loss": 0.1721, + "step": 164 + }, + { + "epoch": 0.2223719676549865, + "grad_norm": 37.64362476872803, + "learning_rate": 9.986231818428432e-06, + "loss": 0.1797, + "step": 165 + }, + { + "epoch": 0.22371967654986524, + "grad_norm": 34.41276035265965, + "learning_rate": 9.98582408722805e-06, + "loss": 0.1598, + "step": 166 + }, + { + "epoch": 0.22506738544474394, + "grad_norm": 53.73250731996025, + "learning_rate": 9.985410415088923e-06, + "loss": 0.173, + "step": 167 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 52.52888949779512, + "learning_rate": 9.98499080250397e-06, + "loss": 0.1424, + "step": 168 + }, + { + "epoch": 0.22776280323450135, + "grad_norm": 4.884339496023134, + "learning_rate": 9.984565249973187e-06, + "loss": 0.133, + "step": 169 + }, + { + "epoch": 0.22911051212938005, + "grad_norm": 6.033422389690938, + "learning_rate": 9.984133758003649e-06, + "loss": 0.1327, + "step": 170 + }, + { + "epoch": 0.23045822102425875, + "grad_norm": 24.2999686290198, + "learning_rate": 9.983696327109504e-06, + "loss": 0.1653, + "step": 171 + }, + { + "epoch": 0.23180592991913745, + "grad_norm": 24.969252404118237, + "learning_rate": 9.983252957811982e-06, + "loss": 0.1557, + "step": 172 + }, + { + "epoch": 0.23315363881401618, + "grad_norm": 11.542707189097792, + "learning_rate": 9.982803650639385e-06, + "loss": 0.1458, + "step": 173 + }, + { + "epoch": 0.23450134770889489, + "grad_norm": 8.591922919119126, + "learning_rate": 9.982348406127096e-06, + "loss": 0.1526, + "step": 174 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 7.486346559365512, + "learning_rate": 9.981887224817565e-06, + "loss": 0.1344, + "step": 175 + }, + { + "epoch": 0.2371967654986523, + "grad_norm": 9.001344163801335, + "learning_rate": 9.981420107260325e-06, + "loss": 0.1483, + "step": 176 + }, + { + "epoch": 0.238544474393531, + "grad_norm": 20.630456031842627, + "learning_rate": 9.98094705401197e-06, + "loss": 0.158, + "step": 177 + }, + { + "epoch": 0.2398921832884097, + "grad_norm": 20.794346689731377, + "learning_rate": 9.98046806563618e-06, + "loss": 0.159, + "step": 178 + }, + { + "epoch": 0.24123989218328842, + "grad_norm": 6.591359368066544, + "learning_rate": 9.979983142703699e-06, + "loss": 0.1768, + "step": 179 + }, + { + "epoch": 0.24258760107816713, + "grad_norm": 19.313702930895563, + "learning_rate": 9.979492285792345e-06, + "loss": 0.146, + "step": 180 + }, + { + "epoch": 0.24393530997304583, + "grad_norm": 26.850689095926754, + "learning_rate": 9.978995495487007e-06, + "loss": 0.1505, + "step": 181 + }, + { + "epoch": 0.24528301886792453, + "grad_norm": 15.017388639066786, + "learning_rate": 9.978492772379642e-06, + "loss": 0.1112, + "step": 182 + }, + { + "epoch": 0.24663072776280323, + "grad_norm": 19.617331720447655, + "learning_rate": 9.97798411706928e-06, + "loss": 0.1123, + "step": 183 + }, + { + "epoch": 0.24797843665768193, + "grad_norm": 5.5798191895100135, + "learning_rate": 9.977469530162015e-06, + "loss": 0.1336, + "step": 184 + }, + { + "epoch": 0.24932614555256064, + "grad_norm": 6.798847392215386, + "learning_rate": 9.976949012271015e-06, + "loss": 0.1545, + "step": 185 + }, + { + "epoch": 0.25067385444743934, + "grad_norm": 10.312245562614425, + "learning_rate": 9.976422564016509e-06, + "loss": 0.1667, + "step": 186 + }, + { + "epoch": 0.25202156334231807, + "grad_norm": 6.644547599484291, + "learning_rate": 9.975890186025792e-06, + "loss": 0.1392, + "step": 187 + }, + { + "epoch": 0.25336927223719674, + "grad_norm": 11.045272862981264, + "learning_rate": 9.975351878933233e-06, + "loss": 0.1456, + "step": 188 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 29.381999603075865, + "learning_rate": 9.974807643380256e-06, + "loss": 0.1979, + "step": 189 + }, + { + "epoch": 0.2560646900269542, + "grad_norm": 11.216737348991092, + "learning_rate": 9.974257480015356e-06, + "loss": 0.1018, + "step": 190 + }, + { + "epoch": 0.2574123989218329, + "grad_norm": 37.60838202089057, + "learning_rate": 9.973701389494088e-06, + "loss": 0.1519, + "step": 191 + }, + { + "epoch": 0.2587601078167116, + "grad_norm": 45.95031563700449, + "learning_rate": 9.973139372479072e-06, + "loss": 0.1758, + "step": 192 + }, + { + "epoch": 0.2601078167115903, + "grad_norm": 8.59098950465088, + "learning_rate": 9.972571429639987e-06, + "loss": 0.1274, + "step": 193 + }, + { + "epoch": 0.261455525606469, + "grad_norm": 22.675674984986685, + "learning_rate": 9.971997561653577e-06, + "loss": 0.1342, + "step": 194 + }, + { + "epoch": 0.2628032345013477, + "grad_norm": 25.33850385878082, + "learning_rate": 9.971417769203639e-06, + "loss": 0.1422, + "step": 195 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 32.99144841924167, + "learning_rate": 9.970832052981037e-06, + "loss": 0.1415, + "step": 196 + }, + { + "epoch": 0.26549865229110514, + "grad_norm": 7.691341917885444, + "learning_rate": 9.97024041368369e-06, + "loss": 0.1335, + "step": 197 + }, + { + "epoch": 0.2668463611859838, + "grad_norm": 5.437395497442224, + "learning_rate": 9.969642852016576e-06, + "loss": 0.1158, + "step": 198 + }, + { + "epoch": 0.26819407008086255, + "grad_norm": 25.113618597115693, + "learning_rate": 9.969039368691728e-06, + "loss": 0.1522, + "step": 199 + }, + { + "epoch": 0.2695417789757412, + "grad_norm": 20.043499055992555, + "learning_rate": 9.968429964428236e-06, + "loss": 0.1703, + "step": 200 + }, + { + "epoch": 0.27088948787061995, + "grad_norm": 15.688323537548595, + "learning_rate": 9.967814639952248e-06, + "loss": 0.1304, + "step": 201 + }, + { + "epoch": 0.2722371967654987, + "grad_norm": 13.8927333544344, + "learning_rate": 9.967193395996962e-06, + "loss": 0.1682, + "step": 202 + }, + { + "epoch": 0.27358490566037735, + "grad_norm": 17.935810916799703, + "learning_rate": 9.96656623330263e-06, + "loss": 0.1671, + "step": 203 + }, + { + "epoch": 0.2749326145552561, + "grad_norm": 3.689118637636182, + "learning_rate": 9.965933152616558e-06, + "loss": 0.1193, + "step": 204 + }, + { + "epoch": 0.27628032345013476, + "grad_norm": 10.401523415079438, + "learning_rate": 9.965294154693107e-06, + "loss": 0.091, + "step": 205 + }, + { + "epoch": 0.2776280323450135, + "grad_norm": 4.171141921642675, + "learning_rate": 9.964649240293681e-06, + "loss": 0.1191, + "step": 206 + }, + { + "epoch": 0.27897574123989216, + "grad_norm": 10.970234623188182, + "learning_rate": 9.963998410186741e-06, + "loss": 0.1242, + "step": 207 + }, + { + "epoch": 0.2803234501347709, + "grad_norm": 18.268114250194465, + "learning_rate": 9.963341665147793e-06, + "loss": 0.1579, + "step": 208 + }, + { + "epoch": 0.2816711590296496, + "grad_norm": 49.85820766114499, + "learning_rate": 9.96267900595939e-06, + "loss": 0.187, + "step": 209 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 45.20887886025685, + "learning_rate": 9.962010433411138e-06, + "loss": 0.1486, + "step": 210 + }, + { + "epoch": 0.284366576819407, + "grad_norm": 20.567629273244723, + "learning_rate": 9.961335948299681e-06, + "loss": 0.1411, + "step": 211 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 20.397010745654214, + "learning_rate": 9.960655551428718e-06, + "loss": 0.1424, + "step": 212 + }, + { + "epoch": 0.28706199460916443, + "grad_norm": 8.460944021649432, + "learning_rate": 9.959969243608983e-06, + "loss": 0.1454, + "step": 213 + }, + { + "epoch": 0.2884097035040431, + "grad_norm": 49.09230429338558, + "learning_rate": 9.959277025658258e-06, + "loss": 0.1501, + "step": 214 + }, + { + "epoch": 0.28975741239892183, + "grad_norm": 42.49658197476098, + "learning_rate": 9.958578898401365e-06, + "loss": 0.1562, + "step": 215 + }, + { + "epoch": 0.29110512129380056, + "grad_norm": 21.97562096695854, + "learning_rate": 9.957874862670172e-06, + "loss": 0.1628, + "step": 216 + }, + { + "epoch": 0.29245283018867924, + "grad_norm": 41.64536114922512, + "learning_rate": 9.95716491930358e-06, + "loss": 0.1702, + "step": 217 + }, + { + "epoch": 0.29380053908355797, + "grad_norm": 33.22260965441105, + "learning_rate": 9.956449069147537e-06, + "loss": 0.1692, + "step": 218 + }, + { + "epoch": 0.29514824797843664, + "grad_norm": 17.429866845237502, + "learning_rate": 9.955727313055026e-06, + "loss": 0.1382, + "step": 219 + }, + { + "epoch": 0.29649595687331537, + "grad_norm": 33.33584610653785, + "learning_rate": 9.954999651886064e-06, + "loss": 0.1782, + "step": 220 + }, + { + "epoch": 0.29784366576819404, + "grad_norm": 40.84908753571336, + "learning_rate": 9.95426608650771e-06, + "loss": 0.1453, + "step": 221 + }, + { + "epoch": 0.2991913746630728, + "grad_norm": 8.85500427034178, + "learning_rate": 9.953526617794051e-06, + "loss": 0.1391, + "step": 222 + }, + { + "epoch": 0.3005390835579515, + "grad_norm": 3.953839877316965, + "learning_rate": 9.95278124662622e-06, + "loss": 0.1866, + "step": 223 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 21.872976846576627, + "learning_rate": 9.95202997389237e-06, + "loss": 0.1388, + "step": 224 + }, + { + "epoch": 0.3032345013477089, + "grad_norm": 41.899974946244, + "learning_rate": 9.951272800487695e-06, + "loss": 0.1983, + "step": 225 + }, + { + "epoch": 0.3045822102425876, + "grad_norm": 11.687200794235938, + "learning_rate": 9.950509727314415e-06, + "loss": 0.1219, + "step": 226 + }, + { + "epoch": 0.3059299191374663, + "grad_norm": 23.80069104374853, + "learning_rate": 9.949740755281784e-06, + "loss": 0.1814, + "step": 227 + }, + { + "epoch": 0.30727762803234504, + "grad_norm": 48.65157952112168, + "learning_rate": 9.948965885306085e-06, + "loss": 0.1689, + "step": 228 + }, + { + "epoch": 0.3086253369272237, + "grad_norm": 42.270184643227694, + "learning_rate": 9.948185118310623e-06, + "loss": 0.1937, + "step": 229 + }, + { + "epoch": 0.30997304582210244, + "grad_norm": 36.30643454032922, + "learning_rate": 9.947398455225733e-06, + "loss": 0.187, + "step": 230 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 32.702426399275325, + "learning_rate": 9.94660589698878e-06, + "loss": 0.1945, + "step": 231 + }, + { + "epoch": 0.31266846361185985, + "grad_norm": 15.462928054577416, + "learning_rate": 9.945807444544146e-06, + "loss": 0.1502, + "step": 232 + }, + { + "epoch": 0.3140161725067385, + "grad_norm": 19.176299879536224, + "learning_rate": 9.94500309884324e-06, + "loss": 0.1767, + "step": 233 + }, + { + "epoch": 0.31536388140161725, + "grad_norm": 38.076775404950865, + "learning_rate": 9.944192860844496e-06, + "loss": 0.1561, + "step": 234 + }, + { + "epoch": 0.316711590296496, + "grad_norm": 51.80162431965208, + "learning_rate": 9.943376731513364e-06, + "loss": 0.2146, + "step": 235 + }, + { + "epoch": 0.31805929919137466, + "grad_norm": 57.13829443667131, + "learning_rate": 9.942554711822314e-06, + "loss": 0.1918, + "step": 236 + }, + { + "epoch": 0.3194070080862534, + "grad_norm": 27.905780088512664, + "learning_rate": 9.941726802750842e-06, + "loss": 0.158, + "step": 237 + }, + { + "epoch": 0.32075471698113206, + "grad_norm": 49.3489960873823, + "learning_rate": 9.940893005285451e-06, + "loss": 0.1543, + "step": 238 + }, + { + "epoch": 0.3221024258760108, + "grad_norm": 46.990418224582854, + "learning_rate": 9.940053320419668e-06, + "loss": 0.1436, + "step": 239 + }, + { + "epoch": 0.32345013477088946, + "grad_norm": 48.529470829280065, + "learning_rate": 9.939207749154035e-06, + "loss": 0.1551, + "step": 240 + }, + { + "epoch": 0.3247978436657682, + "grad_norm": 28.15893964784892, + "learning_rate": 9.938356292496104e-06, + "loss": 0.1334, + "step": 241 + }, + { + "epoch": 0.3261455525606469, + "grad_norm": 12.984395054700736, + "learning_rate": 9.93749895146044e-06, + "loss": 0.1696, + "step": 242 + }, + { + "epoch": 0.3274932614555256, + "grad_norm": 5.911782676887572, + "learning_rate": 9.936635727068624e-06, + "loss": 0.1253, + "step": 243 + }, + { + "epoch": 0.3288409703504043, + "grad_norm": 19.27209222105451, + "learning_rate": 9.935766620349246e-06, + "loss": 0.1144, + "step": 244 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 10.006084668396747, + "learning_rate": 9.934891632337899e-06, + "loss": 0.1244, + "step": 245 + }, + { + "epoch": 0.33153638814016173, + "grad_norm": 39.45493044715105, + "learning_rate": 9.934010764077196e-06, + "loss": 0.1375, + "step": 246 + }, + { + "epoch": 0.3328840970350404, + "grad_norm": 23.856109182163603, + "learning_rate": 9.933124016616744e-06, + "loss": 0.1518, + "step": 247 + }, + { + "epoch": 0.33423180592991913, + "grad_norm": 59.21607988595415, + "learning_rate": 9.932231391013162e-06, + "loss": 0.1725, + "step": 248 + }, + { + "epoch": 0.33557951482479786, + "grad_norm": 49.93119634420406, + "learning_rate": 9.931332888330076e-06, + "loss": 0.158, + "step": 249 + }, + { + "epoch": 0.33692722371967654, + "grad_norm": 39.859474269778204, + "learning_rate": 9.930428509638109e-06, + "loss": 0.1871, + "step": 250 + }, + { + "epoch": 0.33827493261455527, + "grad_norm": 29.053302859584036, + "learning_rate": 9.929518256014885e-06, + "loss": 0.1435, + "step": 251 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 24.384592979212044, + "learning_rate": 9.928602128545036e-06, + "loss": 0.1181, + "step": 252 + }, + { + "epoch": 0.34097035040431267, + "grad_norm": 46.426907641638934, + "learning_rate": 9.927680128320188e-06, + "loss": 0.157, + "step": 253 + }, + { + "epoch": 0.3423180592991914, + "grad_norm": 29.326243066900645, + "learning_rate": 9.92675225643896e-06, + "loss": 0.1025, + "step": 254 + }, + { + "epoch": 0.3436657681940701, + "grad_norm": 6.56589644117148, + "learning_rate": 9.92581851400698e-06, + "loss": 0.1735, + "step": 255 + }, + { + "epoch": 0.3450134770889488, + "grad_norm": 38.91642313229621, + "learning_rate": 9.924878902136859e-06, + "loss": 0.1882, + "step": 256 + }, + { + "epoch": 0.3463611859838275, + "grad_norm": 40.530847852199074, + "learning_rate": 9.923933421948208e-06, + "loss": 0.159, + "step": 257 + }, + { + "epoch": 0.3477088948787062, + "grad_norm": 29.91131868635505, + "learning_rate": 9.922982074567628e-06, + "loss": 0.1643, + "step": 258 + }, + { + "epoch": 0.3490566037735849, + "grad_norm": 38.288862896560346, + "learning_rate": 9.922024861128714e-06, + "loss": 0.1274, + "step": 259 + }, + { + "epoch": 0.3504043126684636, + "grad_norm": 52.83011805029242, + "learning_rate": 9.921061782772048e-06, + "loss": 0.1697, + "step": 260 + }, + { + "epoch": 0.35175202156334234, + "grad_norm": 83.08249919994793, + "learning_rate": 9.9200928406452e-06, + "loss": 0.2109, + "step": 261 + }, + { + "epoch": 0.353099730458221, + "grad_norm": 60.556020686929294, + "learning_rate": 9.919118035902732e-06, + "loss": 0.1829, + "step": 262 + }, + { + "epoch": 0.35444743935309975, + "grad_norm": 51.45444872498502, + "learning_rate": 9.918137369706187e-06, + "loss": 0.1767, + "step": 263 + }, + { + "epoch": 0.3557951482479784, + "grad_norm": 37.098066149253135, + "learning_rate": 9.917150843224093e-06, + "loss": 0.1402, + "step": 264 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 5.242278091549337, + "learning_rate": 9.916158457631959e-06, + "loss": 0.1123, + "step": 265 + }, + { + "epoch": 0.3584905660377358, + "grad_norm": 12.634146909420425, + "learning_rate": 9.915160214112282e-06, + "loss": 0.1314, + "step": 266 + }, + { + "epoch": 0.35983827493261455, + "grad_norm": 17.326483396952597, + "learning_rate": 9.914156113854534e-06, + "loss": 0.1492, + "step": 267 + }, + { + "epoch": 0.3611859838274933, + "grad_norm": 44.929706958447674, + "learning_rate": 9.913146158055166e-06, + "loss": 0.1581, + "step": 268 + }, + { + "epoch": 0.36253369272237196, + "grad_norm": 26.414560818961196, + "learning_rate": 9.912130347917607e-06, + "loss": 0.1763, + "step": 269 + }, + { + "epoch": 0.3638814016172507, + "grad_norm": 30.13258081826216, + "learning_rate": 9.911108684652263e-06, + "loss": 0.17, + "step": 270 + }, + { + "epoch": 0.36522911051212936, + "grad_norm": 36.77333159909072, + "learning_rate": 9.910081169476512e-06, + "loss": 0.1284, + "step": 271 + }, + { + "epoch": 0.3665768194070081, + "grad_norm": 58.098716262261554, + "learning_rate": 9.909047803614707e-06, + "loss": 0.1835, + "step": 272 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 13.191020634650648, + "learning_rate": 9.908008588298171e-06, + "loss": 0.1646, + "step": 273 + }, + { + "epoch": 0.3692722371967655, + "grad_norm": 28.8333318322399, + "learning_rate": 9.906963524765199e-06, + "loss": 0.1478, + "step": 274 + }, + { + "epoch": 0.3706199460916442, + "grad_norm": 2.437780942914017, + "learning_rate": 9.90591261426105e-06, + "loss": 0.1277, + "step": 275 + }, + { + "epoch": 0.3719676549865229, + "grad_norm": 6.563372948224654, + "learning_rate": 9.904855858037958e-06, + "loss": 0.1268, + "step": 276 + }, + { + "epoch": 0.37331536388140163, + "grad_norm": 11.68194026632853, + "learning_rate": 9.903793257355114e-06, + "loss": 0.0978, + "step": 277 + }, + { + "epoch": 0.3746630727762803, + "grad_norm": 33.4478852535239, + "learning_rate": 9.90272481347868e-06, + "loss": 0.1304, + "step": 278 + }, + { + "epoch": 0.37601078167115903, + "grad_norm": 14.431741225479795, + "learning_rate": 9.901650527681774e-06, + "loss": 0.1439, + "step": 279 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 29.0555554686847, + "learning_rate": 9.900570401244482e-06, + "loss": 0.1835, + "step": 280 + }, + { + "epoch": 0.37870619946091644, + "grad_norm": 19.306561806563042, + "learning_rate": 9.899484435453843e-06, + "loss": 0.1468, + "step": 281 + }, + { + "epoch": 0.38005390835579517, + "grad_norm": 7.631001153553925, + "learning_rate": 9.898392631603859e-06, + "loss": 0.139, + "step": 282 + }, + { + "epoch": 0.38140161725067384, + "grad_norm": 8.299005193985657, + "learning_rate": 9.897294990995486e-06, + "loss": 0.1757, + "step": 283 + }, + { + "epoch": 0.38274932614555257, + "grad_norm": 63.56479518419959, + "learning_rate": 9.896191514936635e-06, + "loss": 0.1758, + "step": 284 + }, + { + "epoch": 0.38409703504043125, + "grad_norm": 33.43457943014574, + "learning_rate": 9.89508220474217e-06, + "loss": 0.1919, + "step": 285 + }, + { + "epoch": 0.38544474393531, + "grad_norm": 7.145523501555237, + "learning_rate": 9.893967061733908e-06, + "loss": 0.1874, + "step": 286 + }, + { + "epoch": 0.3867924528301887, + "grad_norm": 41.945421306629996, + "learning_rate": 9.892846087240614e-06, + "loss": 0.1606, + "step": 287 + }, + { + "epoch": 0.3881401617250674, + "grad_norm": 52.38570558018691, + "learning_rate": 9.891719282598009e-06, + "loss": 0.1923, + "step": 288 + }, + { + "epoch": 0.3894878706199461, + "grad_norm": 13.34898875011023, + "learning_rate": 9.890586649148747e-06, + "loss": 0.1409, + "step": 289 + }, + { + "epoch": 0.3908355795148248, + "grad_norm": 10.19870361899652, + "learning_rate": 9.88944818824244e-06, + "loss": 0.1664, + "step": 290 + }, + { + "epoch": 0.3921832884097035, + "grad_norm": 6.225446324448117, + "learning_rate": 9.88830390123564e-06, + "loss": 0.0891, + "step": 291 + }, + { + "epoch": 0.3935309973045822, + "grad_norm": 17.294777283496376, + "learning_rate": 9.88715378949184e-06, + "loss": 0.132, + "step": 292 + }, + { + "epoch": 0.3948787061994609, + "grad_norm": 14.657452854306397, + "learning_rate": 9.88599785438147e-06, + "loss": 0.1236, + "step": 293 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 12.80212974443883, + "learning_rate": 9.884836097281911e-06, + "loss": 0.1231, + "step": 294 + }, + { + "epoch": 0.3975741239892183, + "grad_norm": 9.95388134989833, + "learning_rate": 9.883668519577464e-06, + "loss": 0.1777, + "step": 295 + }, + { + "epoch": 0.39892183288409705, + "grad_norm": 53.844746150573556, + "learning_rate": 9.882495122659384e-06, + "loss": 0.2218, + "step": 296 + }, + { + "epoch": 0.4002695417789757, + "grad_norm": 17.09697086288405, + "learning_rate": 9.881315907925845e-06, + "loss": 0.1481, + "step": 297 + }, + { + "epoch": 0.40161725067385445, + "grad_norm": 18.751886072835507, + "learning_rate": 9.880130876781962e-06, + "loss": 0.1485, + "step": 298 + }, + { + "epoch": 0.4029649595687331, + "grad_norm": 16.963221754938193, + "learning_rate": 9.878940030639776e-06, + "loss": 0.1732, + "step": 299 + }, + { + "epoch": 0.40431266846361186, + "grad_norm": 17.09017100363569, + "learning_rate": 9.87774337091826e-06, + "loss": 0.126, + "step": 300 + }, + { + "epoch": 0.4056603773584906, + "grad_norm": 2.8064299126506556, + "learning_rate": 9.876540899043312e-06, + "loss": 0.1484, + "step": 301 + }, + { + "epoch": 0.40700808625336926, + "grad_norm": 16.643508127774712, + "learning_rate": 9.875332616447758e-06, + "loss": 0.135, + "step": 302 + }, + { + "epoch": 0.408355795148248, + "grad_norm": 21.16330316986015, + "learning_rate": 9.874118524571345e-06, + "loss": 0.1438, + "step": 303 + }, + { + "epoch": 0.40970350404312667, + "grad_norm": 60.645865023654984, + "learning_rate": 9.872898624860746e-06, + "loss": 0.1811, + "step": 304 + }, + { + "epoch": 0.4110512129380054, + "grad_norm": 50.00869811535701, + "learning_rate": 9.87167291876955e-06, + "loss": 0.1543, + "step": 305 + }, + { + "epoch": 0.4123989218328841, + "grad_norm": 31.155956878467894, + "learning_rate": 9.87044140775827e-06, + "loss": 0.1546, + "step": 306 + }, + { + "epoch": 0.4137466307277628, + "grad_norm": 40.19145392937788, + "learning_rate": 9.869204093294326e-06, + "loss": 0.1265, + "step": 307 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 11.087047498395703, + "learning_rate": 9.867960976852066e-06, + "loss": 0.1434, + "step": 308 + }, + { + "epoch": 0.4164420485175202, + "grad_norm": 31.59450698696892, + "learning_rate": 9.866712059912745e-06, + "loss": 0.1214, + "step": 309 + }, + { + "epoch": 0.41778975741239893, + "grad_norm": 2.7157497023932913, + "learning_rate": 9.865457343964528e-06, + "loss": 0.1502, + "step": 310 + }, + { + "epoch": 0.4191374663072776, + "grad_norm": 5.126745581715616, + "learning_rate": 9.864196830502493e-06, + "loss": 0.1271, + "step": 311 + }, + { + "epoch": 0.42048517520215634, + "grad_norm": 30.41101065594472, + "learning_rate": 9.862930521028621e-06, + "loss": 0.1235, + "step": 312 + }, + { + "epoch": 0.42183288409703507, + "grad_norm": 31.805278673795282, + "learning_rate": 9.86165841705181e-06, + "loss": 0.1467, + "step": 313 + }, + { + "epoch": 0.42318059299191374, + "grad_norm": 26.37179132302691, + "learning_rate": 9.860380520087854e-06, + "loss": 0.1704, + "step": 314 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 53.49814314172076, + "learning_rate": 9.85909683165945e-06, + "loss": 0.1668, + "step": 315 + }, + { + "epoch": 0.42587601078167114, + "grad_norm": 58.57144153948368, + "learning_rate": 9.857807353296195e-06, + "loss": 0.1822, + "step": 316 + }, + { + "epoch": 0.4272237196765499, + "grad_norm": 49.25801457712303, + "learning_rate": 9.856512086534593e-06, + "loss": 0.1305, + "step": 317 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 42.216015200252365, + "learning_rate": 9.855211032918037e-06, + "loss": 0.1518, + "step": 318 + }, + { + "epoch": 0.4299191374663073, + "grad_norm": 58.86153328788254, + "learning_rate": 9.85390419399682e-06, + "loss": 0.2129, + "step": 319 + }, + { + "epoch": 0.431266846361186, + "grad_norm": 72.70582302109207, + "learning_rate": 9.852591571328126e-06, + "loss": 0.1754, + "step": 320 + }, + { + "epoch": 0.4326145552560647, + "grad_norm": 24.405521563368897, + "learning_rate": 9.85127316647603e-06, + "loss": 0.1268, + "step": 321 + }, + { + "epoch": 0.4339622641509434, + "grad_norm": 27.323149777810126, + "learning_rate": 9.8499489810115e-06, + "loss": 0.1465, + "step": 322 + }, + { + "epoch": 0.4353099730458221, + "grad_norm": 10.778868769113878, + "learning_rate": 9.84861901651239e-06, + "loss": 0.1582, + "step": 323 + }, + { + "epoch": 0.4366576819407008, + "grad_norm": 41.47372303603242, + "learning_rate": 9.847283274563441e-06, + "loss": 0.1494, + "step": 324 + }, + { + "epoch": 0.4380053908355795, + "grad_norm": 17.65141456197565, + "learning_rate": 9.84594175675628e-06, + "loss": 0.1919, + "step": 325 + }, + { + "epoch": 0.4393530997304582, + "grad_norm": 54.80693401918428, + "learning_rate": 9.84459446468941e-06, + "loss": 0.1764, + "step": 326 + }, + { + "epoch": 0.44070080862533695, + "grad_norm": 33.5238434221311, + "learning_rate": 9.84324139996822e-06, + "loss": 0.1483, + "step": 327 + }, + { + "epoch": 0.4420485175202156, + "grad_norm": 68.22321426454904, + "learning_rate": 9.841882564204977e-06, + "loss": 0.1526, + "step": 328 + }, + { + "epoch": 0.44339622641509435, + "grad_norm": 27.67323963228479, + "learning_rate": 9.840517959018822e-06, + "loss": 0.1519, + "step": 329 + }, + { + "epoch": 0.444743935309973, + "grad_norm": 12.248572741179483, + "learning_rate": 9.839147586035776e-06, + "loss": 0.1507, + "step": 330 + }, + { + "epoch": 0.44609164420485176, + "grad_norm": 53.081123528533325, + "learning_rate": 9.837771446888721e-06, + "loss": 0.2575, + "step": 331 + }, + { + "epoch": 0.4474393530997305, + "grad_norm": 2.7797168134495567, + "learning_rate": 9.836389543217426e-06, + "loss": 0.1297, + "step": 332 + }, + { + "epoch": 0.44878706199460916, + "grad_norm": 12.16870926813253, + "learning_rate": 9.835001876668517e-06, + "loss": 0.151, + "step": 333 + }, + { + "epoch": 0.4501347708894879, + "grad_norm": 14.304811001132766, + "learning_rate": 9.83360844889549e-06, + "loss": 0.1255, + "step": 334 + }, + { + "epoch": 0.45148247978436656, + "grad_norm": 31.93528796331407, + "learning_rate": 9.832209261558707e-06, + "loss": 0.0999, + "step": 335 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 16.310712405259423, + "learning_rate": 9.830804316325393e-06, + "loss": 0.124, + "step": 336 + }, + { + "epoch": 0.45417789757412397, + "grad_norm": 46.503956083481604, + "learning_rate": 9.82939361486963e-06, + "loss": 0.1614, + "step": 337 + }, + { + "epoch": 0.4555256064690027, + "grad_norm": 44.04360134459606, + "learning_rate": 9.827977158872364e-06, + "loss": 0.1562, + "step": 338 + }, + { + "epoch": 0.4568733153638814, + "grad_norm": 47.66219072087223, + "learning_rate": 9.826554950021397e-06, + "loss": 0.1481, + "step": 339 + }, + { + "epoch": 0.4582210242587601, + "grad_norm": 44.953589268843594, + "learning_rate": 9.825126990011385e-06, + "loss": 0.1251, + "step": 340 + }, + { + "epoch": 0.45956873315363883, + "grad_norm": 30.222043530042995, + "learning_rate": 9.823693280543832e-06, + "loss": 0.1606, + "step": 341 + }, + { + "epoch": 0.4609164420485175, + "grad_norm": 40.922476855807254, + "learning_rate": 9.822253823327103e-06, + "loss": 0.116, + "step": 342 + }, + { + "epoch": 0.46226415094339623, + "grad_norm": 46.64234955676677, + "learning_rate": 9.820808620076403e-06, + "loss": 0.1617, + "step": 343 + }, + { + "epoch": 0.4636118598382749, + "grad_norm": 17.07567621602795, + "learning_rate": 9.81935767251379e-06, + "loss": 0.1878, + "step": 344 + }, + { + "epoch": 0.46495956873315364, + "grad_norm": 35.953665291847074, + "learning_rate": 9.817900982368161e-06, + "loss": 0.137, + "step": 345 + }, + { + "epoch": 0.46630727762803237, + "grad_norm": 22.057858921119426, + "learning_rate": 9.816438551375259e-06, + "loss": 0.1356, + "step": 346 + }, + { + "epoch": 0.46765498652291104, + "grad_norm": 12.33779225122145, + "learning_rate": 9.81497038127767e-06, + "loss": 0.1599, + "step": 347 + }, + { + "epoch": 0.46900269541778977, + "grad_norm": 13.60437560875559, + "learning_rate": 9.81349647382481e-06, + "loss": 0.1456, + "step": 348 + }, + { + "epoch": 0.47035040431266845, + "grad_norm": 29.879416832525852, + "learning_rate": 9.812016830772944e-06, + "loss": 0.1461, + "step": 349 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 35.989502244278064, + "learning_rate": 9.81053145388516e-06, + "loss": 0.1426, + "step": 350 + }, + { + "epoch": 0.47304582210242585, + "grad_norm": 14.93713485362336, + "learning_rate": 9.809040344931382e-06, + "loss": 0.1435, + "step": 351 + }, + { + "epoch": 0.4743935309973046, + "grad_norm": 44.59529803721918, + "learning_rate": 9.807543505688368e-06, + "loss": 0.1696, + "step": 352 + }, + { + "epoch": 0.4757412398921833, + "grad_norm": 53.3373895534619, + "learning_rate": 9.8060409379397e-06, + "loss": 0.173, + "step": 353 + }, + { + "epoch": 0.477088948787062, + "grad_norm": 16.713061217803954, + "learning_rate": 9.804532643475787e-06, + "loss": 0.1594, + "step": 354 + }, + { + "epoch": 0.4784366576819407, + "grad_norm": 6.319391177112821, + "learning_rate": 9.803018624093859e-06, + "loss": 0.129, + "step": 355 + }, + { + "epoch": 0.4797843665768194, + "grad_norm": 10.098766976143285, + "learning_rate": 9.80149888159797e-06, + "loss": 0.1383, + "step": 356 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 41.64811268405399, + "learning_rate": 9.799973417798998e-06, + "loss": 0.1689, + "step": 357 + }, + { + "epoch": 0.48247978436657685, + "grad_norm": 93.2057202360967, + "learning_rate": 9.79844223451463e-06, + "loss": 0.194, + "step": 358 + }, + { + "epoch": 0.4838274932614555, + "grad_norm": 67.29527065321074, + "learning_rate": 9.796905333569374e-06, + "loss": 0.1849, + "step": 359 + }, + { + "epoch": 0.48517520215633425, + "grad_norm": 93.56026223138542, + "learning_rate": 9.795362716794548e-06, + "loss": 0.2088, + "step": 360 + }, + { + "epoch": 0.4865229110512129, + "grad_norm": 58.093295149838575, + "learning_rate": 9.79381438602828e-06, + "loss": 0.1979, + "step": 361 + }, + { + "epoch": 0.48787061994609165, + "grad_norm": 105.96289844013717, + "learning_rate": 9.792260343115512e-06, + "loss": 0.2494, + "step": 362 + }, + { + "epoch": 0.48921832884097033, + "grad_norm": 76.78203127765165, + "learning_rate": 9.790700589907986e-06, + "loss": 0.2192, + "step": 363 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 92.1134274344442, + "learning_rate": 9.789135128264253e-06, + "loss": 0.2345, + "step": 364 + }, + { + "epoch": 0.4919137466307278, + "grad_norm": 72.13784133871704, + "learning_rate": 9.787563960049665e-06, + "loss": 0.1709, + "step": 365 + }, + { + "epoch": 0.49326145552560646, + "grad_norm": 89.42961847049244, + "learning_rate": 9.785987087136368e-06, + "loss": 0.2037, + "step": 366 + }, + { + "epoch": 0.4946091644204852, + "grad_norm": 87.83182157804875, + "learning_rate": 9.784404511403313e-06, + "loss": 0.2369, + "step": 367 + }, + { + "epoch": 0.49595687331536387, + "grad_norm": 19.606108543647103, + "learning_rate": 9.782816234736246e-06, + "loss": 0.1413, + "step": 368 + }, + { + "epoch": 0.4973045822102426, + "grad_norm": 9.880452109661945, + "learning_rate": 9.781222259027699e-06, + "loss": 0.1444, + "step": 369 + }, + { + "epoch": 0.49865229110512127, + "grad_norm": 25.810533557544776, + "learning_rate": 9.779622586177002e-06, + "loss": 0.1308, + "step": 370 + }, + { + "epoch": 0.5, + "grad_norm": 4.895902869315189, + "learning_rate": 9.77801721809027e-06, + "loss": 0.1315, + "step": 371 + }, + { + "epoch": 0.5013477088948787, + "grad_norm": 6.293261381102398, + "learning_rate": 9.776406156680405e-06, + "loss": 0.1477, + "step": 372 + }, + { + "epoch": 0.5026954177897575, + "grad_norm": 37.21968102490209, + "learning_rate": 9.774789403867095e-06, + "loss": 0.1406, + "step": 373 + }, + { + "epoch": 0.5040431266846361, + "grad_norm": 50.98981275807179, + "learning_rate": 9.773166961576805e-06, + "loss": 0.1469, + "step": 374 + }, + { + "epoch": 0.5053908355795148, + "grad_norm": 56.90633107255107, + "learning_rate": 9.771538831742785e-06, + "loss": 0.1929, + "step": 375 + }, + { + "epoch": 0.5067385444743935, + "grad_norm": 68.77799790919518, + "learning_rate": 9.769905016305055e-06, + "loss": 0.1859, + "step": 376 + }, + { + "epoch": 0.5080862533692723, + "grad_norm": 52.786307147679224, + "learning_rate": 9.768265517210419e-06, + "loss": 0.1497, + "step": 377 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 56.82549415392909, + "learning_rate": 9.766620336412446e-06, + "loss": 0.1455, + "step": 378 + }, + { + "epoch": 0.5107816711590296, + "grad_norm": 63.88829431499342, + "learning_rate": 9.764969475871477e-06, + "loss": 0.1596, + "step": 379 + }, + { + "epoch": 0.5121293800539084, + "grad_norm": 35.26481290001438, + "learning_rate": 9.763312937554623e-06, + "loss": 0.1569, + "step": 380 + }, + { + "epoch": 0.5134770889487871, + "grad_norm": 36.590771013019875, + "learning_rate": 9.761650723435758e-06, + "loss": 0.1481, + "step": 381 + }, + { + "epoch": 0.5148247978436657, + "grad_norm": 54.36072256267158, + "learning_rate": 9.759982835495519e-06, + "loss": 0.1596, + "step": 382 + }, + { + "epoch": 0.5161725067385444, + "grad_norm": 52.68148973557427, + "learning_rate": 9.758309275721305e-06, + "loss": 0.1362, + "step": 383 + }, + { + "epoch": 0.5175202156334232, + "grad_norm": 44.223697409480906, + "learning_rate": 9.756630046107276e-06, + "loss": 0.1535, + "step": 384 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 17.32942600511623, + "learning_rate": 9.75494514865434e-06, + "loss": 0.1365, + "step": 385 + }, + { + "epoch": 0.5202156334231806, + "grad_norm": 4.723583046920972, + "learning_rate": 9.753254585370168e-06, + "loss": 0.133, + "step": 386 + }, + { + "epoch": 0.5215633423180593, + "grad_norm": 49.12898339278434, + "learning_rate": 9.751558358269175e-06, + "loss": 0.1929, + "step": 387 + }, + { + "epoch": 0.522911051212938, + "grad_norm": 29.77239306393112, + "learning_rate": 9.74985646937253e-06, + "loss": 0.1555, + "step": 388 + }, + { + "epoch": 0.5242587601078167, + "grad_norm": 67.23397851516717, + "learning_rate": 9.748148920708143e-06, + "loss": 0.1354, + "step": 389 + }, + { + "epoch": 0.5256064690026954, + "grad_norm": 69.44970892499674, + "learning_rate": 9.746435714310673e-06, + "loss": 0.1593, + "step": 390 + }, + { + "epoch": 0.5269541778975741, + "grad_norm": 63.875288049061496, + "learning_rate": 9.74471685222152e-06, + "loss": 0.1912, + "step": 391 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 79.24551574613815, + "learning_rate": 9.742992336488818e-06, + "loss": 0.1799, + "step": 392 + }, + { + "epoch": 0.5296495956873315, + "grad_norm": 88.67339810033933, + "learning_rate": 9.741262169167445e-06, + "loss": 0.2255, + "step": 393 + }, + { + "epoch": 0.5309973045822103, + "grad_norm": 74.30092106575312, + "learning_rate": 9.739526352319007e-06, + "loss": 0.1698, + "step": 394 + }, + { + "epoch": 0.532345013477089, + "grad_norm": 113.14157087022767, + "learning_rate": 9.737784888011847e-06, + "loss": 0.2469, + "step": 395 + }, + { + "epoch": 0.5336927223719676, + "grad_norm": 100.74642314255249, + "learning_rate": 9.736037778321032e-06, + "loss": 0.2645, + "step": 396 + }, + { + "epoch": 0.5350404312668463, + "grad_norm": 51.31556063612228, + "learning_rate": 9.73428502532836e-06, + "loss": 0.1565, + "step": 397 + }, + { + "epoch": 0.5363881401617251, + "grad_norm": 71.26430157986898, + "learning_rate": 9.732526631122352e-06, + "loss": 0.1771, + "step": 398 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 44.03721842062689, + "learning_rate": 9.73076259779825e-06, + "loss": 0.1153, + "step": 399 + }, + { + "epoch": 0.5390835579514824, + "grad_norm": 52.08775500150747, + "learning_rate": 9.72899292745802e-06, + "loss": 0.1348, + "step": 400 + }, + { + "epoch": 0.5404312668463612, + "grad_norm": 40.35024929600591, + "learning_rate": 9.727217622210337e-06, + "loss": 0.1596, + "step": 401 + }, + { + "epoch": 0.5417789757412399, + "grad_norm": 13.983044813775106, + "learning_rate": 9.725436684170592e-06, + "loss": 0.1748, + "step": 402 + }, + { + "epoch": 0.5431266846361186, + "grad_norm": 10.352642555293139, + "learning_rate": 9.723650115460897e-06, + "loss": 0.1675, + "step": 403 + }, + { + "epoch": 0.5444743935309974, + "grad_norm": 15.384805940070105, + "learning_rate": 9.721857918210064e-06, + "loss": 0.1415, + "step": 404 + }, + { + "epoch": 0.545822102425876, + "grad_norm": 23.440920716466472, + "learning_rate": 9.720060094553613e-06, + "loss": 0.1498, + "step": 405 + }, + { + "epoch": 0.5471698113207547, + "grad_norm": 20.9736647419531, + "learning_rate": 9.71825664663377e-06, + "loss": 0.1374, + "step": 406 + }, + { + "epoch": 0.5485175202156334, + "grad_norm": 46.8999957691335, + "learning_rate": 9.716447576599463e-06, + "loss": 0.1311, + "step": 407 + }, + { + "epoch": 0.5498652291105122, + "grad_norm": 19.339420207927233, + "learning_rate": 9.714632886606319e-06, + "loss": 0.1215, + "step": 408 + }, + { + "epoch": 0.5512129380053908, + "grad_norm": 47.7836296360254, + "learning_rate": 9.71281257881666e-06, + "loss": 0.1594, + "step": 409 + }, + { + "epoch": 0.5525606469002695, + "grad_norm": 62.29759681865715, + "learning_rate": 9.710986655399504e-06, + "loss": 0.1484, + "step": 410 + }, + { + "epoch": 0.5539083557951483, + "grad_norm": 26.783007837322113, + "learning_rate": 9.709155118530557e-06, + "loss": 0.1219, + "step": 411 + }, + { + "epoch": 0.555256064690027, + "grad_norm": 31.383281029362056, + "learning_rate": 9.707317970392218e-06, + "loss": 0.1819, + "step": 412 + }, + { + "epoch": 0.5566037735849056, + "grad_norm": 61.88026906895395, + "learning_rate": 9.705475213173572e-06, + "loss": 0.1412, + "step": 413 + }, + { + "epoch": 0.5579514824797843, + "grad_norm": 50.46853740708549, + "learning_rate": 9.703626849070383e-06, + "loss": 0.141, + "step": 414 + }, + { + "epoch": 0.5592991913746631, + "grad_norm": 39.64156576997649, + "learning_rate": 9.701772880285098e-06, + "loss": 0.1658, + "step": 415 + }, + { + "epoch": 0.5606469002695418, + "grad_norm": 26.547349103374547, + "learning_rate": 9.699913309026848e-06, + "loss": 0.1544, + "step": 416 + }, + { + "epoch": 0.5619946091644205, + "grad_norm": 8.997478673314651, + "learning_rate": 9.698048137511432e-06, + "loss": 0.1501, + "step": 417 + }, + { + "epoch": 0.5633423180592992, + "grad_norm": 36.26215464982081, + "learning_rate": 9.696177367961325e-06, + "loss": 0.14, + "step": 418 + }, + { + "epoch": 0.5646900269541779, + "grad_norm": 14.310533273579404, + "learning_rate": 9.694301002605672e-06, + "loss": 0.1317, + "step": 419 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 8.983107144394515, + "learning_rate": 9.69241904368029e-06, + "loss": 0.1245, + "step": 420 + }, + { + "epoch": 0.5673854447439353, + "grad_norm": 29.30231304437641, + "learning_rate": 9.690531493427652e-06, + "loss": 0.1619, + "step": 421 + }, + { + "epoch": 0.568733153638814, + "grad_norm": 13.182260530262305, + "learning_rate": 9.688638354096902e-06, + "loss": 0.1232, + "step": 422 + }, + { + "epoch": 0.5700808625336927, + "grad_norm": 31.445078983132596, + "learning_rate": 9.68673962794384e-06, + "loss": 0.1397, + "step": 423 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 19.744406752093038, + "learning_rate": 9.684835317230923e-06, + "loss": 0.1328, + "step": 424 + }, + { + "epoch": 0.5727762803234502, + "grad_norm": 39.617326790512315, + "learning_rate": 9.682925424227265e-06, + "loss": 0.1451, + "step": 425 + }, + { + "epoch": 0.5741239892183289, + "grad_norm": 31.028041390584516, + "learning_rate": 9.681009951208627e-06, + "loss": 0.1688, + "step": 426 + }, + { + "epoch": 0.5754716981132075, + "grad_norm": 20.563429803813193, + "learning_rate": 9.679088900457423e-06, + "loss": 0.1216, + "step": 427 + }, + { + "epoch": 0.5768194070080862, + "grad_norm": 27.006696057933993, + "learning_rate": 9.677162274262711e-06, + "loss": 0.1135, + "step": 428 + }, + { + "epoch": 0.578167115902965, + "grad_norm": 8.39248178540939, + "learning_rate": 9.675230074920195e-06, + "loss": 0.1487, + "step": 429 + }, + { + "epoch": 0.5795148247978437, + "grad_norm": 24.301644431371223, + "learning_rate": 9.673292304732216e-06, + "loss": 0.1657, + "step": 430 + }, + { + "epoch": 0.5808625336927223, + "grad_norm": 3.0512600059776895, + "learning_rate": 9.671348966007759e-06, + "loss": 0.1648, + "step": 431 + }, + { + "epoch": 0.5822102425876011, + "grad_norm": 2.7686862052262264, + "learning_rate": 9.669400061062435e-06, + "loss": 0.1464, + "step": 432 + }, + { + "epoch": 0.5835579514824798, + "grad_norm": 20.94114764543427, + "learning_rate": 9.667445592218499e-06, + "loss": 0.1482, + "step": 433 + }, + { + "epoch": 0.5849056603773585, + "grad_norm": 46.674218609560235, + "learning_rate": 9.665485561804824e-06, + "loss": 0.1606, + "step": 434 + }, + { + "epoch": 0.5862533692722371, + "grad_norm": 73.51662407355673, + "learning_rate": 9.663519972156919e-06, + "loss": 0.1881, + "step": 435 + }, + { + "epoch": 0.5876010781671159, + "grad_norm": 64.17473153598924, + "learning_rate": 9.661548825616914e-06, + "loss": 0.1562, + "step": 436 + }, + { + "epoch": 0.5889487870619946, + "grad_norm": 57.55748973136535, + "learning_rate": 9.659572124533559e-06, + "loss": 0.1986, + "step": 437 + }, + { + "epoch": 0.5902964959568733, + "grad_norm": 58.452672808931176, + "learning_rate": 9.657589871262223e-06, + "loss": 0.1386, + "step": 438 + }, + { + "epoch": 0.5916442048517521, + "grad_norm": 65.29401457168774, + "learning_rate": 9.655602068164895e-06, + "loss": 0.1757, + "step": 439 + }, + { + "epoch": 0.5929919137466307, + "grad_norm": 68.16386483658967, + "learning_rate": 9.65360871761017e-06, + "loss": 0.1793, + "step": 440 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 96.98448110719954, + "learning_rate": 9.65160982197326e-06, + "loss": 0.2105, + "step": 441 + }, + { + "epoch": 0.5956873315363881, + "grad_norm": 79.04657405633344, + "learning_rate": 9.64960538363598e-06, + "loss": 0.1977, + "step": 442 + }, + { + "epoch": 0.5970350404312669, + "grad_norm": 56.47094546492654, + "learning_rate": 9.64759540498675e-06, + "loss": 0.1701, + "step": 443 + }, + { + "epoch": 0.5983827493261455, + "grad_norm": 39.47572717609311, + "learning_rate": 9.645579888420594e-06, + "loss": 0.1605, + "step": 444 + }, + { + "epoch": 0.5997304582210242, + "grad_norm": 43.188913614090644, + "learning_rate": 9.643558836339131e-06, + "loss": 0.1469, + "step": 445 + }, + { + "epoch": 0.601078167115903, + "grad_norm": 41.23951220638007, + "learning_rate": 9.64153225115058e-06, + "loss": 0.1492, + "step": 446 + }, + { + "epoch": 0.6024258760107817, + "grad_norm": 21.440302941577617, + "learning_rate": 9.639500135269749e-06, + "loss": 0.1419, + "step": 447 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 25.180675674797104, + "learning_rate": 9.637462491118041e-06, + "loss": 0.1328, + "step": 448 + }, + { + "epoch": 0.605121293800539, + "grad_norm": 3.3139027890830386, + "learning_rate": 9.635419321123441e-06, + "loss": 0.1306, + "step": 449 + }, + { + "epoch": 0.6064690026954178, + "grad_norm": 55.726178717044576, + "learning_rate": 9.633370627720521e-06, + "loss": 0.1879, + "step": 450 + }, + { + "epoch": 0.6078167115902965, + "grad_norm": 33.09719159714796, + "learning_rate": 9.631316413350438e-06, + "loss": 0.1505, + "step": 451 + }, + { + "epoch": 0.6091644204851752, + "grad_norm": 24.8661091128541, + "learning_rate": 9.62925668046092e-06, + "loss": 0.1297, + "step": 452 + }, + { + "epoch": 0.610512129380054, + "grad_norm": 55.941539687278535, + "learning_rate": 9.627191431506278e-06, + "loss": 0.1198, + "step": 453 + }, + { + "epoch": 0.6118598382749326, + "grad_norm": 74.31406479323235, + "learning_rate": 9.625120668947389e-06, + "loss": 0.1871, + "step": 454 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 75.42215271824082, + "learning_rate": 9.623044395251709e-06, + "loss": 0.1753, + "step": 455 + }, + { + "epoch": 0.6145552560646901, + "grad_norm": 105.85595473321933, + "learning_rate": 9.620962612893248e-06, + "loss": 0.2231, + "step": 456 + }, + { + "epoch": 0.6159029649595688, + "grad_norm": 33.17330182602598, + "learning_rate": 9.618875324352594e-06, + "loss": 0.1115, + "step": 457 + }, + { + "epoch": 0.6172506738544474, + "grad_norm": 65.1260212631549, + "learning_rate": 9.616782532116883e-06, + "loss": 0.1681, + "step": 458 + }, + { + "epoch": 0.6185983827493261, + "grad_norm": 46.57187565529923, + "learning_rate": 9.614684238679821e-06, + "loss": 0.1535, + "step": 459 + }, + { + "epoch": 0.6199460916442049, + "grad_norm": 56.89740858330641, + "learning_rate": 9.612580446541659e-06, + "loss": 0.1296, + "step": 460 + }, + { + "epoch": 0.6212938005390836, + "grad_norm": 29.22100659364491, + "learning_rate": 9.610471158209206e-06, + "loss": 0.128, + "step": 461 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 70.71508550657603, + "learning_rate": 9.60835637619582e-06, + "loss": 0.2107, + "step": 462 + }, + { + "epoch": 0.623989218328841, + "grad_norm": 68.97948245390305, + "learning_rate": 9.6062361030214e-06, + "loss": 0.2047, + "step": 463 + }, + { + "epoch": 0.6253369272237197, + "grad_norm": 25.382686585019663, + "learning_rate": 9.604110341212394e-06, + "loss": 0.1296, + "step": 464 + }, + { + "epoch": 0.6266846361185984, + "grad_norm": 29.5225675793953, + "learning_rate": 9.601979093301785e-06, + "loss": 0.1122, + "step": 465 + }, + { + "epoch": 0.628032345013477, + "grad_norm": 23.471245527862205, + "learning_rate": 9.5998423618291e-06, + "loss": 0.1396, + "step": 466 + }, + { + "epoch": 0.6293800539083558, + "grad_norm": 37.42561681333412, + "learning_rate": 9.597700149340392e-06, + "loss": 0.1332, + "step": 467 + }, + { + "epoch": 0.6307277628032345, + "grad_norm": 28.654885537393923, + "learning_rate": 9.59555245838825e-06, + "loss": 0.1171, + "step": 468 + }, + { + "epoch": 0.6320754716981132, + "grad_norm": 49.1377693765772, + "learning_rate": 9.593399291531789e-06, + "loss": 0.1519, + "step": 469 + }, + { + "epoch": 0.633423180592992, + "grad_norm": 27.757127043202914, + "learning_rate": 9.59124065133665e-06, + "loss": 0.1679, + "step": 470 + }, + { + "epoch": 0.6347708894878706, + "grad_norm": 52.76644823698273, + "learning_rate": 9.589076540374998e-06, + "loss": 0.1644, + "step": 471 + }, + { + "epoch": 0.6361185983827493, + "grad_norm": 28.0331118052067, + "learning_rate": 9.586906961225509e-06, + "loss": 0.1233, + "step": 472 + }, + { + "epoch": 0.637466307277628, + "grad_norm": 58.32479225922101, + "learning_rate": 9.584731916473382e-06, + "loss": 0.1686, + "step": 473 + }, + { + "epoch": 0.6388140161725068, + "grad_norm": 26.054640746254268, + "learning_rate": 9.582551408710329e-06, + "loss": 0.2094, + "step": 474 + }, + { + "epoch": 0.6401617250673854, + "grad_norm": 38.00731161381952, + "learning_rate": 9.580365440534567e-06, + "loss": 0.1345, + "step": 475 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 36.5972999519646, + "learning_rate": 9.57817401455082e-06, + "loss": 0.1337, + "step": 476 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 27.390363129861477, + "learning_rate": 9.575977133370318e-06, + "loss": 0.1583, + "step": 477 + }, + { + "epoch": 0.6442048517520216, + "grad_norm": 9.739349946147911, + "learning_rate": 9.573774799610792e-06, + "loss": 0.1352, + "step": 478 + }, + { + "epoch": 0.6455525606469003, + "grad_norm": 14.080654240248998, + "learning_rate": 9.571567015896465e-06, + "loss": 0.1276, + "step": 479 + }, + { + "epoch": 0.6469002695417789, + "grad_norm": 44.71159992313211, + "learning_rate": 9.569353784858059e-06, + "loss": 0.193, + "step": 480 + }, + { + "epoch": 0.6482479784366577, + "grad_norm": 15.140475478506886, + "learning_rate": 9.567135109132786e-06, + "loss": 0.156, + "step": 481 + }, + { + "epoch": 0.6495956873315364, + "grad_norm": 31.03459632213024, + "learning_rate": 9.564910991364342e-06, + "loss": 0.1214, + "step": 482 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 29.89949237044646, + "learning_rate": 9.562681434202911e-06, + "loss": 0.1784, + "step": 483 + }, + { + "epoch": 0.6522911051212938, + "grad_norm": 67.16498414742483, + "learning_rate": 9.56044644030516e-06, + "loss": 0.1696, + "step": 484 + }, + { + "epoch": 0.6536388140161725, + "grad_norm": 62.506457456600266, + "learning_rate": 9.55820601233423e-06, + "loss": 0.149, + "step": 485 + }, + { + "epoch": 0.6549865229110512, + "grad_norm": 58.076511796889065, + "learning_rate": 9.555960152959737e-06, + "loss": 0.1691, + "step": 486 + }, + { + "epoch": 0.6563342318059299, + "grad_norm": 98.07764535791624, + "learning_rate": 9.553708864857775e-06, + "loss": 0.2005, + "step": 487 + }, + { + "epoch": 0.6576819407008087, + "grad_norm": 66.78574060188438, + "learning_rate": 9.551452150710899e-06, + "loss": 0.1411, + "step": 488 + }, + { + "epoch": 0.6590296495956873, + "grad_norm": 45.574848340391064, + "learning_rate": 9.549190013208135e-06, + "loss": 0.1883, + "step": 489 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 49.6564807056717, + "learning_rate": 9.546922455044966e-06, + "loss": 0.1492, + "step": 490 + }, + { + "epoch": 0.6617250673854448, + "grad_norm": 73.22543656566751, + "learning_rate": 9.544649478923342e-06, + "loss": 0.1665, + "step": 491 + }, + { + "epoch": 0.6630727762803235, + "grad_norm": 72.36142408517122, + "learning_rate": 9.542371087551663e-06, + "loss": 0.1856, + "step": 492 + }, + { + "epoch": 0.6644204851752021, + "grad_norm": 32.133646764157085, + "learning_rate": 9.54008728364478e-06, + "loss": 0.0996, + "step": 493 + }, + { + "epoch": 0.6657681940700808, + "grad_norm": 30.370125033942305, + "learning_rate": 9.537798069923998e-06, + "loss": 0.1513, + "step": 494 + }, + { + "epoch": 0.6671159029649596, + "grad_norm": 20.72972617250389, + "learning_rate": 9.535503449117067e-06, + "loss": 0.1308, + "step": 495 + }, + { + "epoch": 0.6684636118598383, + "grad_norm": 17.575270135065292, + "learning_rate": 9.53320342395818e-06, + "loss": 0.1115, + "step": 496 + }, + { + "epoch": 0.6698113207547169, + "grad_norm": 2.343809976570899, + "learning_rate": 9.530897997187964e-06, + "loss": 0.0925, + "step": 497 + }, + { + "epoch": 0.6711590296495957, + "grad_norm": 30.772149851879053, + "learning_rate": 9.528587171553494e-06, + "loss": 0.1651, + "step": 498 + }, + { + "epoch": 0.6725067385444744, + "grad_norm": 35.61568213544565, + "learning_rate": 9.526270949808268e-06, + "loss": 0.1189, + "step": 499 + }, + { + "epoch": 0.6738544474393531, + "grad_norm": 62.09620312153279, + "learning_rate": 9.523949334712218e-06, + "loss": 0.1611, + "step": 500 + }, + { + "epoch": 0.6752021563342318, + "grad_norm": 45.94424753734832, + "learning_rate": 9.521622329031699e-06, + "loss": 0.1443, + "step": 501 + }, + { + "epoch": 0.6765498652291105, + "grad_norm": 48.01157383607107, + "learning_rate": 9.519289935539495e-06, + "loss": 0.1301, + "step": 502 + }, + { + "epoch": 0.6778975741239892, + "grad_norm": 61.156385722498975, + "learning_rate": 9.516952157014807e-06, + "loss": 0.1288, + "step": 503 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 51.22229588762735, + "learning_rate": 9.51460899624325e-06, + "loss": 0.1802, + "step": 504 + }, + { + "epoch": 0.6805929919137467, + "grad_norm": 66.70942254176441, + "learning_rate": 9.512260456016858e-06, + "loss": 0.1305, + "step": 505 + }, + { + "epoch": 0.6819407008086253, + "grad_norm": 43.55130325020912, + "learning_rate": 9.509906539134069e-06, + "loss": 0.216, + "step": 506 + }, + { + "epoch": 0.683288409703504, + "grad_norm": 43.402672767992556, + "learning_rate": 9.507547248399734e-06, + "loss": 0.1649, + "step": 507 + }, + { + "epoch": 0.6846361185983828, + "grad_norm": 33.602314008301484, + "learning_rate": 9.5051825866251e-06, + "loss": 0.1884, + "step": 508 + }, + { + "epoch": 0.6859838274932615, + "grad_norm": 38.00513059794873, + "learning_rate": 9.50281255662782e-06, + "loss": 0.153, + "step": 509 + }, + { + "epoch": 0.6873315363881402, + "grad_norm": 3.2772012799549306, + "learning_rate": 9.500437161231938e-06, + "loss": 0.1047, + "step": 510 + }, + { + "epoch": 0.6886792452830188, + "grad_norm": 2.7076720657946085, + "learning_rate": 9.4980564032679e-06, + "loss": 0.1493, + "step": 511 + }, + { + "epoch": 0.6900269541778976, + "grad_norm": 4.990358440341732, + "learning_rate": 9.49567028557253e-06, + "loss": 0.1552, + "step": 512 + }, + { + "epoch": 0.6913746630727763, + "grad_norm": 29.033891781067915, + "learning_rate": 9.49327881098905e-06, + "loss": 0.1408, + "step": 513 + }, + { + "epoch": 0.692722371967655, + "grad_norm": 52.579900092702246, + "learning_rate": 9.49088198236706e-06, + "loss": 0.1746, + "step": 514 + }, + { + "epoch": 0.6940700808625337, + "grad_norm": 56.83765835511297, + "learning_rate": 9.488479802562535e-06, + "loss": 0.1222, + "step": 515 + }, + { + "epoch": 0.6954177897574124, + "grad_norm": 48.41220327621307, + "learning_rate": 9.486072274437837e-06, + "loss": 0.1496, + "step": 516 + }, + { + "epoch": 0.6967654986522911, + "grad_norm": 70.73148389423216, + "learning_rate": 9.48365940086169e-06, + "loss": 0.2207, + "step": 517 + }, + { + "epoch": 0.6981132075471698, + "grad_norm": 90.60548979134887, + "learning_rate": 9.481241184709194e-06, + "loss": 0.201, + "step": 518 + }, + { + "epoch": 0.6994609164420486, + "grad_norm": 94.38777098242979, + "learning_rate": 9.478817628861812e-06, + "loss": 0.1953, + "step": 519 + }, + { + "epoch": 0.7008086253369272, + "grad_norm": 57.699533335871585, + "learning_rate": 9.476388736207372e-06, + "loss": 0.1651, + "step": 520 + }, + { + "epoch": 0.7021563342318059, + "grad_norm": 70.01073029125482, + "learning_rate": 9.473954509640062e-06, + "loss": 0.1557, + "step": 521 + }, + { + "epoch": 0.7035040431266847, + "grad_norm": 65.2151729975666, + "learning_rate": 9.471514952060419e-06, + "loss": 0.18, + "step": 522 + }, + { + "epoch": 0.7048517520215634, + "grad_norm": 77.31355712833412, + "learning_rate": 9.469070066375342e-06, + "loss": 0.1406, + "step": 523 + }, + { + "epoch": 0.706199460916442, + "grad_norm": 58.61003222357775, + "learning_rate": 9.46661985549807e-06, + "loss": 0.1706, + "step": 524 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 83.95623505055414, + "learning_rate": 9.464164322348193e-06, + "loss": 0.1934, + "step": 525 + }, + { + "epoch": 0.7088948787061995, + "grad_norm": 51.40189523280256, + "learning_rate": 9.461703469851642e-06, + "loss": 0.1809, + "step": 526 + }, + { + "epoch": 0.7102425876010782, + "grad_norm": 53.46265090732973, + "learning_rate": 9.459237300940683e-06, + "loss": 0.2354, + "step": 527 + }, + { + "epoch": 0.7115902964959568, + "grad_norm": 40.67473791039382, + "learning_rate": 9.456765818553919e-06, + "loss": 0.1394, + "step": 528 + }, + { + "epoch": 0.7129380053908356, + "grad_norm": 5.164864553069915, + "learning_rate": 9.454289025636287e-06, + "loss": 0.1109, + "step": 529 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 7.78808386120813, + "learning_rate": 9.451806925139048e-06, + "loss": 0.1613, + "step": 530 + }, + { + "epoch": 0.715633423180593, + "grad_norm": 9.504852309987978, + "learning_rate": 9.449319520019788e-06, + "loss": 0.1029, + "step": 531 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 12.651185095133341, + "learning_rate": 9.446826813242416e-06, + "loss": 0.1255, + "step": 532 + }, + { + "epoch": 0.7183288409703504, + "grad_norm": 55.51690204037741, + "learning_rate": 9.444328807777155e-06, + "loss": 0.2028, + "step": 533 + }, + { + "epoch": 0.7196765498652291, + "grad_norm": 61.89706229024663, + "learning_rate": 9.441825506600543e-06, + "loss": 0.1405, + "step": 534 + }, + { + "epoch": 0.7210242587601078, + "grad_norm": 53.053278488030664, + "learning_rate": 9.439316912695433e-06, + "loss": 0.1901, + "step": 535 + }, + { + "epoch": 0.7223719676549866, + "grad_norm": 63.50724713529975, + "learning_rate": 9.436803029050976e-06, + "loss": 0.137, + "step": 536 + }, + { + "epoch": 0.7237196765498652, + "grad_norm": 75.29341510767952, + "learning_rate": 9.434283858662632e-06, + "loss": 0.1857, + "step": 537 + }, + { + "epoch": 0.7250673854447439, + "grad_norm": 57.636711615164195, + "learning_rate": 9.431759404532161e-06, + "loss": 0.1359, + "step": 538 + }, + { + "epoch": 0.7264150943396226, + "grad_norm": 71.14768342801014, + "learning_rate": 9.429229669667613e-06, + "loss": 0.1749, + "step": 539 + }, + { + "epoch": 0.7277628032345014, + "grad_norm": 74.22759941220224, + "learning_rate": 9.426694657083335e-06, + "loss": 0.171, + "step": 540 + }, + { + "epoch": 0.72911051212938, + "grad_norm": 46.96691839200615, + "learning_rate": 9.424154369799964e-06, + "loss": 0.1674, + "step": 541 + }, + { + "epoch": 0.7304582210242587, + "grad_norm": 43.641915511493714, + "learning_rate": 9.421608810844418e-06, + "loss": 0.1384, + "step": 542 + }, + { + "epoch": 0.7318059299191375, + "grad_norm": 62.25123414461279, + "learning_rate": 9.419057983249903e-06, + "loss": 0.1217, + "step": 543 + }, + { + "epoch": 0.7331536388140162, + "grad_norm": 38.812143816073544, + "learning_rate": 9.416501890055892e-06, + "loss": 0.1714, + "step": 544 + }, + { + "epoch": 0.7345013477088949, + "grad_norm": 32.96235362954222, + "learning_rate": 9.413940534308142e-06, + "loss": 0.1014, + "step": 545 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 18.70881752452829, + "learning_rate": 9.411373919058677e-06, + "loss": 0.1622, + "step": 546 + }, + { + "epoch": 0.7371967654986523, + "grad_norm": 16.431298777503326, + "learning_rate": 9.408802047365792e-06, + "loss": 0.1135, + "step": 547 + }, + { + "epoch": 0.738544474393531, + "grad_norm": 26.90945383536926, + "learning_rate": 9.406224922294038e-06, + "loss": 0.1207, + "step": 548 + }, + { + "epoch": 0.7398921832884097, + "grad_norm": 42.26171443915004, + "learning_rate": 9.403642546914231e-06, + "loss": 0.1493, + "step": 549 + }, + { + "epoch": 0.7412398921832885, + "grad_norm": 56.96266653071134, + "learning_rate": 9.401054924303441e-06, + "loss": 0.1637, + "step": 550 + }, + { + "epoch": 0.7425876010781671, + "grad_norm": 32.182429959185995, + "learning_rate": 9.398462057544992e-06, + "loss": 0.1661, + "step": 551 + }, + { + "epoch": 0.7439353099730458, + "grad_norm": 58.119232344893035, + "learning_rate": 9.395863949728458e-06, + "loss": 0.1752, + "step": 552 + }, + { + "epoch": 0.7452830188679245, + "grad_norm": 49.34348725819003, + "learning_rate": 9.393260603949654e-06, + "loss": 0.1747, + "step": 553 + }, + { + "epoch": 0.7466307277628033, + "grad_norm": 81.91184016213379, + "learning_rate": 9.390652023310638e-06, + "loss": 0.1656, + "step": 554 + }, + { + "epoch": 0.7479784366576819, + "grad_norm": 77.13851587981706, + "learning_rate": 9.388038210919706e-06, + "loss": 0.1831, + "step": 555 + }, + { + "epoch": 0.7493261455525606, + "grad_norm": 75.79353957814907, + "learning_rate": 9.38541916989139e-06, + "loss": 0.1928, + "step": 556 + }, + { + "epoch": 0.7506738544474394, + "grad_norm": 77.6531770418516, + "learning_rate": 9.38279490334645e-06, + "loss": 0.206, + "step": 557 + }, + { + "epoch": 0.7520215633423181, + "grad_norm": 68.24873899564115, + "learning_rate": 9.380165414411872e-06, + "loss": 0.1954, + "step": 558 + }, + { + "epoch": 0.7533692722371967, + "grad_norm": 125.23964528490693, + "learning_rate": 9.377530706220865e-06, + "loss": 0.2875, + "step": 559 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 71.95758509987667, + "learning_rate": 9.37489078191286e-06, + "loss": 0.2111, + "step": 560 + }, + { + "epoch": 0.7560646900269542, + "grad_norm": 71.97963069537671, + "learning_rate": 9.372245644633499e-06, + "loss": 0.1807, + "step": 561 + }, + { + "epoch": 0.7574123989218329, + "grad_norm": 69.22767257903705, + "learning_rate": 9.36959529753464e-06, + "loss": 0.1716, + "step": 562 + }, + { + "epoch": 0.7587601078167115, + "grad_norm": 68.31940550156479, + "learning_rate": 9.366939743774344e-06, + "loss": 0.1782, + "step": 563 + }, + { + "epoch": 0.7601078167115903, + "grad_norm": 64.59390928295655, + "learning_rate": 9.36427898651688e-06, + "loss": 0.1659, + "step": 564 + }, + { + "epoch": 0.761455525606469, + "grad_norm": 13.238039341472172, + "learning_rate": 9.361613028932718e-06, + "loss": 0.1155, + "step": 565 + }, + { + "epoch": 0.7628032345013477, + "grad_norm": 37.47706592566639, + "learning_rate": 9.358941874198522e-06, + "loss": 0.1459, + "step": 566 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 40.44193744856893, + "learning_rate": 9.356265525497146e-06, + "loss": 0.1398, + "step": 567 + }, + { + "epoch": 0.7654986522911051, + "grad_norm": 10.877614312029793, + "learning_rate": 9.353583986017638e-06, + "loss": 0.1255, + "step": 568 + }, + { + "epoch": 0.7668463611859838, + "grad_norm": 23.425603580487014, + "learning_rate": 9.350897258955232e-06, + "loss": 0.1328, + "step": 569 + }, + { + "epoch": 0.7681940700808625, + "grad_norm": 53.49688765956833, + "learning_rate": 9.348205347511337e-06, + "loss": 0.1776, + "step": 570 + }, + { + "epoch": 0.7695417789757413, + "grad_norm": 26.015902139176653, + "learning_rate": 9.345508254893546e-06, + "loss": 0.1563, + "step": 571 + }, + { + "epoch": 0.77088948787062, + "grad_norm": 32.52517645280189, + "learning_rate": 9.34280598431562e-06, + "loss": 0.1345, + "step": 572 + }, + { + "epoch": 0.7722371967654986, + "grad_norm": 51.840737799126046, + "learning_rate": 9.340098538997497e-06, + "loss": 0.1651, + "step": 573 + }, + { + "epoch": 0.7735849056603774, + "grad_norm": 51.941814666760955, + "learning_rate": 9.337385922165275e-06, + "loss": 0.2163, + "step": 574 + }, + { + "epoch": 0.7749326145552561, + "grad_norm": 32.92889378442764, + "learning_rate": 9.334668137051213e-06, + "loss": 0.1585, + "step": 575 + }, + { + "epoch": 0.7762803234501348, + "grad_norm": 75.4983153131046, + "learning_rate": 9.331945186893736e-06, + "loss": 0.1709, + "step": 576 + }, + { + "epoch": 0.7776280323450134, + "grad_norm": 42.780430219604185, + "learning_rate": 9.329217074937418e-06, + "loss": 0.1592, + "step": 577 + }, + { + "epoch": 0.7789757412398922, + "grad_norm": 66.94163238626226, + "learning_rate": 9.326483804432983e-06, + "loss": 0.1754, + "step": 578 + }, + { + "epoch": 0.7803234501347709, + "grad_norm": 81.36095249303122, + "learning_rate": 9.323745378637307e-06, + "loss": 0.1831, + "step": 579 + }, + { + "epoch": 0.7816711590296496, + "grad_norm": 60.62612545488431, + "learning_rate": 9.3210018008134e-06, + "loss": 0.1434, + "step": 580 + }, + { + "epoch": 0.7830188679245284, + "grad_norm": 69.27072706685483, + "learning_rate": 9.318253074230418e-06, + "loss": 0.1848, + "step": 581 + }, + { + "epoch": 0.784366576819407, + "grad_norm": 63.14978406367883, + "learning_rate": 9.315499202163654e-06, + "loss": 0.172, + "step": 582 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 82.1482756771791, + "learning_rate": 9.312740187894524e-06, + "loss": 0.1867, + "step": 583 + }, + { + "epoch": 0.7870619946091644, + "grad_norm": 53.1759568036821, + "learning_rate": 9.309976034710577e-06, + "loss": 0.1191, + "step": 584 + }, + { + "epoch": 0.7884097035040432, + "grad_norm": 25.330124989094518, + "learning_rate": 9.307206745905485e-06, + "loss": 0.1424, + "step": 585 + }, + { + "epoch": 0.7897574123989218, + "grad_norm": 44.86767762925957, + "learning_rate": 9.304432324779038e-06, + "loss": 0.1443, + "step": 586 + }, + { + "epoch": 0.7911051212938005, + "grad_norm": 6.460795039945339, + "learning_rate": 9.30165277463714e-06, + "loss": 0.1513, + "step": 587 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 9.390520527340628, + "learning_rate": 9.298868098791813e-06, + "loss": 0.1575, + "step": 588 + }, + { + "epoch": 0.793800539083558, + "grad_norm": 13.803286780801944, + "learning_rate": 9.29607830056118e-06, + "loss": 0.1125, + "step": 589 + }, + { + "epoch": 0.7951482479784366, + "grad_norm": 28.71103545058596, + "learning_rate": 9.293283383269467e-06, + "loss": 0.1434, + "step": 590 + }, + { + "epoch": 0.7964959568733153, + "grad_norm": 12.429434000065907, + "learning_rate": 9.290483350247008e-06, + "loss": 0.1454, + "step": 591 + }, + { + "epoch": 0.7978436657681941, + "grad_norm": 42.16508253447605, + "learning_rate": 9.287678204830225e-06, + "loss": 0.1564, + "step": 592 + }, + { + "epoch": 0.7991913746630728, + "grad_norm": 45.30007116029795, + "learning_rate": 9.28486795036163e-06, + "loss": 0.1298, + "step": 593 + }, + { + "epoch": 0.8005390835579514, + "grad_norm": 72.66403216817939, + "learning_rate": 9.282052590189833e-06, + "loss": 0.1627, + "step": 594 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 67.96030970132249, + "learning_rate": 9.279232127669519e-06, + "loss": 0.1717, + "step": 595 + }, + { + "epoch": 0.8032345013477089, + "grad_norm": 20.524235564770155, + "learning_rate": 9.276406566161455e-06, + "loss": 0.1373, + "step": 596 + }, + { + "epoch": 0.8045822102425876, + "grad_norm": 22.688359470659105, + "learning_rate": 9.273575909032485e-06, + "loss": 0.1052, + "step": 597 + }, + { + "epoch": 0.8059299191374663, + "grad_norm": 24.75835612099443, + "learning_rate": 9.270740159655523e-06, + "loss": 0.107, + "step": 598 + }, + { + "epoch": 0.807277628032345, + "grad_norm": 48.568528044239336, + "learning_rate": 9.267899321409552e-06, + "loss": 0.1303, + "step": 599 + }, + { + "epoch": 0.8086253369272237, + "grad_norm": 48.0621926595, + "learning_rate": 9.26505339767962e-06, + "loss": 0.1517, + "step": 600 + }, + { + "epoch": 0.8099730458221024, + "grad_norm": 5.083673567942998, + "learning_rate": 9.262202391856831e-06, + "loss": 0.1179, + "step": 601 + }, + { + "epoch": 0.8113207547169812, + "grad_norm": 28.09300976048735, + "learning_rate": 9.259346307338346e-06, + "loss": 0.1573, + "step": 602 + }, + { + "epoch": 0.8126684636118598, + "grad_norm": 26.503986507491973, + "learning_rate": 9.256485147527384e-06, + "loss": 0.1371, + "step": 603 + }, + { + "epoch": 0.8140161725067385, + "grad_norm": 2.061798379353852, + "learning_rate": 9.253618915833198e-06, + "loss": 0.14, + "step": 604 + }, + { + "epoch": 0.8153638814016172, + "grad_norm": 1.9742610299216936, + "learning_rate": 9.250747615671098e-06, + "loss": 0.1188, + "step": 605 + }, + { + "epoch": 0.816711590296496, + "grad_norm": 1.8574176559268898, + "learning_rate": 9.247871250462427e-06, + "loss": 0.146, + "step": 606 + }, + { + "epoch": 0.8180592991913747, + "grad_norm": 32.710983028288666, + "learning_rate": 9.244989823634562e-06, + "loss": 0.1268, + "step": 607 + }, + { + "epoch": 0.8194070080862533, + "grad_norm": 37.93531717874145, + "learning_rate": 9.242103338620915e-06, + "loss": 0.1468, + "step": 608 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 41.33098534866461, + "learning_rate": 9.239211798860923e-06, + "loss": 0.1359, + "step": 609 + }, + { + "epoch": 0.8221024258760108, + "grad_norm": 60.70099442646, + "learning_rate": 9.236315207800048e-06, + "loss": 0.1357, + "step": 610 + }, + { + "epoch": 0.8234501347708895, + "grad_norm": 53.57762316954346, + "learning_rate": 9.233413568889766e-06, + "loss": 0.1506, + "step": 611 + }, + { + "epoch": 0.8247978436657682, + "grad_norm": 44.106821106432335, + "learning_rate": 9.230506885587575e-06, + "loss": 0.1162, + "step": 612 + }, + { + "epoch": 0.8261455525606469, + "grad_norm": 59.8514293269243, + "learning_rate": 9.22759516135698e-06, + "loss": 0.161, + "step": 613 + }, + { + "epoch": 0.8274932614555256, + "grad_norm": 46.08836496429359, + "learning_rate": 9.22467839966749e-06, + "loss": 0.1414, + "step": 614 + }, + { + "epoch": 0.8288409703504043, + "grad_norm": 57.73816801234934, + "learning_rate": 9.221756603994622e-06, + "loss": 0.1448, + "step": 615 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 62.047698643470135, + "learning_rate": 9.21882977781989e-06, + "loss": 0.1259, + "step": 616 + }, + { + "epoch": 0.8315363881401617, + "grad_norm": 18.00213942001759, + "learning_rate": 9.215897924630794e-06, + "loss": 0.1211, + "step": 617 + }, + { + "epoch": 0.8328840970350404, + "grad_norm": 16.07135025295571, + "learning_rate": 9.212961047920838e-06, + "loss": 0.095, + "step": 618 + }, + { + "epoch": 0.8342318059299192, + "grad_norm": 5.024515600210648, + "learning_rate": 9.2100191511895e-06, + "loss": 0.1734, + "step": 619 + }, + { + "epoch": 0.8355795148247979, + "grad_norm": 28.49715795265217, + "learning_rate": 9.207072237942245e-06, + "loss": 0.1534, + "step": 620 + }, + { + "epoch": 0.8369272237196765, + "grad_norm": 5.159484572636771, + "learning_rate": 9.204120311690518e-06, + "loss": 0.1083, + "step": 621 + }, + { + "epoch": 0.8382749326145552, + "grad_norm": 7.279269159748703, + "learning_rate": 9.201163375951731e-06, + "loss": 0.1541, + "step": 622 + }, + { + "epoch": 0.839622641509434, + "grad_norm": 19.243323932886028, + "learning_rate": 9.198201434249268e-06, + "loss": 0.1411, + "step": 623 + }, + { + "epoch": 0.8409703504043127, + "grad_norm": 3.6342082432172433, + "learning_rate": 9.195234490112482e-06, + "loss": 0.1213, + "step": 624 + }, + { + "epoch": 0.8423180592991913, + "grad_norm": 45.613718164446055, + "learning_rate": 9.192262547076677e-06, + "loss": 0.1649, + "step": 625 + }, + { + "epoch": 0.8436657681940701, + "grad_norm": 64.5926292519281, + "learning_rate": 9.189285608683123e-06, + "loss": 0.1752, + "step": 626 + }, + { + "epoch": 0.8450134770889488, + "grad_norm": 65.72779069186068, + "learning_rate": 9.18630367847904e-06, + "loss": 0.1472, + "step": 627 + }, + { + "epoch": 0.8463611859838275, + "grad_norm": 66.8512346275039, + "learning_rate": 9.183316760017592e-06, + "loss": 0.2194, + "step": 628 + }, + { + "epoch": 0.8477088948787062, + "grad_norm": 71.27274370365983, + "learning_rate": 9.180324856857892e-06, + "loss": 0.2001, + "step": 629 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 90.86243362237249, + "learning_rate": 9.177327972564988e-06, + "loss": 0.1664, + "step": 630 + }, + { + "epoch": 0.8504043126684636, + "grad_norm": 87.07214685991477, + "learning_rate": 9.174326110709867e-06, + "loss": 0.1758, + "step": 631 + }, + { + "epoch": 0.8517520215633423, + "grad_norm": 92.68860876547659, + "learning_rate": 9.171319274869445e-06, + "loss": 0.1909, + "step": 632 + }, + { + "epoch": 0.8530997304582211, + "grad_norm": 51.83970245749084, + "learning_rate": 9.168307468626568e-06, + "loss": 0.1916, + "step": 633 + }, + { + "epoch": 0.8544474393530997, + "grad_norm": 61.17970116160292, + "learning_rate": 9.165290695569996e-06, + "loss": 0.1822, + "step": 634 + }, + { + "epoch": 0.8557951482479784, + "grad_norm": 105.24232864355892, + "learning_rate": 9.162268959294421e-06, + "loss": 0.2575, + "step": 635 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 59.18474032780993, + "learning_rate": 9.159242263400435e-06, + "loss": 0.1822, + "step": 636 + }, + { + "epoch": 0.8584905660377359, + "grad_norm": 73.62081944492928, + "learning_rate": 9.15621061149455e-06, + "loss": 0.176, + "step": 637 + }, + { + "epoch": 0.8598382749326146, + "grad_norm": 41.49156354189563, + "learning_rate": 9.153174007189178e-06, + "loss": 0.1382, + "step": 638 + }, + { + "epoch": 0.8611859838274932, + "grad_norm": 26.95056509004383, + "learning_rate": 9.150132454102635e-06, + "loss": 0.1106, + "step": 639 + }, + { + "epoch": 0.862533692722372, + "grad_norm": 25.519594608760567, + "learning_rate": 9.14708595585913e-06, + "loss": 0.1556, + "step": 640 + }, + { + "epoch": 0.8638814016172507, + "grad_norm": 19.782785599284818, + "learning_rate": 9.14403451608877e-06, + "loss": 0.1264, + "step": 641 + }, + { + "epoch": 0.8652291105121294, + "grad_norm": 2.9344852191136455, + "learning_rate": 9.140978138427543e-06, + "loss": 0.1681, + "step": 642 + }, + { + "epoch": 0.866576819407008, + "grad_norm": 3.01735631179554, + "learning_rate": 9.13791682651733e-06, + "loss": 0.165, + "step": 643 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 44.99745746721842, + "learning_rate": 9.13485058400588e-06, + "loss": 0.1546, + "step": 644 + }, + { + "epoch": 0.8692722371967655, + "grad_norm": 35.40742175325014, + "learning_rate": 9.13177941454683e-06, + "loss": 0.1362, + "step": 645 + }, + { + "epoch": 0.8706199460916442, + "grad_norm": 18.365379413480024, + "learning_rate": 9.128703321799676e-06, + "loss": 0.143, + "step": 646 + }, + { + "epoch": 0.871967654986523, + "grad_norm": 54.40508850663043, + "learning_rate": 9.125622309429792e-06, + "loss": 0.1451, + "step": 647 + }, + { + "epoch": 0.8733153638814016, + "grad_norm": 87.8288388055285, + "learning_rate": 9.1225363811084e-06, + "loss": 0.2133, + "step": 648 + }, + { + "epoch": 0.8746630727762803, + "grad_norm": 79.23664306320171, + "learning_rate": 9.119445540512592e-06, + "loss": 0.1851, + "step": 649 + }, + { + "epoch": 0.876010781671159, + "grad_norm": 77.2608919149798, + "learning_rate": 9.116349791325307e-06, + "loss": 0.1708, + "step": 650 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 100.85906205016863, + "learning_rate": 9.113249137235338e-06, + "loss": 0.2371, + "step": 651 + }, + { + "epoch": 0.8787061994609164, + "grad_norm": 84.14647795829706, + "learning_rate": 9.110143581937314e-06, + "loss": 0.2024, + "step": 652 + }, + { + "epoch": 0.8800539083557951, + "grad_norm": 66.74757682898255, + "learning_rate": 9.107033129131714e-06, + "loss": 0.1857, + "step": 653 + }, + { + "epoch": 0.8814016172506739, + "grad_norm": 96.49921371553492, + "learning_rate": 9.103917782524847e-06, + "loss": 0.2243, + "step": 654 + }, + { + "epoch": 0.8827493261455526, + "grad_norm": 93.8391352957189, + "learning_rate": 9.10079754582885e-06, + "loss": 0.2199, + "step": 655 + }, + { + "epoch": 0.8840970350404312, + "grad_norm": 59.53691798038528, + "learning_rate": 9.097672422761697e-06, + "loss": 0.1656, + "step": 656 + }, + { + "epoch": 0.8854447439353099, + "grad_norm": 42.933658869000745, + "learning_rate": 9.094542417047177e-06, + "loss": 0.1278, + "step": 657 + }, + { + "epoch": 0.8867924528301887, + "grad_norm": 78.64236486378812, + "learning_rate": 9.091407532414895e-06, + "loss": 0.1879, + "step": 658 + }, + { + "epoch": 0.8881401617250674, + "grad_norm": 53.69614616137512, + "learning_rate": 9.088267772600276e-06, + "loss": 0.1583, + "step": 659 + }, + { + "epoch": 0.889487870619946, + "grad_norm": 35.4804687975775, + "learning_rate": 9.085123141344552e-06, + "loss": 0.1519, + "step": 660 + }, + { + "epoch": 0.8908355795148248, + "grad_norm": 17.63418221886193, + "learning_rate": 9.081973642394758e-06, + "loss": 0.1215, + "step": 661 + }, + { + "epoch": 0.8921832884097035, + "grad_norm": 20.956362756612698, + "learning_rate": 9.078819279503727e-06, + "loss": 0.1427, + "step": 662 + }, + { + "epoch": 0.8935309973045822, + "grad_norm": 22.20700284866621, + "learning_rate": 9.075660056430096e-06, + "loss": 0.1501, + "step": 663 + }, + { + "epoch": 0.894878706199461, + "grad_norm": 2.439616874548088, + "learning_rate": 9.072495976938285e-06, + "loss": 0.1592, + "step": 664 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 5.074223956260454, + "learning_rate": 9.069327044798506e-06, + "loss": 0.144, + "step": 665 + }, + { + "epoch": 0.8975741239892183, + "grad_norm": 18.819206604517962, + "learning_rate": 9.066153263786745e-06, + "loss": 0.1534, + "step": 666 + }, + { + "epoch": 0.898921832884097, + "grad_norm": 31.696549770171035, + "learning_rate": 9.06297463768478e-06, + "loss": 0.1332, + "step": 667 + }, + { + "epoch": 0.9002695417789758, + "grad_norm": 65.97852786144544, + "learning_rate": 9.059791170280148e-06, + "loss": 0.168, + "step": 668 + }, + { + "epoch": 0.9016172506738545, + "grad_norm": 50.55604057070273, + "learning_rate": 9.056602865366163e-06, + "loss": 0.1776, + "step": 669 + }, + { + "epoch": 0.9029649595687331, + "grad_norm": 85.89807095861163, + "learning_rate": 9.0534097267419e-06, + "loss": 0.2106, + "step": 670 + }, + { + "epoch": 0.9043126684636119, + "grad_norm": 59.66211066048071, + "learning_rate": 9.050211758212197e-06, + "loss": 0.1767, + "step": 671 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 108.98278384225277, + "learning_rate": 9.04700896358764e-06, + "loss": 0.2312, + "step": 672 + }, + { + "epoch": 0.9070080862533693, + "grad_norm": 71.95575514596848, + "learning_rate": 9.043801346684576e-06, + "loss": 0.2224, + "step": 673 + }, + { + "epoch": 0.9083557951482479, + "grad_norm": 85.12406357495809, + "learning_rate": 9.040588911325087e-06, + "loss": 0.2118, + "step": 674 + }, + { + "epoch": 0.9097035040431267, + "grad_norm": 56.63147241029235, + "learning_rate": 9.037371661337006e-06, + "loss": 0.174, + "step": 675 + }, + { + "epoch": 0.9110512129380054, + "grad_norm": 88.21489698704377, + "learning_rate": 9.0341496005539e-06, + "loss": 0.1899, + "step": 676 + }, + { + "epoch": 0.9123989218328841, + "grad_norm": 63.43663443602954, + "learning_rate": 9.030922732815061e-06, + "loss": 0.1703, + "step": 677 + }, + { + "epoch": 0.9137466307277629, + "grad_norm": 92.48079121446479, + "learning_rate": 9.02769106196552e-06, + "loss": 0.2451, + "step": 678 + }, + { + "epoch": 0.9150943396226415, + "grad_norm": 74.36571955781272, + "learning_rate": 9.024454591856024e-06, + "loss": 0.1664, + "step": 679 + }, + { + "epoch": 0.9164420485175202, + "grad_norm": 68.00948208617552, + "learning_rate": 9.021213326343043e-06, + "loss": 0.172, + "step": 680 + }, + { + "epoch": 0.9177897574123989, + "grad_norm": 35.928021301984, + "learning_rate": 9.017967269288759e-06, + "loss": 0.1495, + "step": 681 + }, + { + "epoch": 0.9191374663072777, + "grad_norm": 60.436879052146914, + "learning_rate": 9.01471642456106e-06, + "loss": 0.1823, + "step": 682 + }, + { + "epoch": 0.9204851752021563, + "grad_norm": 39.18066406227695, + "learning_rate": 9.011460796033548e-06, + "loss": 0.1432, + "step": 683 + }, + { + "epoch": 0.921832884097035, + "grad_norm": 35.691088358831706, + "learning_rate": 9.008200387585513e-06, + "loss": 0.1691, + "step": 684 + }, + { + "epoch": 0.9231805929919138, + "grad_norm": 34.00000210896791, + "learning_rate": 9.004935203101951e-06, + "loss": 0.1221, + "step": 685 + }, + { + "epoch": 0.9245283018867925, + "grad_norm": 25.468486859486163, + "learning_rate": 9.001665246473545e-06, + "loss": 0.1267, + "step": 686 + }, + { + "epoch": 0.9258760107816711, + "grad_norm": 27.956678211865146, + "learning_rate": 8.998390521596663e-06, + "loss": 0.1195, + "step": 687 + }, + { + "epoch": 0.9272237196765498, + "grad_norm": 6.9672148673323555, + "learning_rate": 8.995111032373357e-06, + "loss": 0.1197, + "step": 688 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 46.69568601249259, + "learning_rate": 8.991826782711353e-06, + "loss": 0.1684, + "step": 689 + }, + { + "epoch": 0.9299191374663073, + "grad_norm": 31.418590227669963, + "learning_rate": 8.988537776524053e-06, + "loss": 0.1518, + "step": 690 + }, + { + "epoch": 0.931266846361186, + "grad_norm": 30.11415468845868, + "learning_rate": 8.985244017730524e-06, + "loss": 0.1051, + "step": 691 + }, + { + "epoch": 0.9326145552560647, + "grad_norm": 24.768118419148248, + "learning_rate": 8.981945510255501e-06, + "loss": 0.1512, + "step": 692 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 54.89909397876401, + "learning_rate": 8.978642258029369e-06, + "loss": 0.1276, + "step": 693 + }, + { + "epoch": 0.9353099730458221, + "grad_norm": 57.81663554080905, + "learning_rate": 8.975334264988172e-06, + "loss": 0.1634, + "step": 694 + }, + { + "epoch": 0.9366576819407008, + "grad_norm": 62.27857402184147, + "learning_rate": 8.972021535073605e-06, + "loss": 0.1799, + "step": 695 + }, + { + "epoch": 0.9380053908355795, + "grad_norm": 32.496742419215586, + "learning_rate": 8.968704072233002e-06, + "loss": 0.1273, + "step": 696 + }, + { + "epoch": 0.9393530997304582, + "grad_norm": 73.05886930879416, + "learning_rate": 8.965381880419339e-06, + "loss": 0.1708, + "step": 697 + }, + { + "epoch": 0.9407008086253369, + "grad_norm": 69.14501996306903, + "learning_rate": 8.96205496359123e-06, + "loss": 0.1704, + "step": 698 + }, + { + "epoch": 0.9420485175202157, + "grad_norm": 18.25728068277052, + "learning_rate": 8.958723325712912e-06, + "loss": 0.1144, + "step": 699 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 12.609634078907927, + "learning_rate": 8.955386970754255e-06, + "loss": 0.133, + "step": 700 + }, + { + "epoch": 0.944743935309973, + "grad_norm": 13.626809945373157, + "learning_rate": 8.952045902690742e-06, + "loss": 0.1112, + "step": 701 + }, + { + "epoch": 0.9460916442048517, + "grad_norm": 12.292371423058762, + "learning_rate": 8.948700125503482e-06, + "loss": 0.0945, + "step": 702 + }, + { + "epoch": 0.9474393530997305, + "grad_norm": 1.6109238233664116, + "learning_rate": 8.945349643179186e-06, + "loss": 0.1204, + "step": 703 + }, + { + "epoch": 0.9487870619946092, + "grad_norm": 3.931188715353685, + "learning_rate": 8.941994459710175e-06, + "loss": 0.123, + "step": 704 + }, + { + "epoch": 0.9501347708894878, + "grad_norm": 14.077270155369943, + "learning_rate": 8.938634579094373e-06, + "loss": 0.1511, + "step": 705 + }, + { + "epoch": 0.9514824797843666, + "grad_norm": 45.00047274119425, + "learning_rate": 8.9352700053353e-06, + "loss": 0.1686, + "step": 706 + }, + { + "epoch": 0.9528301886792453, + "grad_norm": 22.5966162820501, + "learning_rate": 8.931900742442066e-06, + "loss": 0.112, + "step": 707 + }, + { + "epoch": 0.954177897574124, + "grad_norm": 57.15606553507399, + "learning_rate": 8.928526794429373e-06, + "loss": 0.2011, + "step": 708 + }, + { + "epoch": 0.9555256064690026, + "grad_norm": 55.13109510371408, + "learning_rate": 8.925148165317499e-06, + "loss": 0.1424, + "step": 709 + }, + { + "epoch": 0.9568733153638814, + "grad_norm": 105.01758207516417, + "learning_rate": 8.921764859132308e-06, + "loss": 0.2365, + "step": 710 + }, + { + "epoch": 0.9582210242587601, + "grad_norm": 49.440456870933644, + "learning_rate": 8.918376879905229e-06, + "loss": 0.144, + "step": 711 + }, + { + "epoch": 0.9595687331536388, + "grad_norm": 60.19032296340638, + "learning_rate": 8.914984231673265e-06, + "loss": 0.1191, + "step": 712 + }, + { + "epoch": 0.9609164420485176, + "grad_norm": 61.58292507306651, + "learning_rate": 8.91158691847898e-06, + "loss": 0.1742, + "step": 713 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 50.8531364890058, + "learning_rate": 8.908184944370499e-06, + "loss": 0.133, + "step": 714 + }, + { + "epoch": 0.9636118598382749, + "grad_norm": 101.74723552770821, + "learning_rate": 8.904778313401497e-06, + "loss": 0.2692, + "step": 715 + }, + { + "epoch": 0.9649595687331537, + "grad_norm": 68.54029931601121, + "learning_rate": 8.901367029631199e-06, + "loss": 0.1859, + "step": 716 + }, + { + "epoch": 0.9663072776280324, + "grad_norm": 73.22527767515342, + "learning_rate": 8.897951097124378e-06, + "loss": 0.1879, + "step": 717 + }, + { + "epoch": 0.967654986522911, + "grad_norm": 44.57376688098822, + "learning_rate": 8.894530519951339e-06, + "loss": 0.1303, + "step": 718 + }, + { + "epoch": 0.9690026954177897, + "grad_norm": 45.59278603016841, + "learning_rate": 8.89110530218793e-06, + "loss": 0.1331, + "step": 719 + }, + { + "epoch": 0.9703504043126685, + "grad_norm": 27.93193484661389, + "learning_rate": 8.88767544791552e-06, + "loss": 0.0957, + "step": 720 + }, + { + "epoch": 0.9716981132075472, + "grad_norm": 36.42902195333915, + "learning_rate": 8.884240961221011e-06, + "loss": 0.1212, + "step": 721 + }, + { + "epoch": 0.9730458221024259, + "grad_norm": 40.13034342255332, + "learning_rate": 8.880801846196818e-06, + "loss": 0.1676, + "step": 722 + }, + { + "epoch": 0.9743935309973046, + "grad_norm": 25.749130999023798, + "learning_rate": 8.877358106940875e-06, + "loss": 0.1323, + "step": 723 + }, + { + "epoch": 0.9757412398921833, + "grad_norm": 19.076903162113254, + "learning_rate": 8.873909747556623e-06, + "loss": 0.0835, + "step": 724 + }, + { + "epoch": 0.977088948787062, + "grad_norm": 9.287222600495818, + "learning_rate": 8.870456772153014e-06, + "loss": 0.1394, + "step": 725 + }, + { + "epoch": 0.9784366576819407, + "grad_norm": 63.410212624826976, + "learning_rate": 8.866999184844492e-06, + "loss": 0.1611, + "step": 726 + }, + { + "epoch": 0.9797843665768194, + "grad_norm": 36.576528138784234, + "learning_rate": 8.863536989751003e-06, + "loss": 0.0989, + "step": 727 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 16.712024598590077, + "learning_rate": 8.86007019099798e-06, + "loss": 0.1655, + "step": 728 + }, + { + "epoch": 0.9824797843665768, + "grad_norm": 10.254155246070992, + "learning_rate": 8.856598792716345e-06, + "loss": 0.1053, + "step": 729 + }, + { + "epoch": 0.9838274932614556, + "grad_norm": 35.61940723179949, + "learning_rate": 8.853122799042493e-06, + "loss": 0.1741, + "step": 730 + }, + { + "epoch": 0.9851752021563343, + "grad_norm": 49.794560679173046, + "learning_rate": 8.849642214118305e-06, + "loss": 0.1205, + "step": 731 + }, + { + "epoch": 0.9865229110512129, + "grad_norm": 55.63073048952866, + "learning_rate": 8.846157042091128e-06, + "loss": 0.1995, + "step": 732 + }, + { + "epoch": 0.9878706199460916, + "grad_norm": 60.33934589537435, + "learning_rate": 8.842667287113773e-06, + "loss": 0.1321, + "step": 733 + }, + { + "epoch": 0.9892183288409704, + "grad_norm": 55.305054305115824, + "learning_rate": 8.839172953344513e-06, + "loss": 0.1903, + "step": 734 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 72.21173117359581, + "learning_rate": 8.835674044947078e-06, + "loss": 0.2148, + "step": 735 + }, + { + "epoch": 0.9919137466307277, + "grad_norm": 56.961204874714745, + "learning_rate": 8.83217056609065e-06, + "loss": 0.1717, + "step": 736 + }, + { + "epoch": 0.9932614555256065, + "grad_norm": 32.92010961445034, + "learning_rate": 8.828662520949854e-06, + "loss": 0.1161, + "step": 737 + }, + { + "epoch": 0.9946091644204852, + "grad_norm": 18.788935473521704, + "learning_rate": 8.825149913704756e-06, + "loss": 0.1788, + "step": 738 + }, + { + "epoch": 0.9959568733153639, + "grad_norm": 43.571977995928286, + "learning_rate": 8.821632748540862e-06, + "loss": 0.1801, + "step": 739 + }, + { + "epoch": 0.9973045822102425, + "grad_norm": 2.851169927531399, + "learning_rate": 8.818111029649105e-06, + "loss": 0.1603, + "step": 740 + }, + { + "epoch": 0.9986522911051213, + "grad_norm": 10.2092002843653, + "learning_rate": 8.81458476122585e-06, + "loss": 0.1536, + "step": 741 + }, + { + "epoch": 1.0, + "grad_norm": 1.5455237388203742, + "learning_rate": 8.811053947472873e-06, + "loss": 0.105, + "step": 742 + }, + { + "epoch": 1.0013477088948788, + "grad_norm": 11.340132955172777, + "learning_rate": 8.807518592597375e-06, + "loss": 0.1047, + "step": 743 + }, + { + "epoch": 1.0026954177897573, + "grad_norm": 4.623000458699269, + "learning_rate": 8.803978700811964e-06, + "loss": 0.0893, + "step": 744 + }, + { + "epoch": 1.0040431266846361, + "grad_norm": 8.883596607941477, + "learning_rate": 8.800434276334652e-06, + "loss": 0.1097, + "step": 745 + }, + { + "epoch": 1.005390835579515, + "grad_norm": 8.967387279142535, + "learning_rate": 8.796885323388862e-06, + "loss": 0.124, + "step": 746 + }, + { + "epoch": 1.0067385444743935, + "grad_norm": 17.873771978345882, + "learning_rate": 8.7933318462034e-06, + "loss": 0.1145, + "step": 747 + }, + { + "epoch": 1.0080862533692723, + "grad_norm": 10.591609172898059, + "learning_rate": 8.789773849012471e-06, + "loss": 0.1179, + "step": 748 + }, + { + "epoch": 1.009433962264151, + "grad_norm": 18.936732828072287, + "learning_rate": 8.786211336055664e-06, + "loss": 0.1215, + "step": 749 + }, + { + "epoch": 1.0107816711590296, + "grad_norm": 25.519160112986864, + "learning_rate": 8.782644311577946e-06, + "loss": 0.1198, + "step": 750 + }, + { + "epoch": 1.0121293800539084, + "grad_norm": 13.084725031770748, + "learning_rate": 8.779072779829664e-06, + "loss": 0.1208, + "step": 751 + }, + { + "epoch": 1.013477088948787, + "grad_norm": 18.004005422555174, + "learning_rate": 8.775496745066533e-06, + "loss": 0.14, + "step": 752 + }, + { + "epoch": 1.0148247978436657, + "grad_norm": 25.990486755686874, + "learning_rate": 8.771916211549638e-06, + "loss": 0.1506, + "step": 753 + }, + { + "epoch": 1.0161725067385445, + "grad_norm": 11.839542705695088, + "learning_rate": 8.76833118354542e-06, + "loss": 0.114, + "step": 754 + }, + { + "epoch": 1.017520215633423, + "grad_norm": 17.06844617984455, + "learning_rate": 8.764741665325672e-06, + "loss": 0.1185, + "step": 755 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 9.660763433411988, + "learning_rate": 8.761147661167549e-06, + "loss": 0.101, + "step": 756 + }, + { + "epoch": 1.0202156334231807, + "grad_norm": 7.856258064548563, + "learning_rate": 8.757549175353536e-06, + "loss": 0.1162, + "step": 757 + }, + { + "epoch": 1.0215633423180592, + "grad_norm": 19.765957043727, + "learning_rate": 8.753946212171476e-06, + "loss": 0.1156, + "step": 758 + }, + { + "epoch": 1.022911051212938, + "grad_norm": 3.517156318134228, + "learning_rate": 8.750338775914532e-06, + "loss": 0.1452, + "step": 759 + }, + { + "epoch": 1.0242587601078168, + "grad_norm": 6.408355621081997, + "learning_rate": 8.746726870881204e-06, + "loss": 0.1237, + "step": 760 + }, + { + "epoch": 1.0256064690026954, + "grad_norm": 22.55520994225022, + "learning_rate": 8.743110501375314e-06, + "loss": 0.1319, + "step": 761 + }, + { + "epoch": 1.0269541778975741, + "grad_norm": 10.040812364947387, + "learning_rate": 8.739489671706007e-06, + "loss": 0.1321, + "step": 762 + }, + { + "epoch": 1.028301886792453, + "grad_norm": 20.547854604799287, + "learning_rate": 8.73586438618774e-06, + "loss": 0.157, + "step": 763 + }, + { + "epoch": 1.0296495956873315, + "grad_norm": 30.04520788148507, + "learning_rate": 8.73223464914028e-06, + "loss": 0.1257, + "step": 764 + }, + { + "epoch": 1.0309973045822103, + "grad_norm": 24.513560241202686, + "learning_rate": 8.728600464888698e-06, + "loss": 0.1208, + "step": 765 + }, + { + "epoch": 1.0323450134770888, + "grad_norm": 44.9028736739974, + "learning_rate": 8.724961837763368e-06, + "loss": 0.1464, + "step": 766 + }, + { + "epoch": 1.0336927223719676, + "grad_norm": 30.238847254060765, + "learning_rate": 8.721318772099949e-06, + "loss": 0.1398, + "step": 767 + }, + { + "epoch": 1.0350404312668464, + "grad_norm": 36.63006509117649, + "learning_rate": 8.717671272239398e-06, + "loss": 0.1293, + "step": 768 + }, + { + "epoch": 1.036388140161725, + "grad_norm": 21.936510747044686, + "learning_rate": 8.71401934252795e-06, + "loss": 0.1105, + "step": 769 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 43.52132958778894, + "learning_rate": 8.710362987317124e-06, + "loss": 0.1317, + "step": 770 + }, + { + "epoch": 1.0390835579514826, + "grad_norm": 12.954583898614654, + "learning_rate": 8.706702210963706e-06, + "loss": 0.0962, + "step": 771 + }, + { + "epoch": 1.0404312668463611, + "grad_norm": 32.92712018900622, + "learning_rate": 8.703037017829753e-06, + "loss": 0.1077, + "step": 772 + }, + { + "epoch": 1.04177897574124, + "grad_norm": 19.97785540411922, + "learning_rate": 8.699367412282584e-06, + "loss": 0.1195, + "step": 773 + }, + { + "epoch": 1.0431266846361187, + "grad_norm": 8.205480113240736, + "learning_rate": 8.69569339869478e-06, + "loss": 0.0999, + "step": 774 + }, + { + "epoch": 1.0444743935309972, + "grad_norm": 3.101413138915814, + "learning_rate": 8.692014981444166e-06, + "loss": 0.1189, + "step": 775 + }, + { + "epoch": 1.045822102425876, + "grad_norm": 2.283055948197573, + "learning_rate": 8.688332164913822e-06, + "loss": 0.1262, + "step": 776 + }, + { + "epoch": 1.0471698113207548, + "grad_norm": 14.883821889265258, + "learning_rate": 8.684644953492067e-06, + "loss": 0.1148, + "step": 777 + }, + { + "epoch": 1.0485175202156334, + "grad_norm": 14.105431251894263, + "learning_rate": 8.680953351572456e-06, + "loss": 0.0997, + "step": 778 + }, + { + "epoch": 1.0498652291105122, + "grad_norm": 22.467684573486178, + "learning_rate": 8.677257363553778e-06, + "loss": 0.1314, + "step": 779 + }, + { + "epoch": 1.0512129380053907, + "grad_norm": 23.659981510383574, + "learning_rate": 8.673556993840046e-06, + "loss": 0.1085, + "step": 780 + }, + { + "epoch": 1.0525606469002695, + "grad_norm": 6.24970816922605, + "learning_rate": 8.669852246840495e-06, + "loss": 0.0933, + "step": 781 + }, + { + "epoch": 1.0539083557951483, + "grad_norm": 23.39593907305167, + "learning_rate": 8.666143126969576e-06, + "loss": 0.1, + "step": 782 + }, + { + "epoch": 1.0552560646900269, + "grad_norm": 8.123130264542544, + "learning_rate": 8.662429638646948e-06, + "loss": 0.1028, + "step": 783 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 5.014099155441371, + "learning_rate": 8.65871178629748e-06, + "loss": 0.1683, + "step": 784 + }, + { + "epoch": 1.0579514824797844, + "grad_norm": 22.45602975784216, + "learning_rate": 8.654989574351241e-06, + "loss": 0.1204, + "step": 785 + }, + { + "epoch": 1.059299191374663, + "grad_norm": 3.761458016938634, + "learning_rate": 8.651263007243489e-06, + "loss": 0.0766, + "step": 786 + }, + { + "epoch": 1.0606469002695418, + "grad_norm": 26.31373869619325, + "learning_rate": 8.647532089414674e-06, + "loss": 0.1354, + "step": 787 + }, + { + "epoch": 1.0619946091644206, + "grad_norm": 29.31979080474117, + "learning_rate": 8.643796825310432e-06, + "loss": 0.0972, + "step": 788 + }, + { + "epoch": 1.0633423180592991, + "grad_norm": 3.1166588638371735, + "learning_rate": 8.640057219381582e-06, + "loss": 0.0815, + "step": 789 + }, + { + "epoch": 1.064690026954178, + "grad_norm": 3.0935968743732727, + "learning_rate": 8.636313276084104e-06, + "loss": 0.1332, + "step": 790 + }, + { + "epoch": 1.0660377358490567, + "grad_norm": 2.0381472263754365, + "learning_rate": 8.632564999879156e-06, + "loss": 0.092, + "step": 791 + }, + { + "epoch": 1.0673854447439353, + "grad_norm": 2.9863599251597286, + "learning_rate": 8.62881239523306e-06, + "loss": 0.1011, + "step": 792 + }, + { + "epoch": 1.068733153638814, + "grad_norm": 22.236597342549704, + "learning_rate": 8.625055466617288e-06, + "loss": 0.1308, + "step": 793 + }, + { + "epoch": 1.0700808625336928, + "grad_norm": 19.54289116147246, + "learning_rate": 8.621294218508471e-06, + "loss": 0.1504, + "step": 794 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 23.186974190964627, + "learning_rate": 8.617528655388384e-06, + "loss": 0.1418, + "step": 795 + }, + { + "epoch": 1.0727762803234502, + "grad_norm": 14.344827393082216, + "learning_rate": 8.613758781743945e-06, + "loss": 0.1048, + "step": 796 + }, + { + "epoch": 1.0741239892183287, + "grad_norm": 47.985507621264404, + "learning_rate": 8.609984602067206e-06, + "loss": 0.1237, + "step": 797 + }, + { + "epoch": 1.0754716981132075, + "grad_norm": 4.067939320892777, + "learning_rate": 8.606206120855351e-06, + "loss": 0.1095, + "step": 798 + }, + { + "epoch": 1.0768194070080863, + "grad_norm": 30.002605399813007, + "learning_rate": 8.602423342610692e-06, + "loss": 0.1034, + "step": 799 + }, + { + "epoch": 1.0781671159029649, + "grad_norm": 24.10756382596904, + "learning_rate": 8.598636271840658e-06, + "loss": 0.0954, + "step": 800 + }, + { + "epoch": 1.0795148247978437, + "grad_norm": 5.583677920814206, + "learning_rate": 8.594844913057796e-06, + "loss": 0.1198, + "step": 801 + }, + { + "epoch": 1.0808625336927224, + "grad_norm": 15.313668915568401, + "learning_rate": 8.591049270779757e-06, + "loss": 0.1027, + "step": 802 + }, + { + "epoch": 1.082210242587601, + "grad_norm": 17.894389378861558, + "learning_rate": 8.587249349529303e-06, + "loss": 0.1154, + "step": 803 + }, + { + "epoch": 1.0835579514824798, + "grad_norm": 6.333767080485184, + "learning_rate": 8.583445153834286e-06, + "loss": 0.1363, + "step": 804 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 14.170300301344652, + "learning_rate": 8.579636688227663e-06, + "loss": 0.1139, + "step": 805 + }, + { + "epoch": 1.0862533692722371, + "grad_norm": 10.468221050141898, + "learning_rate": 8.575823957247466e-06, + "loss": 0.1076, + "step": 806 + }, + { + "epoch": 1.087601078167116, + "grad_norm": 19.99153416646693, + "learning_rate": 8.572006965436822e-06, + "loss": 0.0889, + "step": 807 + }, + { + "epoch": 1.0889487870619945, + "grad_norm": 19.802441158637322, + "learning_rate": 8.568185717343923e-06, + "loss": 0.1256, + "step": 808 + }, + { + "epoch": 1.0902964959568733, + "grad_norm": 41.906144607956435, + "learning_rate": 8.564360217522045e-06, + "loss": 0.1495, + "step": 809 + }, + { + "epoch": 1.091644204851752, + "grad_norm": 47.05578540512516, + "learning_rate": 8.560530470529519e-06, + "loss": 0.1405, + "step": 810 + }, + { + "epoch": 1.0929919137466306, + "grad_norm": 30.4121951336433, + "learning_rate": 8.556696480929739e-06, + "loss": 0.0942, + "step": 811 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 37.22719907198771, + "learning_rate": 8.552858253291163e-06, + "loss": 0.1716, + "step": 812 + }, + { + "epoch": 1.0956873315363882, + "grad_norm": 37.73810759469924, + "learning_rate": 8.54901579218729e-06, + "loss": 0.1651, + "step": 813 + }, + { + "epoch": 1.0970350404312668, + "grad_norm": 14.545364613313549, + "learning_rate": 8.545169102196666e-06, + "loss": 0.1211, + "step": 814 + }, + { + "epoch": 1.0983827493261455, + "grad_norm": 52.23942729910137, + "learning_rate": 8.541318187902879e-06, + "loss": 0.1608, + "step": 815 + }, + { + "epoch": 1.0997304582210243, + "grad_norm": 37.72948666452416, + "learning_rate": 8.537463053894543e-06, + "loss": 0.1335, + "step": 816 + }, + { + "epoch": 1.101078167115903, + "grad_norm": 69.27972411598832, + "learning_rate": 8.533603704765308e-06, + "loss": 0.1447, + "step": 817 + }, + { + "epoch": 1.1024258760107817, + "grad_norm": 48.53283267252406, + "learning_rate": 8.529740145113842e-06, + "loss": 0.1471, + "step": 818 + }, + { + "epoch": 1.1037735849056605, + "grad_norm": 31.91467697093213, + "learning_rate": 8.525872379543833e-06, + "loss": 0.1179, + "step": 819 + }, + { + "epoch": 1.105121293800539, + "grad_norm": 28.69389972243588, + "learning_rate": 8.522000412663978e-06, + "loss": 0.124, + "step": 820 + }, + { + "epoch": 1.1064690026954178, + "grad_norm": 4.451855529460002, + "learning_rate": 8.518124249087983e-06, + "loss": 0.1045, + "step": 821 + }, + { + "epoch": 1.1078167115902966, + "grad_norm": 24.292741678071025, + "learning_rate": 8.514243893434549e-06, + "loss": 0.1154, + "step": 822 + }, + { + "epoch": 1.1091644204851752, + "grad_norm": 5.032537821458209, + "learning_rate": 8.51035935032738e-06, + "loss": 0.1137, + "step": 823 + }, + { + "epoch": 1.110512129380054, + "grad_norm": 8.18379018339823, + "learning_rate": 8.506470624395164e-06, + "loss": 0.1352, + "step": 824 + }, + { + "epoch": 1.1118598382749325, + "grad_norm": 27.16147119349904, + "learning_rate": 8.502577720271576e-06, + "loss": 0.1289, + "step": 825 + }, + { + "epoch": 1.1132075471698113, + "grad_norm": 40.66282941574022, + "learning_rate": 8.498680642595268e-06, + "loss": 0.1188, + "step": 826 + }, + { + "epoch": 1.11455525606469, + "grad_norm": 32.88591494022974, + "learning_rate": 8.494779396009864e-06, + "loss": 0.1081, + "step": 827 + }, + { + "epoch": 1.1159029649595686, + "grad_norm": 28.528139128791448, + "learning_rate": 8.49087398516396e-06, + "loss": 0.1115, + "step": 828 + }, + { + "epoch": 1.1172506738544474, + "grad_norm": 29.996184490170503, + "learning_rate": 8.486964414711107e-06, + "loss": 0.1189, + "step": 829 + }, + { + "epoch": 1.1185983827493262, + "grad_norm": 22.24433579990352, + "learning_rate": 8.48305068930982e-06, + "loss": 0.1415, + "step": 830 + }, + { + "epoch": 1.1199460916442048, + "grad_norm": 2.444616706538533, + "learning_rate": 8.479132813623558e-06, + "loss": 0.099, + "step": 831 + }, + { + "epoch": 1.1212938005390836, + "grad_norm": 2.1167940530990736, + "learning_rate": 8.475210792320733e-06, + "loss": 0.0806, + "step": 832 + }, + { + "epoch": 1.1226415094339623, + "grad_norm": 2.2954828429661305, + "learning_rate": 8.471284630074688e-06, + "loss": 0.1171, + "step": 833 + }, + { + "epoch": 1.123989218328841, + "grad_norm": 12.624610406366203, + "learning_rate": 8.467354331563709e-06, + "loss": 0.119, + "step": 834 + }, + { + "epoch": 1.1253369272237197, + "grad_norm": 34.73243825110076, + "learning_rate": 8.463419901471002e-06, + "loss": 0.1223, + "step": 835 + }, + { + "epoch": 1.1266846361185983, + "grad_norm": 18.23699945587048, + "learning_rate": 8.459481344484704e-06, + "loss": 0.1014, + "step": 836 + }, + { + "epoch": 1.128032345013477, + "grad_norm": 4.2574858836325955, + "learning_rate": 8.455538665297862e-06, + "loss": 0.0874, + "step": 837 + }, + { + "epoch": 1.1293800539083558, + "grad_norm": 11.126948272282787, + "learning_rate": 8.451591868608443e-06, + "loss": 0.1416, + "step": 838 + }, + { + "epoch": 1.1307277628032346, + "grad_norm": 4.866081235999342, + "learning_rate": 8.447640959119312e-06, + "loss": 0.1104, + "step": 839 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 13.482018214887708, + "learning_rate": 8.443685941538242e-06, + "loss": 0.1424, + "step": 840 + }, + { + "epoch": 1.133423180592992, + "grad_norm": 6.459429216693512, + "learning_rate": 8.439726820577895e-06, + "loss": 0.1133, + "step": 841 + }, + { + "epoch": 1.1347708894878705, + "grad_norm": 6.778492872467015, + "learning_rate": 8.435763600955827e-06, + "loss": 0.1242, + "step": 842 + }, + { + "epoch": 1.1361185983827493, + "grad_norm": 25.88764497444041, + "learning_rate": 8.431796287394476e-06, + "loss": 0.1232, + "step": 843 + }, + { + "epoch": 1.137466307277628, + "grad_norm": 18.497385222212717, + "learning_rate": 8.427824884621156e-06, + "loss": 0.0946, + "step": 844 + }, + { + "epoch": 1.1388140161725067, + "grad_norm": 44.54430347400736, + "learning_rate": 8.423849397368058e-06, + "loss": 0.1548, + "step": 845 + }, + { + "epoch": 1.1401617250673854, + "grad_norm": 33.10489969737261, + "learning_rate": 8.419869830372237e-06, + "loss": 0.0999, + "step": 846 + }, + { + "epoch": 1.1415094339622642, + "grad_norm": 45.561458817288184, + "learning_rate": 8.41588618837561e-06, + "loss": 0.1177, + "step": 847 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 37.90071712718538, + "learning_rate": 8.411898476124949e-06, + "loss": 0.1151, + "step": 848 + }, + { + "epoch": 1.1442048517520216, + "grad_norm": 44.458893864143825, + "learning_rate": 8.407906698371878e-06, + "loss": 0.1386, + "step": 849 + }, + { + "epoch": 1.1455525606469004, + "grad_norm": 45.754433596632225, + "learning_rate": 8.40391085987286e-06, + "loss": 0.1414, + "step": 850 + }, + { + "epoch": 1.146900269541779, + "grad_norm": 22.544352196641043, + "learning_rate": 8.399910965389206e-06, + "loss": 0.1372, + "step": 851 + }, + { + "epoch": 1.1482479784366577, + "grad_norm": 47.72532414224419, + "learning_rate": 8.395907019687051e-06, + "loss": 0.1137, + "step": 852 + }, + { + "epoch": 1.1495956873315363, + "grad_norm": 26.139256205737066, + "learning_rate": 8.391899027537362e-06, + "loss": 0.08, + "step": 853 + }, + { + "epoch": 1.150943396226415, + "grad_norm": 29.659509232869837, + "learning_rate": 8.387886993715924e-06, + "loss": 0.1174, + "step": 854 + }, + { + "epoch": 1.1522911051212938, + "grad_norm": 28.43976991946177, + "learning_rate": 8.383870923003345e-06, + "loss": 0.1011, + "step": 855 + }, + { + "epoch": 1.1536388140161726, + "grad_norm": 6.85682354494262, + "learning_rate": 8.379850820185034e-06, + "loss": 0.136, + "step": 856 + }, + { + "epoch": 1.1549865229110512, + "grad_norm": 14.82537791684829, + "learning_rate": 8.375826690051213e-06, + "loss": 0.0977, + "step": 857 + }, + { + "epoch": 1.15633423180593, + "grad_norm": 2.7542314001317556, + "learning_rate": 8.371798537396895e-06, + "loss": 0.1143, + "step": 858 + }, + { + "epoch": 1.1576819407008085, + "grad_norm": 19.405616835571298, + "learning_rate": 8.367766367021895e-06, + "loss": 0.0735, + "step": 859 + }, + { + "epoch": 1.1590296495956873, + "grad_norm": 50.25959492337316, + "learning_rate": 8.363730183730802e-06, + "loss": 0.1678, + "step": 860 + }, + { + "epoch": 1.1603773584905661, + "grad_norm": 58.15587650962807, + "learning_rate": 8.359689992333005e-06, + "loss": 0.1535, + "step": 861 + }, + { + "epoch": 1.1617250673854447, + "grad_norm": 7.7911463080227055, + "learning_rate": 8.35564579764265e-06, + "loss": 0.1313, + "step": 862 + }, + { + "epoch": 1.1630727762803235, + "grad_norm": 38.09122378075344, + "learning_rate": 8.35159760447867e-06, + "loss": 0.1214, + "step": 863 + }, + { + "epoch": 1.1644204851752022, + "grad_norm": 21.70230617533526, + "learning_rate": 8.347545417664749e-06, + "loss": 0.1041, + "step": 864 + }, + { + "epoch": 1.1657681940700808, + "grad_norm": 33.38756044603026, + "learning_rate": 8.343489242029337e-06, + "loss": 0.0938, + "step": 865 + }, + { + "epoch": 1.1671159029649596, + "grad_norm": 39.97866309691813, + "learning_rate": 8.339429082405634e-06, + "loss": 0.1204, + "step": 866 + }, + { + "epoch": 1.1684636118598384, + "grad_norm": 42.365494469113635, + "learning_rate": 8.335364943631591e-06, + "loss": 0.0869, + "step": 867 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 50.55376723178448, + "learning_rate": 8.331296830549898e-06, + "loss": 0.1204, + "step": 868 + }, + { + "epoch": 1.1711590296495957, + "grad_norm": 30.45074922728905, + "learning_rate": 8.327224748007977e-06, + "loss": 0.1077, + "step": 869 + }, + { + "epoch": 1.1725067385444743, + "grad_norm": 63.92946391916935, + "learning_rate": 8.323148700857984e-06, + "loss": 0.1392, + "step": 870 + }, + { + "epoch": 1.173854447439353, + "grad_norm": 54.481414137308136, + "learning_rate": 8.319068693956803e-06, + "loss": 0.1522, + "step": 871 + }, + { + "epoch": 1.1752021563342319, + "grad_norm": 5.0102576684726925, + "learning_rate": 8.314984732166025e-06, + "loss": 0.1188, + "step": 872 + }, + { + "epoch": 1.1765498652291104, + "grad_norm": 24.860753636902622, + "learning_rate": 8.310896820351966e-06, + "loss": 0.1052, + "step": 873 + }, + { + "epoch": 1.1778975741239892, + "grad_norm": 18.811044955157666, + "learning_rate": 8.306804963385639e-06, + "loss": 0.1046, + "step": 874 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 7.621425204619976, + "learning_rate": 8.302709166142765e-06, + "loss": 0.1034, + "step": 875 + }, + { + "epoch": 1.1805929919137466, + "grad_norm": 15.671968063343714, + "learning_rate": 8.298609433503754e-06, + "loss": 0.1142, + "step": 876 + }, + { + "epoch": 1.1819407008086253, + "grad_norm": 15.440515393442167, + "learning_rate": 8.294505770353711e-06, + "loss": 0.1044, + "step": 877 + }, + { + "epoch": 1.1832884097035041, + "grad_norm": 19.381427581843667, + "learning_rate": 8.29039818158242e-06, + "loss": 0.1261, + "step": 878 + }, + { + "epoch": 1.1846361185983827, + "grad_norm": 22.994553169806004, + "learning_rate": 8.286286672084346e-06, + "loss": 0.1235, + "step": 879 + }, + { + "epoch": 1.1859838274932615, + "grad_norm": 20.69491974207516, + "learning_rate": 8.28217124675862e-06, + "loss": 0.1025, + "step": 880 + }, + { + "epoch": 1.18733153638814, + "grad_norm": 16.523924719593147, + "learning_rate": 8.278051910509048e-06, + "loss": 0.0823, + "step": 881 + }, + { + "epoch": 1.1886792452830188, + "grad_norm": 15.172820448626965, + "learning_rate": 8.273928668244088e-06, + "loss": 0.105, + "step": 882 + }, + { + "epoch": 1.1900269541778976, + "grad_norm": 34.71408771196782, + "learning_rate": 8.269801524876859e-06, + "loss": 0.1113, + "step": 883 + }, + { + "epoch": 1.1913746630727764, + "grad_norm": 20.407048964747712, + "learning_rate": 8.26567048532512e-06, + "loss": 0.1502, + "step": 884 + }, + { + "epoch": 1.192722371967655, + "grad_norm": 59.089150498395675, + "learning_rate": 8.261535554511282e-06, + "loss": 0.1487, + "step": 885 + }, + { + "epoch": 1.1940700808625337, + "grad_norm": 50.99728617591187, + "learning_rate": 8.257396737362386e-06, + "loss": 0.1217, + "step": 886 + }, + { + "epoch": 1.1954177897574123, + "grad_norm": 7.220560348903716, + "learning_rate": 8.253254038810106e-06, + "loss": 0.113, + "step": 887 + }, + { + "epoch": 1.196765498652291, + "grad_norm": 26.747897315791068, + "learning_rate": 8.249107463790742e-06, + "loss": 0.1348, + "step": 888 + }, + { + "epoch": 1.1981132075471699, + "grad_norm": 49.77819396943203, + "learning_rate": 8.244957017245212e-06, + "loss": 0.1523, + "step": 889 + }, + { + "epoch": 1.1994609164420484, + "grad_norm": 12.738977523076596, + "learning_rate": 8.240802704119046e-06, + "loss": 0.0946, + "step": 890 + }, + { + "epoch": 1.2008086253369272, + "grad_norm": 10.458964017643767, + "learning_rate": 8.236644529362384e-06, + "loss": 0.074, + "step": 891 + }, + { + "epoch": 1.202156334231806, + "grad_norm": 4.444722114979576, + "learning_rate": 8.232482497929965e-06, + "loss": 0.1256, + "step": 892 + }, + { + "epoch": 1.2035040431266846, + "grad_norm": 16.425951014091332, + "learning_rate": 8.228316614781124e-06, + "loss": 0.1157, + "step": 893 + }, + { + "epoch": 1.2048517520215634, + "grad_norm": 20.735049054659413, + "learning_rate": 8.224146884879786e-06, + "loss": 0.0882, + "step": 894 + }, + { + "epoch": 1.2061994609164421, + "grad_norm": 11.461935070101037, + "learning_rate": 8.219973313194461e-06, + "loss": 0.1171, + "step": 895 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 3.9700442119966803, + "learning_rate": 8.215795904698234e-06, + "loss": 0.1123, + "step": 896 + }, + { + "epoch": 1.2088948787061995, + "grad_norm": 22.550591448454927, + "learning_rate": 8.211614664368764e-06, + "loss": 0.1222, + "step": 897 + }, + { + "epoch": 1.210242587601078, + "grad_norm": 47.42152074507092, + "learning_rate": 8.207429597188275e-06, + "loss": 0.1264, + "step": 898 + }, + { + "epoch": 1.2115902964959568, + "grad_norm": 24.354574735172918, + "learning_rate": 8.20324070814355e-06, + "loss": 0.1085, + "step": 899 + }, + { + "epoch": 1.2129380053908356, + "grad_norm": 1.9437402240317392, + "learning_rate": 8.199048002225927e-06, + "loss": 0.1045, + "step": 900 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 29.05227249436911, + "learning_rate": 8.194851484431291e-06, + "loss": 0.115, + "step": 901 + }, + { + "epoch": 1.215633423180593, + "grad_norm": 12.783951208242675, + "learning_rate": 8.190651159760075e-06, + "loss": 0.1335, + "step": 902 + }, + { + "epoch": 1.2169811320754718, + "grad_norm": 7.67370355395683, + "learning_rate": 8.18644703321724e-06, + "loss": 0.1121, + "step": 903 + }, + { + "epoch": 1.2183288409703503, + "grad_norm": 6.5215203844610885, + "learning_rate": 8.18223910981228e-06, + "loss": 0.1173, + "step": 904 + }, + { + "epoch": 1.219676549865229, + "grad_norm": 5.9487983027165034, + "learning_rate": 8.178027394559213e-06, + "loss": 0.1074, + "step": 905 + }, + { + "epoch": 1.221024258760108, + "grad_norm": 5.662232965102052, + "learning_rate": 8.17381189247658e-06, + "loss": 0.1114, + "step": 906 + }, + { + "epoch": 1.2223719676549865, + "grad_norm": 13.34423797196247, + "learning_rate": 8.169592608587427e-06, + "loss": 0.1147, + "step": 907 + }, + { + "epoch": 1.2237196765498652, + "grad_norm": 26.50882820593779, + "learning_rate": 8.165369547919308e-06, + "loss": 0.1053, + "step": 908 + }, + { + "epoch": 1.225067385444744, + "grad_norm": 1.6839624536579263, + "learning_rate": 8.16114271550428e-06, + "loss": 0.1059, + "step": 909 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 7.86692348738743, + "learning_rate": 8.156912116378897e-06, + "loss": 0.1058, + "step": 910 + }, + { + "epoch": 1.2277628032345014, + "grad_norm": 18.25630007803528, + "learning_rate": 8.152677755584192e-06, + "loss": 0.1059, + "step": 911 + }, + { + "epoch": 1.2291105121293802, + "grad_norm": 11.362114432031028, + "learning_rate": 8.148439638165688e-06, + "loss": 0.0926, + "step": 912 + }, + { + "epoch": 1.2304582210242587, + "grad_norm": 17.844862816169307, + "learning_rate": 8.144197769173381e-06, + "loss": 0.1272, + "step": 913 + }, + { + "epoch": 1.2318059299191375, + "grad_norm": 18.63329620218173, + "learning_rate": 8.139952153661738e-06, + "loss": 0.1254, + "step": 914 + }, + { + "epoch": 1.233153638814016, + "grad_norm": 5.733730940751637, + "learning_rate": 8.135702796689693e-06, + "loss": 0.1064, + "step": 915 + }, + { + "epoch": 1.2345013477088949, + "grad_norm": 6.867568659787743, + "learning_rate": 8.131449703320633e-06, + "loss": 0.1273, + "step": 916 + }, + { + "epoch": 1.2358490566037736, + "grad_norm": 25.475187915507775, + "learning_rate": 8.127192878622398e-06, + "loss": 0.1015, + "step": 917 + }, + { + "epoch": 1.2371967654986522, + "grad_norm": 2.880284411830121, + "learning_rate": 8.12293232766728e-06, + "loss": 0.09, + "step": 918 + }, + { + "epoch": 1.238544474393531, + "grad_norm": 16.044330986561764, + "learning_rate": 8.118668055532003e-06, + "loss": 0.1048, + "step": 919 + }, + { + "epoch": 1.2398921832884098, + "grad_norm": 5.934346510991366, + "learning_rate": 8.114400067297733e-06, + "loss": 0.0994, + "step": 920 + }, + { + "epoch": 1.2412398921832883, + "grad_norm": 25.194221950191935, + "learning_rate": 8.110128368050056e-06, + "loss": 0.1221, + "step": 921 + }, + { + "epoch": 1.2425876010781671, + "grad_norm": 10.581148491361258, + "learning_rate": 8.105852962878987e-06, + "loss": 0.1021, + "step": 922 + }, + { + "epoch": 1.243935309973046, + "grad_norm": 5.221806380207294, + "learning_rate": 8.10157385687895e-06, + "loss": 0.111, + "step": 923 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 29.676605036166546, + "learning_rate": 8.097291055148785e-06, + "loss": 0.1087, + "step": 924 + }, + { + "epoch": 1.2466307277628033, + "grad_norm": 24.214966868066234, + "learning_rate": 8.093004562791736e-06, + "loss": 0.1303, + "step": 925 + }, + { + "epoch": 1.2479784366576818, + "grad_norm": 9.959387201884962, + "learning_rate": 8.088714384915437e-06, + "loss": 0.0921, + "step": 926 + }, + { + "epoch": 1.2493261455525606, + "grad_norm": 11.090297064669507, + "learning_rate": 8.084420526631918e-06, + "loss": 0.0922, + "step": 927 + }, + { + "epoch": 1.2506738544474394, + "grad_norm": 3.110132169230399, + "learning_rate": 8.080122993057598e-06, + "loss": 0.1083, + "step": 928 + }, + { + "epoch": 1.2520215633423182, + "grad_norm": 40.334465940765654, + "learning_rate": 8.07582178931327e-06, + "loss": 0.0985, + "step": 929 + }, + { + "epoch": 1.2533692722371967, + "grad_norm": 8.185940506919685, + "learning_rate": 8.071516920524105e-06, + "loss": 0.1207, + "step": 930 + }, + { + "epoch": 1.2547169811320755, + "grad_norm": 5.087707388969879, + "learning_rate": 8.067208391819637e-06, + "loss": 0.1107, + "step": 931 + }, + { + "epoch": 1.256064690026954, + "grad_norm": 7.856767157655405, + "learning_rate": 8.06289620833376e-06, + "loss": 0.1956, + "step": 932 + }, + { + "epoch": 1.2574123989218329, + "grad_norm": 11.137137223777419, + "learning_rate": 8.058580375204728e-06, + "loss": 0.0896, + "step": 933 + }, + { + "epoch": 1.2587601078167117, + "grad_norm": 7.907574417143278, + "learning_rate": 8.054260897575143e-06, + "loss": 0.0964, + "step": 934 + }, + { + "epoch": 1.2601078167115902, + "grad_norm": 30.668364546854864, + "learning_rate": 8.049937780591944e-06, + "loss": 0.1, + "step": 935 + }, + { + "epoch": 1.261455525606469, + "grad_norm": 14.080082750728149, + "learning_rate": 8.045611029406412e-06, + "loss": 0.1156, + "step": 936 + }, + { + "epoch": 1.2628032345013476, + "grad_norm": 36.48176354712488, + "learning_rate": 8.041280649174161e-06, + "loss": 0.1132, + "step": 937 + }, + { + "epoch": 1.2641509433962264, + "grad_norm": 11.017302872293405, + "learning_rate": 8.036946645055117e-06, + "loss": 0.1142, + "step": 938 + }, + { + "epoch": 1.2654986522911051, + "grad_norm": 32.59182058582043, + "learning_rate": 8.032609022213539e-06, + "loss": 0.0929, + "step": 939 + }, + { + "epoch": 1.266846361185984, + "grad_norm": 22.47718690309813, + "learning_rate": 8.028267785817988e-06, + "loss": 0.1081, + "step": 940 + }, + { + "epoch": 1.2681940700808625, + "grad_norm": 33.409413856998945, + "learning_rate": 8.023922941041336e-06, + "loss": 0.1278, + "step": 941 + }, + { + "epoch": 1.2695417789757413, + "grad_norm": 3.187385533120976, + "learning_rate": 8.01957449306075e-06, + "loss": 0.0809, + "step": 942 + }, + { + "epoch": 1.2708894878706198, + "grad_norm": 25.191470832180023, + "learning_rate": 8.015222447057694e-06, + "loss": 0.0988, + "step": 943 + }, + { + "epoch": 1.2722371967654986, + "grad_norm": 25.284847963659182, + "learning_rate": 8.010866808217917e-06, + "loss": 0.095, + "step": 944 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 5.653445466065407, + "learning_rate": 8.006507581731453e-06, + "loss": 0.1474, + "step": 945 + }, + { + "epoch": 1.2749326145552562, + "grad_norm": 6.208429597258365, + "learning_rate": 8.002144772792608e-06, + "loss": 0.0747, + "step": 946 + }, + { + "epoch": 1.2762803234501348, + "grad_norm": 7.154969886351634, + "learning_rate": 7.997778386599955e-06, + "loss": 0.0777, + "step": 947 + }, + { + "epoch": 1.2776280323450135, + "grad_norm": 14.475594478981877, + "learning_rate": 7.993408428356336e-06, + "loss": 0.1162, + "step": 948 + }, + { + "epoch": 1.278975741239892, + "grad_norm": 14.447874068209902, + "learning_rate": 7.989034903268837e-06, + "loss": 0.0877, + "step": 949 + }, + { + "epoch": 1.280323450134771, + "grad_norm": 18.12903236911136, + "learning_rate": 7.98465781654881e-06, + "loss": 0.0989, + "step": 950 + }, + { + "epoch": 1.2816711590296497, + "grad_norm": 18.358560764245194, + "learning_rate": 7.980277173411838e-06, + "loss": 0.0902, + "step": 951 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 12.976138623800194, + "learning_rate": 7.975892979077751e-06, + "loss": 0.0975, + "step": 952 + }, + { + "epoch": 1.284366576819407, + "grad_norm": 16.208384052366934, + "learning_rate": 7.9715052387706e-06, + "loss": 0.1089, + "step": 953 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 49.37826926347926, + "learning_rate": 7.967113957718674e-06, + "loss": 0.1304, + "step": 954 + }, + { + "epoch": 1.2870619946091644, + "grad_norm": 16.18848807386049, + "learning_rate": 7.962719141154469e-06, + "loss": 0.1409, + "step": 955 + }, + { + "epoch": 1.2884097035040432, + "grad_norm": 8.410331356302077, + "learning_rate": 7.958320794314702e-06, + "loss": 0.0709, + "step": 956 + }, + { + "epoch": 1.289757412398922, + "grad_norm": 7.160655839539009, + "learning_rate": 7.953918922440295e-06, + "loss": 0.1089, + "step": 957 + }, + { + "epoch": 1.2911051212938005, + "grad_norm": 3.593216847083426, + "learning_rate": 7.949513530776367e-06, + "loss": 0.1422, + "step": 958 + }, + { + "epoch": 1.2924528301886793, + "grad_norm": 20.142293920515357, + "learning_rate": 7.945104624572233e-06, + "loss": 0.0744, + "step": 959 + }, + { + "epoch": 1.2938005390835579, + "grad_norm": 25.893471545718896, + "learning_rate": 7.940692209081396e-06, + "loss": 0.0851, + "step": 960 + }, + { + "epoch": 1.2951482479784366, + "grad_norm": 38.21276723562215, + "learning_rate": 7.936276289561543e-06, + "loss": 0.1405, + "step": 961 + }, + { + "epoch": 1.2964959568733154, + "grad_norm": 19.933176697795734, + "learning_rate": 7.93185687127453e-06, + "loss": 0.0996, + "step": 962 + }, + { + "epoch": 1.297843665768194, + "grad_norm": 61.21573111534167, + "learning_rate": 7.92743395948639e-06, + "loss": 0.1264, + "step": 963 + }, + { + "epoch": 1.2991913746630728, + "grad_norm": 28.962499222623318, + "learning_rate": 7.923007559467313e-06, + "loss": 0.1567, + "step": 964 + }, + { + "epoch": 1.3005390835579516, + "grad_norm": 43.93393133618395, + "learning_rate": 7.918577676491643e-06, + "loss": 0.1135, + "step": 965 + }, + { + "epoch": 1.3018867924528301, + "grad_norm": 34.42151552091543, + "learning_rate": 7.914144315837883e-06, + "loss": 0.0903, + "step": 966 + }, + { + "epoch": 1.303234501347709, + "grad_norm": 34.02280380104221, + "learning_rate": 7.909707482788674e-06, + "loss": 0.1079, + "step": 967 + }, + { + "epoch": 1.3045822102425877, + "grad_norm": 32.73822371477655, + "learning_rate": 7.905267182630797e-06, + "loss": 0.1206, + "step": 968 + }, + { + "epoch": 1.3059299191374663, + "grad_norm": 48.34909133857089, + "learning_rate": 7.900823420655158e-06, + "loss": 0.1472, + "step": 969 + }, + { + "epoch": 1.307277628032345, + "grad_norm": 52.09898157042411, + "learning_rate": 7.896376202156799e-06, + "loss": 0.1378, + "step": 970 + }, + { + "epoch": 1.3086253369272236, + "grad_norm": 24.740270241499115, + "learning_rate": 7.89192553243487e-06, + "loss": 0.1005, + "step": 971 + }, + { + "epoch": 1.3099730458221024, + "grad_norm": 13.809385588570777, + "learning_rate": 7.88747141679264e-06, + "loss": 0.1018, + "step": 972 + }, + { + "epoch": 1.3113207547169812, + "grad_norm": 43.97400729901852, + "learning_rate": 7.883013860537483e-06, + "loss": 0.1412, + "step": 973 + }, + { + "epoch": 1.31266846361186, + "grad_norm": 26.63349236105383, + "learning_rate": 7.878552868980868e-06, + "loss": 0.1398, + "step": 974 + }, + { + "epoch": 1.3140161725067385, + "grad_norm": 10.522890330747442, + "learning_rate": 7.874088447438366e-06, + "loss": 0.1252, + "step": 975 + }, + { + "epoch": 1.3153638814016173, + "grad_norm": 20.225078935210846, + "learning_rate": 7.869620601229627e-06, + "loss": 0.1215, + "step": 976 + }, + { + "epoch": 1.3167115902964959, + "grad_norm": 9.321622860102774, + "learning_rate": 7.865149335678386e-06, + "loss": 0.0914, + "step": 977 + }, + { + "epoch": 1.3180592991913747, + "grad_norm": 31.518510110740117, + "learning_rate": 7.86067465611245e-06, + "loss": 0.1016, + "step": 978 + }, + { + "epoch": 1.3194070080862534, + "grad_norm": 23.296941703077227, + "learning_rate": 7.856196567863697e-06, + "loss": 0.0827, + "step": 979 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 33.82277481312892, + "learning_rate": 7.851715076268062e-06, + "loss": 0.1095, + "step": 980 + }, + { + "epoch": 1.3221024258760108, + "grad_norm": 45.20497812803161, + "learning_rate": 7.847230186665543e-06, + "loss": 0.1223, + "step": 981 + }, + { + "epoch": 1.3234501347708894, + "grad_norm": 10.029977159012898, + "learning_rate": 7.84274190440018e-06, + "loss": 0.1061, + "step": 982 + }, + { + "epoch": 1.3247978436657681, + "grad_norm": 36.33379557026886, + "learning_rate": 7.838250234820058e-06, + "loss": 0.1234, + "step": 983 + }, + { + "epoch": 1.326145552560647, + "grad_norm": 14.918707910912905, + "learning_rate": 7.833755183277294e-06, + "loss": 0.1376, + "step": 984 + }, + { + "epoch": 1.3274932614555257, + "grad_norm": 12.724828325680742, + "learning_rate": 7.829256755128046e-06, + "loss": 0.1197, + "step": 985 + }, + { + "epoch": 1.3288409703504043, + "grad_norm": 54.76220098219921, + "learning_rate": 7.824754955732481e-06, + "loss": 0.1396, + "step": 986 + }, + { + "epoch": 1.330188679245283, + "grad_norm": 10.491484151695149, + "learning_rate": 7.820249790454796e-06, + "loss": 0.1401, + "step": 987 + }, + { + "epoch": 1.3315363881401616, + "grad_norm": 20.582149931116316, + "learning_rate": 7.81574126466319e-06, + "loss": 0.1145, + "step": 988 + }, + { + "epoch": 1.3328840970350404, + "grad_norm": 49.65280041461766, + "learning_rate": 7.811229383729872e-06, + "loss": 0.1242, + "step": 989 + }, + { + "epoch": 1.3342318059299192, + "grad_norm": 25.25962419508191, + "learning_rate": 7.806714153031043e-06, + "loss": 0.1267, + "step": 990 + }, + { + "epoch": 1.335579514824798, + "grad_norm": 5.94345450515603, + "learning_rate": 7.8021955779469e-06, + "loss": 0.1044, + "step": 991 + }, + { + "epoch": 1.3369272237196765, + "grad_norm": 6.945503742883835, + "learning_rate": 7.797673663861625e-06, + "loss": 0.149, + "step": 992 + }, + { + "epoch": 1.3382749326145553, + "grad_norm": 14.65868064285003, + "learning_rate": 7.793148416163375e-06, + "loss": 0.1153, + "step": 993 + }, + { + "epoch": 1.3396226415094339, + "grad_norm": 10.908047061764897, + "learning_rate": 7.788619840244284e-06, + "loss": 0.1111, + "step": 994 + }, + { + "epoch": 1.3409703504043127, + "grad_norm": 5.348947682035502, + "learning_rate": 7.784087941500446e-06, + "loss": 0.0969, + "step": 995 + }, + { + "epoch": 1.3423180592991915, + "grad_norm": 2.0175323558521967, + "learning_rate": 7.77955272533192e-06, + "loss": 0.0936, + "step": 996 + }, + { + "epoch": 1.34366576819407, + "grad_norm": 12.700274355559104, + "learning_rate": 7.775014197142716e-06, + "loss": 0.12, + "step": 997 + }, + { + "epoch": 1.3450134770889488, + "grad_norm": 24.07459262656964, + "learning_rate": 7.77047236234079e-06, + "loss": 0.1033, + "step": 998 + }, + { + "epoch": 1.3463611859838274, + "grad_norm": 25.957985182791102, + "learning_rate": 7.765927226338037e-06, + "loss": 0.1433, + "step": 999 + }, + { + "epoch": 1.3477088948787062, + "grad_norm": 12.682916200365911, + "learning_rate": 7.761378794550288e-06, + "loss": 0.1078, + "step": 1000 + }, + { + "epoch": 1.349056603773585, + "grad_norm": 32.07844213567014, + "learning_rate": 7.756827072397299e-06, + "loss": 0.1399, + "step": 1001 + }, + { + "epoch": 1.3504043126684637, + "grad_norm": 34.99353935249154, + "learning_rate": 7.752272065302746e-06, + "loss": 0.1357, + "step": 1002 + }, + { + "epoch": 1.3517520215633423, + "grad_norm": 14.823963885230082, + "learning_rate": 7.747713778694225e-06, + "loss": 0.0871, + "step": 1003 + }, + { + "epoch": 1.353099730458221, + "grad_norm": 52.61258875146785, + "learning_rate": 7.743152218003234e-06, + "loss": 0.1646, + "step": 1004 + }, + { + "epoch": 1.3544474393530996, + "grad_norm": 3.7832155008911843, + "learning_rate": 7.738587388665171e-06, + "loss": 0.1007, + "step": 1005 + }, + { + "epoch": 1.3557951482479784, + "grad_norm": 3.1511347974602653, + "learning_rate": 7.734019296119336e-06, + "loss": 0.1527, + "step": 1006 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 5.332651747852219, + "learning_rate": 7.72944794580891e-06, + "loss": 0.119, + "step": 1007 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 48.09891871252668, + "learning_rate": 7.724873343180961e-06, + "loss": 0.1712, + "step": 1008 + }, + { + "epoch": 1.3598382749326146, + "grad_norm": 10.033166758752769, + "learning_rate": 7.720295493686429e-06, + "loss": 0.1131, + "step": 1009 + }, + { + "epoch": 1.3611859838274933, + "grad_norm": 9.225307912278433, + "learning_rate": 7.715714402780124e-06, + "loss": 0.1188, + "step": 1010 + }, + { + "epoch": 1.362533692722372, + "grad_norm": 13.055457577101487, + "learning_rate": 7.711130075920717e-06, + "loss": 0.1191, + "step": 1011 + }, + { + "epoch": 1.3638814016172507, + "grad_norm": 29.515077094800397, + "learning_rate": 7.70654251857074e-06, + "loss": 0.1118, + "step": 1012 + }, + { + "epoch": 1.3652291105121295, + "grad_norm": 11.93475198739081, + "learning_rate": 7.701951736196566e-06, + "loss": 0.1052, + "step": 1013 + }, + { + "epoch": 1.366576819407008, + "grad_norm": 11.562640673537897, + "learning_rate": 7.697357734268418e-06, + "loss": 0.1176, + "step": 1014 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 14.710925276861829, + "learning_rate": 7.692760518260355e-06, + "loss": 0.1246, + "step": 1015 + }, + { + "epoch": 1.3692722371967654, + "grad_norm": 8.806029982033136, + "learning_rate": 7.688160093650259e-06, + "loss": 0.1019, + "step": 1016 + }, + { + "epoch": 1.3706199460916442, + "grad_norm": 7.032866036742388, + "learning_rate": 7.683556465919844e-06, + "loss": 0.1088, + "step": 1017 + }, + { + "epoch": 1.371967654986523, + "grad_norm": 4.621329747998142, + "learning_rate": 7.678949640554634e-06, + "loss": 0.1483, + "step": 1018 + }, + { + "epoch": 1.3733153638814017, + "grad_norm": 31.012520120699048, + "learning_rate": 7.674339623043967e-06, + "loss": 0.134, + "step": 1019 + }, + { + "epoch": 1.3746630727762803, + "grad_norm": 4.331856738756752, + "learning_rate": 7.66972641888098e-06, + "loss": 0.0885, + "step": 1020 + }, + { + "epoch": 1.376010781671159, + "grad_norm": 22.089283007899795, + "learning_rate": 7.665110033562614e-06, + "loss": 0.1252, + "step": 1021 + }, + { + "epoch": 1.3773584905660377, + "grad_norm": 27.470655338773316, + "learning_rate": 7.660490472589598e-06, + "loss": 0.1143, + "step": 1022 + }, + { + "epoch": 1.3787061994609164, + "grad_norm": 14.756207570970693, + "learning_rate": 7.65586774146644e-06, + "loss": 0.1113, + "step": 1023 + }, + { + "epoch": 1.3800539083557952, + "grad_norm": 7.178434750331886, + "learning_rate": 7.651241845701435e-06, + "loss": 0.1329, + "step": 1024 + }, + { + "epoch": 1.3814016172506738, + "grad_norm": 12.220818706905236, + "learning_rate": 7.646612790806638e-06, + "loss": 0.1072, + "step": 1025 + }, + { + "epoch": 1.3827493261455526, + "grad_norm": 32.79775970456627, + "learning_rate": 7.641980582297874e-06, + "loss": 0.0926, + "step": 1026 + }, + { + "epoch": 1.3840970350404311, + "grad_norm": 4.876317907593398, + "learning_rate": 7.63734522569473e-06, + "loss": 0.0888, + "step": 1027 + }, + { + "epoch": 1.38544474393531, + "grad_norm": 27.92969151177633, + "learning_rate": 7.632706726520535e-06, + "loss": 0.0946, + "step": 1028 + }, + { + "epoch": 1.3867924528301887, + "grad_norm": 4.623214515002581, + "learning_rate": 7.628065090302371e-06, + "loss": 0.1247, + "step": 1029 + }, + { + "epoch": 1.3881401617250675, + "grad_norm": 33.67004477338842, + "learning_rate": 7.623420322571051e-06, + "loss": 0.0804, + "step": 1030 + }, + { + "epoch": 1.389487870619946, + "grad_norm": 13.250996748675746, + "learning_rate": 7.618772428861125e-06, + "loss": 0.0945, + "step": 1031 + }, + { + "epoch": 1.3908355795148248, + "grad_norm": 28.26420466131732, + "learning_rate": 7.6141214147108636e-06, + "loss": 0.111, + "step": 1032 + }, + { + "epoch": 1.3921832884097034, + "grad_norm": 7.3692397828764715, + "learning_rate": 7.609467285662257e-06, + "loss": 0.1089, + "step": 1033 + }, + { + "epoch": 1.3935309973045822, + "grad_norm": 20.906630712379382, + "learning_rate": 7.604810047261008e-06, + "loss": 0.0879, + "step": 1034 + }, + { + "epoch": 1.394878706199461, + "grad_norm": 18.91651101387468, + "learning_rate": 7.6001497050565256e-06, + "loss": 0.1319, + "step": 1035 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 2.5823313371574135, + "learning_rate": 7.595486264601912e-06, + "loss": 0.0743, + "step": 1036 + }, + { + "epoch": 1.3975741239892183, + "grad_norm": 2.385269230959107, + "learning_rate": 7.590819731453968e-06, + "loss": 0.1026, + "step": 1037 + }, + { + "epoch": 1.398921832884097, + "grad_norm": 9.323617862112442, + "learning_rate": 7.586150111173174e-06, + "loss": 0.1247, + "step": 1038 + }, + { + "epoch": 1.4002695417789757, + "grad_norm": 5.903010725753807, + "learning_rate": 7.581477409323692e-06, + "loss": 0.0777, + "step": 1039 + }, + { + "epoch": 1.4016172506738545, + "grad_norm": 4.590839856636562, + "learning_rate": 7.576801631473353e-06, + "loss": 0.1062, + "step": 1040 + }, + { + "epoch": 1.4029649595687332, + "grad_norm": 7.828747694542345, + "learning_rate": 7.572122783193657e-06, + "loss": 0.114, + "step": 1041 + }, + { + "epoch": 1.4043126684636118, + "grad_norm": 3.468915005012156, + "learning_rate": 7.5674408700597615e-06, + "loss": 0.1365, + "step": 1042 + }, + { + "epoch": 1.4056603773584906, + "grad_norm": 4.954761342454677, + "learning_rate": 7.562755897650473e-06, + "loss": 0.0745, + "step": 1043 + }, + { + "epoch": 1.4070080862533692, + "grad_norm": 12.945085997836593, + "learning_rate": 7.558067871548248e-06, + "loss": 0.1207, + "step": 1044 + }, + { + "epoch": 1.408355795148248, + "grad_norm": 40.40293635058792, + "learning_rate": 7.553376797339178e-06, + "loss": 0.12, + "step": 1045 + }, + { + "epoch": 1.4097035040431267, + "grad_norm": 2.270260914621187, + "learning_rate": 7.548682680612987e-06, + "loss": 0.1307, + "step": 1046 + }, + { + "epoch": 1.4110512129380055, + "grad_norm": 6.778446691361667, + "learning_rate": 7.543985526963026e-06, + "loss": 0.1002, + "step": 1047 + }, + { + "epoch": 1.412398921832884, + "grad_norm": 6.3330437734283205, + "learning_rate": 7.539285341986264e-06, + "loss": 0.1089, + "step": 1048 + }, + { + "epoch": 1.4137466307277629, + "grad_norm": 3.7350026089009383, + "learning_rate": 7.534582131283281e-06, + "loss": 0.11, + "step": 1049 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 26.590643328892877, + "learning_rate": 7.529875900458266e-06, + "loss": 0.1055, + "step": 1050 + }, + { + "epoch": 1.4164420485175202, + "grad_norm": 2.585585255025196, + "learning_rate": 7.525166655119001e-06, + "loss": 0.1055, + "step": 1051 + }, + { + "epoch": 1.417789757412399, + "grad_norm": 11.302276055732396, + "learning_rate": 7.520454400876862e-06, + "loss": 0.0899, + "step": 1052 + }, + { + "epoch": 1.4191374663072776, + "grad_norm": 4.583513229645144, + "learning_rate": 7.515739143346814e-06, + "loss": 0.0748, + "step": 1053 + }, + { + "epoch": 1.4204851752021563, + "grad_norm": 24.568899471150306, + "learning_rate": 7.511020888147397e-06, + "loss": 0.0843, + "step": 1054 + }, + { + "epoch": 1.4218328840970351, + "grad_norm": 5.1921226189637855, + "learning_rate": 7.506299640900725e-06, + "loss": 0.1274, + "step": 1055 + }, + { + "epoch": 1.4231805929919137, + "grad_norm": 31.921158569377543, + "learning_rate": 7.501575407232473e-06, + "loss": 0.0878, + "step": 1056 + }, + { + "epoch": 1.4245283018867925, + "grad_norm": 26.721461914937116, + "learning_rate": 7.496848192771879e-06, + "loss": 0.1101, + "step": 1057 + }, + { + "epoch": 1.4258760107816713, + "grad_norm": 13.903612117641035, + "learning_rate": 7.4921180031517316e-06, + "loss": 0.1103, + "step": 1058 + }, + { + "epoch": 1.4272237196765498, + "grad_norm": 9.850664055006078, + "learning_rate": 7.487384844008363e-06, + "loss": 0.1049, + "step": 1059 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 23.339097653552795, + "learning_rate": 7.482648720981647e-06, + "loss": 0.0863, + "step": 1060 + }, + { + "epoch": 1.4299191374663072, + "grad_norm": 18.9149873894291, + "learning_rate": 7.477909639714984e-06, + "loss": 0.1159, + "step": 1061 + }, + { + "epoch": 1.431266846361186, + "grad_norm": 5.22117677005158, + "learning_rate": 7.473167605855305e-06, + "loss": 0.1101, + "step": 1062 + }, + { + "epoch": 1.4326145552560647, + "grad_norm": 8.237349371216037, + "learning_rate": 7.468422625053057e-06, + "loss": 0.0965, + "step": 1063 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 2.136009463973772, + "learning_rate": 7.463674702962196e-06, + "loss": 0.0832, + "step": 1064 + }, + { + "epoch": 1.435309973045822, + "grad_norm": 15.686274744605235, + "learning_rate": 7.4589238452401845e-06, + "loss": 0.1246, + "step": 1065 + }, + { + "epoch": 1.4366576819407009, + "grad_norm": 4.464083513795896, + "learning_rate": 7.454170057547986e-06, + "loss": 0.1, + "step": 1066 + }, + { + "epoch": 1.4380053908355794, + "grad_norm": 7.3177154259464325, + "learning_rate": 7.449413345550052e-06, + "loss": 0.0902, + "step": 1067 + }, + { + "epoch": 1.4393530997304582, + "grad_norm": 4.05673608939445, + "learning_rate": 7.444653714914316e-06, + "loss": 0.0809, + "step": 1068 + }, + { + "epoch": 1.440700808625337, + "grad_norm": 45.821328428077344, + "learning_rate": 7.439891171312196e-06, + "loss": 0.1264, + "step": 1069 + }, + { + "epoch": 1.4420485175202156, + "grad_norm": 31.52509102485694, + "learning_rate": 7.4351257204185735e-06, + "loss": 0.0899, + "step": 1070 + }, + { + "epoch": 1.4433962264150944, + "grad_norm": 15.705358163528055, + "learning_rate": 7.430357367911801e-06, + "loss": 0.1337, + "step": 1071 + }, + { + "epoch": 1.444743935309973, + "grad_norm": 10.970263995608827, + "learning_rate": 7.425586119473687e-06, + "loss": 0.1171, + "step": 1072 + }, + { + "epoch": 1.4460916442048517, + "grad_norm": 24.406783619522763, + "learning_rate": 7.420811980789484e-06, + "loss": 0.0859, + "step": 1073 + }, + { + "epoch": 1.4474393530997305, + "grad_norm": 17.76425982157703, + "learning_rate": 7.416034957547898e-06, + "loss": 0.1184, + "step": 1074 + }, + { + "epoch": 1.4487870619946093, + "grad_norm": 21.628904528327297, + "learning_rate": 7.411255055441064e-06, + "loss": 0.1145, + "step": 1075 + }, + { + "epoch": 1.4501347708894878, + "grad_norm": 5.936251270041572, + "learning_rate": 7.406472280164556e-06, + "loss": 0.0956, + "step": 1076 + }, + { + "epoch": 1.4514824797843666, + "grad_norm": 21.25644102672, + "learning_rate": 7.401686637417362e-06, + "loss": 0.1058, + "step": 1077 + }, + { + "epoch": 1.4528301886792452, + "grad_norm": 17.910753579683917, + "learning_rate": 7.396898132901895e-06, + "loss": 0.0982, + "step": 1078 + }, + { + "epoch": 1.454177897574124, + "grad_norm": 26.518551423719124, + "learning_rate": 7.3921067723239735e-06, + "loss": 0.1233, + "step": 1079 + }, + { + "epoch": 1.4555256064690028, + "grad_norm": 6.39356858056367, + "learning_rate": 7.387312561392818e-06, + "loss": 0.0796, + "step": 1080 + }, + { + "epoch": 1.4568733153638815, + "grad_norm": 22.662876402489186, + "learning_rate": 7.382515505821049e-06, + "loss": 0.0946, + "step": 1081 + }, + { + "epoch": 1.45822102425876, + "grad_norm": 13.242291581954587, + "learning_rate": 7.377715611324676e-06, + "loss": 0.1148, + "step": 1082 + }, + { + "epoch": 1.4595687331536389, + "grad_norm": 39.39700203316144, + "learning_rate": 7.372912883623089e-06, + "loss": 0.0899, + "step": 1083 + }, + { + "epoch": 1.4609164420485174, + "grad_norm": 41.838735245054494, + "learning_rate": 7.368107328439056e-06, + "loss": 0.1173, + "step": 1084 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 27.576023820187338, + "learning_rate": 7.363298951498712e-06, + "loss": 0.1298, + "step": 1085 + }, + { + "epoch": 1.463611859838275, + "grad_norm": 22.1734857031026, + "learning_rate": 7.358487758531559e-06, + "loss": 0.1332, + "step": 1086 + }, + { + "epoch": 1.4649595687331536, + "grad_norm": 31.329983930087728, + "learning_rate": 7.353673755270448e-06, + "loss": 0.0876, + "step": 1087 + }, + { + "epoch": 1.4663072776280324, + "grad_norm": 31.038174887439062, + "learning_rate": 7.348856947451583e-06, + "loss": 0.1023, + "step": 1088 + }, + { + "epoch": 1.467654986522911, + "grad_norm": 25.39417525894147, + "learning_rate": 7.344037340814508e-06, + "loss": 0.0904, + "step": 1089 + }, + { + "epoch": 1.4690026954177897, + "grad_norm": 17.642007002124206, + "learning_rate": 7.3392149411021054e-06, + "loss": 0.1038, + "step": 1090 + }, + { + "epoch": 1.4703504043126685, + "grad_norm": 14.935454102066993, + "learning_rate": 7.33438975406058e-06, + "loss": 0.0869, + "step": 1091 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 5.762237022735922, + "learning_rate": 7.329561785439462e-06, + "loss": 0.1008, + "step": 1092 + }, + { + "epoch": 1.4730458221024259, + "grad_norm": 7.47128806276577, + "learning_rate": 7.324731040991595e-06, + "loss": 0.1083, + "step": 1093 + }, + { + "epoch": 1.4743935309973046, + "grad_norm": 23.39017574267935, + "learning_rate": 7.3198975264731294e-06, + "loss": 0.1438, + "step": 1094 + }, + { + "epoch": 1.4757412398921832, + "grad_norm": 20.421115373146566, + "learning_rate": 7.315061247643518e-06, + "loss": 0.1113, + "step": 1095 + }, + { + "epoch": 1.477088948787062, + "grad_norm": 9.445842471969664, + "learning_rate": 7.310222210265507e-06, + "loss": 0.1338, + "step": 1096 + }, + { + "epoch": 1.4784366576819408, + "grad_norm": 4.385299409885909, + "learning_rate": 7.305380420105127e-06, + "loss": 0.0851, + "step": 1097 + }, + { + "epoch": 1.4797843665768193, + "grad_norm": 37.5857712002039, + "learning_rate": 7.3005358829316915e-06, + "loss": 0.1249, + "step": 1098 + }, + { + "epoch": 1.4811320754716981, + "grad_norm": 33.636691510343326, + "learning_rate": 7.295688604517789e-06, + "loss": 0.1135, + "step": 1099 + }, + { + "epoch": 1.482479784366577, + "grad_norm": 16.625982083168932, + "learning_rate": 7.290838590639269e-06, + "loss": 0.1166, + "step": 1100 + }, + { + "epoch": 1.4838274932614555, + "grad_norm": 22.135230847232098, + "learning_rate": 7.285985847075243e-06, + "loss": 0.0833, + "step": 1101 + }, + { + "epoch": 1.4851752021563343, + "grad_norm": 2.126207539243428, + "learning_rate": 7.281130379608079e-06, + "loss": 0.0793, + "step": 1102 + }, + { + "epoch": 1.486522911051213, + "grad_norm": 9.34923656071516, + "learning_rate": 7.276272194023385e-06, + "loss": 0.1034, + "step": 1103 + }, + { + "epoch": 1.4878706199460916, + "grad_norm": 2.1597046273755214, + "learning_rate": 7.271411296110009e-06, + "loss": 0.1065, + "step": 1104 + }, + { + "epoch": 1.4892183288409704, + "grad_norm": 33.121917950270934, + "learning_rate": 7.266547691660033e-06, + "loss": 0.116, + "step": 1105 + }, + { + "epoch": 1.490566037735849, + "grad_norm": 7.343302314636782, + "learning_rate": 7.2616813864687644e-06, + "loss": 0.1287, + "step": 1106 + }, + { + "epoch": 1.4919137466307277, + "grad_norm": 3.2908583463407988, + "learning_rate": 7.256812386334724e-06, + "loss": 0.1117, + "step": 1107 + }, + { + "epoch": 1.4932614555256065, + "grad_norm": 2.2939997981193376, + "learning_rate": 7.25194069705965e-06, + "loss": 0.0739, + "step": 1108 + }, + { + "epoch": 1.4946091644204853, + "grad_norm": 5.1607764197270365, + "learning_rate": 7.247066324448482e-06, + "loss": 0.1065, + "step": 1109 + }, + { + "epoch": 1.4959568733153639, + "grad_norm": 18.393930078880278, + "learning_rate": 7.242189274309355e-06, + "loss": 0.123, + "step": 1110 + }, + { + "epoch": 1.4973045822102427, + "grad_norm": 9.464329343623643, + "learning_rate": 7.237309552453597e-06, + "loss": 0.1115, + "step": 1111 + }, + { + "epoch": 1.4986522911051212, + "grad_norm": 37.83361608650681, + "learning_rate": 7.23242716469572e-06, + "loss": 0.1268, + "step": 1112 + }, + { + "epoch": 1.5, + "grad_norm": 12.216934030467762, + "learning_rate": 7.22754211685341e-06, + "loss": 0.1265, + "step": 1113 + }, + { + "epoch": 1.5013477088948788, + "grad_norm": 2.8136895086939444, + "learning_rate": 7.222654414747526e-06, + "loss": 0.1293, + "step": 1114 + }, + { + "epoch": 1.5026954177897576, + "grad_norm": 4.008257470814644, + "learning_rate": 7.2177640642020875e-06, + "loss": 0.1278, + "step": 1115 + }, + { + "epoch": 1.5040431266846361, + "grad_norm": 2.3811983165617576, + "learning_rate": 7.212871071044268e-06, + "loss": 0.1213, + "step": 1116 + }, + { + "epoch": 1.5053908355795147, + "grad_norm": 2.7661614037180082, + "learning_rate": 7.2079754411043956e-06, + "loss": 0.1058, + "step": 1117 + }, + { + "epoch": 1.5067385444743935, + "grad_norm": 20.818382238101755, + "learning_rate": 7.203077180215933e-06, + "loss": 0.1127, + "step": 1118 + }, + { + "epoch": 1.5080862533692723, + "grad_norm": 19.99102228628015, + "learning_rate": 7.198176294215483e-06, + "loss": 0.0965, + "step": 1119 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 42.420000304536266, + "learning_rate": 7.1932727889427775e-06, + "loss": 0.1465, + "step": 1120 + }, + { + "epoch": 1.5107816711590296, + "grad_norm": 1.6007012658028104, + "learning_rate": 7.188366670240664e-06, + "loss": 0.0726, + "step": 1121 + }, + { + "epoch": 1.5121293800539084, + "grad_norm": 18.688273124654945, + "learning_rate": 7.183457943955108e-06, + "loss": 0.0813, + "step": 1122 + }, + { + "epoch": 1.513477088948787, + "grad_norm": 7.793377717587547, + "learning_rate": 7.178546615935181e-06, + "loss": 0.1, + "step": 1123 + }, + { + "epoch": 1.5148247978436657, + "grad_norm": 1.4738669140791756, + "learning_rate": 7.1736326920330544e-06, + "loss": 0.1009, + "step": 1124 + }, + { + "epoch": 1.5161725067385445, + "grad_norm": 17.156255105329993, + "learning_rate": 7.168716178103994e-06, + "loss": 0.0945, + "step": 1125 + }, + { + "epoch": 1.5175202156334233, + "grad_norm": 2.1629234144760043, + "learning_rate": 7.1637970800063505e-06, + "loss": 0.1131, + "step": 1126 + }, + { + "epoch": 1.5188679245283019, + "grad_norm": 1.8549401560640506, + "learning_rate": 7.158875403601555e-06, + "loss": 0.1008, + "step": 1127 + }, + { + "epoch": 1.5202156334231804, + "grad_norm": 9.490832972590944, + "learning_rate": 7.153951154754108e-06, + "loss": 0.1176, + "step": 1128 + }, + { + "epoch": 1.5215633423180592, + "grad_norm": 9.26536721824739, + "learning_rate": 7.149024339331579e-06, + "loss": 0.0847, + "step": 1129 + }, + { + "epoch": 1.522911051212938, + "grad_norm": 6.238060860335915, + "learning_rate": 7.144094963204593e-06, + "loss": 0.0989, + "step": 1130 + }, + { + "epoch": 1.5242587601078168, + "grad_norm": 14.82562993910571, + "learning_rate": 7.139163032246828e-06, + "loss": 0.1137, + "step": 1131 + }, + { + "epoch": 1.5256064690026954, + "grad_norm": 29.323774071798635, + "learning_rate": 7.134228552335005e-06, + "loss": 0.13, + "step": 1132 + }, + { + "epoch": 1.5269541778975741, + "grad_norm": 10.037121463647193, + "learning_rate": 7.129291529348883e-06, + "loss": 0.0691, + "step": 1133 + }, + { + "epoch": 1.5283018867924527, + "grad_norm": 22.123139477008667, + "learning_rate": 7.124351969171251e-06, + "loss": 0.0948, + "step": 1134 + }, + { + "epoch": 1.5296495956873315, + "grad_norm": 7.36648017028669, + "learning_rate": 7.119409877687923e-06, + "loss": 0.0884, + "step": 1135 + }, + { + "epoch": 1.5309973045822103, + "grad_norm": 11.771267245564815, + "learning_rate": 7.114465260787724e-06, + "loss": 0.1114, + "step": 1136 + }, + { + "epoch": 1.532345013477089, + "grad_norm": 8.637887037011216, + "learning_rate": 7.109518124362493e-06, + "loss": 0.0986, + "step": 1137 + }, + { + "epoch": 1.5336927223719676, + "grad_norm": 10.61634111561255, + "learning_rate": 7.104568474307072e-06, + "loss": 0.0826, + "step": 1138 + }, + { + "epoch": 1.5350404312668462, + "grad_norm": 11.563778709889638, + "learning_rate": 7.099616316519295e-06, + "loss": 0.1438, + "step": 1139 + }, + { + "epoch": 1.536388140161725, + "grad_norm": 15.357697605021723, + "learning_rate": 7.094661656899982e-06, + "loss": 0.0924, + "step": 1140 + }, + { + "epoch": 1.5377358490566038, + "grad_norm": 12.516176006251278, + "learning_rate": 7.089704501352941e-06, + "loss": 0.0905, + "step": 1141 + }, + { + "epoch": 1.5390835579514826, + "grad_norm": 27.165967938607718, + "learning_rate": 7.084744855784947e-06, + "loss": 0.1246, + "step": 1142 + }, + { + "epoch": 1.5404312668463613, + "grad_norm": 9.494120570408535, + "learning_rate": 7.0797827261057484e-06, + "loss": 0.1017, + "step": 1143 + }, + { + "epoch": 1.54177897574124, + "grad_norm": 6.653710984404262, + "learning_rate": 7.07481811822805e-06, + "loss": 0.0837, + "step": 1144 + }, + { + "epoch": 1.5431266846361185, + "grad_norm": 8.511256807390327, + "learning_rate": 7.069851038067509e-06, + "loss": 0.073, + "step": 1145 + }, + { + "epoch": 1.5444743935309972, + "grad_norm": 5.264840681897747, + "learning_rate": 7.0648814915427285e-06, + "loss": 0.1166, + "step": 1146 + }, + { + "epoch": 1.545822102425876, + "grad_norm": 2.4143561250594052, + "learning_rate": 7.059909484575256e-06, + "loss": 0.1077, + "step": 1147 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 16.55033703624872, + "learning_rate": 7.05493502308956e-06, + "loss": 0.0696, + "step": 1148 + }, + { + "epoch": 1.5485175202156334, + "grad_norm": 8.25640393545228, + "learning_rate": 7.049958113013044e-06, + "loss": 0.0976, + "step": 1149 + }, + { + "epoch": 1.5498652291105122, + "grad_norm": 13.88341536834796, + "learning_rate": 7.044978760276025e-06, + "loss": 0.0775, + "step": 1150 + }, + { + "epoch": 1.5512129380053907, + "grad_norm": 13.945405328291223, + "learning_rate": 7.039996970811729e-06, + "loss": 0.1343, + "step": 1151 + }, + { + "epoch": 1.5525606469002695, + "grad_norm": 15.51029951786006, + "learning_rate": 7.0350127505562875e-06, + "loss": 0.0721, + "step": 1152 + }, + { + "epoch": 1.5539083557951483, + "grad_norm": 12.405571004302521, + "learning_rate": 7.030026105448728e-06, + "loss": 0.0782, + "step": 1153 + }, + { + "epoch": 1.555256064690027, + "grad_norm": 2.81035866297853, + "learning_rate": 7.02503704143097e-06, + "loss": 0.0898, + "step": 1154 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 8.821265713825078, + "learning_rate": 7.0200455644478105e-06, + "loss": 0.1062, + "step": 1155 + }, + { + "epoch": 1.5579514824797842, + "grad_norm": 18.708558378256342, + "learning_rate": 7.015051680446925e-06, + "loss": 0.1041, + "step": 1156 + }, + { + "epoch": 1.559299191374663, + "grad_norm": 2.479616735413377, + "learning_rate": 7.010055395378854e-06, + "loss": 0.1037, + "step": 1157 + }, + { + "epoch": 1.5606469002695418, + "grad_norm": 16.99804959370196, + "learning_rate": 7.005056715197004e-06, + "loss": 0.1135, + "step": 1158 + }, + { + "epoch": 1.5619946091644206, + "grad_norm": 9.796525538548483, + "learning_rate": 7.000055645857633e-06, + "loss": 0.0813, + "step": 1159 + }, + { + "epoch": 1.5633423180592994, + "grad_norm": 28.494352223060666, + "learning_rate": 6.995052193319842e-06, + "loss": 0.0797, + "step": 1160 + }, + { + "epoch": 1.564690026954178, + "grad_norm": 8.736405757610926, + "learning_rate": 6.9900463635455796e-06, + "loss": 0.0958, + "step": 1161 + }, + { + "epoch": 1.5660377358490565, + "grad_norm": 28.242556708726337, + "learning_rate": 6.9850381624996175e-06, + "loss": 0.094, + "step": 1162 + }, + { + "epoch": 1.5673854447439353, + "grad_norm": 16.89714123228305, + "learning_rate": 6.980027596149563e-06, + "loss": 0.1102, + "step": 1163 + }, + { + "epoch": 1.568733153638814, + "grad_norm": 4.8676784187888025, + "learning_rate": 6.975014670465834e-06, + "loss": 0.11, + "step": 1164 + }, + { + "epoch": 1.5700808625336928, + "grad_norm": 13.791472040868245, + "learning_rate": 6.969999391421664e-06, + "loss": 0.1144, + "step": 1165 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 4.411298748249035, + "learning_rate": 6.964981764993088e-06, + "loss": 0.0969, + "step": 1166 + }, + { + "epoch": 1.5727762803234502, + "grad_norm": 19.891909986905784, + "learning_rate": 6.9599617971589395e-06, + "loss": 0.0801, + "step": 1167 + }, + { + "epoch": 1.5741239892183287, + "grad_norm": 3.871978393700357, + "learning_rate": 6.954939493900843e-06, + "loss": 0.1045, + "step": 1168 + }, + { + "epoch": 1.5754716981132075, + "grad_norm": 25.50501535818795, + "learning_rate": 6.949914861203204e-06, + "loss": 0.1198, + "step": 1169 + }, + { + "epoch": 1.5768194070080863, + "grad_norm": 9.982832552032603, + "learning_rate": 6.944887905053203e-06, + "loss": 0.1303, + "step": 1170 + }, + { + "epoch": 1.578167115902965, + "grad_norm": 54.40709135679041, + "learning_rate": 6.939858631440792e-06, + "loss": 0.1488, + "step": 1171 + }, + { + "epoch": 1.5795148247978437, + "grad_norm": 14.09668238806814, + "learning_rate": 6.934827046358682e-06, + "loss": 0.1295, + "step": 1172 + }, + { + "epoch": 1.5808625336927222, + "grad_norm": 29.77850286816509, + "learning_rate": 6.92979315580234e-06, + "loss": 0.1051, + "step": 1173 + }, + { + "epoch": 1.582210242587601, + "grad_norm": 3.6639452724841157, + "learning_rate": 6.924756965769977e-06, + "loss": 0.0986, + "step": 1174 + }, + { + "epoch": 1.5835579514824798, + "grad_norm": 1.9273162897988128, + "learning_rate": 6.91971848226255e-06, + "loss": 0.1145, + "step": 1175 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 2.992958540282665, + "learning_rate": 6.914677711283739e-06, + "loss": 0.0947, + "step": 1176 + }, + { + "epoch": 1.5862533692722371, + "grad_norm": 12.73364513801502, + "learning_rate": 6.90963465883996e-06, + "loss": 0.0978, + "step": 1177 + }, + { + "epoch": 1.587601078167116, + "grad_norm": 25.064696100659468, + "learning_rate": 6.904589330940342e-06, + "loss": 0.1164, + "step": 1178 + }, + { + "epoch": 1.5889487870619945, + "grad_norm": 9.109269853794904, + "learning_rate": 6.8995417335967265e-06, + "loss": 0.1051, + "step": 1179 + }, + { + "epoch": 1.5902964959568733, + "grad_norm": 4.591646646499147, + "learning_rate": 6.894491872823659e-06, + "loss": 0.0951, + "step": 1180 + }, + { + "epoch": 1.591644204851752, + "grad_norm": 6.391533099970024, + "learning_rate": 6.889439754638382e-06, + "loss": 0.0886, + "step": 1181 + }, + { + "epoch": 1.5929919137466308, + "grad_norm": 13.720766515806753, + "learning_rate": 6.8843853850608275e-06, + "loss": 0.0985, + "step": 1182 + }, + { + "epoch": 1.5943396226415094, + "grad_norm": 27.29185563213414, + "learning_rate": 6.879328770113614e-06, + "loss": 0.07, + "step": 1183 + }, + { + "epoch": 1.595687331536388, + "grad_norm": 15.442713268670104, + "learning_rate": 6.874269915822028e-06, + "loss": 0.104, + "step": 1184 + }, + { + "epoch": 1.5970350404312668, + "grad_norm": 26.530566114904104, + "learning_rate": 6.869208828214031e-06, + "loss": 0.1253, + "step": 1185 + }, + { + "epoch": 1.5983827493261455, + "grad_norm": 2.9536356648857462, + "learning_rate": 6.864145513320243e-06, + "loss": 0.1028, + "step": 1186 + }, + { + "epoch": 1.5997304582210243, + "grad_norm": 10.244401798527736, + "learning_rate": 6.859079977173937e-06, + "loss": 0.1097, + "step": 1187 + }, + { + "epoch": 1.6010781671159031, + "grad_norm": 17.56313094886711, + "learning_rate": 6.854012225811035e-06, + "loss": 0.0853, + "step": 1188 + }, + { + "epoch": 1.6024258760107817, + "grad_norm": 43.32541810968609, + "learning_rate": 6.848942265270095e-06, + "loss": 0.1203, + "step": 1189 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 27.710602821262547, + "learning_rate": 6.8438701015923146e-06, + "loss": 0.1143, + "step": 1190 + }, + { + "epoch": 1.605121293800539, + "grad_norm": 36.56712041557793, + "learning_rate": 6.8387957408215075e-06, + "loss": 0.0994, + "step": 1191 + }, + { + "epoch": 1.6064690026954178, + "grad_norm": 39.9028483631125, + "learning_rate": 6.8337191890041136e-06, + "loss": 0.1672, + "step": 1192 + }, + { + "epoch": 1.6078167115902966, + "grad_norm": 21.935615596490596, + "learning_rate": 6.828640452189175e-06, + "loss": 0.0916, + "step": 1193 + }, + { + "epoch": 1.6091644204851752, + "grad_norm": 3.444736772467093, + "learning_rate": 6.823559536428347e-06, + "loss": 0.1097, + "step": 1194 + }, + { + "epoch": 1.610512129380054, + "grad_norm": 31.697200169227436, + "learning_rate": 6.818476447775873e-06, + "loss": 0.1089, + "step": 1195 + }, + { + "epoch": 1.6118598382749325, + "grad_norm": 7.759691035690136, + "learning_rate": 6.813391192288591e-06, + "loss": 0.1082, + "step": 1196 + }, + { + "epoch": 1.6132075471698113, + "grad_norm": 19.186773322033847, + "learning_rate": 6.808303776025917e-06, + "loss": 0.0912, + "step": 1197 + }, + { + "epoch": 1.61455525606469, + "grad_norm": 10.612196304639252, + "learning_rate": 6.803214205049844e-06, + "loss": 0.1096, + "step": 1198 + }, + { + "epoch": 1.6159029649595689, + "grad_norm": 10.36097048689935, + "learning_rate": 6.798122485424934e-06, + "loss": 0.088, + "step": 1199 + }, + { + "epoch": 1.6172506738544474, + "grad_norm": 31.085756029280716, + "learning_rate": 6.793028623218304e-06, + "loss": 0.1181, + "step": 1200 + }, + { + "epoch": 1.618598382749326, + "grad_norm": 11.0875340955073, + "learning_rate": 6.787932624499629e-06, + "loss": 0.1278, + "step": 1201 + }, + { + "epoch": 1.6199460916442048, + "grad_norm": 4.729548974450747, + "learning_rate": 6.782834495341128e-06, + "loss": 0.0879, + "step": 1202 + }, + { + "epoch": 1.6212938005390836, + "grad_norm": 5.329176206677219, + "learning_rate": 6.77773424181756e-06, + "loss": 0.1252, + "step": 1203 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 13.840401629148346, + "learning_rate": 6.772631870006211e-06, + "loss": 0.1031, + "step": 1204 + }, + { + "epoch": 1.6239892183288411, + "grad_norm": 35.10935660255721, + "learning_rate": 6.767527385986897e-06, + "loss": 0.0929, + "step": 1205 + }, + { + "epoch": 1.6253369272237197, + "grad_norm": 24.656977376360842, + "learning_rate": 6.7624207958419465e-06, + "loss": 0.1299, + "step": 1206 + }, + { + "epoch": 1.6266846361185983, + "grad_norm": 47.117020352404154, + "learning_rate": 6.757312105656199e-06, + "loss": 0.1666, + "step": 1207 + }, + { + "epoch": 1.628032345013477, + "grad_norm": 38.68384948164893, + "learning_rate": 6.752201321516995e-06, + "loss": 0.1352, + "step": 1208 + }, + { + "epoch": 1.6293800539083558, + "grad_norm": 31.61555179617573, + "learning_rate": 6.747088449514176e-06, + "loss": 0.1752, + "step": 1209 + }, + { + "epoch": 1.6307277628032346, + "grad_norm": 25.986968830667053, + "learning_rate": 6.74197349574006e-06, + "loss": 0.1221, + "step": 1210 + }, + { + "epoch": 1.6320754716981132, + "grad_norm": 38.46439361663359, + "learning_rate": 6.736856466289458e-06, + "loss": 0.1386, + "step": 1211 + }, + { + "epoch": 1.633423180592992, + "grad_norm": 13.445555078661352, + "learning_rate": 6.731737367259646e-06, + "loss": 0.1186, + "step": 1212 + }, + { + "epoch": 1.6347708894878705, + "grad_norm": 41.476674569377096, + "learning_rate": 6.726616204750369e-06, + "loss": 0.1175, + "step": 1213 + }, + { + "epoch": 1.6361185983827493, + "grad_norm": 41.38432245773835, + "learning_rate": 6.721492984863831e-06, + "loss": 0.1349, + "step": 1214 + }, + { + "epoch": 1.637466307277628, + "grad_norm": 44.95132635920724, + "learning_rate": 6.7163677137046855e-06, + "loss": 0.1244, + "step": 1215 + }, + { + "epoch": 1.6388140161725069, + "grad_norm": 32.4962831018852, + "learning_rate": 6.7112403973800325e-06, + "loss": 0.122, + "step": 1216 + }, + { + "epoch": 1.6401617250673854, + "grad_norm": 25.308386858932877, + "learning_rate": 6.706111041999409e-06, + "loss": 0.1021, + "step": 1217 + }, + { + "epoch": 1.641509433962264, + "grad_norm": 15.925230798704916, + "learning_rate": 6.700979653674779e-06, + "loss": 0.1489, + "step": 1218 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 6.538157260931235, + "learning_rate": 6.695846238520531e-06, + "loss": 0.1588, + "step": 1219 + }, + { + "epoch": 1.6442048517520216, + "grad_norm": 8.631297195640133, + "learning_rate": 6.690710802653471e-06, + "loss": 0.1083, + "step": 1220 + }, + { + "epoch": 1.6455525606469004, + "grad_norm": 15.81376427768351, + "learning_rate": 6.685573352192808e-06, + "loss": 0.0919, + "step": 1221 + }, + { + "epoch": 1.646900269541779, + "grad_norm": 18.904992407748285, + "learning_rate": 6.6804338932601505e-06, + "loss": 0.097, + "step": 1222 + }, + { + "epoch": 1.6482479784366577, + "grad_norm": 18.663628342820118, + "learning_rate": 6.67529243197951e-06, + "loss": 0.1122, + "step": 1223 + }, + { + "epoch": 1.6495956873315363, + "grad_norm": 29.945220179437086, + "learning_rate": 6.670148974477271e-06, + "loss": 0.0694, + "step": 1224 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 34.688653633610734, + "learning_rate": 6.665003526882204e-06, + "loss": 0.1339, + "step": 1225 + }, + { + "epoch": 1.6522911051212938, + "grad_norm": 32.608689638499996, + "learning_rate": 6.659856095325455e-06, + "loss": 0.1223, + "step": 1226 + }, + { + "epoch": 1.6536388140161726, + "grad_norm": 48.50080584571213, + "learning_rate": 6.654706685940522e-06, + "loss": 0.1435, + "step": 1227 + }, + { + "epoch": 1.6549865229110512, + "grad_norm": 65.29465577693831, + "learning_rate": 6.649555304863269e-06, + "loss": 0.1371, + "step": 1228 + }, + { + "epoch": 1.6563342318059298, + "grad_norm": 49.76779283578899, + "learning_rate": 6.6444019582319074e-06, + "loss": 0.0986, + "step": 1229 + }, + { + "epoch": 1.6576819407008085, + "grad_norm": 33.30741714366988, + "learning_rate": 6.63924665218699e-06, + "loss": 0.1147, + "step": 1230 + }, + { + "epoch": 1.6590296495956873, + "grad_norm": 11.736601106685816, + "learning_rate": 6.634089392871405e-06, + "loss": 0.1133, + "step": 1231 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 58.31353888771556, + "learning_rate": 6.628930186430367e-06, + "loss": 0.1454, + "step": 1232 + }, + { + "epoch": 1.661725067385445, + "grad_norm": 45.12864108447989, + "learning_rate": 6.62376903901141e-06, + "loss": 0.0972, + "step": 1233 + }, + { + "epoch": 1.6630727762803235, + "grad_norm": 45.352479069457026, + "learning_rate": 6.618605956764383e-06, + "loss": 0.1233, + "step": 1234 + }, + { + "epoch": 1.664420485175202, + "grad_norm": 37.53241717193761, + "learning_rate": 6.6134409458414415e-06, + "loss": 0.1529, + "step": 1235 + }, + { + "epoch": 1.6657681940700808, + "grad_norm": 45.25353054473023, + "learning_rate": 6.608274012397033e-06, + "loss": 0.1157, + "step": 1236 + }, + { + "epoch": 1.6671159029649596, + "grad_norm": 21.422004370857188, + "learning_rate": 6.603105162587904e-06, + "loss": 0.0975, + "step": 1237 + }, + { + "epoch": 1.6684636118598384, + "grad_norm": 8.36586784017517, + "learning_rate": 6.59793440257308e-06, + "loss": 0.1021, + "step": 1238 + }, + { + "epoch": 1.669811320754717, + "grad_norm": 8.679161799456223, + "learning_rate": 6.59276173851386e-06, + "loss": 0.1059, + "step": 1239 + }, + { + "epoch": 1.6711590296495957, + "grad_norm": 17.191189176260615, + "learning_rate": 6.587587176573816e-06, + "loss": 0.1057, + "step": 1240 + }, + { + "epoch": 1.6725067385444743, + "grad_norm": 13.149605342662214, + "learning_rate": 6.582410722918784e-06, + "loss": 0.1176, + "step": 1241 + }, + { + "epoch": 1.673854447439353, + "grad_norm": 30.04964299176277, + "learning_rate": 6.577232383716846e-06, + "loss": 0.0901, + "step": 1242 + }, + { + "epoch": 1.6752021563342319, + "grad_norm": 6.572153105571111, + "learning_rate": 6.572052165138338e-06, + "loss": 0.0814, + "step": 1243 + }, + { + "epoch": 1.6765498652291106, + "grad_norm": 16.79054717595195, + "learning_rate": 6.566870073355831e-06, + "loss": 0.0998, + "step": 1244 + }, + { + "epoch": 1.6778975741239892, + "grad_norm": 40.28440248338788, + "learning_rate": 6.56168611454413e-06, + "loss": 0.1383, + "step": 1245 + }, + { + "epoch": 1.6792452830188678, + "grad_norm": 34.496263467293595, + "learning_rate": 6.556500294880265e-06, + "loss": 0.1391, + "step": 1246 + }, + { + "epoch": 1.6805929919137466, + "grad_norm": 59.88436542838312, + "learning_rate": 6.551312620543483e-06, + "loss": 0.1493, + "step": 1247 + }, + { + "epoch": 1.6819407008086253, + "grad_norm": 33.56778010431017, + "learning_rate": 6.546123097715239e-06, + "loss": 0.0935, + "step": 1248 + }, + { + "epoch": 1.6832884097035041, + "grad_norm": 68.72981157613054, + "learning_rate": 6.5409317325791925e-06, + "loss": 0.1476, + "step": 1249 + }, + { + "epoch": 1.684636118598383, + "grad_norm": 37.96658174494101, + "learning_rate": 6.535738531321201e-06, + "loss": 0.1081, + "step": 1250 + }, + { + "epoch": 1.6859838274932615, + "grad_norm": 44.95537210053708, + "learning_rate": 6.5305435001293015e-06, + "loss": 0.0954, + "step": 1251 + }, + { + "epoch": 1.68733153638814, + "grad_norm": 43.8663596492044, + "learning_rate": 6.525346645193722e-06, + "loss": 0.1342, + "step": 1252 + }, + { + "epoch": 1.6886792452830188, + "grad_norm": 37.51604577550003, + "learning_rate": 6.520147972706856e-06, + "loss": 0.1474, + "step": 1253 + }, + { + "epoch": 1.6900269541778976, + "grad_norm": 27.713651044112602, + "learning_rate": 6.514947488863265e-06, + "loss": 0.0938, + "step": 1254 + }, + { + "epoch": 1.6913746630727764, + "grad_norm": 17.31074941028693, + "learning_rate": 6.50974519985967e-06, + "loss": 0.0869, + "step": 1255 + }, + { + "epoch": 1.692722371967655, + "grad_norm": 41.044294602366946, + "learning_rate": 6.504541111894941e-06, + "loss": 0.0828, + "step": 1256 + }, + { + "epoch": 1.6940700808625337, + "grad_norm": 36.80750823714753, + "learning_rate": 6.499335231170094e-06, + "loss": 0.0865, + "step": 1257 + }, + { + "epoch": 1.6954177897574123, + "grad_norm": 30.545195216209756, + "learning_rate": 6.494127563888277e-06, + "loss": 0.0866, + "step": 1258 + }, + { + "epoch": 1.696765498652291, + "grad_norm": 21.902068209891787, + "learning_rate": 6.488918116254773e-06, + "loss": 0.1364, + "step": 1259 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 13.910446227231002, + "learning_rate": 6.48370689447698e-06, + "loss": 0.1043, + "step": 1260 + }, + { + "epoch": 1.6994609164420487, + "grad_norm": 30.379092718731492, + "learning_rate": 6.478493904764415e-06, + "loss": 0.1336, + "step": 1261 + }, + { + "epoch": 1.7008086253369272, + "grad_norm": 4.90987118168899, + "learning_rate": 6.4732791533287e-06, + "loss": 0.1292, + "step": 1262 + }, + { + "epoch": 1.7021563342318058, + "grad_norm": 7.908822042748592, + "learning_rate": 6.468062646383553e-06, + "loss": 0.1136, + "step": 1263 + }, + { + "epoch": 1.7035040431266846, + "grad_norm": 2.9673589298913483, + "learning_rate": 6.462844390144789e-06, + "loss": 0.0846, + "step": 1264 + }, + { + "epoch": 1.7048517520215634, + "grad_norm": 34.1923214338531, + "learning_rate": 6.457624390830305e-06, + "loss": 0.0778, + "step": 1265 + }, + { + "epoch": 1.7061994609164421, + "grad_norm": 22.20352275427439, + "learning_rate": 6.452402654660072e-06, + "loss": 0.1184, + "step": 1266 + }, + { + "epoch": 1.7075471698113207, + "grad_norm": 6.744303875198106, + "learning_rate": 6.447179187856138e-06, + "loss": 0.0829, + "step": 1267 + }, + { + "epoch": 1.7088948787061995, + "grad_norm": 36.15074852445199, + "learning_rate": 6.441953996642607e-06, + "loss": 0.1436, + "step": 1268 + }, + { + "epoch": 1.710242587601078, + "grad_norm": 19.691147904376766, + "learning_rate": 6.436727087245639e-06, + "loss": 0.1374, + "step": 1269 + }, + { + "epoch": 1.7115902964959568, + "grad_norm": 57.667849490243775, + "learning_rate": 6.431498465893441e-06, + "loss": 0.1083, + "step": 1270 + }, + { + "epoch": 1.7129380053908356, + "grad_norm": 68.62494274825639, + "learning_rate": 6.426268138816263e-06, + "loss": 0.1248, + "step": 1271 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 38.2680311565065, + "learning_rate": 6.421036112246381e-06, + "loss": 0.107, + "step": 1272 + }, + { + "epoch": 1.715633423180593, + "grad_norm": 18.604174658810006, + "learning_rate": 6.4158023924181055e-06, + "loss": 0.1002, + "step": 1273 + }, + { + "epoch": 1.7169811320754715, + "grad_norm": 26.442122679923433, + "learning_rate": 6.410566985567758e-06, + "loss": 0.0801, + "step": 1274 + }, + { + "epoch": 1.7183288409703503, + "grad_norm": 50.07634613318523, + "learning_rate": 6.405329897933669e-06, + "loss": 0.1083, + "step": 1275 + }, + { + "epoch": 1.719676549865229, + "grad_norm": 27.753771307048627, + "learning_rate": 6.400091135756175e-06, + "loss": 0.1392, + "step": 1276 + }, + { + "epoch": 1.721024258760108, + "grad_norm": 40.765571395887356, + "learning_rate": 6.39485070527761e-06, + "loss": 0.0953, + "step": 1277 + }, + { + "epoch": 1.7223719676549867, + "grad_norm": 30.260970795008827, + "learning_rate": 6.389608612742291e-06, + "loss": 0.1346, + "step": 1278 + }, + { + "epoch": 1.7237196765498652, + "grad_norm": 15.614631220706945, + "learning_rate": 6.384364864396516e-06, + "loss": 0.1251, + "step": 1279 + }, + { + "epoch": 1.7250673854447438, + "grad_norm": 9.53195082176003, + "learning_rate": 6.3791194664885615e-06, + "loss": 0.1273, + "step": 1280 + }, + { + "epoch": 1.7264150943396226, + "grad_norm": 29.998941885373196, + "learning_rate": 6.373872425268663e-06, + "loss": 0.132, + "step": 1281 + }, + { + "epoch": 1.7277628032345014, + "grad_norm": 20.588034540327506, + "learning_rate": 6.368623746989017e-06, + "loss": 0.1169, + "step": 1282 + }, + { + "epoch": 1.7291105121293802, + "grad_norm": 6.669739004460062, + "learning_rate": 6.363373437903771e-06, + "loss": 0.123, + "step": 1283 + }, + { + "epoch": 1.7304582210242587, + "grad_norm": 24.76193719865685, + "learning_rate": 6.358121504269014e-06, + "loss": 0.1012, + "step": 1284 + }, + { + "epoch": 1.7318059299191375, + "grad_norm": 17.14846878596772, + "learning_rate": 6.352867952342772e-06, + "loss": 0.107, + "step": 1285 + }, + { + "epoch": 1.733153638814016, + "grad_norm": 42.360318744521294, + "learning_rate": 6.3476127883850004e-06, + "loss": 0.1186, + "step": 1286 + }, + { + "epoch": 1.7345013477088949, + "grad_norm": 32.34398563703926, + "learning_rate": 6.342356018657572e-06, + "loss": 0.0995, + "step": 1287 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 24.848694774801302, + "learning_rate": 6.337097649424277e-06, + "loss": 0.0989, + "step": 1288 + }, + { + "epoch": 1.7371967654986524, + "grad_norm": 18.897283860971818, + "learning_rate": 6.33183768695081e-06, + "loss": 0.0854, + "step": 1289 + }, + { + "epoch": 1.738544474393531, + "grad_norm": 24.500250547348585, + "learning_rate": 6.326576137504763e-06, + "loss": 0.1456, + "step": 1290 + }, + { + "epoch": 1.7398921832884096, + "grad_norm": 36.176626423647136, + "learning_rate": 6.3213130073556185e-06, + "loss": 0.1579, + "step": 1291 + }, + { + "epoch": 1.7412398921832883, + "grad_norm": 46.57101433309929, + "learning_rate": 6.3160483027747466e-06, + "loss": 0.1198, + "step": 1292 + }, + { + "epoch": 1.7425876010781671, + "grad_norm": 67.30502296403954, + "learning_rate": 6.310782030035389e-06, + "loss": 0.1716, + "step": 1293 + }, + { + "epoch": 1.743935309973046, + "grad_norm": 33.018911988047826, + "learning_rate": 6.305514195412657e-06, + "loss": 0.1179, + "step": 1294 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 41.46849001482197, + "learning_rate": 6.300244805183524e-06, + "loss": 0.1192, + "step": 1295 + }, + { + "epoch": 1.7466307277628033, + "grad_norm": 25.098718629340727, + "learning_rate": 6.294973865626816e-06, + "loss": 0.1133, + "step": 1296 + }, + { + "epoch": 1.7479784366576818, + "grad_norm": 10.122837844200781, + "learning_rate": 6.289701383023206e-06, + "loss": 0.0845, + "step": 1297 + }, + { + "epoch": 1.7493261455525606, + "grad_norm": 15.103957697190689, + "learning_rate": 6.284427363655205e-06, + "loss": 0.1243, + "step": 1298 + }, + { + "epoch": 1.7506738544474394, + "grad_norm": 24.950561920796087, + "learning_rate": 6.2791518138071576e-06, + "loss": 0.1118, + "step": 1299 + }, + { + "epoch": 1.7520215633423182, + "grad_norm": 3.088394336992794, + "learning_rate": 6.273874739765227e-06, + "loss": 0.1076, + "step": 1300 + }, + { + "epoch": 1.7533692722371967, + "grad_norm": 6.766338179774651, + "learning_rate": 6.268596147817397e-06, + "loss": 0.1195, + "step": 1301 + }, + { + "epoch": 1.7547169811320755, + "grad_norm": 33.425848073605664, + "learning_rate": 6.263316044253458e-06, + "loss": 0.103, + "step": 1302 + }, + { + "epoch": 1.756064690026954, + "grad_norm": 5.412061669447912, + "learning_rate": 6.258034435365003e-06, + "loss": 0.0939, + "step": 1303 + }, + { + "epoch": 1.7574123989218329, + "grad_norm": 2.5526583699674004, + "learning_rate": 6.252751327445418e-06, + "loss": 0.1013, + "step": 1304 + }, + { + "epoch": 1.7587601078167117, + "grad_norm": 7.967096068724246, + "learning_rate": 6.247466726789875e-06, + "loss": 0.1097, + "step": 1305 + }, + { + "epoch": 1.7601078167115904, + "grad_norm": 15.47054258693064, + "learning_rate": 6.2421806396953225e-06, + "loss": 0.0757, + "step": 1306 + }, + { + "epoch": 1.761455525606469, + "grad_norm": 29.790324334117606, + "learning_rate": 6.236893072460485e-06, + "loss": 0.1218, + "step": 1307 + }, + { + "epoch": 1.7628032345013476, + "grad_norm": 63.167035497548774, + "learning_rate": 6.231604031385847e-06, + "loss": 0.1407, + "step": 1308 + }, + { + "epoch": 1.7641509433962264, + "grad_norm": 55.55168528876185, + "learning_rate": 6.226313522773651e-06, + "loss": 0.1242, + "step": 1309 + }, + { + "epoch": 1.7654986522911051, + "grad_norm": 37.637608990824354, + "learning_rate": 6.221021552927887e-06, + "loss": 0.0977, + "step": 1310 + }, + { + "epoch": 1.766846361185984, + "grad_norm": 25.310246390682764, + "learning_rate": 6.215728128154286e-06, + "loss": 0.1127, + "step": 1311 + }, + { + "epoch": 1.7681940700808625, + "grad_norm": 38.43805067128463, + "learning_rate": 6.210433254760314e-06, + "loss": 0.1467, + "step": 1312 + }, + { + "epoch": 1.7695417789757413, + "grad_norm": 34.54112023962584, + "learning_rate": 6.205136939055164e-06, + "loss": 0.1029, + "step": 1313 + }, + { + "epoch": 1.7708894878706198, + "grad_norm": 14.865570575940001, + "learning_rate": 6.199839187349744e-06, + "loss": 0.0939, + "step": 1314 + }, + { + "epoch": 1.7722371967654986, + "grad_norm": 26.341817840003184, + "learning_rate": 6.194540005956675e-06, + "loss": 0.1084, + "step": 1315 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 15.226758539906148, + "learning_rate": 6.189239401190283e-06, + "loss": 0.0761, + "step": 1316 + }, + { + "epoch": 1.7749326145552562, + "grad_norm": 29.378985893057745, + "learning_rate": 6.183937379366587e-06, + "loss": 0.097, + "step": 1317 + }, + { + "epoch": 1.7762803234501348, + "grad_norm": 27.724283552730537, + "learning_rate": 6.178633946803298e-06, + "loss": 0.0915, + "step": 1318 + }, + { + "epoch": 1.7776280323450133, + "grad_norm": 7.516753613988535, + "learning_rate": 6.173329109819805e-06, + "loss": 0.1031, + "step": 1319 + }, + { + "epoch": 1.778975741239892, + "grad_norm": 17.840966150820822, + "learning_rate": 6.168022874737172e-06, + "loss": 0.09, + "step": 1320 + }, + { + "epoch": 1.780323450134771, + "grad_norm": 37.42784107759407, + "learning_rate": 6.162715247878129e-06, + "loss": 0.0977, + "step": 1321 + }, + { + "epoch": 1.7816711590296497, + "grad_norm": 22.128152527772187, + "learning_rate": 6.157406235567063e-06, + "loss": 0.1008, + "step": 1322 + }, + { + "epoch": 1.7830188679245285, + "grad_norm": 16.85995352771574, + "learning_rate": 6.152095844130014e-06, + "loss": 0.1384, + "step": 1323 + }, + { + "epoch": 1.784366576819407, + "grad_norm": 42.208768843530166, + "learning_rate": 6.146784079894663e-06, + "loss": 0.1076, + "step": 1324 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 15.843627198010337, + "learning_rate": 6.14147094919033e-06, + "loss": 0.0924, + "step": 1325 + }, + { + "epoch": 1.7870619946091644, + "grad_norm": 20.019025199850617, + "learning_rate": 6.1361564583479595e-06, + "loss": 0.1027, + "step": 1326 + }, + { + "epoch": 1.7884097035040432, + "grad_norm": 33.09963910790979, + "learning_rate": 6.13084061370012e-06, + "loss": 0.0946, + "step": 1327 + }, + { + "epoch": 1.789757412398922, + "grad_norm": 14.658193329892471, + "learning_rate": 6.125523421580988e-06, + "loss": 0.0828, + "step": 1328 + }, + { + "epoch": 1.7911051212938005, + "grad_norm": 54.968511261598366, + "learning_rate": 6.1202048883263536e-06, + "loss": 0.1455, + "step": 1329 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 46.841572585791454, + "learning_rate": 6.114885020273597e-06, + "loss": 0.1638, + "step": 1330 + }, + { + "epoch": 1.7938005390835579, + "grad_norm": 2.581682673229848, + "learning_rate": 6.109563823761695e-06, + "loss": 0.0844, + "step": 1331 + }, + { + "epoch": 1.7951482479784366, + "grad_norm": 13.901284307618948, + "learning_rate": 6.104241305131202e-06, + "loss": 0.1232, + "step": 1332 + }, + { + "epoch": 1.7964959568733154, + "grad_norm": 14.565778905102706, + "learning_rate": 6.098917470724252e-06, + "loss": 0.1071, + "step": 1333 + }, + { + "epoch": 1.7978436657681942, + "grad_norm": 25.483809555518132, + "learning_rate": 6.093592326884548e-06, + "loss": 0.1219, + "step": 1334 + }, + { + "epoch": 1.7991913746630728, + "grad_norm": 39.31328739844174, + "learning_rate": 6.088265879957345e-06, + "loss": 0.1422, + "step": 1335 + }, + { + "epoch": 1.8005390835579513, + "grad_norm": 17.12124732601102, + "learning_rate": 6.08293813628946e-06, + "loss": 0.0913, + "step": 1336 + }, + { + "epoch": 1.8018867924528301, + "grad_norm": 23.360869305188082, + "learning_rate": 6.077609102229253e-06, + "loss": 0.0804, + "step": 1337 + }, + { + "epoch": 1.803234501347709, + "grad_norm": 1.9538480429901037, + "learning_rate": 6.072278784126615e-06, + "loss": 0.0893, + "step": 1338 + }, + { + "epoch": 1.8045822102425877, + "grad_norm": 5.134298331135784, + "learning_rate": 6.066947188332978e-06, + "loss": 0.0752, + "step": 1339 + }, + { + "epoch": 1.8059299191374663, + "grad_norm": 17.00701308019667, + "learning_rate": 6.061614321201286e-06, + "loss": 0.1066, + "step": 1340 + }, + { + "epoch": 1.807277628032345, + "grad_norm": 4.644095242807913, + "learning_rate": 6.056280189086006e-06, + "loss": 0.0822, + "step": 1341 + }, + { + "epoch": 1.8086253369272236, + "grad_norm": 17.260944481495244, + "learning_rate": 6.050944798343104e-06, + "loss": 0.0982, + "step": 1342 + }, + { + "epoch": 1.8099730458221024, + "grad_norm": 13.57516656588318, + "learning_rate": 6.045608155330056e-06, + "loss": 0.0897, + "step": 1343 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 32.16964729385561, + "learning_rate": 6.040270266405821e-06, + "loss": 0.1188, + "step": 1344 + }, + { + "epoch": 1.81266846361186, + "grad_norm": 40.28929535060277, + "learning_rate": 6.034931137930847e-06, + "loss": 0.1415, + "step": 1345 + }, + { + "epoch": 1.8140161725067385, + "grad_norm": 27.397743564973116, + "learning_rate": 6.0295907762670604e-06, + "loss": 0.1013, + "step": 1346 + }, + { + "epoch": 1.815363881401617, + "grad_norm": 29.71416076150079, + "learning_rate": 6.024249187777851e-06, + "loss": 0.1199, + "step": 1347 + }, + { + "epoch": 1.8167115902964959, + "grad_norm": 46.429511066063434, + "learning_rate": 6.018906378828077e-06, + "loss": 0.1121, + "step": 1348 + }, + { + "epoch": 1.8180592991913747, + "grad_norm": 40.14207122441297, + "learning_rate": 6.0135623557840495e-06, + "loss": 0.1347, + "step": 1349 + }, + { + "epoch": 1.8194070080862534, + "grad_norm": 36.68569452474385, + "learning_rate": 6.00821712501352e-06, + "loss": 0.1067, + "step": 1350 + }, + { + "epoch": 1.8207547169811322, + "grad_norm": 30.914958207386643, + "learning_rate": 6.00287069288569e-06, + "loss": 0.12, + "step": 1351 + }, + { + "epoch": 1.8221024258760108, + "grad_norm": 18.460114293224187, + "learning_rate": 5.997523065771183e-06, + "loss": 0.1019, + "step": 1352 + }, + { + "epoch": 1.8234501347708894, + "grad_norm": 36.38752824341115, + "learning_rate": 5.99217425004205e-06, + "loss": 0.157, + "step": 1353 + }, + { + "epoch": 1.8247978436657681, + "grad_norm": 29.732678521705505, + "learning_rate": 5.986824252071759e-06, + "loss": 0.1089, + "step": 1354 + }, + { + "epoch": 1.826145552560647, + "grad_norm": 13.339611118244939, + "learning_rate": 5.981473078235186e-06, + "loss": 0.0744, + "step": 1355 + }, + { + "epoch": 1.8274932614555257, + "grad_norm": 12.082635816770566, + "learning_rate": 5.976120734908608e-06, + "loss": 0.0947, + "step": 1356 + }, + { + "epoch": 1.8288409703504043, + "grad_norm": 13.405686005260478, + "learning_rate": 5.970767228469695e-06, + "loss": 0.1037, + "step": 1357 + }, + { + "epoch": 1.830188679245283, + "grad_norm": 14.059632585833, + "learning_rate": 5.9654125652975045e-06, + "loss": 0.1134, + "step": 1358 + }, + { + "epoch": 1.8315363881401616, + "grad_norm": 13.84173336774697, + "learning_rate": 5.9600567517724714e-06, + "loss": 0.112, + "step": 1359 + }, + { + "epoch": 1.8328840970350404, + "grad_norm": 20.23846114831505, + "learning_rate": 5.954699794276401e-06, + "loss": 0.1249, + "step": 1360 + }, + { + "epoch": 1.8342318059299192, + "grad_norm": 26.696112761495108, + "learning_rate": 5.949341699192462e-06, + "loss": 0.0793, + "step": 1361 + }, + { + "epoch": 1.835579514824798, + "grad_norm": 26.761264321958276, + "learning_rate": 5.943982472905178e-06, + "loss": 0.1116, + "step": 1362 + }, + { + "epoch": 1.8369272237196765, + "grad_norm": 40.15603286118293, + "learning_rate": 5.938622121800423e-06, + "loss": 0.1388, + "step": 1363 + }, + { + "epoch": 1.838274932614555, + "grad_norm": 27.957452337284327, + "learning_rate": 5.933260652265407e-06, + "loss": 0.115, + "step": 1364 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 27.56506697975481, + "learning_rate": 5.927898070688677e-06, + "loss": 0.1154, + "step": 1365 + }, + { + "epoch": 1.8409703504043127, + "grad_norm": 60.57735309155048, + "learning_rate": 5.922534383460101e-06, + "loss": 0.1675, + "step": 1366 + }, + { + "epoch": 1.8423180592991915, + "grad_norm": 34.876813642576835, + "learning_rate": 5.91716959697087e-06, + "loss": 0.127, + "step": 1367 + }, + { + "epoch": 1.8436657681940702, + "grad_norm": 24.904655819690944, + "learning_rate": 5.911803717613478e-06, + "loss": 0.1334, + "step": 1368 + }, + { + "epoch": 1.8450134770889488, + "grad_norm": 43.34111026130619, + "learning_rate": 5.906436751781727e-06, + "loss": 0.1191, + "step": 1369 + }, + { + "epoch": 1.8463611859838274, + "grad_norm": 30.74287075985777, + "learning_rate": 5.9010687058707105e-06, + "loss": 0.107, + "step": 1370 + }, + { + "epoch": 1.8477088948787062, + "grad_norm": 42.56337494037877, + "learning_rate": 5.89569958627681e-06, + "loss": 0.1094, + "step": 1371 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 31.954727575442895, + "learning_rate": 5.890329399397685e-06, + "loss": 0.1144, + "step": 1372 + }, + { + "epoch": 1.8504043126684637, + "grad_norm": 19.845183662847663, + "learning_rate": 5.884958151632269e-06, + "loss": 0.1108, + "step": 1373 + }, + { + "epoch": 1.8517520215633423, + "grad_norm": 13.074599273925406, + "learning_rate": 5.87958584938076e-06, + "loss": 0.1342, + "step": 1374 + }, + { + "epoch": 1.853099730458221, + "grad_norm": 30.953840845007782, + "learning_rate": 5.874212499044609e-06, + "loss": 0.104, + "step": 1375 + }, + { + "epoch": 1.8544474393530996, + "grad_norm": 24.62275758276975, + "learning_rate": 5.868838107026518e-06, + "loss": 0.079, + "step": 1376 + }, + { + "epoch": 1.8557951482479784, + "grad_norm": 16.96373399331299, + "learning_rate": 5.863462679730431e-06, + "loss": 0.1057, + "step": 1377 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 4.1580258605498726, + "learning_rate": 5.858086223561522e-06, + "loss": 0.0626, + "step": 1378 + }, + { + "epoch": 1.858490566037736, + "grad_norm": 13.556414527474818, + "learning_rate": 5.852708744926199e-06, + "loss": 0.1195, + "step": 1379 + }, + { + "epoch": 1.8598382749326146, + "grad_norm": 10.394601201106761, + "learning_rate": 5.847330250232077e-06, + "loss": 0.0961, + "step": 1380 + }, + { + "epoch": 1.8611859838274931, + "grad_norm": 6.109464455778147, + "learning_rate": 5.841950745887991e-06, + "loss": 0.1067, + "step": 1381 + }, + { + "epoch": 1.862533692722372, + "grad_norm": 2.446323850068468, + "learning_rate": 5.836570238303977e-06, + "loss": 0.085, + "step": 1382 + }, + { + "epoch": 1.8638814016172507, + "grad_norm": 7.123970380996732, + "learning_rate": 5.831188733891262e-06, + "loss": 0.0852, + "step": 1383 + }, + { + "epoch": 1.8652291105121295, + "grad_norm": 36.10221997982196, + "learning_rate": 5.825806239062265e-06, + "loss": 0.0868, + "step": 1384 + }, + { + "epoch": 1.866576819407008, + "grad_norm": 27.939292410367248, + "learning_rate": 5.820422760230587e-06, + "loss": 0.0915, + "step": 1385 + }, + { + "epoch": 1.8679245283018868, + "grad_norm": 27.390130401062443, + "learning_rate": 5.815038303810993e-06, + "loss": 0.0957, + "step": 1386 + }, + { + "epoch": 1.8692722371967654, + "grad_norm": 12.897096060364303, + "learning_rate": 5.809652876219425e-06, + "loss": 0.084, + "step": 1387 + }, + { + "epoch": 1.8706199460916442, + "grad_norm": 49.373155264767156, + "learning_rate": 5.80426648387297e-06, + "loss": 0.1221, + "step": 1388 + }, + { + "epoch": 1.871967654986523, + "grad_norm": 4.192702663338079, + "learning_rate": 5.798879133189874e-06, + "loss": 0.0722, + "step": 1389 + }, + { + "epoch": 1.8733153638814017, + "grad_norm": 50.14995370368929, + "learning_rate": 5.79349083058952e-06, + "loss": 0.0984, + "step": 1390 + }, + { + "epoch": 1.8746630727762803, + "grad_norm": 33.115898211463154, + "learning_rate": 5.788101582492426e-06, + "loss": 0.1117, + "step": 1391 + }, + { + "epoch": 1.8760107816711589, + "grad_norm": 11.653837880950354, + "learning_rate": 5.782711395320237e-06, + "loss": 0.1046, + "step": 1392 + }, + { + "epoch": 1.8773584905660377, + "grad_norm": 18.28158312272161, + "learning_rate": 5.777320275495718e-06, + "loss": 0.0823, + "step": 1393 + }, + { + "epoch": 1.8787061994609164, + "grad_norm": 14.768783202604602, + "learning_rate": 5.7719282294427445e-06, + "loss": 0.106, + "step": 1394 + }, + { + "epoch": 1.8800539083557952, + "grad_norm": 10.393862378793864, + "learning_rate": 5.7665352635862945e-06, + "loss": 0.0716, + "step": 1395 + }, + { + "epoch": 1.881401617250674, + "grad_norm": 15.438809554156398, + "learning_rate": 5.761141384352444e-06, + "loss": 0.068, + "step": 1396 + }, + { + "epoch": 1.8827493261455526, + "grad_norm": 18.65140165342761, + "learning_rate": 5.755746598168357e-06, + "loss": 0.0921, + "step": 1397 + }, + { + "epoch": 1.8840970350404311, + "grad_norm": 8.286142056365618, + "learning_rate": 5.7503509114622745e-06, + "loss": 0.0665, + "step": 1398 + }, + { + "epoch": 1.88544474393531, + "grad_norm": 21.069569458009255, + "learning_rate": 5.744954330663517e-06, + "loss": 0.0693, + "step": 1399 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 17.1845408989426, + "learning_rate": 5.739556862202467e-06, + "loss": 0.0635, + "step": 1400 + }, + { + "epoch": 1.8881401617250675, + "grad_norm": 35.88182780646645, + "learning_rate": 5.7341585125105605e-06, + "loss": 0.0875, + "step": 1401 + }, + { + "epoch": 1.889487870619946, + "grad_norm": 35.277373424235975, + "learning_rate": 5.728759288020291e-06, + "loss": 0.079, + "step": 1402 + }, + { + "epoch": 1.8908355795148248, + "grad_norm": 10.101823798440112, + "learning_rate": 5.723359195165193e-06, + "loss": 0.0754, + "step": 1403 + }, + { + "epoch": 1.8921832884097034, + "grad_norm": 45.17335078860093, + "learning_rate": 5.717958240379831e-06, + "loss": 0.135, + "step": 1404 + }, + { + "epoch": 1.8935309973045822, + "grad_norm": 35.227014728752025, + "learning_rate": 5.712556430099798e-06, + "loss": 0.126, + "step": 1405 + }, + { + "epoch": 1.894878706199461, + "grad_norm": 37.31369879485224, + "learning_rate": 5.707153770761713e-06, + "loss": 0.1296, + "step": 1406 + }, + { + "epoch": 1.8962264150943398, + "grad_norm": 34.21801840159295, + "learning_rate": 5.701750268803197e-06, + "loss": 0.0813, + "step": 1407 + }, + { + "epoch": 1.8975741239892183, + "grad_norm": 20.33902491051905, + "learning_rate": 5.696345930662879e-06, + "loss": 0.0906, + "step": 1408 + }, + { + "epoch": 1.8989218328840969, + "grad_norm": 7.477003134339218, + "learning_rate": 5.6909407627803895e-06, + "loss": 0.0923, + "step": 1409 + }, + { + "epoch": 1.9002695417789757, + "grad_norm": 32.51330491853712, + "learning_rate": 5.685534771596338e-06, + "loss": 0.1128, + "step": 1410 + }, + { + "epoch": 1.9016172506738545, + "grad_norm": 26.88374992745288, + "learning_rate": 5.680127963552325e-06, + "loss": 0.089, + "step": 1411 + }, + { + "epoch": 1.9029649595687332, + "grad_norm": 4.975595887949011, + "learning_rate": 5.674720345090916e-06, + "loss": 0.1459, + "step": 1412 + }, + { + "epoch": 1.904312668463612, + "grad_norm": 18.047660434226664, + "learning_rate": 5.669311922655645e-06, + "loss": 0.0961, + "step": 1413 + }, + { + "epoch": 1.9056603773584906, + "grad_norm": 16.64960132287855, + "learning_rate": 5.663902702691007e-06, + "loss": 0.1055, + "step": 1414 + }, + { + "epoch": 1.9070080862533692, + "grad_norm": 21.122917467344067, + "learning_rate": 5.658492691642443e-06, + "loss": 0.0915, + "step": 1415 + }, + { + "epoch": 1.908355795148248, + "grad_norm": 48.60555062729239, + "learning_rate": 5.65308189595634e-06, + "loss": 0.1067, + "step": 1416 + }, + { + "epoch": 1.9097035040431267, + "grad_norm": 36.26956001307304, + "learning_rate": 5.647670322080017e-06, + "loss": 0.1108, + "step": 1417 + }, + { + "epoch": 1.9110512129380055, + "grad_norm": 49.32787935957758, + "learning_rate": 5.642257976461725e-06, + "loss": 0.1572, + "step": 1418 + }, + { + "epoch": 1.912398921832884, + "grad_norm": 42.075819299377635, + "learning_rate": 5.636844865550627e-06, + "loss": 0.1434, + "step": 1419 + }, + { + "epoch": 1.9137466307277629, + "grad_norm": 11.991942842012211, + "learning_rate": 5.631430995796805e-06, + "loss": 0.0878, + "step": 1420 + }, + { + "epoch": 1.9150943396226414, + "grad_norm": 30.23210800745099, + "learning_rate": 5.626016373651242e-06, + "loss": 0.1276, + "step": 1421 + }, + { + "epoch": 1.9164420485175202, + "grad_norm": 39.92040420027348, + "learning_rate": 5.6206010055658165e-06, + "loss": 0.1196, + "step": 1422 + }, + { + "epoch": 1.917789757412399, + "grad_norm": 18.603978120446158, + "learning_rate": 5.6151848979933e-06, + "loss": 0.113, + "step": 1423 + }, + { + "epoch": 1.9191374663072778, + "grad_norm": 26.75942104371344, + "learning_rate": 5.60976805738734e-06, + "loss": 0.0976, + "step": 1424 + }, + { + "epoch": 1.9204851752021563, + "grad_norm": 57.21916047145641, + "learning_rate": 5.60435049020246e-06, + "loss": 0.1317, + "step": 1425 + }, + { + "epoch": 1.921832884097035, + "grad_norm": 33.85786081569241, + "learning_rate": 5.5989322028940505e-06, + "loss": 0.1183, + "step": 1426 + }, + { + "epoch": 1.9231805929919137, + "grad_norm": 2.2186470015361883, + "learning_rate": 5.593513201918358e-06, + "loss": 0.0739, + "step": 1427 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 35.79869144886063, + "learning_rate": 5.58809349373248e-06, + "loss": 0.1501, + "step": 1428 + }, + { + "epoch": 1.9258760107816713, + "grad_norm": 51.044650259268906, + "learning_rate": 5.582673084794357e-06, + "loss": 0.1158, + "step": 1429 + }, + { + "epoch": 1.9272237196765498, + "grad_norm": 3.2758534254113725, + "learning_rate": 5.5772519815627654e-06, + "loss": 0.0878, + "step": 1430 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 20.226783903157436, + "learning_rate": 5.571830190497306e-06, + "loss": 0.1093, + "step": 1431 + }, + { + "epoch": 1.9299191374663072, + "grad_norm": 14.504493071402013, + "learning_rate": 5.566407718058401e-06, + "loss": 0.1147, + "step": 1432 + }, + { + "epoch": 1.931266846361186, + "grad_norm": 5.778206882333169, + "learning_rate": 5.560984570707286e-06, + "loss": 0.1097, + "step": 1433 + }, + { + "epoch": 1.9326145552560647, + "grad_norm": 9.025281617035915, + "learning_rate": 5.555560754905999e-06, + "loss": 0.1122, + "step": 1434 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 2.1441592097518845, + "learning_rate": 5.550136277117375e-06, + "loss": 0.0981, + "step": 1435 + }, + { + "epoch": 1.935309973045822, + "grad_norm": 10.145719536719056, + "learning_rate": 5.544711143805036e-06, + "loss": 0.0982, + "step": 1436 + }, + { + "epoch": 1.9366576819407006, + "grad_norm": 16.60293406828361, + "learning_rate": 5.539285361433387e-06, + "loss": 0.0795, + "step": 1437 + }, + { + "epoch": 1.9380053908355794, + "grad_norm": 8.92692407600556, + "learning_rate": 5.533858936467607e-06, + "loss": 0.1099, + "step": 1438 + }, + { + "epoch": 1.9393530997304582, + "grad_norm": 7.642044159151737, + "learning_rate": 5.528431875373641e-06, + "loss": 0.0929, + "step": 1439 + }, + { + "epoch": 1.940700808625337, + "grad_norm": 25.860009557631003, + "learning_rate": 5.523004184618187e-06, + "loss": 0.1252, + "step": 1440 + }, + { + "epoch": 1.9420485175202158, + "grad_norm": 37.8929363063091, + "learning_rate": 5.5175758706687e-06, + "loss": 0.1201, + "step": 1441 + }, + { + "epoch": 1.9433962264150944, + "grad_norm": 40.9665632067674, + "learning_rate": 5.512146939993376e-06, + "loss": 0.1051, + "step": 1442 + }, + { + "epoch": 1.944743935309973, + "grad_norm": 23.70050166352492, + "learning_rate": 5.50671739906114e-06, + "loss": 0.0968, + "step": 1443 + }, + { + "epoch": 1.9460916442048517, + "grad_norm": 21.688073882036, + "learning_rate": 5.501287254341653e-06, + "loss": 0.0991, + "step": 1444 + }, + { + "epoch": 1.9474393530997305, + "grad_norm": 37.57250831349659, + "learning_rate": 5.4958565123052884e-06, + "loss": 0.1069, + "step": 1445 + }, + { + "epoch": 1.9487870619946093, + "grad_norm": 21.888310011194886, + "learning_rate": 5.490425179423135e-06, + "loss": 0.1308, + "step": 1446 + }, + { + "epoch": 1.9501347708894878, + "grad_norm": 39.64122670174048, + "learning_rate": 5.484993262166987e-06, + "loss": 0.1024, + "step": 1447 + }, + { + "epoch": 1.9514824797843666, + "grad_norm": 4.395935309413971, + "learning_rate": 5.479560767009329e-06, + "loss": 0.0743, + "step": 1448 + }, + { + "epoch": 1.9528301886792452, + "grad_norm": 22.411617243312858, + "learning_rate": 5.4741277004233385e-06, + "loss": 0.1061, + "step": 1449 + }, + { + "epoch": 1.954177897574124, + "grad_norm": 21.704251479136392, + "learning_rate": 5.4686940688828725e-06, + "loss": 0.1265, + "step": 1450 + }, + { + "epoch": 1.9555256064690028, + "grad_norm": 24.405893902396205, + "learning_rate": 5.463259878862466e-06, + "loss": 0.0819, + "step": 1451 + }, + { + "epoch": 1.9568733153638815, + "grad_norm": 13.575468361633106, + "learning_rate": 5.457825136837312e-06, + "loss": 0.0886, + "step": 1452 + }, + { + "epoch": 1.95822102425876, + "grad_norm": 15.485011201962617, + "learning_rate": 5.4523898492832635e-06, + "loss": 0.0812, + "step": 1453 + }, + { + "epoch": 1.9595687331536387, + "grad_norm": 39.53256294884658, + "learning_rate": 5.446954022676829e-06, + "loss": 0.0908, + "step": 1454 + }, + { + "epoch": 1.9609164420485174, + "grad_norm": 2.156788253786119, + "learning_rate": 5.4415176634951515e-06, + "loss": 0.0842, + "step": 1455 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 27.976991034655175, + "learning_rate": 5.436080778216012e-06, + "loss": 0.0787, + "step": 1456 + }, + { + "epoch": 1.963611859838275, + "grad_norm": 30.85244223005629, + "learning_rate": 5.430643373317821e-06, + "loss": 0.0768, + "step": 1457 + }, + { + "epoch": 1.9649595687331538, + "grad_norm": 41.64213558140356, + "learning_rate": 5.425205455279603e-06, + "loss": 0.1164, + "step": 1458 + }, + { + "epoch": 1.9663072776280324, + "grad_norm": 7.2916437082109224, + "learning_rate": 5.419767030580999e-06, + "loss": 0.0875, + "step": 1459 + }, + { + "epoch": 1.967654986522911, + "grad_norm": 31.750991546968493, + "learning_rate": 5.414328105702249e-06, + "loss": 0.106, + "step": 1460 + }, + { + "epoch": 1.9690026954177897, + "grad_norm": 21.17668823567841, + "learning_rate": 5.408888687124192e-06, + "loss": 0.0842, + "step": 1461 + }, + { + "epoch": 1.9703504043126685, + "grad_norm": 40.987115077985294, + "learning_rate": 5.4034487813282545e-06, + "loss": 0.1375, + "step": 1462 + }, + { + "epoch": 1.9716981132075473, + "grad_norm": 45.63768136548763, + "learning_rate": 5.398008394796444e-06, + "loss": 0.0994, + "step": 1463 + }, + { + "epoch": 1.9730458221024259, + "grad_norm": 32.44169475989783, + "learning_rate": 5.39256753401134e-06, + "loss": 0.1163, + "step": 1464 + }, + { + "epoch": 1.9743935309973046, + "grad_norm": 29.4393371244778, + "learning_rate": 5.387126205456088e-06, + "loss": 0.1045, + "step": 1465 + }, + { + "epoch": 1.9757412398921832, + "grad_norm": 32.221460994383065, + "learning_rate": 5.381684415614391e-06, + "loss": 0.1105, + "step": 1466 + }, + { + "epoch": 1.977088948787062, + "grad_norm": 37.19578061982519, + "learning_rate": 5.3762421709705e-06, + "loss": 0.1204, + "step": 1467 + }, + { + "epoch": 1.9784366576819408, + "grad_norm": 31.80415707957099, + "learning_rate": 5.3707994780092076e-06, + "loss": 0.0902, + "step": 1468 + }, + { + "epoch": 1.9797843665768196, + "grad_norm": 44.12904877399175, + "learning_rate": 5.365356343215845e-06, + "loss": 0.1012, + "step": 1469 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 27.560623776057188, + "learning_rate": 5.359912773076265e-06, + "loss": 0.1183, + "step": 1470 + }, + { + "epoch": 1.9824797843665767, + "grad_norm": 25.77835044125415, + "learning_rate": 5.354468774076842e-06, + "loss": 0.0955, + "step": 1471 + }, + { + "epoch": 1.9838274932614555, + "grad_norm": 10.464436189371217, + "learning_rate": 5.34902435270446e-06, + "loss": 0.0979, + "step": 1472 + }, + { + "epoch": 1.9851752021563343, + "grad_norm": 5.0413385960054855, + "learning_rate": 5.343579515446505e-06, + "loss": 0.0866, + "step": 1473 + }, + { + "epoch": 1.986522911051213, + "grad_norm": 23.413045295504592, + "learning_rate": 5.338134268790862e-06, + "loss": 0.103, + "step": 1474 + }, + { + "epoch": 1.9878706199460916, + "grad_norm": 9.705334363746262, + "learning_rate": 5.332688619225903e-06, + "loss": 0.0968, + "step": 1475 + }, + { + "epoch": 1.9892183288409704, + "grad_norm": 31.801299951470767, + "learning_rate": 5.3272425732404775e-06, + "loss": 0.0786, + "step": 1476 + }, + { + "epoch": 1.990566037735849, + "grad_norm": 10.072942565319671, + "learning_rate": 5.321796137323909e-06, + "loss": 0.0846, + "step": 1477 + }, + { + "epoch": 1.9919137466307277, + "grad_norm": 9.409965706545336, + "learning_rate": 5.316349317965989e-06, + "loss": 0.0781, + "step": 1478 + }, + { + "epoch": 1.9932614555256065, + "grad_norm": 29.576144496209082, + "learning_rate": 5.310902121656957e-06, + "loss": 0.1061, + "step": 1479 + }, + { + "epoch": 1.9946091644204853, + "grad_norm": 22.420330625670147, + "learning_rate": 5.3054545548875105e-06, + "loss": 0.0847, + "step": 1480 + }, + { + "epoch": 1.9959568733153639, + "grad_norm": 28.40109736266923, + "learning_rate": 5.300006624148786e-06, + "loss": 0.0868, + "step": 1481 + }, + { + "epoch": 1.9973045822102424, + "grad_norm": 38.6080555974034, + "learning_rate": 5.29455833593235e-06, + "loss": 0.113, + "step": 1482 + }, + { + "epoch": 1.9986522911051212, + "grad_norm": 44.022874202191325, + "learning_rate": 5.2891096967302e-06, + "loss": 0.0885, + "step": 1483 + }, + { + "epoch": 2.0, + "grad_norm": 41.98852795144451, + "learning_rate": 5.28366071303475e-06, + "loss": 0.117, + "step": 1484 + }, + { + "epoch": 2.001347708894879, + "grad_norm": 37.23948026966171, + "learning_rate": 5.2782113913388226e-06, + "loss": 0.0745, + "step": 1485 + }, + { + "epoch": 2.0026954177897576, + "grad_norm": 24.26478852184496, + "learning_rate": 5.2727617381356435e-06, + "loss": 0.0443, + "step": 1486 + }, + { + "epoch": 2.004043126684636, + "grad_norm": 25.84261472981937, + "learning_rate": 5.267311759918836e-06, + "loss": 0.08, + "step": 1487 + }, + { + "epoch": 2.0053908355795147, + "grad_norm": 17.886062386841598, + "learning_rate": 5.2618614631824094e-06, + "loss": 0.1091, + "step": 1488 + }, + { + "epoch": 2.0067385444743935, + "grad_norm": 21.292627482182418, + "learning_rate": 5.256410854420752e-06, + "loss": 0.0768, + "step": 1489 + }, + { + "epoch": 2.0080862533692723, + "grad_norm": 40.033822933511544, + "learning_rate": 5.250959940128624e-06, + "loss": 0.0973, + "step": 1490 + }, + { + "epoch": 2.009433962264151, + "grad_norm": 11.259840528332134, + "learning_rate": 5.24550872680115e-06, + "loss": 0.1047, + "step": 1491 + }, + { + "epoch": 2.01078167115903, + "grad_norm": 6.806547596111846, + "learning_rate": 5.24005722093381e-06, + "loss": 0.0688, + "step": 1492 + }, + { + "epoch": 2.012129380053908, + "grad_norm": 18.722525680387495, + "learning_rate": 5.2346054290224344e-06, + "loss": 0.0723, + "step": 1493 + }, + { + "epoch": 2.013477088948787, + "grad_norm": 5.2294324074600365, + "learning_rate": 5.229153357563194e-06, + "loss": 0.1165, + "step": 1494 + }, + { + "epoch": 2.0148247978436657, + "grad_norm": 5.037305384546009, + "learning_rate": 5.22370101305259e-06, + "loss": 0.0944, + "step": 1495 + }, + { + "epoch": 2.0161725067385445, + "grad_norm": 20.434257721504718, + "learning_rate": 5.218248401987453e-06, + "loss": 0.0796, + "step": 1496 + }, + { + "epoch": 2.0175202156334233, + "grad_norm": 2.753133841746517, + "learning_rate": 5.212795530864928e-06, + "loss": 0.0782, + "step": 1497 + }, + { + "epoch": 2.018867924528302, + "grad_norm": 9.373098862960934, + "learning_rate": 5.20734240618247e-06, + "loss": 0.0764, + "step": 1498 + }, + { + "epoch": 2.0202156334231804, + "grad_norm": 12.935214837037536, + "learning_rate": 5.2018890344378414e-06, + "loss": 0.103, + "step": 1499 + }, + { + "epoch": 2.0215633423180592, + "grad_norm": 35.731928075955395, + "learning_rate": 5.19643542212909e-06, + "loss": 0.0846, + "step": 1500 + }, + { + "epoch": 2.022911051212938, + "grad_norm": 23.08681404973202, + "learning_rate": 5.190981575754558e-06, + "loss": 0.0866, + "step": 1501 + }, + { + "epoch": 2.024258760107817, + "grad_norm": 36.86786286179082, + "learning_rate": 5.185527501812865e-06, + "loss": 0.094, + "step": 1502 + }, + { + "epoch": 2.0256064690026956, + "grad_norm": 12.769983331283786, + "learning_rate": 5.180073206802896e-06, + "loss": 0.0872, + "step": 1503 + }, + { + "epoch": 2.026954177897574, + "grad_norm": 23.154448748955808, + "learning_rate": 5.1746186972238055e-06, + "loss": 0.0836, + "step": 1504 + }, + { + "epoch": 2.0283018867924527, + "grad_norm": 8.202817498494337, + "learning_rate": 5.169163979575005e-06, + "loss": 0.0938, + "step": 1505 + }, + { + "epoch": 2.0296495956873315, + "grad_norm": 27.781146594097596, + "learning_rate": 5.1637090603561465e-06, + "loss": 0.0986, + "step": 1506 + }, + { + "epoch": 2.0309973045822103, + "grad_norm": 26.58670025663311, + "learning_rate": 5.158253946067131e-06, + "loss": 0.0903, + "step": 1507 + }, + { + "epoch": 2.032345013477089, + "grad_norm": 3.993808394739428, + "learning_rate": 5.152798643208085e-06, + "loss": 0.0749, + "step": 1508 + }, + { + "epoch": 2.033692722371968, + "grad_norm": 29.062509530466638, + "learning_rate": 5.14734315827936e-06, + "loss": 0.0707, + "step": 1509 + }, + { + "epoch": 2.035040431266846, + "grad_norm": 8.851657379386424, + "learning_rate": 5.141887497781529e-06, + "loss": 0.082, + "step": 1510 + }, + { + "epoch": 2.036388140161725, + "grad_norm": 14.374937239580346, + "learning_rate": 5.136431668215374e-06, + "loss": 0.0782, + "step": 1511 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 5.2556756242357645, + "learning_rate": 5.130975676081873e-06, + "loss": 0.0805, + "step": 1512 + }, + { + "epoch": 2.0390835579514826, + "grad_norm": 11.84174278711634, + "learning_rate": 5.1255195278822014e-06, + "loss": 0.0892, + "step": 1513 + }, + { + "epoch": 2.0404312668463613, + "grad_norm": 12.615081508565869, + "learning_rate": 5.120063230117723e-06, + "loss": 0.0558, + "step": 1514 + }, + { + "epoch": 2.0417789757412397, + "grad_norm": 1.625343879964544, + "learning_rate": 5.114606789289973e-06, + "loss": 0.0719, + "step": 1515 + }, + { + "epoch": 2.0431266846361185, + "grad_norm": 4.906343087252333, + "learning_rate": 5.109150211900665e-06, + "loss": 0.0411, + "step": 1516 + }, + { + "epoch": 2.0444743935309972, + "grad_norm": 36.71280866699191, + "learning_rate": 5.103693504451668e-06, + "loss": 0.1249, + "step": 1517 + }, + { + "epoch": 2.045822102425876, + "grad_norm": 16.96999467908217, + "learning_rate": 5.098236673445011e-06, + "loss": 0.0857, + "step": 1518 + }, + { + "epoch": 2.047169811320755, + "grad_norm": 15.486388825125378, + "learning_rate": 5.092779725382869e-06, + "loss": 0.082, + "step": 1519 + }, + { + "epoch": 2.0485175202156336, + "grad_norm": 31.7583296511955, + "learning_rate": 5.087322666767557e-06, + "loss": 0.1003, + "step": 1520 + }, + { + "epoch": 2.049865229110512, + "grad_norm": 36.09684931795807, + "learning_rate": 5.081865504101517e-06, + "loss": 0.1024, + "step": 1521 + }, + { + "epoch": 2.0512129380053907, + "grad_norm": 23.759853353104774, + "learning_rate": 5.076408243887321e-06, + "loss": 0.075, + "step": 1522 + }, + { + "epoch": 2.0525606469002695, + "grad_norm": 35.52727740492361, + "learning_rate": 5.070950892627656e-06, + "loss": 0.0608, + "step": 1523 + }, + { + "epoch": 2.0539083557951483, + "grad_norm": 25.9309638321048, + "learning_rate": 5.065493456825316e-06, + "loss": 0.065, + "step": 1524 + }, + { + "epoch": 2.055256064690027, + "grad_norm": 43.427334974095785, + "learning_rate": 5.060035942983194e-06, + "loss": 0.0925, + "step": 1525 + }, + { + "epoch": 2.056603773584906, + "grad_norm": 38.582081415467606, + "learning_rate": 5.054578357604284e-06, + "loss": 0.0761, + "step": 1526 + }, + { + "epoch": 2.057951482479784, + "grad_norm": 17.1033837354341, + "learning_rate": 5.0491207071916525e-06, + "loss": 0.1141, + "step": 1527 + }, + { + "epoch": 2.059299191374663, + "grad_norm": 36.81882157843135, + "learning_rate": 5.043662998248455e-06, + "loss": 0.1033, + "step": 1528 + }, + { + "epoch": 2.060646900269542, + "grad_norm": 25.205866486630832, + "learning_rate": 5.03820523727791e-06, + "loss": 0.053, + "step": 1529 + }, + { + "epoch": 2.0619946091644206, + "grad_norm": 35.40450548512179, + "learning_rate": 5.032747430783301e-06, + "loss": 0.0734, + "step": 1530 + }, + { + "epoch": 2.0633423180592994, + "grad_norm": 12.912208995987092, + "learning_rate": 5.027289585267967e-06, + "loss": 0.0592, + "step": 1531 + }, + { + "epoch": 2.0646900269541777, + "grad_norm": 45.801520232461215, + "learning_rate": 5.021831707235289e-06, + "loss": 0.1178, + "step": 1532 + }, + { + "epoch": 2.0660377358490565, + "grad_norm": 3.680403641726924, + "learning_rate": 5.016373803188689e-06, + "loss": 0.0664, + "step": 1533 + }, + { + "epoch": 2.0673854447439353, + "grad_norm": 11.357135654204681, + "learning_rate": 5.010915879631619e-06, + "loss": 0.0801, + "step": 1534 + }, + { + "epoch": 2.068733153638814, + "grad_norm": 2.2448739991439357, + "learning_rate": 5.005457943067561e-06, + "loss": 0.072, + "step": 1535 + }, + { + "epoch": 2.070080862533693, + "grad_norm": 10.38994474183449, + "learning_rate": 5e-06, + "loss": 0.0694, + "step": 1536 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 8.508512297897715, + "learning_rate": 4.994542056932442e-06, + "loss": 0.0999, + "step": 1537 + }, + { + "epoch": 2.07277628032345, + "grad_norm": 2.982043570221644, + "learning_rate": 4.989084120368381e-06, + "loss": 0.0794, + "step": 1538 + }, + { + "epoch": 2.0741239892183287, + "grad_norm": 50.220382195811595, + "learning_rate": 4.983626196811313e-06, + "loss": 0.104, + "step": 1539 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 24.7603562967408, + "learning_rate": 4.978168292764714e-06, + "loss": 0.079, + "step": 1540 + }, + { + "epoch": 2.0768194070080863, + "grad_norm": 26.410205748494672, + "learning_rate": 4.972710414732034e-06, + "loss": 0.0946, + "step": 1541 + }, + { + "epoch": 2.078167115902965, + "grad_norm": 21.20913561127269, + "learning_rate": 4.9672525692167e-06, + "loss": 0.0543, + "step": 1542 + }, + { + "epoch": 2.079514824797844, + "grad_norm": 45.565436611230524, + "learning_rate": 4.9617947627220904e-06, + "loss": 0.1048, + "step": 1543 + }, + { + "epoch": 2.0808625336927222, + "grad_norm": 27.035539220648666, + "learning_rate": 4.956337001751546e-06, + "loss": 0.0796, + "step": 1544 + }, + { + "epoch": 2.082210242587601, + "grad_norm": 41.02736174276366, + "learning_rate": 4.950879292808349e-06, + "loss": 0.1033, + "step": 1545 + }, + { + "epoch": 2.08355795148248, + "grad_norm": 24.397132433130377, + "learning_rate": 4.945421642395719e-06, + "loss": 0.0794, + "step": 1546 + }, + { + "epoch": 2.0849056603773586, + "grad_norm": 44.577736994808944, + "learning_rate": 4.939964057016806e-06, + "loss": 0.1017, + "step": 1547 + }, + { + "epoch": 2.0862533692722374, + "grad_norm": 26.240162474732422, + "learning_rate": 4.934506543174686e-06, + "loss": 0.0659, + "step": 1548 + }, + { + "epoch": 2.0876010781671157, + "grad_norm": 23.578837133545584, + "learning_rate": 4.9290491073723465e-06, + "loss": 0.066, + "step": 1549 + }, + { + "epoch": 2.0889487870619945, + "grad_norm": 46.96107648154718, + "learning_rate": 4.92359175611268e-06, + "loss": 0.0993, + "step": 1550 + }, + { + "epoch": 2.0902964959568733, + "grad_norm": 26.069828168430018, + "learning_rate": 4.918134495898485e-06, + "loss": 0.0668, + "step": 1551 + }, + { + "epoch": 2.091644204851752, + "grad_norm": 29.923603458214107, + "learning_rate": 4.912677333232446e-06, + "loss": 0.06, + "step": 1552 + }, + { + "epoch": 2.092991913746631, + "grad_norm": 20.27787797136538, + "learning_rate": 4.907220274617132e-06, + "loss": 0.0529, + "step": 1553 + }, + { + "epoch": 2.0943396226415096, + "grad_norm": 32.44172128201535, + "learning_rate": 4.901763326554991e-06, + "loss": 0.0616, + "step": 1554 + }, + { + "epoch": 2.095687331536388, + "grad_norm": 36.604533990416904, + "learning_rate": 4.896306495548334e-06, + "loss": 0.0811, + "step": 1555 + }, + { + "epoch": 2.0970350404312668, + "grad_norm": 1.7004226388477683, + "learning_rate": 4.890849788099336e-06, + "loss": 0.0714, + "step": 1556 + }, + { + "epoch": 2.0983827493261455, + "grad_norm": 8.551185150746702, + "learning_rate": 4.885393210710028e-06, + "loss": 0.0797, + "step": 1557 + }, + { + "epoch": 2.0997304582210243, + "grad_norm": 11.259688351211429, + "learning_rate": 4.87993676988228e-06, + "loss": 0.0516, + "step": 1558 + }, + { + "epoch": 2.101078167115903, + "grad_norm": 3.4326802735767687, + "learning_rate": 4.8744804721177985e-06, + "loss": 0.0831, + "step": 1559 + }, + { + "epoch": 2.1024258760107815, + "grad_norm": 31.85498273668575, + "learning_rate": 4.869024323918128e-06, + "loss": 0.0978, + "step": 1560 + }, + { + "epoch": 2.1037735849056602, + "grad_norm": 8.880144745677931, + "learning_rate": 4.8635683317846285e-06, + "loss": 0.0713, + "step": 1561 + }, + { + "epoch": 2.105121293800539, + "grad_norm": 17.3442741971847, + "learning_rate": 4.858112502218471e-06, + "loss": 0.0639, + "step": 1562 + }, + { + "epoch": 2.106469002695418, + "grad_norm": 20.23092932100312, + "learning_rate": 4.852656841720642e-06, + "loss": 0.054, + "step": 1563 + }, + { + "epoch": 2.1078167115902966, + "grad_norm": 34.97491799651886, + "learning_rate": 4.8472013567919176e-06, + "loss": 0.0734, + "step": 1564 + }, + { + "epoch": 2.1091644204851754, + "grad_norm": 10.736915113178574, + "learning_rate": 4.84174605393287e-06, + "loss": 0.0846, + "step": 1565 + }, + { + "epoch": 2.1105121293800537, + "grad_norm": 26.961967353550808, + "learning_rate": 4.836290939643854e-06, + "loss": 0.0981, + "step": 1566 + }, + { + "epoch": 2.1118598382749325, + "grad_norm": 37.15550291236645, + "learning_rate": 4.830836020424996e-06, + "loss": 0.0789, + "step": 1567 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 27.99488012056636, + "learning_rate": 4.8253813027761945e-06, + "loss": 0.0497, + "step": 1568 + }, + { + "epoch": 2.11455525606469, + "grad_norm": 30.690705654614415, + "learning_rate": 4.819926793197106e-06, + "loss": 0.0716, + "step": 1569 + }, + { + "epoch": 2.115902964959569, + "grad_norm": 40.003614474572856, + "learning_rate": 4.814472498187139e-06, + "loss": 0.0744, + "step": 1570 + }, + { + "epoch": 2.1172506738544477, + "grad_norm": 27.519104522719648, + "learning_rate": 4.809018424245442e-06, + "loss": 0.069, + "step": 1571 + }, + { + "epoch": 2.118598382749326, + "grad_norm": 17.24544771699317, + "learning_rate": 4.8035645778709114e-06, + "loss": 0.1053, + "step": 1572 + }, + { + "epoch": 2.1199460916442048, + "grad_norm": 34.10671921819011, + "learning_rate": 4.798110965562161e-06, + "loss": 0.0832, + "step": 1573 + }, + { + "epoch": 2.1212938005390836, + "grad_norm": 6.713219662974832, + "learning_rate": 4.79265759381753e-06, + "loss": 0.0719, + "step": 1574 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 25.89097940801962, + "learning_rate": 4.7872044691350735e-06, + "loss": 0.0668, + "step": 1575 + }, + { + "epoch": 2.123989218328841, + "grad_norm": 3.440404196723915, + "learning_rate": 4.781751598012549e-06, + "loss": 0.0662, + "step": 1576 + }, + { + "epoch": 2.1253369272237195, + "grad_norm": 13.166713953970635, + "learning_rate": 4.776298986947411e-06, + "loss": 0.065, + "step": 1577 + }, + { + "epoch": 2.1266846361185983, + "grad_norm": 6.03334567719322, + "learning_rate": 4.770846642436809e-06, + "loss": 0.0672, + "step": 1578 + }, + { + "epoch": 2.128032345013477, + "grad_norm": 7.631924778250048, + "learning_rate": 4.765394570977566e-06, + "loss": 0.0807, + "step": 1579 + }, + { + "epoch": 2.129380053908356, + "grad_norm": 12.745639388183623, + "learning_rate": 4.759942779066191e-06, + "loss": 0.0554, + "step": 1580 + }, + { + "epoch": 2.1307277628032346, + "grad_norm": 11.47467457736581, + "learning_rate": 4.754491273198852e-06, + "loss": 0.105, + "step": 1581 + }, + { + "epoch": 2.1320754716981134, + "grad_norm": 12.67120441628888, + "learning_rate": 4.749040059871378e-06, + "loss": 0.0456, + "step": 1582 + }, + { + "epoch": 2.1334231805929917, + "grad_norm": 9.282641808464875, + "learning_rate": 4.743589145579249e-06, + "loss": 0.0571, + "step": 1583 + }, + { + "epoch": 2.1347708894878705, + "grad_norm": 14.083866778171496, + "learning_rate": 4.738138536817592e-06, + "loss": 0.087, + "step": 1584 + }, + { + "epoch": 2.1361185983827493, + "grad_norm": 6.1194959708832535, + "learning_rate": 4.732688240081165e-06, + "loss": 0.0733, + "step": 1585 + }, + { + "epoch": 2.137466307277628, + "grad_norm": 10.074582549115679, + "learning_rate": 4.727238261864357e-06, + "loss": 0.0682, + "step": 1586 + }, + { + "epoch": 2.138814016172507, + "grad_norm": 41.964088675817855, + "learning_rate": 4.72178860866118e-06, + "loss": 0.1146, + "step": 1587 + }, + { + "epoch": 2.1401617250673857, + "grad_norm": 2.040126357197928, + "learning_rate": 4.716339286965252e-06, + "loss": 0.0893, + "step": 1588 + }, + { + "epoch": 2.141509433962264, + "grad_norm": 9.677873926527514, + "learning_rate": 4.7108903032698005e-06, + "loss": 0.0832, + "step": 1589 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 19.74806413943723, + "learning_rate": 4.705441664067651e-06, + "loss": 0.1032, + "step": 1590 + }, + { + "epoch": 2.1442048517520216, + "grad_norm": 31.452387575762078, + "learning_rate": 4.699993375851216e-06, + "loss": 0.0768, + "step": 1591 + }, + { + "epoch": 2.1455525606469004, + "grad_norm": 9.96189319173805, + "learning_rate": 4.69454544511249e-06, + "loss": 0.0838, + "step": 1592 + }, + { + "epoch": 2.146900269541779, + "grad_norm": 10.186231805854481, + "learning_rate": 4.689097878343045e-06, + "loss": 0.0611, + "step": 1593 + }, + { + "epoch": 2.1482479784366575, + "grad_norm": 22.354186906666047, + "learning_rate": 4.6836506820340145e-06, + "loss": 0.064, + "step": 1594 + }, + { + "epoch": 2.1495956873315363, + "grad_norm": 21.360660825259906, + "learning_rate": 4.678203862676091e-06, + "loss": 0.0779, + "step": 1595 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 9.174782350467895, + "learning_rate": 4.672757426759524e-06, + "loss": 0.062, + "step": 1596 + }, + { + "epoch": 2.152291105121294, + "grad_norm": 13.358560918001015, + "learning_rate": 4.667311380774099e-06, + "loss": 0.0957, + "step": 1597 + }, + { + "epoch": 2.1536388140161726, + "grad_norm": 4.3504741582871675, + "learning_rate": 4.661865731209138e-06, + "loss": 0.0571, + "step": 1598 + }, + { + "epoch": 2.1549865229110514, + "grad_norm": 9.243390341384229, + "learning_rate": 4.656420484553496e-06, + "loss": 0.0922, + "step": 1599 + }, + { + "epoch": 2.1563342318059298, + "grad_norm": 5.848697111085247, + "learning_rate": 4.650975647295543e-06, + "loss": 0.0822, + "step": 1600 + }, + { + "epoch": 2.1576819407008085, + "grad_norm": 10.363069575846405, + "learning_rate": 4.64553122592316e-06, + "loss": 0.0735, + "step": 1601 + }, + { + "epoch": 2.1590296495956873, + "grad_norm": 2.896258355184782, + "learning_rate": 4.640087226923738e-06, + "loss": 0.0745, + "step": 1602 + }, + { + "epoch": 2.160377358490566, + "grad_norm": 4.7798037050866276, + "learning_rate": 4.6346436567841564e-06, + "loss": 0.0433, + "step": 1603 + }, + { + "epoch": 2.161725067385445, + "grad_norm": 8.287162712297844, + "learning_rate": 4.629200521990793e-06, + "loss": 0.0637, + "step": 1604 + }, + { + "epoch": 2.1630727762803232, + "grad_norm": 3.305382457414503, + "learning_rate": 4.623757829029503e-06, + "loss": 0.1009, + "step": 1605 + }, + { + "epoch": 2.164420485175202, + "grad_norm": 3.1706345335497472, + "learning_rate": 4.618315584385612e-06, + "loss": 0.0749, + "step": 1606 + }, + { + "epoch": 2.165768194070081, + "grad_norm": 14.038426879331011, + "learning_rate": 4.612873794543912e-06, + "loss": 0.0602, + "step": 1607 + }, + { + "epoch": 2.1671159029649596, + "grad_norm": 13.923655895146352, + "learning_rate": 4.607432465988661e-06, + "loss": 0.0782, + "step": 1608 + }, + { + "epoch": 2.1684636118598384, + "grad_norm": 13.791083546996436, + "learning_rate": 4.601991605203558e-06, + "loss": 0.088, + "step": 1609 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 8.402031361144637, + "learning_rate": 4.596551218671746e-06, + "loss": 0.1012, + "step": 1610 + }, + { + "epoch": 2.1711590296495955, + "grad_norm": 25.228936003827833, + "learning_rate": 4.5911113128758095e-06, + "loss": 0.1, + "step": 1611 + }, + { + "epoch": 2.1725067385444743, + "grad_norm": 13.87206628557552, + "learning_rate": 4.585671894297753e-06, + "loss": 0.0788, + "step": 1612 + }, + { + "epoch": 2.173854447439353, + "grad_norm": 12.365358414672217, + "learning_rate": 4.580232969419002e-06, + "loss": 0.0742, + "step": 1613 + }, + { + "epoch": 2.175202156334232, + "grad_norm": 2.185151226960825, + "learning_rate": 4.5747945447203985e-06, + "loss": 0.0704, + "step": 1614 + }, + { + "epoch": 2.1765498652291106, + "grad_norm": 11.455915244828502, + "learning_rate": 4.569356626682181e-06, + "loss": 0.0891, + "step": 1615 + }, + { + "epoch": 2.177897574123989, + "grad_norm": 3.907339179453856, + "learning_rate": 4.563919221783988e-06, + "loss": 0.0876, + "step": 1616 + }, + { + "epoch": 2.1792452830188678, + "grad_norm": 10.15914486516509, + "learning_rate": 4.55848233650485e-06, + "loss": 0.0756, + "step": 1617 + }, + { + "epoch": 2.1805929919137466, + "grad_norm": 5.069034259527786, + "learning_rate": 4.553045977323173e-06, + "loss": 0.0505, + "step": 1618 + }, + { + "epoch": 2.1819407008086253, + "grad_norm": 2.042606475357963, + "learning_rate": 4.5476101507167365e-06, + "loss": 0.059, + "step": 1619 + }, + { + "epoch": 2.183288409703504, + "grad_norm": 4.734915260668062, + "learning_rate": 4.54217486316269e-06, + "loss": 0.0772, + "step": 1620 + }, + { + "epoch": 2.184636118598383, + "grad_norm": 9.038663141982722, + "learning_rate": 4.536740121137536e-06, + "loss": 0.0927, + "step": 1621 + }, + { + "epoch": 2.1859838274932613, + "grad_norm": 8.479079939150896, + "learning_rate": 4.531305931117127e-06, + "loss": 0.0652, + "step": 1622 + }, + { + "epoch": 2.18733153638814, + "grad_norm": 3.954160743951541, + "learning_rate": 4.525872299576663e-06, + "loss": 0.0599, + "step": 1623 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 3.5101200748629386, + "learning_rate": 4.520439232990674e-06, + "loss": 0.0798, + "step": 1624 + }, + { + "epoch": 2.1900269541778976, + "grad_norm": 14.518792566126788, + "learning_rate": 4.515006737833015e-06, + "loss": 0.0395, + "step": 1625 + }, + { + "epoch": 2.1913746630727764, + "grad_norm": 31.339554577302692, + "learning_rate": 4.5095748205768656e-06, + "loss": 0.0827, + "step": 1626 + }, + { + "epoch": 2.192722371967655, + "grad_norm": 11.539893233571627, + "learning_rate": 4.504143487694712e-06, + "loss": 0.0801, + "step": 1627 + }, + { + "epoch": 2.1940700808625335, + "grad_norm": 14.197131031368015, + "learning_rate": 4.498712745658348e-06, + "loss": 0.0669, + "step": 1628 + }, + { + "epoch": 2.1954177897574123, + "grad_norm": 20.529816646999013, + "learning_rate": 4.493282600938861e-06, + "loss": 0.0612, + "step": 1629 + }, + { + "epoch": 2.196765498652291, + "grad_norm": 18.199268608295785, + "learning_rate": 4.487853060006627e-06, + "loss": 0.0754, + "step": 1630 + }, + { + "epoch": 2.19811320754717, + "grad_norm": 19.045636615195317, + "learning_rate": 4.482424129331299e-06, + "loss": 0.0766, + "step": 1631 + }, + { + "epoch": 2.1994609164420487, + "grad_norm": 13.46377267694943, + "learning_rate": 4.476995815381815e-06, + "loss": 0.1282, + "step": 1632 + }, + { + "epoch": 2.2008086253369274, + "grad_norm": 30.707410412698984, + "learning_rate": 4.471568124626362e-06, + "loss": 0.0957, + "step": 1633 + }, + { + "epoch": 2.202156334231806, + "grad_norm": 36.959191968519725, + "learning_rate": 4.466141063532393e-06, + "loss": 0.088, + "step": 1634 + }, + { + "epoch": 2.2035040431266846, + "grad_norm": 2.890242703119733, + "learning_rate": 4.4607146385666145e-06, + "loss": 0.0628, + "step": 1635 + }, + { + "epoch": 2.2048517520215634, + "grad_norm": 8.739266018286754, + "learning_rate": 4.455288856194966e-06, + "loss": 0.0563, + "step": 1636 + }, + { + "epoch": 2.206199460916442, + "grad_norm": 1.756682428483566, + "learning_rate": 4.449863722882627e-06, + "loss": 0.0633, + "step": 1637 + }, + { + "epoch": 2.207547169811321, + "grad_norm": 7.8151326442335565, + "learning_rate": 4.444439245094003e-06, + "loss": 0.0911, + "step": 1638 + }, + { + "epoch": 2.2088948787061993, + "grad_norm": 13.563402516371703, + "learning_rate": 4.439015429292715e-06, + "loss": 0.0877, + "step": 1639 + }, + { + "epoch": 2.210242587601078, + "grad_norm": 16.225233323826735, + "learning_rate": 4.4335922819415996e-06, + "loss": 0.0523, + "step": 1640 + }, + { + "epoch": 2.211590296495957, + "grad_norm": 7.3157623680627735, + "learning_rate": 4.428169809502696e-06, + "loss": 0.0762, + "step": 1641 + }, + { + "epoch": 2.2129380053908356, + "grad_norm": 3.1514029199201508, + "learning_rate": 4.422748018437237e-06, + "loss": 0.0725, + "step": 1642 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 7.773773064543327, + "learning_rate": 4.417326915205643e-06, + "loss": 0.0502, + "step": 1643 + }, + { + "epoch": 2.215633423180593, + "grad_norm": 8.707255645113625, + "learning_rate": 4.411906506267521e-06, + "loss": 0.0678, + "step": 1644 + }, + { + "epoch": 2.2169811320754715, + "grad_norm": 14.367635977842182, + "learning_rate": 4.406486798081644e-06, + "loss": 0.0864, + "step": 1645 + }, + { + "epoch": 2.2183288409703503, + "grad_norm": 19.233334904720344, + "learning_rate": 4.40106779710595e-06, + "loss": 0.0618, + "step": 1646 + }, + { + "epoch": 2.219676549865229, + "grad_norm": 32.85060856435931, + "learning_rate": 4.395649509797541e-06, + "loss": 0.0667, + "step": 1647 + }, + { + "epoch": 2.221024258760108, + "grad_norm": 28.9682770641955, + "learning_rate": 4.390231942612662e-06, + "loss": 0.0798, + "step": 1648 + }, + { + "epoch": 2.2223719676549867, + "grad_norm": 41.164249963969034, + "learning_rate": 4.3848151020067024e-06, + "loss": 0.0895, + "step": 1649 + }, + { + "epoch": 2.223719676549865, + "grad_norm": 47.41009165533848, + "learning_rate": 4.379398994434184e-06, + "loss": 0.1284, + "step": 1650 + }, + { + "epoch": 2.225067385444744, + "grad_norm": 34.34248574839438, + "learning_rate": 4.37398362634876e-06, + "loss": 0.0949, + "step": 1651 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 32.62441554296755, + "learning_rate": 4.368569004203196e-06, + "loss": 0.0862, + "step": 1652 + }, + { + "epoch": 2.2277628032345014, + "grad_norm": 26.768911586507887, + "learning_rate": 4.363155134449374e-06, + "loss": 0.0679, + "step": 1653 + }, + { + "epoch": 2.22911051212938, + "grad_norm": 15.050861162781032, + "learning_rate": 4.357742023538277e-06, + "loss": 0.0786, + "step": 1654 + }, + { + "epoch": 2.230458221024259, + "grad_norm": 11.884128527871633, + "learning_rate": 4.352329677919983e-06, + "loss": 0.0701, + "step": 1655 + }, + { + "epoch": 2.2318059299191373, + "grad_norm": 10.952795801149861, + "learning_rate": 4.3469181040436614e-06, + "loss": 0.0546, + "step": 1656 + }, + { + "epoch": 2.233153638814016, + "grad_norm": 7.499300256410989, + "learning_rate": 4.341507308357559e-06, + "loss": 0.0314, + "step": 1657 + }, + { + "epoch": 2.234501347708895, + "grad_norm": 6.651804966927524, + "learning_rate": 4.336097297308994e-06, + "loss": 0.0736, + "step": 1658 + }, + { + "epoch": 2.2358490566037736, + "grad_norm": 5.106370773632856, + "learning_rate": 4.330688077344357e-06, + "loss": 0.049, + "step": 1659 + }, + { + "epoch": 2.2371967654986524, + "grad_norm": 23.706522710791706, + "learning_rate": 4.325279654909087e-06, + "loss": 0.0803, + "step": 1660 + }, + { + "epoch": 2.2385444743935308, + "grad_norm": 1.433315323461568, + "learning_rate": 4.319872036447677e-06, + "loss": 0.0427, + "step": 1661 + }, + { + "epoch": 2.2398921832884096, + "grad_norm": 4.88253088122149, + "learning_rate": 4.314465228403663e-06, + "loss": 0.0718, + "step": 1662 + }, + { + "epoch": 2.2412398921832883, + "grad_norm": 10.517901598244428, + "learning_rate": 4.309059237219613e-06, + "loss": 0.0939, + "step": 1663 + }, + { + "epoch": 2.242587601078167, + "grad_norm": 14.482913024940549, + "learning_rate": 4.303654069337121e-06, + "loss": 0.0608, + "step": 1664 + }, + { + "epoch": 2.243935309973046, + "grad_norm": 42.476294215387924, + "learning_rate": 4.2982497311968054e-06, + "loss": 0.0809, + "step": 1665 + }, + { + "epoch": 2.2452830188679247, + "grad_norm": 32.57147265773425, + "learning_rate": 4.29284622923829e-06, + "loss": 0.1025, + "step": 1666 + }, + { + "epoch": 2.246630727762803, + "grad_norm": 2.8989317741513427, + "learning_rate": 4.287443569900202e-06, + "loss": 0.0932, + "step": 1667 + }, + { + "epoch": 2.247978436657682, + "grad_norm": 12.270349902063913, + "learning_rate": 4.282041759620171e-06, + "loss": 0.0637, + "step": 1668 + }, + { + "epoch": 2.2493261455525606, + "grad_norm": 41.31335182045065, + "learning_rate": 4.276640804834809e-06, + "loss": 0.0652, + "step": 1669 + }, + { + "epoch": 2.2506738544474394, + "grad_norm": 17.96687668044286, + "learning_rate": 4.271240711979709e-06, + "loss": 0.0878, + "step": 1670 + }, + { + "epoch": 2.252021563342318, + "grad_norm": 14.624540582739247, + "learning_rate": 4.26584148748944e-06, + "loss": 0.0584, + "step": 1671 + }, + { + "epoch": 2.2533692722371965, + "grad_norm": 1.970426245493994, + "learning_rate": 4.2604431377975366e-06, + "loss": 0.0636, + "step": 1672 + }, + { + "epoch": 2.2547169811320753, + "grad_norm": 25.238262674867084, + "learning_rate": 4.255045669336484e-06, + "loss": 0.0895, + "step": 1673 + }, + { + "epoch": 2.256064690026954, + "grad_norm": 32.73880716803188, + "learning_rate": 4.249649088537727e-06, + "loss": 0.0785, + "step": 1674 + }, + { + "epoch": 2.257412398921833, + "grad_norm": 13.562269266007391, + "learning_rate": 4.244253401831646e-06, + "loss": 0.0838, + "step": 1675 + }, + { + "epoch": 2.2587601078167117, + "grad_norm": 26.283084762298486, + "learning_rate": 4.238858615647556e-06, + "loss": 0.0813, + "step": 1676 + }, + { + "epoch": 2.2601078167115904, + "grad_norm": 25.385131384259818, + "learning_rate": 4.233464736413706e-06, + "loss": 0.0842, + "step": 1677 + }, + { + "epoch": 2.2614555256064692, + "grad_norm": 4.221894338432704, + "learning_rate": 4.228071770557257e-06, + "loss": 0.0569, + "step": 1678 + }, + { + "epoch": 2.2628032345013476, + "grad_norm": 19.28722382361294, + "learning_rate": 4.222679724504282e-06, + "loss": 0.0997, + "step": 1679 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 4.187928456624233, + "learning_rate": 4.217288604679764e-06, + "loss": 0.0733, + "step": 1680 + }, + { + "epoch": 2.265498652291105, + "grad_norm": 5.922179875889065, + "learning_rate": 4.211898417507576e-06, + "loss": 0.0767, + "step": 1681 + }, + { + "epoch": 2.266846361185984, + "grad_norm": 7.695082638142993, + "learning_rate": 4.206509169410481e-06, + "loss": 0.0815, + "step": 1682 + }, + { + "epoch": 2.2681940700808627, + "grad_norm": 2.6084481441236167, + "learning_rate": 4.201120866810127e-06, + "loss": 0.0858, + "step": 1683 + }, + { + "epoch": 2.269541778975741, + "grad_norm": 8.335811739813751, + "learning_rate": 4.195733516127032e-06, + "loss": 0.0783, + "step": 1684 + }, + { + "epoch": 2.27088948787062, + "grad_norm": 17.370490400097516, + "learning_rate": 4.190347123780577e-06, + "loss": 0.1345, + "step": 1685 + }, + { + "epoch": 2.2722371967654986, + "grad_norm": 23.484378637420093, + "learning_rate": 4.184961696189008e-06, + "loss": 0.0674, + "step": 1686 + }, + { + "epoch": 2.2735849056603774, + "grad_norm": 35.983253322776356, + "learning_rate": 4.179577239769416e-06, + "loss": 0.1013, + "step": 1687 + }, + { + "epoch": 2.274932614555256, + "grad_norm": 25.88323042828285, + "learning_rate": 4.174193760937735e-06, + "loss": 0.0841, + "step": 1688 + }, + { + "epoch": 2.276280323450135, + "grad_norm": 24.153957713522097, + "learning_rate": 4.16881126610874e-06, + "loss": 0.061, + "step": 1689 + }, + { + "epoch": 2.2776280323450133, + "grad_norm": 26.057615909769613, + "learning_rate": 4.163429761696025e-06, + "loss": 0.1107, + "step": 1690 + }, + { + "epoch": 2.278975741239892, + "grad_norm": 22.955693065320595, + "learning_rate": 4.158049254112009e-06, + "loss": 0.1096, + "step": 1691 + }, + { + "epoch": 2.280323450134771, + "grad_norm": 32.49483095398299, + "learning_rate": 4.152669749767924e-06, + "loss": 0.087, + "step": 1692 + }, + { + "epoch": 2.2816711590296497, + "grad_norm": 33.47064687507144, + "learning_rate": 4.147291255073804e-06, + "loss": 0.0574, + "step": 1693 + }, + { + "epoch": 2.2830188679245285, + "grad_norm": 24.429324539270155, + "learning_rate": 4.141913776438478e-06, + "loss": 0.0824, + "step": 1694 + }, + { + "epoch": 2.284366576819407, + "grad_norm": 41.62617089804322, + "learning_rate": 4.136537320269571e-06, + "loss": 0.0965, + "step": 1695 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 48.63446397752065, + "learning_rate": 4.1311618929734846e-06, + "loss": 0.1301, + "step": 1696 + }, + { + "epoch": 2.2870619946091644, + "grad_norm": 45.660380726590304, + "learning_rate": 4.1257875009553925e-06, + "loss": 0.1092, + "step": 1697 + }, + { + "epoch": 2.288409703504043, + "grad_norm": 11.089307668152347, + "learning_rate": 4.1204141506192415e-06, + "loss": 0.0646, + "step": 1698 + }, + { + "epoch": 2.289757412398922, + "grad_norm": 7.951664336893197, + "learning_rate": 4.115041848367732e-06, + "loss": 0.0595, + "step": 1699 + }, + { + "epoch": 2.2911051212938007, + "grad_norm": 14.095404059391193, + "learning_rate": 4.109670600602316e-06, + "loss": 0.0645, + "step": 1700 + }, + { + "epoch": 2.292452830188679, + "grad_norm": 12.03123440081043, + "learning_rate": 4.104300413723192e-06, + "loss": 0.0726, + "step": 1701 + }, + { + "epoch": 2.293800539083558, + "grad_norm": 4.163845469529417, + "learning_rate": 4.098931294129293e-06, + "loss": 0.0843, + "step": 1702 + }, + { + "epoch": 2.2951482479784366, + "grad_norm": 23.613238156251743, + "learning_rate": 4.093563248218274e-06, + "loss": 0.0847, + "step": 1703 + }, + { + "epoch": 2.2964959568733154, + "grad_norm": 7.408545880173188, + "learning_rate": 4.088196282386523e-06, + "loss": 0.0811, + "step": 1704 + }, + { + "epoch": 2.297843665768194, + "grad_norm": 20.873710903543937, + "learning_rate": 4.082830403029132e-06, + "loss": 0.059, + "step": 1705 + }, + { + "epoch": 2.2991913746630726, + "grad_norm": 25.88729574436525, + "learning_rate": 4.0774656165399e-06, + "loss": 0.0964, + "step": 1706 + }, + { + "epoch": 2.3005390835579513, + "grad_norm": 18.721986475938635, + "learning_rate": 4.072101929311325e-06, + "loss": 0.0953, + "step": 1707 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 13.762125344105979, + "learning_rate": 4.066739347734595e-06, + "loss": 0.0952, + "step": 1708 + }, + { + "epoch": 2.303234501347709, + "grad_norm": 45.7561868319531, + "learning_rate": 4.061377878199579e-06, + "loss": 0.0756, + "step": 1709 + }, + { + "epoch": 2.3045822102425877, + "grad_norm": 26.52439626223674, + "learning_rate": 4.056017527094824e-06, + "loss": 0.0864, + "step": 1710 + }, + { + "epoch": 2.3059299191374665, + "grad_norm": 19.350214378346372, + "learning_rate": 4.050658300807541e-06, + "loss": 0.073, + "step": 1711 + }, + { + "epoch": 2.3072776280323453, + "grad_norm": 26.108200295424602, + "learning_rate": 4.0453002057236e-06, + "loss": 0.0709, + "step": 1712 + }, + { + "epoch": 2.3086253369272236, + "grad_norm": 24.372415288609485, + "learning_rate": 4.03994324822753e-06, + "loss": 0.0773, + "step": 1713 + }, + { + "epoch": 2.3099730458221024, + "grad_norm": 30.187173309497698, + "learning_rate": 4.034587434702497e-06, + "loss": 0.0517, + "step": 1714 + }, + { + "epoch": 2.311320754716981, + "grad_norm": 38.86127986271493, + "learning_rate": 4.029232771530306e-06, + "loss": 0.1258, + "step": 1715 + }, + { + "epoch": 2.31266846361186, + "grad_norm": 46.29096761313643, + "learning_rate": 4.023879265091394e-06, + "loss": 0.0857, + "step": 1716 + }, + { + "epoch": 2.3140161725067383, + "grad_norm": 44.43573176743274, + "learning_rate": 4.018526921764817e-06, + "loss": 0.101, + "step": 1717 + }, + { + "epoch": 2.315363881401617, + "grad_norm": 7.782478098602791, + "learning_rate": 4.0131757479282416e-06, + "loss": 0.078, + "step": 1718 + }, + { + "epoch": 2.316711590296496, + "grad_norm": 46.47204817686339, + "learning_rate": 4.007825749957951e-06, + "loss": 0.1101, + "step": 1719 + }, + { + "epoch": 2.3180592991913747, + "grad_norm": 13.962112583149745, + "learning_rate": 4.00247693422882e-06, + "loss": 0.0823, + "step": 1720 + }, + { + "epoch": 2.3194070080862534, + "grad_norm": 7.607088280904576, + "learning_rate": 3.997129307114311e-06, + "loss": 0.0767, + "step": 1721 + }, + { + "epoch": 2.3207547169811322, + "grad_norm": 31.77975494630974, + "learning_rate": 3.991782874986481e-06, + "loss": 0.0805, + "step": 1722 + }, + { + "epoch": 2.322102425876011, + "grad_norm": 43.030668960733315, + "learning_rate": 3.986437644215954e-06, + "loss": 0.0792, + "step": 1723 + }, + { + "epoch": 2.3234501347708894, + "grad_norm": 4.250893172416862, + "learning_rate": 3.9810936211719235e-06, + "loss": 0.0597, + "step": 1724 + }, + { + "epoch": 2.324797843665768, + "grad_norm": 5.80276319243837, + "learning_rate": 3.97575081222215e-06, + "loss": 0.0444, + "step": 1725 + }, + { + "epoch": 2.326145552560647, + "grad_norm": 8.136311607288327, + "learning_rate": 3.970409223732942e-06, + "loss": 0.072, + "step": 1726 + }, + { + "epoch": 2.3274932614555257, + "grad_norm": 4.513900144040238, + "learning_rate": 3.965068862069153e-06, + "loss": 0.0777, + "step": 1727 + }, + { + "epoch": 2.3288409703504045, + "grad_norm": 6.851286292575998, + "learning_rate": 3.959729733594181e-06, + "loss": 0.0619, + "step": 1728 + }, + { + "epoch": 2.330188679245283, + "grad_norm": 4.9045737436928585, + "learning_rate": 3.954391844669946e-06, + "loss": 0.0722, + "step": 1729 + }, + { + "epoch": 2.3315363881401616, + "grad_norm": 3.017972329573049, + "learning_rate": 3.949055201656896e-06, + "loss": 0.0629, + "step": 1730 + }, + { + "epoch": 2.3328840970350404, + "grad_norm": 10.836296844734363, + "learning_rate": 3.9437198109139965e-06, + "loss": 0.0821, + "step": 1731 + }, + { + "epoch": 2.334231805929919, + "grad_norm": 3.3700782458570204, + "learning_rate": 3.938385678798715e-06, + "loss": 0.0706, + "step": 1732 + }, + { + "epoch": 2.335579514824798, + "grad_norm": 10.32208707238705, + "learning_rate": 3.933052811667023e-06, + "loss": 0.0718, + "step": 1733 + }, + { + "epoch": 2.3369272237196768, + "grad_norm": 17.14384086769449, + "learning_rate": 3.927721215873386e-06, + "loss": 0.0515, + "step": 1734 + }, + { + "epoch": 2.338274932614555, + "grad_norm": 5.977139693630598, + "learning_rate": 3.92239089777075e-06, + "loss": 0.0594, + "step": 1735 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 20.044780867207532, + "learning_rate": 3.91706186371054e-06, + "loss": 0.1136, + "step": 1736 + }, + { + "epoch": 2.3409703504043127, + "grad_norm": 23.774765255248628, + "learning_rate": 3.911734120042656e-06, + "loss": 0.0977, + "step": 1737 + }, + { + "epoch": 2.3423180592991915, + "grad_norm": 13.937305827368407, + "learning_rate": 3.9064076731154554e-06, + "loss": 0.0541, + "step": 1738 + }, + { + "epoch": 2.3436657681940702, + "grad_norm": 16.329509996659738, + "learning_rate": 3.9010825292757485e-06, + "loss": 0.0632, + "step": 1739 + }, + { + "epoch": 2.3450134770889486, + "grad_norm": 7.831518308895083, + "learning_rate": 3.8957586948687995e-06, + "loss": 0.0558, + "step": 1740 + }, + { + "epoch": 2.3463611859838274, + "grad_norm": 14.938870875488048, + "learning_rate": 3.890436176238308e-06, + "loss": 0.0631, + "step": 1741 + }, + { + "epoch": 2.347708894878706, + "grad_norm": 16.65716251941151, + "learning_rate": 3.885114979726403e-06, + "loss": 0.0821, + "step": 1742 + }, + { + "epoch": 2.349056603773585, + "grad_norm": 28.590531458792576, + "learning_rate": 3.879795111673647e-06, + "loss": 0.082, + "step": 1743 + }, + { + "epoch": 2.3504043126684637, + "grad_norm": 27.684782022964136, + "learning_rate": 3.874476578419013e-06, + "loss": 0.0613, + "step": 1744 + }, + { + "epoch": 2.3517520215633425, + "grad_norm": 13.794012271723894, + "learning_rate": 3.869159386299882e-06, + "loss": 0.0722, + "step": 1745 + }, + { + "epoch": 2.353099730458221, + "grad_norm": 16.988657778098656, + "learning_rate": 3.863843541652042e-06, + "loss": 0.0837, + "step": 1746 + }, + { + "epoch": 2.3544474393530996, + "grad_norm": 10.36490621454436, + "learning_rate": 3.858529050809672e-06, + "loss": 0.0521, + "step": 1747 + }, + { + "epoch": 2.3557951482479784, + "grad_norm": 21.96836453673217, + "learning_rate": 3.853215920105337e-06, + "loss": 0.0839, + "step": 1748 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 8.216581735110383, + "learning_rate": 3.8479041558699875e-06, + "loss": 0.0867, + "step": 1749 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 19.323116244911496, + "learning_rate": 3.842593764432939e-06, + "loss": 0.1061, + "step": 1750 + }, + { + "epoch": 2.3598382749326143, + "grad_norm": 9.808612875898087, + "learning_rate": 3.837284752121872e-06, + "loss": 0.0664, + "step": 1751 + }, + { + "epoch": 2.361185983827493, + "grad_norm": 21.545799938824295, + "learning_rate": 3.83197712526283e-06, + "loss": 0.0785, + "step": 1752 + }, + { + "epoch": 2.362533692722372, + "grad_norm": 3.4436195589512155, + "learning_rate": 3.826670890180197e-06, + "loss": 0.0793, + "step": 1753 + }, + { + "epoch": 2.3638814016172507, + "grad_norm": 17.48579357954061, + "learning_rate": 3.821366053196703e-06, + "loss": 0.077, + "step": 1754 + }, + { + "epoch": 2.3652291105121295, + "grad_norm": 26.67022532084765, + "learning_rate": 3.816062620633414e-06, + "loss": 0.086, + "step": 1755 + }, + { + "epoch": 2.3665768194070083, + "grad_norm": 45.15543994521633, + "learning_rate": 3.810760598809719e-06, + "loss": 0.0687, + "step": 1756 + }, + { + "epoch": 2.3679245283018866, + "grad_norm": 24.083745688781388, + "learning_rate": 3.8054599940433263e-06, + "loss": 0.097, + "step": 1757 + }, + { + "epoch": 2.3692722371967654, + "grad_norm": 11.984694767036656, + "learning_rate": 3.800160812650258e-06, + "loss": 0.0961, + "step": 1758 + }, + { + "epoch": 2.370619946091644, + "grad_norm": 22.600438545895383, + "learning_rate": 3.7948630609448383e-06, + "loss": 0.0789, + "step": 1759 + }, + { + "epoch": 2.371967654986523, + "grad_norm": 14.394653308293327, + "learning_rate": 3.7895667452396863e-06, + "loss": 0.0599, + "step": 1760 + }, + { + "epoch": 2.3733153638814017, + "grad_norm": 15.963764786614192, + "learning_rate": 3.784271871845715e-06, + "loss": 0.0991, + "step": 1761 + }, + { + "epoch": 2.37466307277628, + "grad_norm": 30.612647620110096, + "learning_rate": 3.778978447072116e-06, + "loss": 0.0698, + "step": 1762 + }, + { + "epoch": 2.376010781671159, + "grad_norm": 17.492396204803352, + "learning_rate": 3.7736864772263504e-06, + "loss": 0.1026, + "step": 1763 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 3.416164581617303, + "learning_rate": 3.7683959686141548e-06, + "loss": 0.0791, + "step": 1764 + }, + { + "epoch": 2.3787061994609164, + "grad_norm": 20.728425303878705, + "learning_rate": 3.763106927539517e-06, + "loss": 0.0458, + "step": 1765 + }, + { + "epoch": 2.3800539083557952, + "grad_norm": 11.66693483088628, + "learning_rate": 3.757819360304678e-06, + "loss": 0.086, + "step": 1766 + }, + { + "epoch": 2.381401617250674, + "grad_norm": 16.34742701209728, + "learning_rate": 3.7525332732101272e-06, + "loss": 0.0673, + "step": 1767 + }, + { + "epoch": 2.382749326145553, + "grad_norm": 3.453091708687257, + "learning_rate": 3.7472486725545832e-06, + "loss": 0.0842, + "step": 1768 + }, + { + "epoch": 2.384097035040431, + "grad_norm": 9.929381944452368, + "learning_rate": 3.7419655646349972e-06, + "loss": 0.0536, + "step": 1769 + }, + { + "epoch": 2.38544474393531, + "grad_norm": 11.793570843087945, + "learning_rate": 3.7366839557465427e-06, + "loss": 0.1116, + "step": 1770 + }, + { + "epoch": 2.3867924528301887, + "grad_norm": 12.498860615218057, + "learning_rate": 3.731403852182606e-06, + "loss": 0.0718, + "step": 1771 + }, + { + "epoch": 2.3881401617250675, + "grad_norm": 2.5366660660475384, + "learning_rate": 3.726125260234774e-06, + "loss": 0.0472, + "step": 1772 + }, + { + "epoch": 2.3894878706199463, + "grad_norm": 20.541416973242285, + "learning_rate": 3.7208481861928445e-06, + "loss": 0.0957, + "step": 1773 + }, + { + "epoch": 2.3908355795148246, + "grad_norm": 5.151707495087112, + "learning_rate": 3.715572636344797e-06, + "loss": 0.0805, + "step": 1774 + }, + { + "epoch": 2.3921832884097034, + "grad_norm": 7.767196431798118, + "learning_rate": 3.7102986169767954e-06, + "loss": 0.051, + "step": 1775 + }, + { + "epoch": 2.393530997304582, + "grad_norm": 16.937873029740953, + "learning_rate": 3.7050261343731864e-06, + "loss": 0.0794, + "step": 1776 + }, + { + "epoch": 2.394878706199461, + "grad_norm": 16.62243169703727, + "learning_rate": 3.699755194816479e-06, + "loss": 0.0613, + "step": 1777 + }, + { + "epoch": 2.3962264150943398, + "grad_norm": 16.77821234300901, + "learning_rate": 3.694485804587344e-06, + "loss": 0.0382, + "step": 1778 + }, + { + "epoch": 2.3975741239892185, + "grad_norm": 13.069931650537256, + "learning_rate": 3.6892179699646126e-06, + "loss": 0.0788, + "step": 1779 + }, + { + "epoch": 2.398921832884097, + "grad_norm": 20.84362538245794, + "learning_rate": 3.6839516972252542e-06, + "loss": 0.0655, + "step": 1780 + }, + { + "epoch": 2.4002695417789757, + "grad_norm": 13.13685811107039, + "learning_rate": 3.6786869926443814e-06, + "loss": 0.0608, + "step": 1781 + }, + { + "epoch": 2.4016172506738545, + "grad_norm": 14.77446604415077, + "learning_rate": 3.6734238624952388e-06, + "loss": 0.0703, + "step": 1782 + }, + { + "epoch": 2.4029649595687332, + "grad_norm": 3.5803245026241006, + "learning_rate": 3.6681623130491917e-06, + "loss": 0.0676, + "step": 1783 + }, + { + "epoch": 2.404312668463612, + "grad_norm": 17.278465909138884, + "learning_rate": 3.662902350575723e-06, + "loss": 0.0683, + "step": 1784 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 3.700880985669733, + "learning_rate": 3.6576439813424293e-06, + "loss": 0.0688, + "step": 1785 + }, + { + "epoch": 2.407008086253369, + "grad_norm": 9.074710758339394, + "learning_rate": 3.652387211615003e-06, + "loss": 0.0995, + "step": 1786 + }, + { + "epoch": 2.408355795148248, + "grad_norm": 3.6038682323717843, + "learning_rate": 3.647132047657229e-06, + "loss": 0.0883, + "step": 1787 + }, + { + "epoch": 2.4097035040431267, + "grad_norm": 2.4669816166725895, + "learning_rate": 3.6418784957309884e-06, + "loss": 0.0501, + "step": 1788 + }, + { + "epoch": 2.4110512129380055, + "grad_norm": 18.897031009212103, + "learning_rate": 3.6366265620962315e-06, + "loss": 0.0687, + "step": 1789 + }, + { + "epoch": 2.4123989218328843, + "grad_norm": 1.7543915044044542, + "learning_rate": 3.631376253010983e-06, + "loss": 0.0618, + "step": 1790 + }, + { + "epoch": 2.4137466307277626, + "grad_norm": 3.4619305016757247, + "learning_rate": 3.6261275747313373e-06, + "loss": 0.0469, + "step": 1791 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 13.573747148822067, + "learning_rate": 3.6208805335114393e-06, + "loss": 0.063, + "step": 1792 + }, + { + "epoch": 2.41644204851752, + "grad_norm": 9.86875844240667, + "learning_rate": 3.6156351356034837e-06, + "loss": 0.0972, + "step": 1793 + }, + { + "epoch": 2.417789757412399, + "grad_norm": 9.658881782560327, + "learning_rate": 3.610391387257711e-06, + "loss": 0.0987, + "step": 1794 + }, + { + "epoch": 2.4191374663072778, + "grad_norm": 14.035958453846794, + "learning_rate": 3.605149294722392e-06, + "loss": 0.1202, + "step": 1795 + }, + { + "epoch": 2.420485175202156, + "grad_norm": 3.637738614200157, + "learning_rate": 3.5999088642438252e-06, + "loss": 0.0545, + "step": 1796 + }, + { + "epoch": 2.421832884097035, + "grad_norm": 3.100464070514725, + "learning_rate": 3.594670102066333e-06, + "loss": 0.0541, + "step": 1797 + }, + { + "epoch": 2.4231805929919137, + "grad_norm": 3.6556862781173782, + "learning_rate": 3.589433014432245e-06, + "loss": 0.0526, + "step": 1798 + }, + { + "epoch": 2.4245283018867925, + "grad_norm": 6.653694410088033, + "learning_rate": 3.5841976075818945e-06, + "loss": 0.0437, + "step": 1799 + }, + { + "epoch": 2.4258760107816713, + "grad_norm": 20.504752879645736, + "learning_rate": 3.578963887753619e-06, + "loss": 0.0796, + "step": 1800 + }, + { + "epoch": 2.42722371967655, + "grad_norm": 17.209457547751732, + "learning_rate": 3.57373186118374e-06, + "loss": 0.0474, + "step": 1801 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 16.755584743274373, + "learning_rate": 3.5685015341065594e-06, + "loss": 0.0575, + "step": 1802 + }, + { + "epoch": 2.429919137466307, + "grad_norm": 1.5424891488575292, + "learning_rate": 3.563272912754362e-06, + "loss": 0.0407, + "step": 1803 + }, + { + "epoch": 2.431266846361186, + "grad_norm": 13.546565415763883, + "learning_rate": 3.5580460033573943e-06, + "loss": 0.1208, + "step": 1804 + }, + { + "epoch": 2.4326145552560647, + "grad_norm": 1.5047693464429686, + "learning_rate": 3.5528208121438624e-06, + "loss": 0.0597, + "step": 1805 + }, + { + "epoch": 2.4339622641509435, + "grad_norm": 6.542749527918897, + "learning_rate": 3.547597345339928e-06, + "loss": 0.0753, + "step": 1806 + }, + { + "epoch": 2.435309973045822, + "grad_norm": 2.900619111351923, + "learning_rate": 3.542375609169698e-06, + "loss": 0.0458, + "step": 1807 + }, + { + "epoch": 2.4366576819407006, + "grad_norm": 2.546515358986534, + "learning_rate": 3.537155609855212e-06, + "loss": 0.0696, + "step": 1808 + }, + { + "epoch": 2.4380053908355794, + "grad_norm": 16.17165276288412, + "learning_rate": 3.531937353616448e-06, + "loss": 0.0562, + "step": 1809 + }, + { + "epoch": 2.439353099730458, + "grad_norm": 17.3505846067881, + "learning_rate": 3.5267208466713025e-06, + "loss": 0.0943, + "step": 1810 + }, + { + "epoch": 2.440700808625337, + "grad_norm": 18.341462545733346, + "learning_rate": 3.521506095235585e-06, + "loss": 0.0645, + "step": 1811 + }, + { + "epoch": 2.442048517520216, + "grad_norm": 5.190791046550758, + "learning_rate": 3.516293105523021e-06, + "loss": 0.0612, + "step": 1812 + }, + { + "epoch": 2.4433962264150946, + "grad_norm": 4.8717635367785865, + "learning_rate": 3.511081883745229e-06, + "loss": 0.0834, + "step": 1813 + }, + { + "epoch": 2.444743935309973, + "grad_norm": 12.413018594171387, + "learning_rate": 3.5058724361117234e-06, + "loss": 0.0784, + "step": 1814 + }, + { + "epoch": 2.4460916442048517, + "grad_norm": 26.10258715682471, + "learning_rate": 3.500664768829908e-06, + "loss": 0.0885, + "step": 1815 + }, + { + "epoch": 2.4474393530997305, + "grad_norm": 14.47174104307549, + "learning_rate": 3.495458888105061e-06, + "loss": 0.0706, + "step": 1816 + }, + { + "epoch": 2.4487870619946093, + "grad_norm": 10.770038434940501, + "learning_rate": 3.4902548001403316e-06, + "loss": 0.0887, + "step": 1817 + }, + { + "epoch": 2.450134770889488, + "grad_norm": 11.861747908726969, + "learning_rate": 3.4850525111367366e-06, + "loss": 0.0806, + "step": 1818 + }, + { + "epoch": 2.4514824797843664, + "grad_norm": 2.1281981508143746, + "learning_rate": 3.4798520272931467e-06, + "loss": 0.0748, + "step": 1819 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 11.476679233443168, + "learning_rate": 3.4746533548062787e-06, + "loss": 0.0546, + "step": 1820 + }, + { + "epoch": 2.454177897574124, + "grad_norm": 9.32512251353016, + "learning_rate": 3.4694564998706993e-06, + "loss": 0.0712, + "step": 1821 + }, + { + "epoch": 2.4555256064690028, + "grad_norm": 4.289265214016654, + "learning_rate": 3.4642614686788025e-06, + "loss": 0.1038, + "step": 1822 + }, + { + "epoch": 2.4568733153638815, + "grad_norm": 1.9694657425399524, + "learning_rate": 3.4590682674208075e-06, + "loss": 0.0644, + "step": 1823 + }, + { + "epoch": 2.4582210242587603, + "grad_norm": 2.3068880944060957, + "learning_rate": 3.453876902284763e-06, + "loss": 0.0782, + "step": 1824 + }, + { + "epoch": 2.4595687331536387, + "grad_norm": 7.817147442933776, + "learning_rate": 3.4486873794565196e-06, + "loss": 0.0631, + "step": 1825 + }, + { + "epoch": 2.4609164420485174, + "grad_norm": 8.52474929742177, + "learning_rate": 3.443499705119735e-06, + "loss": 0.0504, + "step": 1826 + }, + { + "epoch": 2.4622641509433962, + "grad_norm": 4.465797765232699, + "learning_rate": 3.4383138854558706e-06, + "loss": 0.0888, + "step": 1827 + }, + { + "epoch": 2.463611859838275, + "grad_norm": 22.092408488988657, + "learning_rate": 3.433129926644171e-06, + "loss": 0.0732, + "step": 1828 + }, + { + "epoch": 2.464959568733154, + "grad_norm": 21.00684782293308, + "learning_rate": 3.4279478348616637e-06, + "loss": 0.0788, + "step": 1829 + }, + { + "epoch": 2.466307277628032, + "grad_norm": 25.722393862891266, + "learning_rate": 3.422767616283156e-06, + "loss": 0.0809, + "step": 1830 + }, + { + "epoch": 2.467654986522911, + "grad_norm": 16.675886912624748, + "learning_rate": 3.4175892770812187e-06, + "loss": 0.0704, + "step": 1831 + }, + { + "epoch": 2.4690026954177897, + "grad_norm": 12.04000161055776, + "learning_rate": 3.412412823426184e-06, + "loss": 0.0543, + "step": 1832 + }, + { + "epoch": 2.4703504043126685, + "grad_norm": 4.280497964990709, + "learning_rate": 3.4072382614861422e-06, + "loss": 0.0803, + "step": 1833 + }, + { + "epoch": 2.4716981132075473, + "grad_norm": 21.379699623679624, + "learning_rate": 3.402065597426923e-06, + "loss": 0.0621, + "step": 1834 + }, + { + "epoch": 2.473045822102426, + "grad_norm": 13.76095729615768, + "learning_rate": 3.3968948374120958e-06, + "loss": 0.068, + "step": 1835 + }, + { + "epoch": 2.4743935309973044, + "grad_norm": 6.0587874617331705, + "learning_rate": 3.391725987602967e-06, + "loss": 0.0671, + "step": 1836 + }, + { + "epoch": 2.475741239892183, + "grad_norm": 19.69156634640957, + "learning_rate": 3.38655905415856e-06, + "loss": 0.0826, + "step": 1837 + }, + { + "epoch": 2.477088948787062, + "grad_norm": 15.070009160094306, + "learning_rate": 3.3813940432356175e-06, + "loss": 0.0812, + "step": 1838 + }, + { + "epoch": 2.4784366576819408, + "grad_norm": 12.971629820895267, + "learning_rate": 3.376230960988591e-06, + "loss": 0.07, + "step": 1839 + }, + { + "epoch": 2.4797843665768196, + "grad_norm": 7.9918085287714336, + "learning_rate": 3.3710698135696346e-06, + "loss": 0.0761, + "step": 1840 + }, + { + "epoch": 2.481132075471698, + "grad_norm": 13.587341487611935, + "learning_rate": 3.3659106071285956e-06, + "loss": 0.0889, + "step": 1841 + }, + { + "epoch": 2.4824797843665767, + "grad_norm": 20.498884958712367, + "learning_rate": 3.3607533478130105e-06, + "loss": 0.0707, + "step": 1842 + }, + { + "epoch": 2.4838274932614555, + "grad_norm": 3.479978851977824, + "learning_rate": 3.3555980417680947e-06, + "loss": 0.086, + "step": 1843 + }, + { + "epoch": 2.4851752021563343, + "grad_norm": 3.788051515437025, + "learning_rate": 3.350444695136732e-06, + "loss": 0.0606, + "step": 1844 + }, + { + "epoch": 2.486522911051213, + "grad_norm": 2.492064241207019, + "learning_rate": 3.34529331405948e-06, + "loss": 0.07, + "step": 1845 + }, + { + "epoch": 2.487870619946092, + "grad_norm": 6.041213814858029, + "learning_rate": 3.3401439046745487e-06, + "loss": 0.0506, + "step": 1846 + }, + { + "epoch": 2.48921832884097, + "grad_norm": 7.606294974577943, + "learning_rate": 3.3349964731177957e-06, + "loss": 0.0992, + "step": 1847 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 12.929551193076811, + "learning_rate": 3.3298510255227313e-06, + "loss": 0.1051, + "step": 1848 + }, + { + "epoch": 2.4919137466307277, + "grad_norm": 1.9073239085715576, + "learning_rate": 3.324707568020493e-06, + "loss": 0.0846, + "step": 1849 + }, + { + "epoch": 2.4932614555256065, + "grad_norm": 5.583775661221358, + "learning_rate": 3.31956610673985e-06, + "loss": 0.0717, + "step": 1850 + }, + { + "epoch": 2.4946091644204853, + "grad_norm": 12.37248124036358, + "learning_rate": 3.314426647807194e-06, + "loss": 0.0803, + "step": 1851 + }, + { + "epoch": 2.4959568733153636, + "grad_norm": 14.915378432091542, + "learning_rate": 3.3092891973465304e-06, + "loss": 0.0759, + "step": 1852 + }, + { + "epoch": 2.4973045822102424, + "grad_norm": 16.901562063952497, + "learning_rate": 3.3041537614794684e-06, + "loss": 0.0547, + "step": 1853 + }, + { + "epoch": 2.498652291105121, + "grad_norm": 30.033570530182498, + "learning_rate": 3.2990203463252225e-06, + "loss": 0.0719, + "step": 1854 + }, + { + "epoch": 2.5, + "grad_norm": 14.587125855702567, + "learning_rate": 3.2938889580005932e-06, + "loss": 0.0603, + "step": 1855 + }, + { + "epoch": 2.501347708894879, + "grad_norm": 12.881733162954589, + "learning_rate": 3.2887596026199675e-06, + "loss": 0.0777, + "step": 1856 + }, + { + "epoch": 2.5026954177897576, + "grad_norm": 9.008742546001686, + "learning_rate": 3.283632286295316e-06, + "loss": 0.0841, + "step": 1857 + }, + { + "epoch": 2.5040431266846364, + "grad_norm": 5.731208252670657, + "learning_rate": 3.2785070151361713e-06, + "loss": 0.1101, + "step": 1858 + }, + { + "epoch": 2.5053908355795147, + "grad_norm": 9.715828289997201, + "learning_rate": 3.2733837952496317e-06, + "loss": 0.0777, + "step": 1859 + }, + { + "epoch": 2.5067385444743935, + "grad_norm": 2.8753615042868588, + "learning_rate": 3.2682626327403547e-06, + "loss": 0.076, + "step": 1860 + }, + { + "epoch": 2.5080862533692723, + "grad_norm": 15.08765509473148, + "learning_rate": 3.2631435337105433e-06, + "loss": 0.0556, + "step": 1861 + }, + { + "epoch": 2.509433962264151, + "grad_norm": 3.615123459130305, + "learning_rate": 3.25802650425994e-06, + "loss": 0.0616, + "step": 1862 + }, + { + "epoch": 2.5107816711590294, + "grad_norm": 7.9265326363596715, + "learning_rate": 3.2529115504858255e-06, + "loss": 0.0631, + "step": 1863 + }, + { + "epoch": 2.512129380053908, + "grad_norm": 25.463312897806876, + "learning_rate": 3.247798678483005e-06, + "loss": 0.0868, + "step": 1864 + }, + { + "epoch": 2.513477088948787, + "grad_norm": 1.6445344881024193, + "learning_rate": 3.2426878943438024e-06, + "loss": 0.0742, + "step": 1865 + }, + { + "epoch": 2.5148247978436657, + "grad_norm": 20.072626454266633, + "learning_rate": 3.237579204158055e-06, + "loss": 0.0748, + "step": 1866 + }, + { + "epoch": 2.5161725067385445, + "grad_norm": 17.873089215871172, + "learning_rate": 3.232472614013105e-06, + "loss": 0.0832, + "step": 1867 + }, + { + "epoch": 2.5175202156334233, + "grad_norm": 13.77638874973307, + "learning_rate": 3.2273681299937887e-06, + "loss": 0.067, + "step": 1868 + }, + { + "epoch": 2.518867924528302, + "grad_norm": 12.71060166702959, + "learning_rate": 3.2222657581824413e-06, + "loss": 0.0715, + "step": 1869 + }, + { + "epoch": 2.5202156334231804, + "grad_norm": 26.058027442609614, + "learning_rate": 3.2171655046588736e-06, + "loss": 0.0498, + "step": 1870 + }, + { + "epoch": 2.5215633423180592, + "grad_norm": 28.56092483050514, + "learning_rate": 3.2120673755003716e-06, + "loss": 0.0529, + "step": 1871 + }, + { + "epoch": 2.522911051212938, + "grad_norm": 23.704064330551617, + "learning_rate": 3.2069713767816974e-06, + "loss": 0.1061, + "step": 1872 + }, + { + "epoch": 2.524258760107817, + "grad_norm": 21.61996688473815, + "learning_rate": 3.2018775145750686e-06, + "loss": 0.115, + "step": 1873 + }, + { + "epoch": 2.525606469002695, + "grad_norm": 29.023329472548372, + "learning_rate": 3.1967857949501566e-06, + "loss": 0.0732, + "step": 1874 + }, + { + "epoch": 2.526954177897574, + "grad_norm": 22.6680423046756, + "learning_rate": 3.191696223974084e-06, + "loss": 0.0935, + "step": 1875 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 2.236853018809813, + "learning_rate": 3.186608807711411e-06, + "loss": 0.0307, + "step": 1876 + }, + { + "epoch": 2.5296495956873315, + "grad_norm": 40.55670263624725, + "learning_rate": 3.1815235522241277e-06, + "loss": 0.0911, + "step": 1877 + }, + { + "epoch": 2.5309973045822103, + "grad_norm": 5.6881872354564464, + "learning_rate": 3.1764404635716546e-06, + "loss": 0.0711, + "step": 1878 + }, + { + "epoch": 2.532345013477089, + "grad_norm": 11.972858439870398, + "learning_rate": 3.1713595478108262e-06, + "loss": 0.0989, + "step": 1879 + }, + { + "epoch": 2.533692722371968, + "grad_norm": 12.123414666441441, + "learning_rate": 3.1662808109958877e-06, + "loss": 0.0457, + "step": 1880 + }, + { + "epoch": 2.535040431266846, + "grad_norm": 4.441706448048429, + "learning_rate": 3.161204259178493e-06, + "loss": 0.0627, + "step": 1881 + }, + { + "epoch": 2.536388140161725, + "grad_norm": 16.54125464998275, + "learning_rate": 3.1561298984076875e-06, + "loss": 0.0614, + "step": 1882 + }, + { + "epoch": 2.5377358490566038, + "grad_norm": 26.48500175172064, + "learning_rate": 3.151057734729905e-06, + "loss": 0.0706, + "step": 1883 + }, + { + "epoch": 2.5390835579514826, + "grad_norm": 2.1650358998192716, + "learning_rate": 3.145987774188967e-06, + "loss": 0.0647, + "step": 1884 + }, + { + "epoch": 2.5404312668463613, + "grad_norm": 10.713429377114707, + "learning_rate": 3.1409200228260654e-06, + "loss": 0.076, + "step": 1885 + }, + { + "epoch": 2.5417789757412397, + "grad_norm": 8.46398515220757, + "learning_rate": 3.135854486679759e-06, + "loss": 0.0637, + "step": 1886 + }, + { + "epoch": 2.5431266846361185, + "grad_norm": 6.233227015277, + "learning_rate": 3.1307911717859695e-06, + "loss": 0.0556, + "step": 1887 + }, + { + "epoch": 2.5444743935309972, + "grad_norm": 23.4422748669712, + "learning_rate": 3.125730084177973e-06, + "loss": 0.063, + "step": 1888 + }, + { + "epoch": 2.545822102425876, + "grad_norm": 32.82939316209632, + "learning_rate": 3.120671229886387e-06, + "loss": 0.1031, + "step": 1889 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 37.59042239382625, + "learning_rate": 3.115614614939173e-06, + "loss": 0.0568, + "step": 1890 + }, + { + "epoch": 2.5485175202156336, + "grad_norm": 8.4547675793951, + "learning_rate": 3.11056024536162e-06, + "loss": 0.0806, + "step": 1891 + }, + { + "epoch": 2.5498652291105124, + "grad_norm": 4.753041489151086, + "learning_rate": 3.105508127176342e-06, + "loss": 0.0883, + "step": 1892 + }, + { + "epoch": 2.5512129380053907, + "grad_norm": 25.64029298620067, + "learning_rate": 3.1004582664032756e-06, + "loss": 0.087, + "step": 1893 + }, + { + "epoch": 2.5525606469002695, + "grad_norm": 32.32506142997821, + "learning_rate": 3.0954106690596604e-06, + "loss": 0.0748, + "step": 1894 + }, + { + "epoch": 2.5539083557951483, + "grad_norm": 27.26912664795796, + "learning_rate": 3.090365341160041e-06, + "loss": 0.0658, + "step": 1895 + }, + { + "epoch": 2.555256064690027, + "grad_norm": 15.187277590016786, + "learning_rate": 3.085322288716263e-06, + "loss": 0.0758, + "step": 1896 + }, + { + "epoch": 2.5566037735849054, + "grad_norm": 21.89897721162333, + "learning_rate": 3.0802815177374533e-06, + "loss": 0.0832, + "step": 1897 + }, + { + "epoch": 2.557951482479784, + "grad_norm": 30.344531625837995, + "learning_rate": 3.075243034230024e-06, + "loss": 0.0632, + "step": 1898 + }, + { + "epoch": 2.559299191374663, + "grad_norm": 5.634558784906762, + "learning_rate": 3.0702068441976608e-06, + "loss": 0.0785, + "step": 1899 + }, + { + "epoch": 2.560646900269542, + "grad_norm": 18.164828368573215, + "learning_rate": 3.0651729536413186e-06, + "loss": 0.0751, + "step": 1900 + }, + { + "epoch": 2.5619946091644206, + "grad_norm": 5.121373773643143, + "learning_rate": 3.0601413685592085e-06, + "loss": 0.0845, + "step": 1901 + }, + { + "epoch": 2.5633423180592994, + "grad_norm": 19.933091617122166, + "learning_rate": 3.0551120949467984e-06, + "loss": 0.0991, + "step": 1902 + }, + { + "epoch": 2.564690026954178, + "grad_norm": 11.136270384842069, + "learning_rate": 3.0500851387967987e-06, + "loss": 0.0686, + "step": 1903 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 17.288051861678436, + "learning_rate": 3.045060506099158e-06, + "loss": 0.0712, + "step": 1904 + }, + { + "epoch": 2.5673854447439353, + "grad_norm": 4.192250408930114, + "learning_rate": 3.0400382028410618e-06, + "loss": 0.0436, + "step": 1905 + }, + { + "epoch": 2.568733153638814, + "grad_norm": 23.868683100393685, + "learning_rate": 3.0350182350069147e-06, + "loss": 0.0538, + "step": 1906 + }, + { + "epoch": 2.570080862533693, + "grad_norm": 2.218608126880435, + "learning_rate": 3.0300006085783375e-06, + "loss": 0.076, + "step": 1907 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 18.406317950160894, + "learning_rate": 3.0249853295341677e-06, + "loss": 0.0655, + "step": 1908 + }, + { + "epoch": 2.57277628032345, + "grad_norm": 12.285276552523104, + "learning_rate": 3.019972403850439e-06, + "loss": 0.1025, + "step": 1909 + }, + { + "epoch": 2.5741239892183287, + "grad_norm": 4.160267164526286, + "learning_rate": 3.014961837500383e-06, + "loss": 0.0697, + "step": 1910 + }, + { + "epoch": 2.5754716981132075, + "grad_norm": 7.002520446325397, + "learning_rate": 3.0099536364544225e-06, + "loss": 0.0812, + "step": 1911 + }, + { + "epoch": 2.5768194070080863, + "grad_norm": 17.52354205621105, + "learning_rate": 3.004947806680159e-06, + "loss": 0.0658, + "step": 1912 + }, + { + "epoch": 2.578167115902965, + "grad_norm": 26.172544569730185, + "learning_rate": 2.999944354142369e-06, + "loss": 0.0543, + "step": 1913 + }, + { + "epoch": 2.579514824797844, + "grad_norm": 11.334005224737801, + "learning_rate": 2.9949432848029968e-06, + "loss": 0.0625, + "step": 1914 + }, + { + "epoch": 2.5808625336927222, + "grad_norm": 31.042976895284674, + "learning_rate": 2.989944604621148e-06, + "loss": 0.0788, + "step": 1915 + }, + { + "epoch": 2.582210242587601, + "grad_norm": 32.20892104198085, + "learning_rate": 2.984948319553077e-06, + "loss": 0.098, + "step": 1916 + }, + { + "epoch": 2.58355795148248, + "grad_norm": 10.200848237284491, + "learning_rate": 2.9799544355521916e-06, + "loss": 0.0505, + "step": 1917 + }, + { + "epoch": 2.5849056603773586, + "grad_norm": 24.33345808455334, + "learning_rate": 2.974962958569032e-06, + "loss": 0.0955, + "step": 1918 + }, + { + "epoch": 2.586253369272237, + "grad_norm": 13.159028251596805, + "learning_rate": 2.9699738945512722e-06, + "loss": 0.086, + "step": 1919 + }, + { + "epoch": 2.5876010781671157, + "grad_norm": 9.962683612540534, + "learning_rate": 2.964987249443715e-06, + "loss": 0.045, + "step": 1920 + }, + { + "epoch": 2.5889487870619945, + "grad_norm": 17.34116131146196, + "learning_rate": 2.960003029188274e-06, + "loss": 0.1002, + "step": 1921 + }, + { + "epoch": 2.5902964959568733, + "grad_norm": 2.7131084506417977, + "learning_rate": 2.9550212397239774e-06, + "loss": 0.0539, + "step": 1922 + }, + { + "epoch": 2.591644204851752, + "grad_norm": 4.0157819946092435, + "learning_rate": 2.9500418869869584e-06, + "loss": 0.0728, + "step": 1923 + }, + { + "epoch": 2.592991913746631, + "grad_norm": 22.044084602848336, + "learning_rate": 2.945064976910442e-06, + "loss": 0.0806, + "step": 1924 + }, + { + "epoch": 2.5943396226415096, + "grad_norm": 17.42952887466711, + "learning_rate": 2.940090515424746e-06, + "loss": 0.1032, + "step": 1925 + }, + { + "epoch": 2.595687331536388, + "grad_norm": 8.920042831025944, + "learning_rate": 2.935118508457272e-06, + "loss": 0.0477, + "step": 1926 + }, + { + "epoch": 2.5970350404312668, + "grad_norm": 16.894054970620033, + "learning_rate": 2.9301489619324937e-06, + "loss": 0.0575, + "step": 1927 + }, + { + "epoch": 2.5983827493261455, + "grad_norm": 12.511672980529472, + "learning_rate": 2.9251818817719513e-06, + "loss": 0.0684, + "step": 1928 + }, + { + "epoch": 2.5997304582210243, + "grad_norm": 3.1644117623816475, + "learning_rate": 2.9202172738942524e-06, + "loss": 0.0686, + "step": 1929 + }, + { + "epoch": 2.601078167115903, + "grad_norm": 0.9927318012731358, + "learning_rate": 2.9152551442150534e-06, + "loss": 0.0447, + "step": 1930 + }, + { + "epoch": 2.6024258760107815, + "grad_norm": 3.727656032408163, + "learning_rate": 2.910295498647061e-06, + "loss": 0.0563, + "step": 1931 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 1.8488032305830748, + "learning_rate": 2.905338343100021e-06, + "loss": 0.0408, + "step": 1932 + }, + { + "epoch": 2.605121293800539, + "grad_norm": 6.718039269805489, + "learning_rate": 2.9003836834807086e-06, + "loss": 0.061, + "step": 1933 + }, + { + "epoch": 2.606469002695418, + "grad_norm": 18.433683126040684, + "learning_rate": 2.8954315256929294e-06, + "loss": 0.0679, + "step": 1934 + }, + { + "epoch": 2.6078167115902966, + "grad_norm": 11.813079132206694, + "learning_rate": 2.8904818756375076e-06, + "loss": 0.1007, + "step": 1935 + }, + { + "epoch": 2.6091644204851754, + "grad_norm": 1.9676741711114634, + "learning_rate": 2.885534739212279e-06, + "loss": 0.051, + "step": 1936 + }, + { + "epoch": 2.610512129380054, + "grad_norm": 3.8282366220867745, + "learning_rate": 2.880590122312078e-06, + "loss": 0.0727, + "step": 1937 + }, + { + "epoch": 2.6118598382749325, + "grad_norm": 2.8889771045714068, + "learning_rate": 2.8756480308287506e-06, + "loss": 0.0855, + "step": 1938 + }, + { + "epoch": 2.6132075471698113, + "grad_norm": 1.8639645821794921, + "learning_rate": 2.870708470651118e-06, + "loss": 0.0815, + "step": 1939 + }, + { + "epoch": 2.61455525606469, + "grad_norm": 10.543715374648736, + "learning_rate": 2.8657714476649963e-06, + "loss": 0.0778, + "step": 1940 + }, + { + "epoch": 2.615902964959569, + "grad_norm": 2.9668023214910497, + "learning_rate": 2.8608369677531755e-06, + "loss": 0.0804, + "step": 1941 + }, + { + "epoch": 2.617250673854447, + "grad_norm": 20.792080336847633, + "learning_rate": 2.8559050367954098e-06, + "loss": 0.1025, + "step": 1942 + }, + { + "epoch": 2.618598382749326, + "grad_norm": 16.078013413094475, + "learning_rate": 2.8509756606684235e-06, + "loss": 0.065, + "step": 1943 + }, + { + "epoch": 2.6199460916442048, + "grad_norm": 23.640467165284832, + "learning_rate": 2.846048845245894e-06, + "loss": 0.0905, + "step": 1944 + }, + { + "epoch": 2.6212938005390836, + "grad_norm": 32.38684266604632, + "learning_rate": 2.841124596398449e-06, + "loss": 0.0679, + "step": 1945 + }, + { + "epoch": 2.6226415094339623, + "grad_norm": 6.944467531483668, + "learning_rate": 2.8362029199936503e-06, + "loss": 0.0679, + "step": 1946 + }, + { + "epoch": 2.623989218328841, + "grad_norm": 1.7362943694422919, + "learning_rate": 2.831283821896008e-06, + "loss": 0.066, + "step": 1947 + }, + { + "epoch": 2.62533692722372, + "grad_norm": 2.6457705086880794, + "learning_rate": 2.8263673079669472e-06, + "loss": 0.0525, + "step": 1948 + }, + { + "epoch": 2.6266846361185983, + "grad_norm": 1.72121411102171, + "learning_rate": 2.8214533840648208e-06, + "loss": 0.0497, + "step": 1949 + }, + { + "epoch": 2.628032345013477, + "grad_norm": 8.087241616172259, + "learning_rate": 2.816542056044893e-06, + "loss": 0.0899, + "step": 1950 + }, + { + "epoch": 2.629380053908356, + "grad_norm": 3.9700817821415044, + "learning_rate": 2.8116333297593383e-06, + "loss": 0.0767, + "step": 1951 + }, + { + "epoch": 2.6307277628032346, + "grad_norm": 4.6659540944767555, + "learning_rate": 2.8067272110572246e-06, + "loss": 0.0818, + "step": 1952 + }, + { + "epoch": 2.632075471698113, + "grad_norm": 3.925414352108396, + "learning_rate": 2.8018237057845176e-06, + "loss": 0.0882, + "step": 1953 + }, + { + "epoch": 2.6334231805929917, + "grad_norm": 20.196425433837398, + "learning_rate": 2.7969228197840685e-06, + "loss": 0.0657, + "step": 1954 + }, + { + "epoch": 2.6347708894878705, + "grad_norm": 19.69593797748419, + "learning_rate": 2.792024558895606e-06, + "loss": 0.0873, + "step": 1955 + }, + { + "epoch": 2.6361185983827493, + "grad_norm": 15.070224038514798, + "learning_rate": 2.7871289289557347e-06, + "loss": 0.0599, + "step": 1956 + }, + { + "epoch": 2.637466307277628, + "grad_norm": 14.700257150561658, + "learning_rate": 2.782235935797915e-06, + "loss": 0.0826, + "step": 1957 + }, + { + "epoch": 2.638814016172507, + "grad_norm": 30.40106967685524, + "learning_rate": 2.7773455852524757e-06, + "loss": 0.0614, + "step": 1958 + }, + { + "epoch": 2.6401617250673857, + "grad_norm": 15.688504465270983, + "learning_rate": 2.7724578831465904e-06, + "loss": 0.0696, + "step": 1959 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 6.605075496627804, + "learning_rate": 2.7675728353042824e-06, + "loss": 0.0736, + "step": 1960 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 28.858587981835857, + "learning_rate": 2.762690447546403e-06, + "loss": 0.101, + "step": 1961 + }, + { + "epoch": 2.6442048517520216, + "grad_norm": 2.284857294665442, + "learning_rate": 2.7578107256906473e-06, + "loss": 0.0895, + "step": 1962 + }, + { + "epoch": 2.6455525606469004, + "grad_norm": 1.6988981828415328, + "learning_rate": 2.7529336755515203e-06, + "loss": 0.0579, + "step": 1963 + }, + { + "epoch": 2.6469002695417787, + "grad_norm": 3.2851882711855906, + "learning_rate": 2.74805930294035e-06, + "loss": 0.0593, + "step": 1964 + }, + { + "epoch": 2.6482479784366575, + "grad_norm": 11.059241823930716, + "learning_rate": 2.743187613665278e-06, + "loss": 0.0796, + "step": 1965 + }, + { + "epoch": 2.6495956873315363, + "grad_norm": 17.412509880059567, + "learning_rate": 2.7383186135312385e-06, + "loss": 0.0893, + "step": 1966 + }, + { + "epoch": 2.650943396226415, + "grad_norm": 12.116458533375496, + "learning_rate": 2.733452308339969e-06, + "loss": 0.0743, + "step": 1967 + }, + { + "epoch": 2.652291105121294, + "grad_norm": 2.7865412976900594, + "learning_rate": 2.7285887038899926e-06, + "loss": 0.0649, + "step": 1968 + }, + { + "epoch": 2.6536388140161726, + "grad_norm": 7.037291876487776, + "learning_rate": 2.7237278059766186e-06, + "loss": 0.0975, + "step": 1969 + }, + { + "epoch": 2.6549865229110514, + "grad_norm": 21.14124148306024, + "learning_rate": 2.718869620391922e-06, + "loss": 0.1008, + "step": 1970 + }, + { + "epoch": 2.6563342318059298, + "grad_norm": 7.062409358207251, + "learning_rate": 2.7140141529247582e-06, + "loss": 0.07, + "step": 1971 + }, + { + "epoch": 2.6576819407008085, + "grad_norm": 1.7318693299987877, + "learning_rate": 2.709161409360733e-06, + "loss": 0.0501, + "step": 1972 + }, + { + "epoch": 2.6590296495956873, + "grad_norm": 6.668923062822522, + "learning_rate": 2.7043113954822125e-06, + "loss": 0.0944, + "step": 1973 + }, + { + "epoch": 2.660377358490566, + "grad_norm": 11.15837313372785, + "learning_rate": 2.6994641170683085e-06, + "loss": 0.0911, + "step": 1974 + }, + { + "epoch": 2.661725067385445, + "grad_norm": 21.245958379065318, + "learning_rate": 2.6946195798948755e-06, + "loss": 0.1097, + "step": 1975 + }, + { + "epoch": 2.6630727762803232, + "grad_norm": 4.294760328423536, + "learning_rate": 2.6897777897344956e-06, + "loss": 0.0395, + "step": 1976 + }, + { + "epoch": 2.664420485175202, + "grad_norm": 11.365403032332624, + "learning_rate": 2.684938752356483e-06, + "loss": 0.1218, + "step": 1977 + }, + { + "epoch": 2.665768194070081, + "grad_norm": 2.136750196375148, + "learning_rate": 2.680102473526871e-06, + "loss": 0.0836, + "step": 1978 + }, + { + "epoch": 2.6671159029649596, + "grad_norm": 13.229477365593308, + "learning_rate": 2.6752689590084057e-06, + "loss": 0.0721, + "step": 1979 + }, + { + "epoch": 2.6684636118598384, + "grad_norm": 9.140290391624168, + "learning_rate": 2.67043821456054e-06, + "loss": 0.07, + "step": 1980 + }, + { + "epoch": 2.669811320754717, + "grad_norm": 18.838343455657714, + "learning_rate": 2.665610245939422e-06, + "loss": 0.0585, + "step": 1981 + }, + { + "epoch": 2.671159029649596, + "grad_norm": 2.9341267878931343, + "learning_rate": 2.6607850588978962e-06, + "loss": 0.0526, + "step": 1982 + }, + { + "epoch": 2.6725067385444743, + "grad_norm": 2.5901431504898413, + "learning_rate": 2.6559626591854924e-06, + "loss": 0.0771, + "step": 1983 + }, + { + "epoch": 2.673854447439353, + "grad_norm": 20.083762833187485, + "learning_rate": 2.6511430525484193e-06, + "loss": 0.0628, + "step": 1984 + }, + { + "epoch": 2.675202156334232, + "grad_norm": 3.0733974024528528, + "learning_rate": 2.6463262447295523e-06, + "loss": 0.0539, + "step": 1985 + }, + { + "epoch": 2.6765498652291106, + "grad_norm": 29.02608325567151, + "learning_rate": 2.6415122414684434e-06, + "loss": 0.0574, + "step": 1986 + }, + { + "epoch": 2.677897574123989, + "grad_norm": 10.871384989867334, + "learning_rate": 2.636701048501289e-06, + "loss": 0.0916, + "step": 1987 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 21.112230116860278, + "learning_rate": 2.6318926715609454e-06, + "loss": 0.0769, + "step": 1988 + }, + { + "epoch": 2.6805929919137466, + "grad_norm": 20.286209727977155, + "learning_rate": 2.627087116376914e-06, + "loss": 0.0737, + "step": 1989 + }, + { + "epoch": 2.6819407008086253, + "grad_norm": 8.2254540213248, + "learning_rate": 2.6222843886753262e-06, + "loss": 0.0552, + "step": 1990 + }, + { + "epoch": 2.683288409703504, + "grad_norm": 1.7132080743339009, + "learning_rate": 2.6174844941789524e-06, + "loss": 0.0667, + "step": 1991 + }, + { + "epoch": 2.684636118598383, + "grad_norm": 12.726947385296285, + "learning_rate": 2.6126874386071832e-06, + "loss": 0.0616, + "step": 1992 + }, + { + "epoch": 2.6859838274932617, + "grad_norm": 1.7381395723269477, + "learning_rate": 2.60789322767603e-06, + "loss": 0.0485, + "step": 1993 + }, + { + "epoch": 2.68733153638814, + "grad_norm": 11.910101373388704, + "learning_rate": 2.6031018670981053e-06, + "loss": 0.0525, + "step": 1994 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 20.60825682089179, + "learning_rate": 2.598313362582639e-06, + "loss": 0.0635, + "step": 1995 + }, + { + "epoch": 2.6900269541778976, + "grad_norm": 18.585226658470255, + "learning_rate": 2.5935277198354456e-06, + "loss": 0.0795, + "step": 1996 + }, + { + "epoch": 2.6913746630727764, + "grad_norm": 18.199083786179905, + "learning_rate": 2.588744944558936e-06, + "loss": 0.0674, + "step": 1997 + }, + { + "epoch": 2.6927223719676547, + "grad_norm": 14.208129209900266, + "learning_rate": 2.5839650424521036e-06, + "loss": 0.0579, + "step": 1998 + }, + { + "epoch": 2.6940700808625335, + "grad_norm": 8.386500651954666, + "learning_rate": 2.579188019210519e-06, + "loss": 0.1094, + "step": 1999 + }, + { + "epoch": 2.6954177897574123, + "grad_norm": 7.200854160277697, + "learning_rate": 2.5744138805263164e-06, + "loss": 0.0833, + "step": 2000 + }, + { + "epoch": 2.696765498652291, + "grad_norm": 11.192385250703634, + "learning_rate": 2.5696426320882003e-06, + "loss": 0.059, + "step": 2001 + }, + { + "epoch": 2.69811320754717, + "grad_norm": 12.183585651552391, + "learning_rate": 2.5648742795814273e-06, + "loss": 0.079, + "step": 2002 + }, + { + "epoch": 2.6994609164420487, + "grad_norm": 6.036757754997978, + "learning_rate": 2.560108828687806e-06, + "loss": 0.0653, + "step": 2003 + }, + { + "epoch": 2.7008086253369274, + "grad_norm": 16.51426938763599, + "learning_rate": 2.555346285085687e-06, + "loss": 0.0585, + "step": 2004 + }, + { + "epoch": 2.702156334231806, + "grad_norm": 17.53964311732567, + "learning_rate": 2.550586654449951e-06, + "loss": 0.0849, + "step": 2005 + }, + { + "epoch": 2.7035040431266846, + "grad_norm": 3.464633695715914, + "learning_rate": 2.545829942452015e-06, + "loss": 0.0674, + "step": 2006 + }, + { + "epoch": 2.7048517520215634, + "grad_norm": 12.119645947172911, + "learning_rate": 2.5410761547598163e-06, + "loss": 0.0661, + "step": 2007 + }, + { + "epoch": 2.706199460916442, + "grad_norm": 16.28030470631774, + "learning_rate": 2.5363252970378073e-06, + "loss": 0.0672, + "step": 2008 + }, + { + "epoch": 2.7075471698113205, + "grad_norm": 2.9778540697813103, + "learning_rate": 2.531577374946944e-06, + "loss": 0.0368, + "step": 2009 + }, + { + "epoch": 2.7088948787061993, + "grad_norm": 16.005083380466075, + "learning_rate": 2.5268323941446966e-06, + "loss": 0.0732, + "step": 2010 + }, + { + "epoch": 2.710242587601078, + "grad_norm": 3.9224671354608645, + "learning_rate": 2.522090360285018e-06, + "loss": 0.0569, + "step": 2011 + }, + { + "epoch": 2.711590296495957, + "grad_norm": 24.04028166202447, + "learning_rate": 2.517351279018355e-06, + "loss": 0.0928, + "step": 2012 + }, + { + "epoch": 2.7129380053908356, + "grad_norm": 19.667528411153224, + "learning_rate": 2.51261515599164e-06, + "loss": 0.1027, + "step": 2013 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 12.59049735525708, + "learning_rate": 2.5078819968482714e-06, + "loss": 0.0659, + "step": 2014 + }, + { + "epoch": 2.715633423180593, + "grad_norm": 4.074686189942641, + "learning_rate": 2.5031518072281236e-06, + "loss": 0.0765, + "step": 2015 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 9.386018596603849, + "learning_rate": 2.4984245927675287e-06, + "loss": 0.0701, + "step": 2016 + }, + { + "epoch": 2.7183288409703503, + "grad_norm": 10.678004222482402, + "learning_rate": 2.4937003590992787e-06, + "loss": 0.1116, + "step": 2017 + }, + { + "epoch": 2.719676549865229, + "grad_norm": 15.208325973641516, + "learning_rate": 2.4889791118526026e-06, + "loss": 0.0735, + "step": 2018 + }, + { + "epoch": 2.721024258760108, + "grad_norm": 12.189470334743396, + "learning_rate": 2.4842608566531873e-06, + "loss": 0.0604, + "step": 2019 + }, + { + "epoch": 2.7223719676549867, + "grad_norm": 24.910549884130678, + "learning_rate": 2.479545599123139e-06, + "loss": 0.0865, + "step": 2020 + }, + { + "epoch": 2.723719676549865, + "grad_norm": 2.044492785394739, + "learning_rate": 2.4748333448810013e-06, + "loss": 0.044, + "step": 2021 + }, + { + "epoch": 2.725067385444744, + "grad_norm": 21.227419797744272, + "learning_rate": 2.4701240995417353e-06, + "loss": 0.0761, + "step": 2022 + }, + { + "epoch": 2.7264150943396226, + "grad_norm": 11.84226862143415, + "learning_rate": 2.465417868716721e-06, + "loss": 0.0644, + "step": 2023 + }, + { + "epoch": 2.7277628032345014, + "grad_norm": 1.9195031878389617, + "learning_rate": 2.460714658013738e-06, + "loss": 0.0615, + "step": 2024 + }, + { + "epoch": 2.72911051212938, + "grad_norm": 9.85679984000092, + "learning_rate": 2.4560144730369757e-06, + "loss": 0.074, + "step": 2025 + }, + { + "epoch": 2.730458221024259, + "grad_norm": 3.6903618475393367, + "learning_rate": 2.4513173193870165e-06, + "loss": 0.0839, + "step": 2026 + }, + { + "epoch": 2.7318059299191377, + "grad_norm": 4.08620913463286, + "learning_rate": 2.4466232026608234e-06, + "loss": 0.0459, + "step": 2027 + }, + { + "epoch": 2.733153638814016, + "grad_norm": 10.269110006492419, + "learning_rate": 2.4419321284517544e-06, + "loss": 0.078, + "step": 2028 + }, + { + "epoch": 2.734501347708895, + "grad_norm": 13.216153626692229, + "learning_rate": 2.437244102349528e-06, + "loss": 0.0582, + "step": 2029 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 19.045440792313098, + "learning_rate": 2.43255912994024e-06, + "loss": 0.1129, + "step": 2030 + }, + { + "epoch": 2.7371967654986524, + "grad_norm": 13.423644368910283, + "learning_rate": 2.4278772168063436e-06, + "loss": 0.0719, + "step": 2031 + }, + { + "epoch": 2.7385444743935308, + "grad_norm": 10.936748271746845, + "learning_rate": 2.42319836852665e-06, + "loss": 0.094, + "step": 2032 + }, + { + "epoch": 2.7398921832884096, + "grad_norm": 7.638487010894083, + "learning_rate": 2.4185225906763086e-06, + "loss": 0.0559, + "step": 2033 + }, + { + "epoch": 2.7412398921832883, + "grad_norm": 6.771679973643614, + "learning_rate": 2.413849888826828e-06, + "loss": 0.0319, + "step": 2034 + }, + { + "epoch": 2.742587601078167, + "grad_norm": 2.6367756073724915, + "learning_rate": 2.4091802685460336e-06, + "loss": 0.0794, + "step": 2035 + }, + { + "epoch": 2.743935309973046, + "grad_norm": 4.563171620615371, + "learning_rate": 2.4045137353980885e-06, + "loss": 0.0979, + "step": 2036 + }, + { + "epoch": 2.7452830188679247, + "grad_norm": 4.7550991172247965, + "learning_rate": 2.399850294943477e-06, + "loss": 0.0644, + "step": 2037 + }, + { + "epoch": 2.7466307277628035, + "grad_norm": 5.327839412694109, + "learning_rate": 2.395189952738994e-06, + "loss": 0.0558, + "step": 2038 + }, + { + "epoch": 2.747978436657682, + "grad_norm": 14.566112662273918, + "learning_rate": 2.3905327143377448e-06, + "loss": 0.0803, + "step": 2039 + }, + { + "epoch": 2.7493261455525606, + "grad_norm": 10.471213151041947, + "learning_rate": 2.385878585289138e-06, + "loss": 0.0521, + "step": 2040 + }, + { + "epoch": 2.7506738544474394, + "grad_norm": 2.5073758117639975, + "learning_rate": 2.3812275711388777e-06, + "loss": 0.0636, + "step": 2041 + }, + { + "epoch": 2.752021563342318, + "grad_norm": 5.023334331593219, + "learning_rate": 2.3765796774289486e-06, + "loss": 0.0578, + "step": 2042 + }, + { + "epoch": 2.7533692722371965, + "grad_norm": 3.883467096709631, + "learning_rate": 2.3719349096976303e-06, + "loss": 0.0636, + "step": 2043 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 31.744270485198346, + "learning_rate": 2.367293273479465e-06, + "loss": 0.098, + "step": 2044 + }, + { + "epoch": 2.756064690026954, + "grad_norm": 5.496601213690694, + "learning_rate": 2.362654774305271e-06, + "loss": 0.0607, + "step": 2045 + }, + { + "epoch": 2.757412398921833, + "grad_norm": 6.987970533624463, + "learning_rate": 2.3580194177021252e-06, + "loss": 0.073, + "step": 2046 + }, + { + "epoch": 2.7587601078167117, + "grad_norm": 3.226452501490859, + "learning_rate": 2.353387209193365e-06, + "loss": 0.0588, + "step": 2047 + }, + { + "epoch": 2.7601078167115904, + "grad_norm": 6.843739530324069, + "learning_rate": 2.3487581542985676e-06, + "loss": 0.0611, + "step": 2048 + }, + { + "epoch": 2.7614555256064692, + "grad_norm": 2.0287693211904236, + "learning_rate": 2.34413225853356e-06, + "loss": 0.0574, + "step": 2049 + }, + { + "epoch": 2.7628032345013476, + "grad_norm": 12.538588423232184, + "learning_rate": 2.339509527410405e-06, + "loss": 0.0669, + "step": 2050 + }, + { + "epoch": 2.7641509433962264, + "grad_norm": 11.103032401735364, + "learning_rate": 2.334889966437386e-06, + "loss": 0.0779, + "step": 2051 + }, + { + "epoch": 2.765498652291105, + "grad_norm": 8.302328049448331, + "learning_rate": 2.3302735811190227e-06, + "loss": 0.0862, + "step": 2052 + }, + { + "epoch": 2.766846361185984, + "grad_norm": 5.668250321233335, + "learning_rate": 2.3256603769560366e-06, + "loss": 0.047, + "step": 2053 + }, + { + "epoch": 2.7681940700808623, + "grad_norm": 3.9857026030846447, + "learning_rate": 2.3210503594453684e-06, + "loss": 0.0812, + "step": 2054 + }, + { + "epoch": 2.769541778975741, + "grad_norm": 3.216116336623931, + "learning_rate": 2.3164435340801574e-06, + "loss": 0.0645, + "step": 2055 + }, + { + "epoch": 2.77088948787062, + "grad_norm": 14.015309876570198, + "learning_rate": 2.311839906349743e-06, + "loss": 0.0734, + "step": 2056 + }, + { + "epoch": 2.7722371967654986, + "grad_norm": 5.779778130507297, + "learning_rate": 2.3072394817396458e-06, + "loss": 0.0462, + "step": 2057 + }, + { + "epoch": 2.7735849056603774, + "grad_norm": 1.8195596770100049, + "learning_rate": 2.3026422657315833e-06, + "loss": 0.0803, + "step": 2058 + }, + { + "epoch": 2.774932614555256, + "grad_norm": 4.8468230160439205, + "learning_rate": 2.298048263803436e-06, + "loss": 0.0775, + "step": 2059 + }, + { + "epoch": 2.776280323450135, + "grad_norm": 9.702808651937595, + "learning_rate": 2.2934574814292627e-06, + "loss": 0.0746, + "step": 2060 + }, + { + "epoch": 2.7776280323450133, + "grad_norm": 5.276969700631638, + "learning_rate": 2.288869924079286e-06, + "loss": 0.074, + "step": 2061 + }, + { + "epoch": 2.778975741239892, + "grad_norm": 6.817848959735604, + "learning_rate": 2.2842855972198796e-06, + "loss": 0.0325, + "step": 2062 + }, + { + "epoch": 2.780323450134771, + "grad_norm": 2.468472393907127, + "learning_rate": 2.2797045063135737e-06, + "loss": 0.081, + "step": 2063 + }, + { + "epoch": 2.7816711590296497, + "grad_norm": 8.961334511197487, + "learning_rate": 2.2751266568190404e-06, + "loss": 0.0802, + "step": 2064 + }, + { + "epoch": 2.7830188679245285, + "grad_norm": 2.920229845516263, + "learning_rate": 2.2705520541910917e-06, + "loss": 0.057, + "step": 2065 + }, + { + "epoch": 2.784366576819407, + "grad_norm": 1.9984743108035141, + "learning_rate": 2.2659807038806644e-06, + "loss": 0.0682, + "step": 2066 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 13.73634222200607, + "learning_rate": 2.26141261133483e-06, + "loss": 0.08, + "step": 2067 + }, + { + "epoch": 2.7870619946091644, + "grad_norm": 4.664517334559496, + "learning_rate": 2.2568477819967678e-06, + "loss": 0.0473, + "step": 2068 + }, + { + "epoch": 2.788409703504043, + "grad_norm": 2.9294432531625687, + "learning_rate": 2.2522862213057754e-06, + "loss": 0.0837, + "step": 2069 + }, + { + "epoch": 2.789757412398922, + "grad_norm": 10.360568499850563, + "learning_rate": 2.247727934697254e-06, + "loss": 0.0748, + "step": 2070 + }, + { + "epoch": 2.7911051212938007, + "grad_norm": 9.28000483433563, + "learning_rate": 2.2431729276027043e-06, + "loss": 0.114, + "step": 2071 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 4.9527162838365815, + "learning_rate": 2.2386212054497146e-06, + "loss": 0.0703, + "step": 2072 + }, + { + "epoch": 2.793800539083558, + "grad_norm": 6.892020804208241, + "learning_rate": 2.2340727736619644e-06, + "loss": 0.0885, + "step": 2073 + }, + { + "epoch": 2.7951482479784366, + "grad_norm": 12.675723728998907, + "learning_rate": 2.229527637659213e-06, + "loss": 0.0978, + "step": 2074 + }, + { + "epoch": 2.7964959568733154, + "grad_norm": 3.076520016738387, + "learning_rate": 2.224985802857284e-06, + "loss": 0.0729, + "step": 2075 + }, + { + "epoch": 2.797843665768194, + "grad_norm": 3.7706436066815106, + "learning_rate": 2.2204472746680817e-06, + "loss": 0.0486, + "step": 2076 + }, + { + "epoch": 2.7991913746630726, + "grad_norm": 18.521865414848808, + "learning_rate": 2.2159120584995556e-06, + "loss": 0.0962, + "step": 2077 + }, + { + "epoch": 2.8005390835579513, + "grad_norm": 7.9318201798660475, + "learning_rate": 2.2113801597557184e-06, + "loss": 0.0829, + "step": 2078 + }, + { + "epoch": 2.80188679245283, + "grad_norm": 16.595898735928625, + "learning_rate": 2.2068515838366257e-06, + "loss": 0.0712, + "step": 2079 + }, + { + "epoch": 2.803234501347709, + "grad_norm": 11.012953530110384, + "learning_rate": 2.202326336138377e-06, + "loss": 0.0631, + "step": 2080 + }, + { + "epoch": 2.8045822102425877, + "grad_norm": 15.632301087614819, + "learning_rate": 2.1978044220530993e-06, + "loss": 0.0676, + "step": 2081 + }, + { + "epoch": 2.8059299191374665, + "grad_norm": 2.884814122068789, + "learning_rate": 2.193285846968958e-06, + "loss": 0.096, + "step": 2082 + }, + { + "epoch": 2.8072776280323453, + "grad_norm": 13.10736664813874, + "learning_rate": 2.1887706162701292e-06, + "loss": 0.0574, + "step": 2083 + }, + { + "epoch": 2.8086253369272236, + "grad_norm": 8.15508398760302, + "learning_rate": 2.18425873533681e-06, + "loss": 0.0678, + "step": 2084 + }, + { + "epoch": 2.8099730458221024, + "grad_norm": 3.6045107084597983, + "learning_rate": 2.1797502095452063e-06, + "loss": 0.052, + "step": 2085 + }, + { + "epoch": 2.811320754716981, + "grad_norm": 24.78006817865326, + "learning_rate": 2.1752450442675204e-06, + "loss": 0.0743, + "step": 2086 + }, + { + "epoch": 2.81266846361186, + "grad_norm": 6.155333166680704, + "learning_rate": 2.170743244871957e-06, + "loss": 0.0612, + "step": 2087 + }, + { + "epoch": 2.8140161725067383, + "grad_norm": 7.08088448550691, + "learning_rate": 2.1662448167227068e-06, + "loss": 0.0654, + "step": 2088 + }, + { + "epoch": 2.815363881401617, + "grad_norm": 5.818681094653226, + "learning_rate": 2.161749765179946e-06, + "loss": 0.0955, + "step": 2089 + }, + { + "epoch": 2.816711590296496, + "grad_norm": 15.048485786706832, + "learning_rate": 2.1572580955998202e-06, + "loss": 0.0977, + "step": 2090 + }, + { + "epoch": 2.8180592991913747, + "grad_norm": 20.578741256547936, + "learning_rate": 2.1527698133344578e-06, + "loss": 0.0879, + "step": 2091 + }, + { + "epoch": 2.8194070080862534, + "grad_norm": 27.963989710914447, + "learning_rate": 2.148284923731938e-06, + "loss": 0.112, + "step": 2092 + }, + { + "epoch": 2.8207547169811322, + "grad_norm": 6.962581420957632, + "learning_rate": 2.1438034321363044e-06, + "loss": 0.0702, + "step": 2093 + }, + { + "epoch": 2.822102425876011, + "grad_norm": 1.7862654070727138, + "learning_rate": 2.139325343887551e-06, + "loss": 0.0515, + "step": 2094 + }, + { + "epoch": 2.8234501347708894, + "grad_norm": 12.013520743800186, + "learning_rate": 2.134850664321617e-06, + "loss": 0.0887, + "step": 2095 + }, + { + "epoch": 2.824797843665768, + "grad_norm": 23.406244254394807, + "learning_rate": 2.130379398770375e-06, + "loss": 0.1039, + "step": 2096 + }, + { + "epoch": 2.826145552560647, + "grad_norm": 2.9584757382636813, + "learning_rate": 2.125911552561636e-06, + "loss": 0.0542, + "step": 2097 + }, + { + "epoch": 2.8274932614555257, + "grad_norm": 16.048032714358783, + "learning_rate": 2.121447131019134e-06, + "loss": 0.0664, + "step": 2098 + }, + { + "epoch": 2.828840970350404, + "grad_norm": 5.942210143929107, + "learning_rate": 2.1169861394625186e-06, + "loss": 0.0638, + "step": 2099 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 7.471861253679736, + "learning_rate": 2.1125285832073623e-06, + "loss": 0.0538, + "step": 2100 + }, + { + "epoch": 2.8315363881401616, + "grad_norm": 23.02237872760605, + "learning_rate": 2.108074467565132e-06, + "loss": 0.0353, + "step": 2101 + }, + { + "epoch": 2.8328840970350404, + "grad_norm": 10.580659892727125, + "learning_rate": 2.1036237978432034e-06, + "loss": 0.1139, + "step": 2102 + }, + { + "epoch": 2.834231805929919, + "grad_norm": 13.56523689961846, + "learning_rate": 2.099176579344843e-06, + "loss": 0.0857, + "step": 2103 + }, + { + "epoch": 2.835579514824798, + "grad_norm": 7.001429874937545, + "learning_rate": 2.094732817369207e-06, + "loss": 0.0843, + "step": 2104 + }, + { + "epoch": 2.8369272237196768, + "grad_norm": 13.529066375939529, + "learning_rate": 2.090292517211326e-06, + "loss": 0.0506, + "step": 2105 + }, + { + "epoch": 2.838274932614555, + "grad_norm": 15.283120800163262, + "learning_rate": 2.0858556841621187e-06, + "loss": 0.0513, + "step": 2106 + }, + { + "epoch": 2.839622641509434, + "grad_norm": 18.218203359922025, + "learning_rate": 2.081422323508358e-06, + "loss": 0.0654, + "step": 2107 + }, + { + "epoch": 2.8409703504043127, + "grad_norm": 2.274118786277428, + "learning_rate": 2.0769924405326896e-06, + "loss": 0.0426, + "step": 2108 + }, + { + "epoch": 2.8423180592991915, + "grad_norm": 6.209809542599675, + "learning_rate": 2.0725660405136123e-06, + "loss": 0.0446, + "step": 2109 + }, + { + "epoch": 2.8436657681940702, + "grad_norm": 6.414394828033482, + "learning_rate": 2.068143128725471e-06, + "loss": 0.0633, + "step": 2110 + }, + { + "epoch": 2.8450134770889486, + "grad_norm": 19.850815581405037, + "learning_rate": 2.063723710438459e-06, + "loss": 0.1061, + "step": 2111 + }, + { + "epoch": 2.8463611859838274, + "grad_norm": 17.92195958739503, + "learning_rate": 2.0593077909186047e-06, + "loss": 0.0594, + "step": 2112 + }, + { + "epoch": 2.847708894878706, + "grad_norm": 17.488921471281333, + "learning_rate": 2.05489537542777e-06, + "loss": 0.0947, + "step": 2113 + }, + { + "epoch": 2.849056603773585, + "grad_norm": 16.308876659954787, + "learning_rate": 2.050486469223634e-06, + "loss": 0.0626, + "step": 2114 + }, + { + "epoch": 2.8504043126684637, + "grad_norm": 13.256741466642033, + "learning_rate": 2.046081077559707e-06, + "loss": 0.0876, + "step": 2115 + }, + { + "epoch": 2.8517520215633425, + "grad_norm": 13.090900461638812, + "learning_rate": 2.0416792056852985e-06, + "loss": 0.0698, + "step": 2116 + }, + { + "epoch": 2.8530997304582213, + "grad_norm": 6.381402983010651, + "learning_rate": 2.0372808588455318e-06, + "loss": 0.0679, + "step": 2117 + }, + { + "epoch": 2.8544474393530996, + "grad_norm": 11.020836353290653, + "learning_rate": 2.032886042281327e-06, + "loss": 0.0516, + "step": 2118 + }, + { + "epoch": 2.8557951482479784, + "grad_norm": 3.7604728513825774, + "learning_rate": 2.0284947612294016e-06, + "loss": 0.0762, + "step": 2119 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.9868919124329782, + "learning_rate": 2.024107020922252e-06, + "loss": 0.0741, + "step": 2120 + }, + { + "epoch": 2.858490566037736, + "grad_norm": 8.67428163441566, + "learning_rate": 2.0197228265881622e-06, + "loss": 0.0778, + "step": 2121 + }, + { + "epoch": 2.8598382749326143, + "grad_norm": 2.249889001460133, + "learning_rate": 2.0153421834511927e-06, + "loss": 0.0629, + "step": 2122 + }, + { + "epoch": 2.861185983827493, + "grad_norm": 13.386181180407547, + "learning_rate": 2.010965096731163e-06, + "loss": 0.0483, + "step": 2123 + }, + { + "epoch": 2.862533692722372, + "grad_norm": 22.204057777568526, + "learning_rate": 2.0065915716436675e-06, + "loss": 0.0792, + "step": 2124 + }, + { + "epoch": 2.8638814016172507, + "grad_norm": 9.273835053641399, + "learning_rate": 2.0022216134000456e-06, + "loss": 0.0578, + "step": 2125 + }, + { + "epoch": 2.8652291105121295, + "grad_norm": 12.053633531667248, + "learning_rate": 1.997855227207393e-06, + "loss": 0.0755, + "step": 2126 + }, + { + "epoch": 2.8665768194070083, + "grad_norm": 15.137075162118652, + "learning_rate": 1.9934924182685474e-06, + "loss": 0.0672, + "step": 2127 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 3.0694374573016265, + "learning_rate": 1.989133191782085e-06, + "loss": 0.0536, + "step": 2128 + }, + { + "epoch": 2.8692722371967654, + "grad_norm": 1.9899417998195532, + "learning_rate": 1.9847775529423076e-06, + "loss": 0.0454, + "step": 2129 + }, + { + "epoch": 2.870619946091644, + "grad_norm": 8.726965443256283, + "learning_rate": 1.980425506939253e-06, + "loss": 0.1004, + "step": 2130 + }, + { + "epoch": 2.871967654986523, + "grad_norm": 3.740544080430832, + "learning_rate": 1.9760770589586664e-06, + "loss": 0.0652, + "step": 2131 + }, + { + "epoch": 2.8733153638814017, + "grad_norm": 19.678639969474187, + "learning_rate": 1.971732214182013e-06, + "loss": 0.0897, + "step": 2132 + }, + { + "epoch": 2.87466307277628, + "grad_norm": 12.65606000720933, + "learning_rate": 1.967390977786463e-06, + "loss": 0.0599, + "step": 2133 + }, + { + "epoch": 2.876010781671159, + "grad_norm": 18.162662608789805, + "learning_rate": 1.963053354944884e-06, + "loss": 0.0813, + "step": 2134 + }, + { + "epoch": 2.8773584905660377, + "grad_norm": 10.594673685005686, + "learning_rate": 1.9587193508258415e-06, + "loss": 0.0691, + "step": 2135 + }, + { + "epoch": 2.8787061994609164, + "grad_norm": 1.4771370419334375, + "learning_rate": 1.9543889705935874e-06, + "loss": 0.0693, + "step": 2136 + }, + { + "epoch": 2.8800539083557952, + "grad_norm": 5.444469611389185, + "learning_rate": 1.950062219408058e-06, + "loss": 0.0771, + "step": 2137 + }, + { + "epoch": 2.881401617250674, + "grad_norm": 7.277887524043918, + "learning_rate": 1.9457391024248578e-06, + "loss": 0.0719, + "step": 2138 + }, + { + "epoch": 2.882749326145553, + "grad_norm": 6.815433555801158, + "learning_rate": 1.941419624795273e-06, + "loss": 0.0436, + "step": 2139 + }, + { + "epoch": 2.884097035040431, + "grad_norm": 11.90249055342095, + "learning_rate": 1.9371037916662417e-06, + "loss": 0.0951, + "step": 2140 + }, + { + "epoch": 2.88544474393531, + "grad_norm": 24.019291707840743, + "learning_rate": 1.9327916081803655e-06, + "loss": 0.0639, + "step": 2141 + }, + { + "epoch": 2.8867924528301887, + "grad_norm": 8.503012687884413, + "learning_rate": 1.9284830794758957e-06, + "loss": 0.0693, + "step": 2142 + }, + { + "epoch": 2.8881401617250675, + "grad_norm": 19.30798001917515, + "learning_rate": 1.924178210686731e-06, + "loss": 0.0955, + "step": 2143 + }, + { + "epoch": 2.889487870619946, + "grad_norm": 7.323450170022368, + "learning_rate": 1.919877006942404e-06, + "loss": 0.0463, + "step": 2144 + }, + { + "epoch": 2.8908355795148246, + "grad_norm": 16.477058739319627, + "learning_rate": 1.915579473368083e-06, + "loss": 0.1136, + "step": 2145 + }, + { + "epoch": 2.8921832884097034, + "grad_norm": 13.214829351053602, + "learning_rate": 1.911285615084567e-06, + "loss": 0.0602, + "step": 2146 + }, + { + "epoch": 2.893530997304582, + "grad_norm": 9.07227641006568, + "learning_rate": 1.906995437208265e-06, + "loss": 0.0732, + "step": 2147 + }, + { + "epoch": 2.894878706199461, + "grad_norm": 20.659956882757495, + "learning_rate": 1.9027089448512154e-06, + "loss": 0.0751, + "step": 2148 + }, + { + "epoch": 2.8962264150943398, + "grad_norm": 1.7086720792745933, + "learning_rate": 1.8984261431210505e-06, + "loss": 0.0767, + "step": 2149 + }, + { + "epoch": 2.8975741239892185, + "grad_norm": 3.247115474743604, + "learning_rate": 1.8941470371210146e-06, + "loss": 0.0775, + "step": 2150 + }, + { + "epoch": 2.898921832884097, + "grad_norm": 2.272201986902991, + "learning_rate": 1.8898716319499443e-06, + "loss": 0.06, + "step": 2151 + }, + { + "epoch": 2.9002695417789757, + "grad_norm": 16.377741381670834, + "learning_rate": 1.8855999327022695e-06, + "loss": 0.0609, + "step": 2152 + }, + { + "epoch": 2.9016172506738545, + "grad_norm": 6.2990796127661115, + "learning_rate": 1.8813319444679962e-06, + "loss": 0.0683, + "step": 2153 + }, + { + "epoch": 2.9029649595687332, + "grad_norm": 9.696748161629063, + "learning_rate": 1.8770676723327214e-06, + "loss": 0.0531, + "step": 2154 + }, + { + "epoch": 2.904312668463612, + "grad_norm": 14.50655541002873, + "learning_rate": 1.8728071213776028e-06, + "loss": 0.0591, + "step": 2155 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 24.097700711626334, + "learning_rate": 1.8685502966793684e-06, + "loss": 0.1073, + "step": 2156 + }, + { + "epoch": 2.907008086253369, + "grad_norm": 8.40624073868737, + "learning_rate": 1.864297203310309e-06, + "loss": 0.0715, + "step": 2157 + }, + { + "epoch": 2.908355795148248, + "grad_norm": 6.898445876215753, + "learning_rate": 1.8600478463382627e-06, + "loss": 0.0593, + "step": 2158 + }, + { + "epoch": 2.9097035040431267, + "grad_norm": 3.5354427601721636, + "learning_rate": 1.8558022308266204e-06, + "loss": 0.0536, + "step": 2159 + }, + { + "epoch": 2.9110512129380055, + "grad_norm": 2.940356786969647, + "learning_rate": 1.8515603618343131e-06, + "loss": 0.0664, + "step": 2160 + }, + { + "epoch": 2.9123989218328843, + "grad_norm": 17.377774251996353, + "learning_rate": 1.8473222444158107e-06, + "loss": 0.0783, + "step": 2161 + }, + { + "epoch": 2.913746630727763, + "grad_norm": 14.586581951963673, + "learning_rate": 1.8430878836211036e-06, + "loss": 0.054, + "step": 2162 + }, + { + "epoch": 2.9150943396226414, + "grad_norm": 2.7186775569057033, + "learning_rate": 1.8388572844957202e-06, + "loss": 0.0746, + "step": 2163 + }, + { + "epoch": 2.91644204851752, + "grad_norm": 5.758164403994922, + "learning_rate": 1.8346304520806936e-06, + "loss": 0.0722, + "step": 2164 + }, + { + "epoch": 2.917789757412399, + "grad_norm": 2.571795811319892, + "learning_rate": 1.8304073914125752e-06, + "loss": 0.0441, + "step": 2165 + }, + { + "epoch": 2.9191374663072778, + "grad_norm": 17.376030165506776, + "learning_rate": 1.8261881075234212e-06, + "loss": 0.069, + "step": 2166 + }, + { + "epoch": 2.920485175202156, + "grad_norm": 5.66509110864994, + "learning_rate": 1.8219726054407876e-06, + "loss": 0.0649, + "step": 2167 + }, + { + "epoch": 2.921832884097035, + "grad_norm": 7.058906313558427, + "learning_rate": 1.817760890187722e-06, + "loss": 0.0629, + "step": 2168 + }, + { + "epoch": 2.9231805929919137, + "grad_norm": 8.549647898448992, + "learning_rate": 1.813552966782761e-06, + "loss": 0.0769, + "step": 2169 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 2.4387449598481665, + "learning_rate": 1.8093488402399266e-06, + "loss": 0.0716, + "step": 2170 + }, + { + "epoch": 2.9258760107816713, + "grad_norm": 8.328777503839065, + "learning_rate": 1.805148515568708e-06, + "loss": 0.0593, + "step": 2171 + }, + { + "epoch": 2.92722371967655, + "grad_norm": 3.0120787758447243, + "learning_rate": 1.800951997774076e-06, + "loss": 0.0843, + "step": 2172 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 9.275219636262008, + "learning_rate": 1.796759291856453e-06, + "loss": 0.0785, + "step": 2173 + }, + { + "epoch": 2.929919137466307, + "grad_norm": 2.2148976135620537, + "learning_rate": 1.7925704028117275e-06, + "loss": 0.0668, + "step": 2174 + }, + { + "epoch": 2.931266846361186, + "grad_norm": 10.206706413488453, + "learning_rate": 1.7883853356312375e-06, + "loss": 0.1002, + "step": 2175 + }, + { + "epoch": 2.9326145552560647, + "grad_norm": 2.37407560749916, + "learning_rate": 1.7842040953017685e-06, + "loss": 0.0846, + "step": 2176 + }, + { + "epoch": 2.9339622641509435, + "grad_norm": 11.218100395719704, + "learning_rate": 1.7800266868055393e-06, + "loss": 0.0672, + "step": 2177 + }, + { + "epoch": 2.935309973045822, + "grad_norm": 2.193879796541628, + "learning_rate": 1.7758531151202157e-06, + "loss": 0.071, + "step": 2178 + }, + { + "epoch": 2.9366576819407006, + "grad_norm": 4.990136547931498, + "learning_rate": 1.771683385218878e-06, + "loss": 0.0594, + "step": 2179 + }, + { + "epoch": 2.9380053908355794, + "grad_norm": 20.454122745965, + "learning_rate": 1.7675175020700363e-06, + "loss": 0.0903, + "step": 2180 + }, + { + "epoch": 2.939353099730458, + "grad_norm": 14.818429329343411, + "learning_rate": 1.7633554706376182e-06, + "loss": 0.0473, + "step": 2181 + }, + { + "epoch": 2.940700808625337, + "grad_norm": 1.2451533753899264, + "learning_rate": 1.7591972958809556e-06, + "loss": 0.0648, + "step": 2182 + }, + { + "epoch": 2.942048517520216, + "grad_norm": 8.992569746404866, + "learning_rate": 1.7550429827547894e-06, + "loss": 0.0765, + "step": 2183 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 17.42693163149274, + "learning_rate": 1.7508925362092587e-06, + "loss": 0.0882, + "step": 2184 + }, + { + "epoch": 2.944743935309973, + "grad_norm": 4.761697495892718, + "learning_rate": 1.7467459611898962e-06, + "loss": 0.0786, + "step": 2185 + }, + { + "epoch": 2.9460916442048517, + "grad_norm": 13.971957969306356, + "learning_rate": 1.7426032626376145e-06, + "loss": 0.0787, + "step": 2186 + }, + { + "epoch": 2.9474393530997305, + "grad_norm": 5.415307889340228, + "learning_rate": 1.73846444548872e-06, + "loss": 0.0749, + "step": 2187 + }, + { + "epoch": 2.9487870619946093, + "grad_norm": 11.884852525268485, + "learning_rate": 1.734329514674881e-06, + "loss": 0.0863, + "step": 2188 + }, + { + "epoch": 2.9501347708894876, + "grad_norm": 22.79551760940323, + "learning_rate": 1.7301984751231432e-06, + "loss": 0.0584, + "step": 2189 + }, + { + "epoch": 2.9514824797843664, + "grad_norm": 3.6366615685875767, + "learning_rate": 1.7260713317559125e-06, + "loss": 0.0607, + "step": 2190 + }, + { + "epoch": 2.952830188679245, + "grad_norm": 1.7395789987643748, + "learning_rate": 1.7219480894909545e-06, + "loss": 0.091, + "step": 2191 + }, + { + "epoch": 2.954177897574124, + "grad_norm": 13.14914469119441, + "learning_rate": 1.7178287532413818e-06, + "loss": 0.085, + "step": 2192 + }, + { + "epoch": 2.9555256064690028, + "grad_norm": 14.8067428318028, + "learning_rate": 1.713713327915657e-06, + "loss": 0.0666, + "step": 2193 + }, + { + "epoch": 2.9568733153638815, + "grad_norm": 11.337801665750103, + "learning_rate": 1.7096018184175827e-06, + "loss": 0.0751, + "step": 2194 + }, + { + "epoch": 2.9582210242587603, + "grad_norm": 11.597880989024869, + "learning_rate": 1.7054942296462895e-06, + "loss": 0.0733, + "step": 2195 + }, + { + "epoch": 2.9595687331536387, + "grad_norm": 7.049368864419899, + "learning_rate": 1.7013905664962472e-06, + "loss": 0.0665, + "step": 2196 + }, + { + "epoch": 2.9609164420485174, + "grad_norm": 8.672669490490879, + "learning_rate": 1.6972908338572364e-06, + "loss": 0.0615, + "step": 2197 + }, + { + "epoch": 2.9622641509433962, + "grad_norm": 4.7068481936225846, + "learning_rate": 1.6931950366143612e-06, + "loss": 0.0625, + "step": 2198 + }, + { + "epoch": 2.963611859838275, + "grad_norm": 12.397099096146526, + "learning_rate": 1.689103179648035e-06, + "loss": 0.0597, + "step": 2199 + }, + { + "epoch": 2.964959568733154, + "grad_norm": 18.633585745883796, + "learning_rate": 1.6850152678339765e-06, + "loss": 0.0526, + "step": 2200 + }, + { + "epoch": 2.966307277628032, + "grad_norm": 6.132505264235483, + "learning_rate": 1.6809313060431982e-06, + "loss": 0.0766, + "step": 2201 + }, + { + "epoch": 2.967654986522911, + "grad_norm": 13.517589839271304, + "learning_rate": 1.6768512991420165e-06, + "loss": 0.0769, + "step": 2202 + }, + { + "epoch": 2.9690026954177897, + "grad_norm": 1.385162000465888, + "learning_rate": 1.6727752519920249e-06, + "loss": 0.0639, + "step": 2203 + }, + { + "epoch": 2.9703504043126685, + "grad_norm": 20.475529961097198, + "learning_rate": 1.6687031694501037e-06, + "loss": 0.0927, + "step": 2204 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 5.787287417202116, + "learning_rate": 1.6646350563684104e-06, + "loss": 0.073, + "step": 2205 + }, + { + "epoch": 2.973045822102426, + "grad_norm": 3.217028308667301, + "learning_rate": 1.660570917594367e-06, + "loss": 0.057, + "step": 2206 + }, + { + "epoch": 2.974393530997305, + "grad_norm": 16.228641259789992, + "learning_rate": 1.6565107579706651e-06, + "loss": 0.0622, + "step": 2207 + }, + { + "epoch": 2.975741239892183, + "grad_norm": 1.8958619651711741, + "learning_rate": 1.6524545823352527e-06, + "loss": 0.0818, + "step": 2208 + }, + { + "epoch": 2.977088948787062, + "grad_norm": 2.843042015976897, + "learning_rate": 1.648402395521333e-06, + "loss": 0.0783, + "step": 2209 + }, + { + "epoch": 2.9784366576819408, + "grad_norm": 3.580544167711689, + "learning_rate": 1.6443542023573494e-06, + "loss": 0.0705, + "step": 2210 + }, + { + "epoch": 2.9797843665768196, + "grad_norm": 5.285492580903268, + "learning_rate": 1.6403100076669976e-06, + "loss": 0.0483, + "step": 2211 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 2.084773572837163, + "learning_rate": 1.6362698162691982e-06, + "loss": 0.087, + "step": 2212 + }, + { + "epoch": 2.9824797843665767, + "grad_norm": 3.70344758443421, + "learning_rate": 1.6322336329781075e-06, + "loss": 0.0996, + "step": 2213 + }, + { + "epoch": 2.9838274932614555, + "grad_norm": 36.225030211735906, + "learning_rate": 1.628201462603105e-06, + "loss": 0.0873, + "step": 2214 + }, + { + "epoch": 2.9851752021563343, + "grad_norm": 2.3050642899514853, + "learning_rate": 1.6241733099487888e-06, + "loss": 0.1157, + "step": 2215 + }, + { + "epoch": 2.986522911051213, + "grad_norm": 1.9003188343602448, + "learning_rate": 1.6201491798149666e-06, + "loss": 0.0581, + "step": 2216 + }, + { + "epoch": 2.987870619946092, + "grad_norm": 1.9778180135177568, + "learning_rate": 1.6161290769966565e-06, + "loss": 0.0798, + "step": 2217 + }, + { + "epoch": 2.9892183288409706, + "grad_norm": 14.811162302686128, + "learning_rate": 1.6121130062840779e-06, + "loss": 0.0526, + "step": 2218 + }, + { + "epoch": 2.990566037735849, + "grad_norm": 10.409537804386465, + "learning_rate": 1.6081009724626395e-06, + "loss": 0.0676, + "step": 2219 + }, + { + "epoch": 2.9919137466307277, + "grad_norm": 15.855820777040574, + "learning_rate": 1.6040929803129513e-06, + "loss": 0.0784, + "step": 2220 + }, + { + "epoch": 2.9932614555256065, + "grad_norm": 2.696378169304497, + "learning_rate": 1.600089034610796e-06, + "loss": 0.0655, + "step": 2221 + }, + { + "epoch": 2.9946091644204853, + "grad_norm": 3.4225187104515618, + "learning_rate": 1.5960891401271412e-06, + "loss": 0.0791, + "step": 2222 + }, + { + "epoch": 2.9959568733153636, + "grad_norm": 3.466536632368798, + "learning_rate": 1.5920933016281242e-06, + "loss": 0.0688, + "step": 2223 + }, + { + "epoch": 2.9973045822102424, + "grad_norm": 3.5461698705792593, + "learning_rate": 1.5881015238750536e-06, + "loss": 0.0876, + "step": 2224 + }, + { + "epoch": 2.998652291105121, + "grad_norm": 10.385657561238977, + "learning_rate": 1.5841138116243927e-06, + "loss": 0.0506, + "step": 2225 + }, + { + "epoch": 3.0, + "grad_norm": 7.905881247585874, + "learning_rate": 1.5801301696277643e-06, + "loss": 0.0558, + "step": 2226 + }, + { + "epoch": 3.001347708894879, + "grad_norm": 11.847657078363293, + "learning_rate": 1.576150602631943e-06, + "loss": 0.0386, + "step": 2227 + }, + { + "epoch": 3.0026954177897576, + "grad_norm": 1.823871532788187, + "learning_rate": 1.5721751153788444e-06, + "loss": 0.0708, + "step": 2228 + }, + { + "epoch": 3.004043126684636, + "grad_norm": 28.694038000300903, + "learning_rate": 1.5682037126055267e-06, + "loss": 0.0394, + "step": 2229 + }, + { + "epoch": 3.0053908355795147, + "grad_norm": 1.5892711026575215, + "learning_rate": 1.5642363990441745e-06, + "loss": 0.0574, + "step": 2230 + }, + { + "epoch": 3.0067385444743935, + "grad_norm": 19.00624834117741, + "learning_rate": 1.560273179422106e-06, + "loss": 0.0691, + "step": 2231 + }, + { + "epoch": 3.0080862533692723, + "grad_norm": 34.15943665267077, + "learning_rate": 1.5563140584617592e-06, + "loss": 0.0598, + "step": 2232 + }, + { + "epoch": 3.009433962264151, + "grad_norm": 4.982234445510488, + "learning_rate": 1.5523590408806898e-06, + "loss": 0.0577, + "step": 2233 + }, + { + "epoch": 3.01078167115903, + "grad_norm": 15.106540153457408, + "learning_rate": 1.5484081313915577e-06, + "loss": 0.0651, + "step": 2234 + }, + { + "epoch": 3.012129380053908, + "grad_norm": 1.386888601074312, + "learning_rate": 1.5444613347021392e-06, + "loss": 0.0843, + "step": 2235 + }, + { + "epoch": 3.013477088948787, + "grad_norm": 15.382020243073875, + "learning_rate": 1.5405186555152983e-06, + "loss": 0.0439, + "step": 2236 + }, + { + "epoch": 3.0148247978436657, + "grad_norm": 4.776662838515192, + "learning_rate": 1.5365800985289992e-06, + "loss": 0.0511, + "step": 2237 + }, + { + "epoch": 3.0161725067385445, + "grad_norm": 9.89683265223634, + "learning_rate": 1.5326456684362923e-06, + "loss": 0.0466, + "step": 2238 + }, + { + "epoch": 3.0175202156334233, + "grad_norm": 2.2724007113894795, + "learning_rate": 1.5287153699253132e-06, + "loss": 0.07, + "step": 2239 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 18.25295640097969, + "learning_rate": 1.524789207679269e-06, + "loss": 0.0632, + "step": 2240 + }, + { + "epoch": 3.0202156334231804, + "grad_norm": 22.090263134779484, + "learning_rate": 1.5208671863764423e-06, + "loss": 0.06, + "step": 2241 + }, + { + "epoch": 3.0215633423180592, + "grad_norm": 22.997881378413954, + "learning_rate": 1.5169493106901834e-06, + "loss": 0.0441, + "step": 2242 + }, + { + "epoch": 3.022911051212938, + "grad_norm": 9.148099128307923, + "learning_rate": 1.5130355852888935e-06, + "loss": 0.0501, + "step": 2243 + }, + { + "epoch": 3.024258760107817, + "grad_norm": 5.70674044590301, + "learning_rate": 1.5091260148360425e-06, + "loss": 0.0385, + "step": 2244 + }, + { + "epoch": 3.0256064690026956, + "grad_norm": 14.867302870334347, + "learning_rate": 1.5052206039901367e-06, + "loss": 0.0626, + "step": 2245 + }, + { + "epoch": 3.026954177897574, + "grad_norm": 12.19612367040633, + "learning_rate": 1.501319357404733e-06, + "loss": 0.0411, + "step": 2246 + }, + { + "epoch": 3.0283018867924527, + "grad_norm": 17.73468465686651, + "learning_rate": 1.4974222797284243e-06, + "loss": 0.0751, + "step": 2247 + }, + { + "epoch": 3.0296495956873315, + "grad_norm": 6.750383437426294, + "learning_rate": 1.4935293756048376e-06, + "loss": 0.0527, + "step": 2248 + }, + { + "epoch": 3.0309973045822103, + "grad_norm": 11.879306511047423, + "learning_rate": 1.4896406496726217e-06, + "loss": 0.041, + "step": 2249 + }, + { + "epoch": 3.032345013477089, + "grad_norm": 3.5464186955994874, + "learning_rate": 1.4857561065654523e-06, + "loss": 0.0411, + "step": 2250 + }, + { + "epoch": 3.033692722371968, + "grad_norm": 1.8528651592891556, + "learning_rate": 1.4818757509120197e-06, + "loss": 0.0542, + "step": 2251 + }, + { + "epoch": 3.035040431266846, + "grad_norm": 3.1380991214009932, + "learning_rate": 1.477999587336023e-06, + "loss": 0.0786, + "step": 2252 + }, + { + "epoch": 3.036388140161725, + "grad_norm": 7.182088746343587, + "learning_rate": 1.4741276204561694e-06, + "loss": 0.0864, + "step": 2253 + }, + { + "epoch": 3.0377358490566038, + "grad_norm": 12.76043761499442, + "learning_rate": 1.4702598548861597e-06, + "loss": 0.0914, + "step": 2254 + }, + { + "epoch": 3.0390835579514826, + "grad_norm": 2.485699018593833, + "learning_rate": 1.4663962952346938e-06, + "loss": 0.0434, + "step": 2255 + }, + { + "epoch": 3.0404312668463613, + "grad_norm": 6.993402206417582, + "learning_rate": 1.4625369461054583e-06, + "loss": 0.0416, + "step": 2256 + }, + { + "epoch": 3.0417789757412397, + "grad_norm": 3.0697896099980793, + "learning_rate": 1.4586818120971225e-06, + "loss": 0.0551, + "step": 2257 + }, + { + "epoch": 3.0431266846361185, + "grad_norm": 3.086746327688781, + "learning_rate": 1.4548308978033337e-06, + "loss": 0.0468, + "step": 2258 + }, + { + "epoch": 3.0444743935309972, + "grad_norm": 7.416585214474408, + "learning_rate": 1.4509842078127111e-06, + "loss": 0.0576, + "step": 2259 + }, + { + "epoch": 3.045822102425876, + "grad_norm": 13.339825802774138, + "learning_rate": 1.4471417467088377e-06, + "loss": 0.0503, + "step": 2260 + }, + { + "epoch": 3.047169811320755, + "grad_norm": 10.710229661019131, + "learning_rate": 1.4433035190702616e-06, + "loss": 0.0413, + "step": 2261 + }, + { + "epoch": 3.0485175202156336, + "grad_norm": 6.619908250967464, + "learning_rate": 1.4394695294704837e-06, + "loss": 0.0542, + "step": 2262 + }, + { + "epoch": 3.049865229110512, + "grad_norm": 15.847408133807248, + "learning_rate": 1.435639782477956e-06, + "loss": 0.06, + "step": 2263 + }, + { + "epoch": 3.0512129380053907, + "grad_norm": 4.554493188361118, + "learning_rate": 1.4318142826560771e-06, + "loss": 0.0365, + "step": 2264 + }, + { + "epoch": 3.0525606469002695, + "grad_norm": 15.306675593507903, + "learning_rate": 1.4279930345631794e-06, + "loss": 0.055, + "step": 2265 + }, + { + "epoch": 3.0539083557951483, + "grad_norm": 7.922824175902406, + "learning_rate": 1.4241760427525337e-06, + "loss": 0.0408, + "step": 2266 + }, + { + "epoch": 3.055256064690027, + "grad_norm": 20.53119754011293, + "learning_rate": 1.4203633117723382e-06, + "loss": 0.0472, + "step": 2267 + }, + { + "epoch": 3.056603773584906, + "grad_norm": 2.5624735430018832, + "learning_rate": 1.4165548461657146e-06, + "loss": 0.053, + "step": 2268 + }, + { + "epoch": 3.057951482479784, + "grad_norm": 17.879254072028967, + "learning_rate": 1.4127506504706979e-06, + "loss": 0.0429, + "step": 2269 + }, + { + "epoch": 3.059299191374663, + "grad_norm": 10.00564175856931, + "learning_rate": 1.408950729220243e-06, + "loss": 0.0561, + "step": 2270 + }, + { + "epoch": 3.060646900269542, + "grad_norm": 2.475621801663553, + "learning_rate": 1.4051550869422043e-06, + "loss": 0.0668, + "step": 2271 + }, + { + "epoch": 3.0619946091644206, + "grad_norm": 12.606346580580643, + "learning_rate": 1.4013637281593406e-06, + "loss": 0.0478, + "step": 2272 + }, + { + "epoch": 3.0633423180592994, + "grad_norm": 8.45714101444673, + "learning_rate": 1.3975766573893085e-06, + "loss": 0.0453, + "step": 2273 + }, + { + "epoch": 3.0646900269541777, + "grad_norm": 10.4824973859782, + "learning_rate": 1.3937938791446493e-06, + "loss": 0.0393, + "step": 2274 + }, + { + "epoch": 3.0660377358490565, + "grad_norm": 15.38692910777839, + "learning_rate": 1.3900153979327951e-06, + "loss": 0.0478, + "step": 2275 + }, + { + "epoch": 3.0673854447439353, + "grad_norm": 2.6078270177090355, + "learning_rate": 1.386241218256056e-06, + "loss": 0.0595, + "step": 2276 + }, + { + "epoch": 3.068733153638814, + "grad_norm": 14.98873512465596, + "learning_rate": 1.3824713446116178e-06, + "loss": 0.0579, + "step": 2277 + }, + { + "epoch": 3.070080862533693, + "grad_norm": 9.934084931303401, + "learning_rate": 1.378705781491529e-06, + "loss": 0.0383, + "step": 2278 + }, + { + "epoch": 3.0714285714285716, + "grad_norm": 1.0990265961674182, + "learning_rate": 1.3749445333827132e-06, + "loss": 0.0518, + "step": 2279 + }, + { + "epoch": 3.07277628032345, + "grad_norm": 4.999239988385977, + "learning_rate": 1.3711876047669416e-06, + "loss": 0.046, + "step": 2280 + }, + { + "epoch": 3.0741239892183287, + "grad_norm": 1.3130418220032425, + "learning_rate": 1.3674350001208442e-06, + "loss": 0.0351, + "step": 2281 + }, + { + "epoch": 3.0754716981132075, + "grad_norm": 12.247936785425988, + "learning_rate": 1.363686723915897e-06, + "loss": 0.0718, + "step": 2282 + }, + { + "epoch": 3.0768194070080863, + "grad_norm": 3.943813133148743, + "learning_rate": 1.3599427806184207e-06, + "loss": 0.0438, + "step": 2283 + }, + { + "epoch": 3.078167115902965, + "grad_norm": 5.7029415463776125, + "learning_rate": 1.3562031746895677e-06, + "loss": 0.0484, + "step": 2284 + }, + { + "epoch": 3.079514824797844, + "grad_norm": 16.531822560011673, + "learning_rate": 1.3524679105853267e-06, + "loss": 0.0589, + "step": 2285 + }, + { + "epoch": 3.0808625336927222, + "grad_norm": 17.778610101895737, + "learning_rate": 1.3487369927565125e-06, + "loss": 0.054, + "step": 2286 + }, + { + "epoch": 3.082210242587601, + "grad_norm": 14.955956721693665, + "learning_rate": 1.3450104256487595e-06, + "loss": 0.0346, + "step": 2287 + }, + { + "epoch": 3.08355795148248, + "grad_norm": 8.724956990616572, + "learning_rate": 1.3412882137025201e-06, + "loss": 0.0409, + "step": 2288 + }, + { + "epoch": 3.0849056603773586, + "grad_norm": 9.440727895442082, + "learning_rate": 1.3375703613530527e-06, + "loss": 0.047, + "step": 2289 + }, + { + "epoch": 3.0862533692722374, + "grad_norm": 2.971879624715676, + "learning_rate": 1.3338568730304263e-06, + "loss": 0.0536, + "step": 2290 + }, + { + "epoch": 3.0876010781671157, + "grad_norm": 2.274409139706279, + "learning_rate": 1.3301477531595063e-06, + "loss": 0.0533, + "step": 2291 + }, + { + "epoch": 3.0889487870619945, + "grad_norm": 11.754024925829464, + "learning_rate": 1.3264430061599559e-06, + "loss": 0.0491, + "step": 2292 + }, + { + "epoch": 3.0902964959568733, + "grad_norm": 5.644020031326531, + "learning_rate": 1.322742636446222e-06, + "loss": 0.0542, + "step": 2293 + }, + { + "epoch": 3.091644204851752, + "grad_norm": 21.31976748028793, + "learning_rate": 1.3190466484275443e-06, + "loss": 0.0798, + "step": 2294 + }, + { + "epoch": 3.092991913746631, + "grad_norm": 13.343955324840456, + "learning_rate": 1.315355046507934e-06, + "loss": 0.0368, + "step": 2295 + }, + { + "epoch": 3.0943396226415096, + "grad_norm": 8.734046724248618, + "learning_rate": 1.3116678350861784e-06, + "loss": 0.0393, + "step": 2296 + }, + { + "epoch": 3.095687331536388, + "grad_norm": 3.0183495778065335, + "learning_rate": 1.3079850185558356e-06, + "loss": 0.0439, + "step": 2297 + }, + { + "epoch": 3.0970350404312668, + "grad_norm": 1.51368181667962, + "learning_rate": 1.3043066013052218e-06, + "loss": 0.0623, + "step": 2298 + }, + { + "epoch": 3.0983827493261455, + "grad_norm": 4.025212392257419, + "learning_rate": 1.3006325877174164e-06, + "loss": 0.0522, + "step": 2299 + }, + { + "epoch": 3.0997304582210243, + "grad_norm": 9.809798534051707, + "learning_rate": 1.296962982170248e-06, + "loss": 0.0462, + "step": 2300 + }, + { + "epoch": 3.101078167115903, + "grad_norm": 17.91813848960362, + "learning_rate": 1.2932977890362957e-06, + "loss": 0.058, + "step": 2301 + }, + { + "epoch": 3.1024258760107815, + "grad_norm": 4.256662665836097, + "learning_rate": 1.2896370126828755e-06, + "loss": 0.0334, + "step": 2302 + }, + { + "epoch": 3.1037735849056602, + "grad_norm": 22.36376948690144, + "learning_rate": 1.28598065747205e-06, + "loss": 0.0659, + "step": 2303 + }, + { + "epoch": 3.105121293800539, + "grad_norm": 4.667168907963304, + "learning_rate": 1.2823287277606029e-06, + "loss": 0.0442, + "step": 2304 + }, + { + "epoch": 3.106469002695418, + "grad_norm": 2.399419219577551, + "learning_rate": 1.278681227900052e-06, + "loss": 0.062, + "step": 2305 + }, + { + "epoch": 3.1078167115902966, + "grad_norm": 1.6906101133527092, + "learning_rate": 1.2750381622366337e-06, + "loss": 0.0535, + "step": 2306 + }, + { + "epoch": 3.1091644204851754, + "grad_norm": 7.216306421150174, + "learning_rate": 1.2713995351113028e-06, + "loss": 0.0642, + "step": 2307 + }, + { + "epoch": 3.1105121293800537, + "grad_norm": 1.463970217732986, + "learning_rate": 1.2677653508597215e-06, + "loss": 0.0475, + "step": 2308 + }, + { + "epoch": 3.1118598382749325, + "grad_norm": 5.155039177289986, + "learning_rate": 1.2641356138122612e-06, + "loss": 0.0587, + "step": 2309 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 22.340583364589023, + "learning_rate": 1.2605103282939952e-06, + "loss": 0.049, + "step": 2310 + }, + { + "epoch": 3.11455525606469, + "grad_norm": 7.997614428922442, + "learning_rate": 1.2568894986246866e-06, + "loss": 0.0711, + "step": 2311 + }, + { + "epoch": 3.115902964959569, + "grad_norm": 8.15467960552772, + "learning_rate": 1.2532731291187982e-06, + "loss": 0.0657, + "step": 2312 + }, + { + "epoch": 3.1172506738544477, + "grad_norm": 15.660588369253082, + "learning_rate": 1.2496612240854695e-06, + "loss": 0.0438, + "step": 2313 + }, + { + "epoch": 3.118598382749326, + "grad_norm": 7.803263739642561, + "learning_rate": 1.246053787828525e-06, + "loss": 0.0741, + "step": 2314 + }, + { + "epoch": 3.1199460916442048, + "grad_norm": 2.8768625384182567, + "learning_rate": 1.2424508246464635e-06, + "loss": 0.0542, + "step": 2315 + }, + { + "epoch": 3.1212938005390836, + "grad_norm": 3.284957464396023, + "learning_rate": 1.2388523388324547e-06, + "loss": 0.0309, + "step": 2316 + }, + { + "epoch": 3.1226415094339623, + "grad_norm": 1.2378573099635997, + "learning_rate": 1.235258334674328e-06, + "loss": 0.0513, + "step": 2317 + }, + { + "epoch": 3.123989218328841, + "grad_norm": 11.51505076057881, + "learning_rate": 1.2316688164545826e-06, + "loss": 0.0294, + "step": 2318 + }, + { + "epoch": 3.1253369272237195, + "grad_norm": 1.335666259504269, + "learning_rate": 1.2280837884503621e-06, + "loss": 0.0542, + "step": 2319 + }, + { + "epoch": 3.1266846361185983, + "grad_norm": 2.1033696636062613, + "learning_rate": 1.2245032549334661e-06, + "loss": 0.0508, + "step": 2320 + }, + { + "epoch": 3.128032345013477, + "grad_norm": 11.04585812254537, + "learning_rate": 1.2209272201703382e-06, + "loss": 0.0588, + "step": 2321 + }, + { + "epoch": 3.129380053908356, + "grad_norm": 13.403895193992128, + "learning_rate": 1.2173556884220562e-06, + "loss": 0.062, + "step": 2322 + }, + { + "epoch": 3.1307277628032346, + "grad_norm": 5.962566949477347, + "learning_rate": 1.2137886639443386e-06, + "loss": 0.0361, + "step": 2323 + }, + { + "epoch": 3.1320754716981134, + "grad_norm": 9.421441575995734, + "learning_rate": 1.2102261509875302e-06, + "loss": 0.0441, + "step": 2324 + }, + { + "epoch": 3.1334231805929917, + "grad_norm": 17.191856594359535, + "learning_rate": 1.2066681537966019e-06, + "loss": 0.0374, + "step": 2325 + }, + { + "epoch": 3.1347708894878705, + "grad_norm": 11.73766851078867, + "learning_rate": 1.2031146766111386e-06, + "loss": 0.0473, + "step": 2326 + }, + { + "epoch": 3.1361185983827493, + "grad_norm": 2.919812452816608, + "learning_rate": 1.199565723665348e-06, + "loss": 0.0357, + "step": 2327 + }, + { + "epoch": 3.137466307277628, + "grad_norm": 6.165800527800513, + "learning_rate": 1.1960212991880383e-06, + "loss": 0.0335, + "step": 2328 + }, + { + "epoch": 3.138814016172507, + "grad_norm": 2.315094131257654, + "learning_rate": 1.1924814074026263e-06, + "loss": 0.0336, + "step": 2329 + }, + { + "epoch": 3.1401617250673857, + "grad_norm": 4.473050341420013, + "learning_rate": 1.188946052527128e-06, + "loss": 0.064, + "step": 2330 + }, + { + "epoch": 3.141509433962264, + "grad_norm": 5.350904792791169, + "learning_rate": 1.1854152387741525e-06, + "loss": 0.0398, + "step": 2331 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 3.4360507651612133, + "learning_rate": 1.1818889703508951e-06, + "loss": 0.0555, + "step": 2332 + }, + { + "epoch": 3.1442048517520216, + "grad_norm": 20.74014338579958, + "learning_rate": 1.1783672514591388e-06, + "loss": 0.0599, + "step": 2333 + }, + { + "epoch": 3.1455525606469004, + "grad_norm": 19.15026882406858, + "learning_rate": 1.1748500862952466e-06, + "loss": 0.0544, + "step": 2334 + }, + { + "epoch": 3.146900269541779, + "grad_norm": 9.597284516711676, + "learning_rate": 1.171337479050148e-06, + "loss": 0.0435, + "step": 2335 + }, + { + "epoch": 3.1482479784366575, + "grad_norm": 5.4972555314609295, + "learning_rate": 1.1678294339093521e-06, + "loss": 0.0573, + "step": 2336 + }, + { + "epoch": 3.1495956873315363, + "grad_norm": 2.992127344996407, + "learning_rate": 1.1643259550529229e-06, + "loss": 0.0639, + "step": 2337 + }, + { + "epoch": 3.150943396226415, + "grad_norm": 1.8088762158773508, + "learning_rate": 1.1608270466554883e-06, + "loss": 0.0569, + "step": 2338 + }, + { + "epoch": 3.152291105121294, + "grad_norm": 3.809063619498572, + "learning_rate": 1.1573327128862277e-06, + "loss": 0.0326, + "step": 2339 + }, + { + "epoch": 3.1536388140161726, + "grad_norm": 4.944924992346102, + "learning_rate": 1.1538429579088733e-06, + "loss": 0.0446, + "step": 2340 + }, + { + "epoch": 3.1549865229110514, + "grad_norm": 2.8742860803929537, + "learning_rate": 1.1503577858816939e-06, + "loss": 0.0414, + "step": 2341 + }, + { + "epoch": 3.1563342318059298, + "grad_norm": 9.809092986825277, + "learning_rate": 1.1468772009575075e-06, + "loss": 0.0328, + "step": 2342 + }, + { + "epoch": 3.1576819407008085, + "grad_norm": 21.94852919797332, + "learning_rate": 1.143401207283657e-06, + "loss": 0.0457, + "step": 2343 + }, + { + "epoch": 3.1590296495956873, + "grad_norm": 6.834277061981874, + "learning_rate": 1.1399298090020205e-06, + "loss": 0.0425, + "step": 2344 + }, + { + "epoch": 3.160377358490566, + "grad_norm": 10.380011893537077, + "learning_rate": 1.1364630102489988e-06, + "loss": 0.0542, + "step": 2345 + }, + { + "epoch": 3.161725067385445, + "grad_norm": 13.497808203576458, + "learning_rate": 1.1330008151555088e-06, + "loss": 0.0483, + "step": 2346 + }, + { + "epoch": 3.1630727762803232, + "grad_norm": 4.639333906364389, + "learning_rate": 1.129543227846987e-06, + "loss": 0.0423, + "step": 2347 + }, + { + "epoch": 3.164420485175202, + "grad_norm": 3.920892376640188, + "learning_rate": 1.1260902524433765e-06, + "loss": 0.0714, + "step": 2348 + }, + { + "epoch": 3.165768194070081, + "grad_norm": 8.769190150729289, + "learning_rate": 1.1226418930591266e-06, + "loss": 0.0535, + "step": 2349 + }, + { + "epoch": 3.1671159029649596, + "grad_norm": 3.23634590165451, + "learning_rate": 1.119198153803182e-06, + "loss": 0.0495, + "step": 2350 + }, + { + "epoch": 3.1684636118598384, + "grad_norm": 7.700374811921183, + "learning_rate": 1.1157590387789902e-06, + "loss": 0.0696, + "step": 2351 + }, + { + "epoch": 3.169811320754717, + "grad_norm": 2.1174019701467657, + "learning_rate": 1.1123245520844806e-06, + "loss": 0.0484, + "step": 2352 + }, + { + "epoch": 3.1711590296495955, + "grad_norm": 7.754035996633191, + "learning_rate": 1.1088946978120713e-06, + "loss": 0.0357, + "step": 2353 + }, + { + "epoch": 3.1725067385444743, + "grad_norm": 7.551112621402877, + "learning_rate": 1.1054694800486609e-06, + "loss": 0.0357, + "step": 2354 + }, + { + "epoch": 3.173854447439353, + "grad_norm": 6.2160162167186135, + "learning_rate": 1.1020489028756243e-06, + "loss": 0.078, + "step": 2355 + }, + { + "epoch": 3.175202156334232, + "grad_norm": 3.0105565241199077, + "learning_rate": 1.098632970368802e-06, + "loss": 0.0599, + "step": 2356 + }, + { + "epoch": 3.1765498652291106, + "grad_norm": 1.5867677341975188, + "learning_rate": 1.0952216865985044e-06, + "loss": 0.0409, + "step": 2357 + }, + { + "epoch": 3.177897574123989, + "grad_norm": 10.444167243674373, + "learning_rate": 1.0918150556295032e-06, + "loss": 0.0377, + "step": 2358 + }, + { + "epoch": 3.1792452830188678, + "grad_norm": 11.679293241121863, + "learning_rate": 1.0884130815210199e-06, + "loss": 0.0705, + "step": 2359 + }, + { + "epoch": 3.1805929919137466, + "grad_norm": 16.314594730775966, + "learning_rate": 1.085015768326737e-06, + "loss": 0.0411, + "step": 2360 + }, + { + "epoch": 3.1819407008086253, + "grad_norm": 6.08241030685146, + "learning_rate": 1.081623120094773e-06, + "loss": 0.0484, + "step": 2361 + }, + { + "epoch": 3.183288409703504, + "grad_norm": 1.1327822295438397, + "learning_rate": 1.0782351408676945e-06, + "loss": 0.0335, + "step": 2362 + }, + { + "epoch": 3.184636118598383, + "grad_norm": 3.8247594860377845, + "learning_rate": 1.0748518346825021e-06, + "loss": 0.0354, + "step": 2363 + }, + { + "epoch": 3.1859838274932613, + "grad_norm": 8.717024054117074, + "learning_rate": 1.0714732055706301e-06, + "loss": 0.0476, + "step": 2364 + }, + { + "epoch": 3.18733153638814, + "grad_norm": 10.086730711476418, + "learning_rate": 1.0680992575579336e-06, + "loss": 0.0419, + "step": 2365 + }, + { + "epoch": 3.188679245283019, + "grad_norm": 1.8857212605413955, + "learning_rate": 1.064729994664701e-06, + "loss": 0.0393, + "step": 2366 + }, + { + "epoch": 3.1900269541778976, + "grad_norm": 11.370658989636857, + "learning_rate": 1.0613654209056273e-06, + "loss": 0.0589, + "step": 2367 + }, + { + "epoch": 3.1913746630727764, + "grad_norm": 9.422316872610203, + "learning_rate": 1.0580055402898249e-06, + "loss": 0.0528, + "step": 2368 + }, + { + "epoch": 3.192722371967655, + "grad_norm": 6.733280570575204, + "learning_rate": 1.0546503568208155e-06, + "loss": 0.0464, + "step": 2369 + }, + { + "epoch": 3.1940700808625335, + "grad_norm": 10.905644708636277, + "learning_rate": 1.0512998744965192e-06, + "loss": 0.0476, + "step": 2370 + }, + { + "epoch": 3.1954177897574123, + "grad_norm": 15.537445589118569, + "learning_rate": 1.0479540973092583e-06, + "loss": 0.0455, + "step": 2371 + }, + { + "epoch": 3.196765498652291, + "grad_norm": 10.812820453904068, + "learning_rate": 1.0446130292457468e-06, + "loss": 0.0458, + "step": 2372 + }, + { + "epoch": 3.19811320754717, + "grad_norm": 2.6410394504271597, + "learning_rate": 1.04127667428709e-06, + "loss": 0.0641, + "step": 2373 + }, + { + "epoch": 3.1994609164420487, + "grad_norm": 1.6193150550927065, + "learning_rate": 1.0379450364087713e-06, + "loss": 0.0532, + "step": 2374 + }, + { + "epoch": 3.2008086253369274, + "grad_norm": 1.6620991763524635, + "learning_rate": 1.0346181195806614e-06, + "loss": 0.0613, + "step": 2375 + }, + { + "epoch": 3.202156334231806, + "grad_norm": 1.7546532877032792, + "learning_rate": 1.0312959277669993e-06, + "loss": 0.0616, + "step": 2376 + }, + { + "epoch": 3.2035040431266846, + "grad_norm": 6.116370633283004, + "learning_rate": 1.0279784649263957e-06, + "loss": 0.0224, + "step": 2377 + }, + { + "epoch": 3.2048517520215634, + "grad_norm": 2.628964460766287, + "learning_rate": 1.0246657350118278e-06, + "loss": 0.0466, + "step": 2378 + }, + { + "epoch": 3.206199460916442, + "grad_norm": 6.118416581073213, + "learning_rate": 1.0213577419706333e-06, + "loss": 0.0502, + "step": 2379 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 2.128612295754462, + "learning_rate": 1.0180544897445011e-06, + "loss": 0.0542, + "step": 2380 + }, + { + "epoch": 3.2088948787061993, + "grad_norm": 11.622809066846543, + "learning_rate": 1.0147559822694763e-06, + "loss": 0.0461, + "step": 2381 + }, + { + "epoch": 3.210242587601078, + "grad_norm": 7.129891552561835, + "learning_rate": 1.0114622234759498e-06, + "loss": 0.0549, + "step": 2382 + }, + { + "epoch": 3.211590296495957, + "grad_norm": 2.181558090988755, + "learning_rate": 1.0081732172886482e-06, + "loss": 0.0362, + "step": 2383 + }, + { + "epoch": 3.2129380053908356, + "grad_norm": 4.892163383865211, + "learning_rate": 1.004888967626646e-06, + "loss": 0.0367, + "step": 2384 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 5.171421069582192, + "learning_rate": 1.0016094784033386e-06, + "loss": 0.0613, + "step": 2385 + }, + { + "epoch": 3.215633423180593, + "grad_norm": 8.091462380818143, + "learning_rate": 9.98334753526456e-07, + "loss": 0.039, + "step": 2386 + }, + { + "epoch": 3.2169811320754715, + "grad_norm": 1.182235706143124, + "learning_rate": 9.950647968980493e-07, + "loss": 0.0242, + "step": 2387 + }, + { + "epoch": 3.2183288409703503, + "grad_norm": 7.32856384116888, + "learning_rate": 9.917996124144884e-07, + "loss": 0.0402, + "step": 2388 + }, + { + "epoch": 3.219676549865229, + "grad_norm": 4.522258694299483, + "learning_rate": 9.885392039664527e-07, + "loss": 0.0709, + "step": 2389 + }, + { + "epoch": 3.221024258760108, + "grad_norm": 2.781597521681169, + "learning_rate": 9.8528357543894e-07, + "loss": 0.0549, + "step": 2390 + }, + { + "epoch": 3.2223719676549867, + "grad_norm": 6.2763997309252115, + "learning_rate": 9.820327307112421e-07, + "loss": 0.0572, + "step": 2391 + }, + { + "epoch": 3.223719676549865, + "grad_norm": 1.3063771694675672, + "learning_rate": 9.787866736569567e-07, + "loss": 0.0495, + "step": 2392 + }, + { + "epoch": 3.225067385444744, + "grad_norm": 11.703930969299094, + "learning_rate": 9.75545408143977e-07, + "loss": 0.0618, + "step": 2393 + }, + { + "epoch": 3.2264150943396226, + "grad_norm": 1.6343226881541972, + "learning_rate": 9.723089380344819e-07, + "loss": 0.0588, + "step": 2394 + }, + { + "epoch": 3.2277628032345014, + "grad_norm": 5.730101334121764, + "learning_rate": 9.690772671849403e-07, + "loss": 0.0566, + "step": 2395 + }, + { + "epoch": 3.22911051212938, + "grad_norm": 11.528299691037766, + "learning_rate": 9.65850399446102e-07, + "loss": 0.0585, + "step": 2396 + }, + { + "epoch": 3.230458221024259, + "grad_norm": 5.52802556104854, + "learning_rate": 9.626283386629947e-07, + "loss": 0.0418, + "step": 2397 + }, + { + "epoch": 3.2318059299191373, + "grad_norm": 18.431453450785032, + "learning_rate": 9.59411088674912e-07, + "loss": 0.0494, + "step": 2398 + }, + { + "epoch": 3.233153638814016, + "grad_norm": 9.530702208060998, + "learning_rate": 9.561986533154255e-07, + "loss": 0.0715, + "step": 2399 + }, + { + "epoch": 3.234501347708895, + "grad_norm": 1.3954501533618753, + "learning_rate": 9.529910364123601e-07, + "loss": 0.0455, + "step": 2400 + }, + { + "epoch": 3.2358490566037736, + "grad_norm": 6.06484553563748, + "learning_rate": 9.497882417878046e-07, + "loss": 0.0498, + "step": 2401 + }, + { + "epoch": 3.2371967654986524, + "grad_norm": 1.6137890840205795, + "learning_rate": 9.465902732581001e-07, + "loss": 0.0494, + "step": 2402 + }, + { + "epoch": 3.2385444743935308, + "grad_norm": 2.352864032037358, + "learning_rate": 9.433971346338383e-07, + "loss": 0.0309, + "step": 2403 + }, + { + "epoch": 3.2398921832884096, + "grad_norm": 2.3968152794451614, + "learning_rate": 9.40208829719853e-07, + "loss": 0.0437, + "step": 2404 + }, + { + "epoch": 3.2412398921832883, + "grad_norm": 7.368046510971631, + "learning_rate": 9.370253623152215e-07, + "loss": 0.0401, + "step": 2405 + }, + { + "epoch": 3.242587601078167, + "grad_norm": 2.75442783314109, + "learning_rate": 9.338467362132559e-07, + "loss": 0.032, + "step": 2406 + }, + { + "epoch": 3.243935309973046, + "grad_norm": 0.7416043488609677, + "learning_rate": 9.306729552014959e-07, + "loss": 0.028, + "step": 2407 + }, + { + "epoch": 3.2452830188679247, + "grad_norm": 2.2148705814715113, + "learning_rate": 9.275040230617161e-07, + "loss": 0.0497, + "step": 2408 + }, + { + "epoch": 3.246630727762803, + "grad_norm": 6.339609787637471, + "learning_rate": 9.243399435699052e-07, + "loss": 0.0616, + "step": 2409 + }, + { + "epoch": 3.247978436657682, + "grad_norm": 1.3239283410840965, + "learning_rate": 9.21180720496273e-07, + "loss": 0.0414, + "step": 2410 + }, + { + "epoch": 3.2493261455525606, + "grad_norm": 4.673497643443063, + "learning_rate": 9.180263576052439e-07, + "loss": 0.0347, + "step": 2411 + }, + { + "epoch": 3.2506738544474394, + "grad_norm": 9.675574408603005, + "learning_rate": 9.148768586554502e-07, + "loss": 0.0563, + "step": 2412 + }, + { + "epoch": 3.252021563342318, + "grad_norm": 10.769456623624775, + "learning_rate": 9.117322273997243e-07, + "loss": 0.0518, + "step": 2413 + }, + { + "epoch": 3.2533692722371965, + "grad_norm": 6.027794930516898, + "learning_rate": 9.085924675851066e-07, + "loss": 0.0492, + "step": 2414 + }, + { + "epoch": 3.2547169811320753, + "grad_norm": 2.1119880318970163, + "learning_rate": 9.054575829528251e-07, + "loss": 0.0281, + "step": 2415 + }, + { + "epoch": 3.256064690026954, + "grad_norm": 10.794865272123548, + "learning_rate": 9.023275772383033e-07, + "loss": 0.0402, + "step": 2416 + }, + { + "epoch": 3.257412398921833, + "grad_norm": 10.727271878104641, + "learning_rate": 8.992024541711502e-07, + "loss": 0.0418, + "step": 2417 + }, + { + "epoch": 3.2587601078167117, + "grad_norm": 5.333931150944049, + "learning_rate": 8.960822174751548e-07, + "loss": 0.0699, + "step": 2418 + }, + { + "epoch": 3.2601078167115904, + "grad_norm": 8.526011984828976, + "learning_rate": 8.929668708682864e-07, + "loss": 0.0487, + "step": 2419 + }, + { + "epoch": 3.2614555256064692, + "grad_norm": 3.4320204441364695, + "learning_rate": 8.898564180626857e-07, + "loss": 0.0607, + "step": 2420 + }, + { + "epoch": 3.2628032345013476, + "grad_norm": 11.570465742305476, + "learning_rate": 8.867508627646643e-07, + "loss": 0.0332, + "step": 2421 + }, + { + "epoch": 3.2641509433962264, + "grad_norm": 4.918322534698693, + "learning_rate": 8.836502086746924e-07, + "loss": 0.071, + "step": 2422 + }, + { + "epoch": 3.265498652291105, + "grad_norm": 1.8821680033742632, + "learning_rate": 8.805544594874094e-07, + "loss": 0.0442, + "step": 2423 + }, + { + "epoch": 3.266846361185984, + "grad_norm": 8.144068139047318, + "learning_rate": 8.774636188916014e-07, + "loss": 0.0664, + "step": 2424 + }, + { + "epoch": 3.2681940700808627, + "grad_norm": 4.694215741772379, + "learning_rate": 8.743776905702106e-07, + "loss": 0.0394, + "step": 2425 + }, + { + "epoch": 3.269541778975741, + "grad_norm": 7.038082916373995, + "learning_rate": 8.712966782003234e-07, + "loss": 0.0434, + "step": 2426 + }, + { + "epoch": 3.27088948787062, + "grad_norm": 8.548844656248336, + "learning_rate": 8.682205854531717e-07, + "loss": 0.0396, + "step": 2427 + }, + { + "epoch": 3.2722371967654986, + "grad_norm": 6.805751795044565, + "learning_rate": 8.651494159941204e-07, + "loss": 0.045, + "step": 2428 + }, + { + "epoch": 3.2735849056603774, + "grad_norm": 0.9546206858162617, + "learning_rate": 8.620831734826718e-07, + "loss": 0.0377, + "step": 2429 + }, + { + "epoch": 3.274932614555256, + "grad_norm": 12.927850677642708, + "learning_rate": 8.590218615724583e-07, + "loss": 0.0611, + "step": 2430 + }, + { + "epoch": 3.276280323450135, + "grad_norm": 10.684714985806686, + "learning_rate": 8.559654839112308e-07, + "loss": 0.044, + "step": 2431 + }, + { + "epoch": 3.2776280323450133, + "grad_norm": 9.791644822486226, + "learning_rate": 8.529140441408706e-07, + "loss": 0.0592, + "step": 2432 + }, + { + "epoch": 3.278975741239892, + "grad_norm": 4.389668450603624, + "learning_rate": 8.49867545897366e-07, + "loss": 0.0361, + "step": 2433 + }, + { + "epoch": 3.280323450134771, + "grad_norm": 14.361048253905286, + "learning_rate": 8.468259928108219e-07, + "loss": 0.0803, + "step": 2434 + }, + { + "epoch": 3.2816711590296497, + "grad_norm": 5.9378195303095795, + "learning_rate": 8.437893885054504e-07, + "loss": 0.0545, + "step": 2435 + }, + { + "epoch": 3.2830188679245285, + "grad_norm": 5.1516404295649805, + "learning_rate": 8.407577365995662e-07, + "loss": 0.0577, + "step": 2436 + }, + { + "epoch": 3.284366576819407, + "grad_norm": 1.7837589523207893, + "learning_rate": 8.3773104070558e-07, + "loss": 0.0532, + "step": 2437 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 7.137180453693677, + "learning_rate": 8.347093044300048e-07, + "loss": 0.0631, + "step": 2438 + }, + { + "epoch": 3.2870619946091644, + "grad_norm": 6.0214406001699965, + "learning_rate": 8.316925313734347e-07, + "loss": 0.0636, + "step": 2439 + }, + { + "epoch": 3.288409703504043, + "grad_norm": 8.45105544656702, + "learning_rate": 8.286807251305557e-07, + "loss": 0.0685, + "step": 2440 + }, + { + "epoch": 3.289757412398922, + "grad_norm": 5.903080756014483, + "learning_rate": 8.256738892901344e-07, + "loss": 0.0434, + "step": 2441 + }, + { + "epoch": 3.2911051212938007, + "grad_norm": 4.36272622189961, + "learning_rate": 8.226720274350136e-07, + "loss": 0.0433, + "step": 2442 + }, + { + "epoch": 3.292452830188679, + "grad_norm": 18.481641742520413, + "learning_rate": 8.196751431421096e-07, + "loss": 0.0533, + "step": 2443 + }, + { + "epoch": 3.293800539083558, + "grad_norm": 2.3760456176807807, + "learning_rate": 8.166832399824087e-07, + "loss": 0.0756, + "step": 2444 + }, + { + "epoch": 3.2951482479784366, + "grad_norm": 13.673557140411745, + "learning_rate": 8.13696321520962e-07, + "loss": 0.0534, + "step": 2445 + }, + { + "epoch": 3.2964959568733154, + "grad_norm": 14.428764013904072, + "learning_rate": 8.107143913168763e-07, + "loss": 0.0384, + "step": 2446 + }, + { + "epoch": 3.297843665768194, + "grad_norm": 7.795719926341731, + "learning_rate": 8.077374529233245e-07, + "loss": 0.0541, + "step": 2447 + }, + { + "epoch": 3.2991913746630726, + "grad_norm": 8.511022571133342, + "learning_rate": 8.047655098875206e-07, + "loss": 0.0613, + "step": 2448 + }, + { + "epoch": 3.3005390835579513, + "grad_norm": 1.9642150670388263, + "learning_rate": 8.017985657507322e-07, + "loss": 0.0289, + "step": 2449 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 13.380993065610774, + "learning_rate": 7.988366240482698e-07, + "loss": 0.0776, + "step": 2450 + }, + { + "epoch": 3.303234501347709, + "grad_norm": 8.241033013967472, + "learning_rate": 7.958796883094838e-07, + "loss": 0.0374, + "step": 2451 + }, + { + "epoch": 3.3045822102425877, + "grad_norm": 17.54263310821673, + "learning_rate": 7.929277620577552e-07, + "loss": 0.0768, + "step": 2452 + }, + { + "epoch": 3.3059299191374665, + "grad_norm": 1.3775850216835168, + "learning_rate": 7.899808488105015e-07, + "loss": 0.0592, + "step": 2453 + }, + { + "epoch": 3.3072776280323453, + "grad_norm": 8.844128394846134, + "learning_rate": 7.87038952079165e-07, + "loss": 0.0534, + "step": 2454 + }, + { + "epoch": 3.3086253369272236, + "grad_norm": 7.0929774953446, + "learning_rate": 7.841020753692058e-07, + "loss": 0.0762, + "step": 2455 + }, + { + "epoch": 3.3099730458221024, + "grad_norm": 2.777724729143616, + "learning_rate": 7.811702221801127e-07, + "loss": 0.0399, + "step": 2456 + }, + { + "epoch": 3.311320754716981, + "grad_norm": 8.710462836659135, + "learning_rate": 7.782433960053781e-07, + "loss": 0.068, + "step": 2457 + }, + { + "epoch": 3.31266846361186, + "grad_norm": 16.795270556884, + "learning_rate": 7.753216003325098e-07, + "loss": 0.0438, + "step": 2458 + }, + { + "epoch": 3.3140161725067383, + "grad_norm": 7.268189897131109, + "learning_rate": 7.724048386430205e-07, + "loss": 0.0275, + "step": 2459 + }, + { + "epoch": 3.315363881401617, + "grad_norm": 9.428922162651164, + "learning_rate": 7.694931144124256e-07, + "loss": 0.0383, + "step": 2460 + }, + { + "epoch": 3.316711590296496, + "grad_norm": 10.24259534082004, + "learning_rate": 7.665864311102333e-07, + "loss": 0.0557, + "step": 2461 + }, + { + "epoch": 3.3180592991913747, + "grad_norm": 8.031009769480274, + "learning_rate": 7.636847921999541e-07, + "loss": 0.0499, + "step": 2462 + }, + { + "epoch": 3.3194070080862534, + "grad_norm": 1.4962369493039005, + "learning_rate": 7.607882011390777e-07, + "loss": 0.0296, + "step": 2463 + }, + { + "epoch": 3.3207547169811322, + "grad_norm": 6.974583621687253, + "learning_rate": 7.578966613790856e-07, + "loss": 0.0534, + "step": 2464 + }, + { + "epoch": 3.322102425876011, + "grad_norm": 6.281076512212255, + "learning_rate": 7.550101763654394e-07, + "loss": 0.0634, + "step": 2465 + }, + { + "epoch": 3.3234501347708894, + "grad_norm": 3.4631616705892156, + "learning_rate": 7.521287495375745e-07, + "loss": 0.039, + "step": 2466 + }, + { + "epoch": 3.324797843665768, + "grad_norm": 11.890013260318634, + "learning_rate": 7.492523843289024e-07, + "loss": 0.062, + "step": 2467 + }, + { + "epoch": 3.326145552560647, + "grad_norm": 4.343682743550125, + "learning_rate": 7.463810841668018e-07, + "loss": 0.0454, + "step": 2468 + }, + { + "epoch": 3.3274932614555257, + "grad_norm": 14.712882430602255, + "learning_rate": 7.435148524726188e-07, + "loss": 0.0707, + "step": 2469 + }, + { + "epoch": 3.3288409703504045, + "grad_norm": 2.5716967767732775, + "learning_rate": 7.406536926616531e-07, + "loss": 0.0343, + "step": 2470 + }, + { + "epoch": 3.330188679245283, + "grad_norm": 29.182946162233442, + "learning_rate": 7.37797608143171e-07, + "loss": 0.1164, + "step": 2471 + }, + { + "epoch": 3.3315363881401616, + "grad_norm": 29.622115559717024, + "learning_rate": 7.349466023203816e-07, + "loss": 0.0782, + "step": 2472 + }, + { + "epoch": 3.3328840970350404, + "grad_norm": 1.3360017138752665, + "learning_rate": 7.321006785904488e-07, + "loss": 0.0477, + "step": 2473 + }, + { + "epoch": 3.334231805929919, + "grad_norm": 7.125540960293245, + "learning_rate": 7.292598403444784e-07, + "loss": 0.0567, + "step": 2474 + }, + { + "epoch": 3.335579514824798, + "grad_norm": 12.629868450566315, + "learning_rate": 7.264240909675174e-07, + "loss": 0.0479, + "step": 2475 + }, + { + "epoch": 3.3369272237196768, + "grad_norm": 1.4484268455469653, + "learning_rate": 7.23593433838547e-07, + "loss": 0.0555, + "step": 2476 + }, + { + "epoch": 3.338274932614555, + "grad_norm": 5.587132131519826, + "learning_rate": 7.207678723304828e-07, + "loss": 0.036, + "step": 2477 + }, + { + "epoch": 3.339622641509434, + "grad_norm": 12.37815855252242, + "learning_rate": 7.179474098101691e-07, + "loss": 0.057, + "step": 2478 + }, + { + "epoch": 3.3409703504043127, + "grad_norm": 2.6574402805208606, + "learning_rate": 7.151320496383701e-07, + "loss": 0.0381, + "step": 2479 + }, + { + "epoch": 3.3423180592991915, + "grad_norm": 1.1212992252878287, + "learning_rate": 7.12321795169778e-07, + "loss": 0.0542, + "step": 2480 + }, + { + "epoch": 3.3436657681940702, + "grad_norm": 6.873733385552368, + "learning_rate": 7.095166497529937e-07, + "loss": 0.0492, + "step": 2481 + }, + { + "epoch": 3.3450134770889486, + "grad_norm": 9.679259636562033, + "learning_rate": 7.067166167305334e-07, + "loss": 0.0353, + "step": 2482 + }, + { + "epoch": 3.3463611859838274, + "grad_norm": 17.903375870904927, + "learning_rate": 7.039216994388215e-07, + "loss": 0.0508, + "step": 2483 + }, + { + "epoch": 3.347708894878706, + "grad_norm": 2.8542606226623124, + "learning_rate": 7.011319012081886e-07, + "loss": 0.0453, + "step": 2484 + }, + { + "epoch": 3.349056603773585, + "grad_norm": 23.143304183242684, + "learning_rate": 6.983472253628592e-07, + "loss": 0.0474, + "step": 2485 + }, + { + "epoch": 3.3504043126684637, + "grad_norm": 27.64952095919763, + "learning_rate": 6.955676752209639e-07, + "loss": 0.0807, + "step": 2486 + }, + { + "epoch": 3.3517520215633425, + "grad_norm": 23.844633065638973, + "learning_rate": 6.927932540945159e-07, + "loss": 0.0822, + "step": 2487 + }, + { + "epoch": 3.353099730458221, + "grad_norm": 26.369681327016266, + "learning_rate": 6.900239652894236e-07, + "loss": 0.0623, + "step": 2488 + }, + { + "epoch": 3.3544474393530996, + "grad_norm": 6.784123845912933, + "learning_rate": 6.87259812105478e-07, + "loss": 0.0408, + "step": 2489 + }, + { + "epoch": 3.3557951482479784, + "grad_norm": 2.118733225830236, + "learning_rate": 6.845007978363477e-07, + "loss": 0.0728, + "step": 2490 + }, + { + "epoch": 3.357142857142857, + "grad_norm": 13.059137874628322, + "learning_rate": 6.817469257695819e-07, + "loss": 0.0538, + "step": 2491 + }, + { + "epoch": 3.358490566037736, + "grad_norm": 17.691031127901557, + "learning_rate": 6.789981991866007e-07, + "loss": 0.0366, + "step": 2492 + }, + { + "epoch": 3.3598382749326143, + "grad_norm": 12.375970856854426, + "learning_rate": 6.762546213626953e-07, + "loss": 0.0467, + "step": 2493 + }, + { + "epoch": 3.361185983827493, + "grad_norm": 10.028332641567841, + "learning_rate": 6.735161955670161e-07, + "loss": 0.0455, + "step": 2494 + }, + { + "epoch": 3.362533692722372, + "grad_norm": 4.999256001476058, + "learning_rate": 6.707829250625825e-07, + "loss": 0.0564, + "step": 2495 + }, + { + "epoch": 3.3638814016172507, + "grad_norm": 12.235985463697927, + "learning_rate": 6.680548131062637e-07, + "loss": 0.0445, + "step": 2496 + }, + { + "epoch": 3.3652291105121295, + "grad_norm": 2.8022379291413984, + "learning_rate": 6.653318629487871e-07, + "loss": 0.04, + "step": 2497 + }, + { + "epoch": 3.3665768194070083, + "grad_norm": 7.207770307495059, + "learning_rate": 6.626140778347262e-07, + "loss": 0.0561, + "step": 2498 + }, + { + "epoch": 3.3679245283018866, + "grad_norm": 2.4101244629246907, + "learning_rate": 6.599014610025045e-07, + "loss": 0.0661, + "step": 2499 + }, + { + "epoch": 3.3692722371967654, + "grad_norm": 17.570232574749166, + "learning_rate": 6.571940156843803e-07, + "loss": 0.0716, + "step": 2500 + }, + { + "epoch": 3.370619946091644, + "grad_norm": 2.0403590554353763, + "learning_rate": 6.544917451064553e-07, + "loss": 0.0402, + "step": 2501 + }, + { + "epoch": 3.371967654986523, + "grad_norm": 10.267744184375136, + "learning_rate": 6.517946524886648e-07, + "loss": 0.0483, + "step": 2502 + }, + { + "epoch": 3.3733153638814017, + "grad_norm": 3.3393411364349297, + "learning_rate": 6.491027410447687e-07, + "loss": 0.0412, + "step": 2503 + }, + { + "epoch": 3.37466307277628, + "grad_norm": 10.136541636413481, + "learning_rate": 6.464160139823622e-07, + "loss": 0.0486, + "step": 2504 + }, + { + "epoch": 3.376010781671159, + "grad_norm": 4.28178538770121, + "learning_rate": 6.437344745028551e-07, + "loss": 0.0355, + "step": 2505 + }, + { + "epoch": 3.3773584905660377, + "grad_norm": 8.026462351344549, + "learning_rate": 6.410581258014798e-07, + "loss": 0.0449, + "step": 2506 + }, + { + "epoch": 3.3787061994609164, + "grad_norm": 9.770457141336513, + "learning_rate": 6.383869710672819e-07, + "loss": 0.0655, + "step": 2507 + }, + { + "epoch": 3.3800539083557952, + "grad_norm": 3.8003845177736393, + "learning_rate": 6.357210134831199e-07, + "loss": 0.0362, + "step": 2508 + }, + { + "epoch": 3.381401617250674, + "grad_norm": 3.3509358689243127, + "learning_rate": 6.330602562256572e-07, + "loss": 0.0541, + "step": 2509 + }, + { + "epoch": 3.382749326145553, + "grad_norm": 6.929151395875522, + "learning_rate": 6.30404702465362e-07, + "loss": 0.044, + "step": 2510 + }, + { + "epoch": 3.384097035040431, + "grad_norm": 2.975780087932479, + "learning_rate": 6.277543553665022e-07, + "loss": 0.0449, + "step": 2511 + }, + { + "epoch": 3.38544474393531, + "grad_norm": 6.256559213808033, + "learning_rate": 6.251092180871415e-07, + "loss": 0.0477, + "step": 2512 + }, + { + "epoch": 3.3867924528301887, + "grad_norm": 15.305268103491503, + "learning_rate": 6.224692937791366e-07, + "loss": 0.0767, + "step": 2513 + }, + { + "epoch": 3.3881401617250675, + "grad_norm": 1.4957161990748815, + "learning_rate": 6.198345855881299e-07, + "loss": 0.0461, + "step": 2514 + }, + { + "epoch": 3.3894878706199463, + "grad_norm": 7.670005736321975, + "learning_rate": 6.172050966535514e-07, + "loss": 0.0896, + "step": 2515 + }, + { + "epoch": 3.3908355795148246, + "grad_norm": 13.559351315554883, + "learning_rate": 6.145808301086104e-07, + "loss": 0.0579, + "step": 2516 + }, + { + "epoch": 3.3921832884097034, + "grad_norm": 4.9468267475374335, + "learning_rate": 6.119617890802953e-07, + "loss": 0.0356, + "step": 2517 + }, + { + "epoch": 3.393530997304582, + "grad_norm": 3.064922754341301, + "learning_rate": 6.093479766893628e-07, + "loss": 0.0451, + "step": 2518 + }, + { + "epoch": 3.394878706199461, + "grad_norm": 6.158603330892933, + "learning_rate": 6.067393960503476e-07, + "loss": 0.0352, + "step": 2519 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 16.14039125909886, + "learning_rate": 6.041360502715426e-07, + "loss": 0.0587, + "step": 2520 + }, + { + "epoch": 3.3975741239892185, + "grad_norm": 22.31331514040467, + "learning_rate": 6.015379424550078e-07, + "loss": 0.041, + "step": 2521 + }, + { + "epoch": 3.398921832884097, + "grad_norm": 7.2614218350347075, + "learning_rate": 5.989450756965593e-07, + "loss": 0.0485, + "step": 2522 + }, + { + "epoch": 3.4002695417789757, + "grad_norm": 12.142422980443886, + "learning_rate": 5.963574530857707e-07, + "loss": 0.0613, + "step": 2523 + }, + { + "epoch": 3.4016172506738545, + "grad_norm": 13.858837167351973, + "learning_rate": 5.937750777059637e-07, + "loss": 0.0455, + "step": 2524 + }, + { + "epoch": 3.4029649595687332, + "grad_norm": 1.4837357145138717, + "learning_rate": 5.911979526342093e-07, + "loss": 0.0361, + "step": 2525 + }, + { + "epoch": 3.404312668463612, + "grad_norm": 13.961874769108285, + "learning_rate": 5.886260809413236e-07, + "loss": 0.0377, + "step": 2526 + }, + { + "epoch": 3.4056603773584904, + "grad_norm": 1.5768278776739837, + "learning_rate": 5.860594656918589e-07, + "loss": 0.0385, + "step": 2527 + }, + { + "epoch": 3.407008086253369, + "grad_norm": 8.628226943367896, + "learning_rate": 5.834981099441106e-07, + "loss": 0.0697, + "step": 2528 + }, + { + "epoch": 3.408355795148248, + "grad_norm": 20.430734608756904, + "learning_rate": 5.809420167500995e-07, + "loss": 0.0564, + "step": 2529 + }, + { + "epoch": 3.4097035040431267, + "grad_norm": 23.057394263027458, + "learning_rate": 5.783911891555821e-07, + "loss": 0.0741, + "step": 2530 + }, + { + "epoch": 3.4110512129380055, + "grad_norm": 10.600947045700039, + "learning_rate": 5.758456302000365e-07, + "loss": 0.0496, + "step": 2531 + }, + { + "epoch": 3.4123989218328843, + "grad_norm": 24.540464262104653, + "learning_rate": 5.733053429166662e-07, + "loss": 0.0545, + "step": 2532 + }, + { + "epoch": 3.4137466307277626, + "grad_norm": 14.073123158285277, + "learning_rate": 5.707703303323891e-07, + "loss": 0.0433, + "step": 2533 + }, + { + "epoch": 3.4150943396226414, + "grad_norm": 17.8467209634199, + "learning_rate": 5.682405954678411e-07, + "loss": 0.0556, + "step": 2534 + }, + { + "epoch": 3.41644204851752, + "grad_norm": 13.473805709939224, + "learning_rate": 5.65716141337368e-07, + "loss": 0.0596, + "step": 2535 + }, + { + "epoch": 3.417789757412399, + "grad_norm": 4.055884877567275, + "learning_rate": 5.631969709490243e-07, + "loss": 0.0374, + "step": 2536 + }, + { + "epoch": 3.4191374663072778, + "grad_norm": 31.17093338918547, + "learning_rate": 5.606830873045687e-07, + "loss": 0.0492, + "step": 2537 + }, + { + "epoch": 3.420485175202156, + "grad_norm": 4.980934224562141, + "learning_rate": 5.58174493399457e-07, + "loss": 0.0553, + "step": 2538 + }, + { + "epoch": 3.421832884097035, + "grad_norm": 17.169309507311656, + "learning_rate": 5.556711922228469e-07, + "loss": 0.0809, + "step": 2539 + }, + { + "epoch": 3.4231805929919137, + "grad_norm": 5.617323785043908, + "learning_rate": 5.531731867575857e-07, + "loss": 0.0393, + "step": 2540 + }, + { + "epoch": 3.4245283018867925, + "grad_norm": 2.5405797476321013, + "learning_rate": 5.50680479980214e-07, + "loss": 0.043, + "step": 2541 + }, + { + "epoch": 3.4258760107816713, + "grad_norm": 5.8771019480113695, + "learning_rate": 5.481930748609532e-07, + "loss": 0.057, + "step": 2542 + }, + { + "epoch": 3.42722371967655, + "grad_norm": 10.591142414307008, + "learning_rate": 5.45710974363714e-07, + "loss": 0.0486, + "step": 2543 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.0050493401969303, + "learning_rate": 5.432341814460818e-07, + "loss": 0.0474, + "step": 2544 + }, + { + "epoch": 3.429919137466307, + "grad_norm": 2.8965956884961273, + "learning_rate": 5.407626990593184e-07, + "loss": 0.0499, + "step": 2545 + }, + { + "epoch": 3.431266846361186, + "grad_norm": 14.524694403897964, + "learning_rate": 5.382965301483589e-07, + "loss": 0.0578, + "step": 2546 + }, + { + "epoch": 3.4326145552560647, + "grad_norm": 2.6113307888123414, + "learning_rate": 5.358356776518076e-07, + "loss": 0.0463, + "step": 2547 + }, + { + "epoch": 3.4339622641509435, + "grad_norm": 10.09627298639463, + "learning_rate": 5.33380144501931e-07, + "loss": 0.049, + "step": 2548 + }, + { + "epoch": 3.435309973045822, + "grad_norm": 5.947500400248326, + "learning_rate": 5.309299336246593e-07, + "loss": 0.041, + "step": 2549 + }, + { + "epoch": 3.4366576819407006, + "grad_norm": 9.650690529854383, + "learning_rate": 5.28485047939582e-07, + "loss": 0.054, + "step": 2550 + }, + { + "epoch": 3.4380053908355794, + "grad_norm": 3.3520650924009168, + "learning_rate": 5.260454903599393e-07, + "loss": 0.061, + "step": 2551 + }, + { + "epoch": 3.439353099730458, + "grad_norm": 13.941632507482808, + "learning_rate": 5.236112637926288e-07, + "loss": 0.0692, + "step": 2552 + }, + { + "epoch": 3.440700808625337, + "grad_norm": 10.634860340048926, + "learning_rate": 5.211823711381892e-07, + "loss": 0.0675, + "step": 2553 + }, + { + "epoch": 3.442048517520216, + "grad_norm": 11.164316172651134, + "learning_rate": 5.187588152908079e-07, + "loss": 0.0632, + "step": 2554 + }, + { + "epoch": 3.4433962264150946, + "grad_norm": 6.97595151708757, + "learning_rate": 5.163405991383114e-07, + "loss": 0.0558, + "step": 2555 + }, + { + "epoch": 3.444743935309973, + "grad_norm": 8.966111490782325, + "learning_rate": 5.139277255621644e-07, + "loss": 0.0363, + "step": 2556 + }, + { + "epoch": 3.4460916442048517, + "grad_norm": 8.07964741728321, + "learning_rate": 5.115201974374646e-07, + "loss": 0.029, + "step": 2557 + }, + { + "epoch": 3.4474393530997305, + "grad_norm": 11.311495688475198, + "learning_rate": 5.091180176329413e-07, + "loss": 0.0369, + "step": 2558 + }, + { + "epoch": 3.4487870619946093, + "grad_norm": 8.140728538837585, + "learning_rate": 5.067211890109496e-07, + "loss": 0.0577, + "step": 2559 + }, + { + "epoch": 3.450134770889488, + "grad_norm": 1.5254148470552065, + "learning_rate": 5.0432971442747e-07, + "loss": 0.0306, + "step": 2560 + }, + { + "epoch": 3.4514824797843664, + "grad_norm": 8.078986080382222, + "learning_rate": 5.019435967321029e-07, + "loss": 0.027, + "step": 2561 + }, + { + "epoch": 3.452830188679245, + "grad_norm": 21.11125142551996, + "learning_rate": 4.995628387680635e-07, + "loss": 0.0511, + "step": 2562 + }, + { + "epoch": 3.454177897574124, + "grad_norm": 22.72908664519104, + "learning_rate": 4.97187443372183e-07, + "loss": 0.0695, + "step": 2563 + }, + { + "epoch": 3.4555256064690028, + "grad_norm": 21.64240125232225, + "learning_rate": 4.948174133749017e-07, + "loss": 0.0478, + "step": 2564 + }, + { + "epoch": 3.4568733153638815, + "grad_norm": 6.312663950995699, + "learning_rate": 4.924527516002686e-07, + "loss": 0.0696, + "step": 2565 + }, + { + "epoch": 3.4582210242587603, + "grad_norm": 28.54810615020798, + "learning_rate": 4.900934608659314e-07, + "loss": 0.0605, + "step": 2566 + }, + { + "epoch": 3.4595687331536387, + "grad_norm": 16.94262574326956, + "learning_rate": 4.877395439831439e-07, + "loss": 0.051, + "step": 2567 + }, + { + "epoch": 3.4609164420485174, + "grad_norm": 14.61400586329037, + "learning_rate": 4.853910037567511e-07, + "loss": 0.0346, + "step": 2568 + }, + { + "epoch": 3.4622641509433962, + "grad_norm": 14.203991195614918, + "learning_rate": 4.830478429851948e-07, + "loss": 0.0249, + "step": 2569 + }, + { + "epoch": 3.463611859838275, + "grad_norm": 8.367533305901386, + "learning_rate": 4.807100644605056e-07, + "loss": 0.0436, + "step": 2570 + }, + { + "epoch": 3.464959568733154, + "grad_norm": 21.89355384353067, + "learning_rate": 4.78377670968303e-07, + "loss": 0.0621, + "step": 2571 + }, + { + "epoch": 3.466307277628032, + "grad_norm": 11.577735680451763, + "learning_rate": 4.7605066528778443e-07, + "loss": 0.0341, + "step": 2572 + }, + { + "epoch": 3.467654986522911, + "grad_norm": 13.902060440930006, + "learning_rate": 4.737290501917335e-07, + "loss": 0.0455, + "step": 2573 + }, + { + "epoch": 3.4690026954177897, + "grad_norm": 8.313646507575726, + "learning_rate": 4.714128284465075e-07, + "loss": 0.0423, + "step": 2574 + }, + { + "epoch": 3.4703504043126685, + "grad_norm": 23.37523303699438, + "learning_rate": 4.6910200281203523e-07, + "loss": 0.0498, + "step": 2575 + }, + { + "epoch": 3.4716981132075473, + "grad_norm": 23.68360372044244, + "learning_rate": 4.667965760418225e-07, + "loss": 0.085, + "step": 2576 + }, + { + "epoch": 3.473045822102426, + "grad_norm": 2.330522305578351, + "learning_rate": 4.6449655088293353e-07, + "loss": 0.0408, + "step": 2577 + }, + { + "epoch": 3.4743935309973044, + "grad_norm": 7.6022490397787905, + "learning_rate": 4.622019300760028e-07, + "loss": 0.0554, + "step": 2578 + }, + { + "epoch": 3.475741239892183, + "grad_norm": 23.381620710188283, + "learning_rate": 4.5991271635522084e-07, + "loss": 0.0805, + "step": 2579 + }, + { + "epoch": 3.477088948787062, + "grad_norm": 7.663864373081252, + "learning_rate": 4.5762891244833906e-07, + "loss": 0.0437, + "step": 2580 + }, + { + "epoch": 3.4784366576819408, + "grad_norm": 11.470505584045046, + "learning_rate": 4.5535052107665844e-07, + "loss": 0.0348, + "step": 2581 + }, + { + "epoch": 3.4797843665768196, + "grad_norm": 6.617271337444769, + "learning_rate": 4.5307754495503395e-07, + "loss": 0.0458, + "step": 2582 + }, + { + "epoch": 3.481132075471698, + "grad_norm": 2.9127994002112456, + "learning_rate": 4.508099867918664e-07, + "loss": 0.0511, + "step": 2583 + }, + { + "epoch": 3.4824797843665767, + "grad_norm": 2.12775634381057, + "learning_rate": 4.4854784928910157e-07, + "loss": 0.0352, + "step": 2584 + }, + { + "epoch": 3.4838274932614555, + "grad_norm": 10.111810878477304, + "learning_rate": 4.462911351422267e-07, + "loss": 0.0683, + "step": 2585 + }, + { + "epoch": 3.4851752021563343, + "grad_norm": 1.5464282294833156, + "learning_rate": 4.4403984704026347e-07, + "loss": 0.0467, + "step": 2586 + }, + { + "epoch": 3.486522911051213, + "grad_norm": 3.88782121469357, + "learning_rate": 4.417939876657712e-07, + "loss": 0.0395, + "step": 2587 + }, + { + "epoch": 3.487870619946092, + "grad_norm": 6.337134680475258, + "learning_rate": 4.3955355969484027e-07, + "loss": 0.0481, + "step": 2588 + }, + { + "epoch": 3.48921832884097, + "grad_norm": 6.074472523103806, + "learning_rate": 4.373185657970891e-07, + "loss": 0.0549, + "step": 2589 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 1.7421922258425464, + "learning_rate": 4.3508900863565795e-07, + "loss": 0.0436, + "step": 2590 + }, + { + "epoch": 3.4919137466307277, + "grad_norm": 22.172411761387927, + "learning_rate": 4.3286489086721507e-07, + "loss": 0.059, + "step": 2591 + }, + { + "epoch": 3.4932614555256065, + "grad_norm": 0.956850706489198, + "learning_rate": 4.3064621514194106e-07, + "loss": 0.0285, + "step": 2592 + }, + { + "epoch": 3.4946091644204853, + "grad_norm": 5.09984918573167, + "learning_rate": 4.2843298410353506e-07, + "loss": 0.0587, + "step": 2593 + }, + { + "epoch": 3.4959568733153636, + "grad_norm": 2.8378959076090484, + "learning_rate": 4.2622520038920976e-07, + "loss": 0.0447, + "step": 2594 + }, + { + "epoch": 3.4973045822102424, + "grad_norm": 2.4157873873869087, + "learning_rate": 4.240228666296825e-07, + "loss": 0.0417, + "step": 2595 + }, + { + "epoch": 3.498652291105121, + "grad_norm": 1.5126335135851918, + "learning_rate": 4.218259854491813e-07, + "loss": 0.0421, + "step": 2596 + }, + { + "epoch": 3.5, + "grad_norm": 7.4303224282028735, + "learning_rate": 4.1963455946543494e-07, + "loss": 0.0513, + "step": 2597 + }, + { + "epoch": 3.501347708894879, + "grad_norm": 11.83875562955197, + "learning_rate": 4.174485912896725e-07, + "loss": 0.0554, + "step": 2598 + }, + { + "epoch": 3.5026954177897576, + "grad_norm": 4.584502080161404, + "learning_rate": 4.152680835266176e-07, + "loss": 0.0551, + "step": 2599 + }, + { + "epoch": 3.5040431266846364, + "grad_norm": 4.384396317059229, + "learning_rate": 4.130930387744925e-07, + "loss": 0.029, + "step": 2600 + }, + { + "epoch": 3.5053908355795147, + "grad_norm": 3.0448366970585745, + "learning_rate": 4.109234596250039e-07, + "loss": 0.0499, + "step": 2601 + }, + { + "epoch": 3.5067385444743935, + "grad_norm": 10.954563291541938, + "learning_rate": 4.0875934866335007e-07, + "loss": 0.047, + "step": 2602 + }, + { + "epoch": 3.5080862533692723, + "grad_norm": 16.919796455592124, + "learning_rate": 4.066007084682111e-07, + "loss": 0.0567, + "step": 2603 + }, + { + "epoch": 3.509433962264151, + "grad_norm": 16.69756322447446, + "learning_rate": 4.0444754161175157e-07, + "loss": 0.0312, + "step": 2604 + }, + { + "epoch": 3.5107816711590294, + "grad_norm": 12.112414700748149, + "learning_rate": 4.022998506596093e-07, + "loss": 0.0369, + "step": 2605 + }, + { + "epoch": 3.512129380053908, + "grad_norm": 4.108373241786773, + "learning_rate": 4.0015763817090103e-07, + "loss": 0.0694, + "step": 2606 + }, + { + "epoch": 3.513477088948787, + "grad_norm": 12.390907419114166, + "learning_rate": 3.9802090669821494e-07, + "loss": 0.0316, + "step": 2607 + }, + { + "epoch": 3.5148247978436657, + "grad_norm": 8.015309767715797, + "learning_rate": 3.958896587876071e-07, + "loss": 0.0657, + "step": 2608 + }, + { + "epoch": 3.5161725067385445, + "grad_norm": 16.568361046214015, + "learning_rate": 3.937638969786012e-07, + "loss": 0.0367, + "step": 2609 + }, + { + "epoch": 3.5175202156334233, + "grad_norm": 14.467943365866718, + "learning_rate": 3.9164362380418154e-07, + "loss": 0.0506, + "step": 2610 + }, + { + "epoch": 3.518867924528302, + "grad_norm": 5.014260169304983, + "learning_rate": 3.895288417907939e-07, + "loss": 0.035, + "step": 2611 + }, + { + "epoch": 3.5202156334231804, + "grad_norm": 10.96364630517077, + "learning_rate": 3.8741955345834136e-07, + "loss": 0.033, + "step": 2612 + }, + { + "epoch": 3.5215633423180592, + "grad_norm": 6.509619470835854, + "learning_rate": 3.8531576132018024e-07, + "loss": 0.0595, + "step": 2613 + }, + { + "epoch": 3.522911051212938, + "grad_norm": 16.10228766357858, + "learning_rate": 3.832174678831163e-07, + "loss": 0.0453, + "step": 2614 + }, + { + "epoch": 3.524258760107817, + "grad_norm": 9.714829312502735, + "learning_rate": 3.8112467564740796e-07, + "loss": 0.0497, + "step": 2615 + }, + { + "epoch": 3.525606469002695, + "grad_norm": 11.31683014094856, + "learning_rate": 3.790373871067521e-07, + "loss": 0.049, + "step": 2616 + }, + { + "epoch": 3.526954177897574, + "grad_norm": 6.493641545529218, + "learning_rate": 3.769556047482925e-07, + "loss": 0.0336, + "step": 2617 + }, + { + "epoch": 3.5283018867924527, + "grad_norm": 1.2445107475845172, + "learning_rate": 3.748793310526111e-07, + "loss": 0.049, + "step": 2618 + }, + { + "epoch": 3.5296495956873315, + "grad_norm": 9.0675409907071, + "learning_rate": 3.728085684937233e-07, + "loss": 0.0484, + "step": 2619 + }, + { + "epoch": 3.5309973045822103, + "grad_norm": 9.101211163157878, + "learning_rate": 3.7074331953908085e-07, + "loss": 0.0379, + "step": 2620 + }, + { + "epoch": 3.532345013477089, + "grad_norm": 4.1875291353536115, + "learning_rate": 3.6868358664956307e-07, + "loss": 0.0294, + "step": 2621 + }, + { + "epoch": 3.533692722371968, + "grad_norm": 8.552961583621338, + "learning_rate": 3.6662937227948005e-07, + "loss": 0.0559, + "step": 2622 + }, + { + "epoch": 3.535040431266846, + "grad_norm": 1.8997269463895643, + "learning_rate": 3.645806788765599e-07, + "loss": 0.0421, + "step": 2623 + }, + { + "epoch": 3.536388140161725, + "grad_norm": 11.83587062369546, + "learning_rate": 3.6253750888196107e-07, + "loss": 0.0615, + "step": 2624 + }, + { + "epoch": 3.5377358490566038, + "grad_norm": 11.414837171584365, + "learning_rate": 3.604998647302521e-07, + "loss": 0.0271, + "step": 2625 + }, + { + "epoch": 3.5390835579514826, + "grad_norm": 8.871915476131745, + "learning_rate": 3.5846774884942146e-07, + "loss": 0.0459, + "step": 2626 + }, + { + "epoch": 3.5404312668463613, + "grad_norm": 4.34988308126616, + "learning_rate": 3.5644116366086947e-07, + "loss": 0.0522, + "step": 2627 + }, + { + "epoch": 3.5417789757412397, + "grad_norm": 17.28036823949265, + "learning_rate": 3.544201115794077e-07, + "loss": 0.0408, + "step": 2628 + }, + { + "epoch": 3.5431266846361185, + "grad_norm": 13.26630022503841, + "learning_rate": 3.524045950132504e-07, + "loss": 0.0459, + "step": 2629 + }, + { + "epoch": 3.5444743935309972, + "grad_norm": 1.3107648993228473, + "learning_rate": 3.5039461636402095e-07, + "loss": 0.0402, + "step": 2630 + }, + { + "epoch": 3.545822102425876, + "grad_norm": 7.935295810101607, + "learning_rate": 3.483901780267401e-07, + "loss": 0.0322, + "step": 2631 + }, + { + "epoch": 3.547169811320755, + "grad_norm": 1.15935009945019, + "learning_rate": 3.463912823898302e-07, + "loss": 0.0246, + "step": 2632 + }, + { + "epoch": 3.5485175202156336, + "grad_norm": 1.5863552194059136, + "learning_rate": 3.4439793183510704e-07, + "loss": 0.0524, + "step": 2633 + }, + { + "epoch": 3.5498652291105124, + "grad_norm": 5.0193265321956675, + "learning_rate": 3.424101287377779e-07, + "loss": 0.0321, + "step": 2634 + }, + { + "epoch": 3.5512129380053907, + "grad_norm": 4.761566968757623, + "learning_rate": 3.4042787546644305e-07, + "loss": 0.0398, + "step": 2635 + }, + { + "epoch": 3.5525606469002695, + "grad_norm": 4.367214467519872, + "learning_rate": 3.3845117438308763e-07, + "loss": 0.0415, + "step": 2636 + }, + { + "epoch": 3.5539083557951483, + "grad_norm": 16.792890642549313, + "learning_rate": 3.3648002784308297e-07, + "loss": 0.0506, + "step": 2637 + }, + { + "epoch": 3.555256064690027, + "grad_norm": 1.7102587267356633, + "learning_rate": 3.3451443819517704e-07, + "loss": 0.0419, + "step": 2638 + }, + { + "epoch": 3.5566037735849054, + "grad_norm": 6.000013232547283, + "learning_rate": 3.325544077815035e-07, + "loss": 0.038, + "step": 2639 + }, + { + "epoch": 3.557951482479784, + "grad_norm": 8.927342858439046, + "learning_rate": 3.3059993893756525e-07, + "loss": 0.0444, + "step": 2640 + }, + { + "epoch": 3.559299191374663, + "grad_norm": 7.830956496238555, + "learning_rate": 3.286510339922422e-07, + "loss": 0.0379, + "step": 2641 + }, + { + "epoch": 3.560646900269542, + "grad_norm": 7.927503555979656, + "learning_rate": 3.2670769526778443e-07, + "loss": 0.0605, + "step": 2642 + }, + { + "epoch": 3.5619946091644206, + "grad_norm": 3.1726075229825135, + "learning_rate": 3.2476992507980645e-07, + "loss": 0.0659, + "step": 2643 + }, + { + "epoch": 3.5633423180592994, + "grad_norm": 1.5307867370311554, + "learning_rate": 3.2283772573729e-07, + "loss": 0.0534, + "step": 2644 + }, + { + "epoch": 3.564690026954178, + "grad_norm": 6.7738259990757195, + "learning_rate": 3.209110995425785e-07, + "loss": 0.0539, + "step": 2645 + }, + { + "epoch": 3.5660377358490565, + "grad_norm": 10.249915798666127, + "learning_rate": 3.18990048791375e-07, + "loss": 0.0415, + "step": 2646 + }, + { + "epoch": 3.5673854447439353, + "grad_norm": 5.5118639269770675, + "learning_rate": 3.1707457577273613e-07, + "loss": 0.0293, + "step": 2647 + }, + { + "epoch": 3.568733153638814, + "grad_norm": 6.735003705452482, + "learning_rate": 3.15164682769078e-07, + "loss": 0.0495, + "step": 2648 + }, + { + "epoch": 3.570080862533693, + "grad_norm": 3.455364659644724, + "learning_rate": 3.132603720561611e-07, + "loss": 0.0566, + "step": 2649 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.4309739822788157, + "learning_rate": 3.113616459030988e-07, + "loss": 0.051, + "step": 2650 + }, + { + "epoch": 3.57277628032345, + "grad_norm": 11.748154583303789, + "learning_rate": 3.094685065723485e-07, + "loss": 0.0282, + "step": 2651 + }, + { + "epoch": 3.5741239892183287, + "grad_norm": 9.7736483760049, + "learning_rate": 3.075809563197119e-07, + "loss": 0.0452, + "step": 2652 + }, + { + "epoch": 3.5754716981132075, + "grad_norm": 15.557766685793641, + "learning_rate": 3.0569899739432804e-07, + "loss": 0.0572, + "step": 2653 + }, + { + "epoch": 3.5768194070080863, + "grad_norm": 5.7979012387555775, + "learning_rate": 3.0382263203867557e-07, + "loss": 0.0789, + "step": 2654 + }, + { + "epoch": 3.578167115902965, + "grad_norm": 10.946268934047174, + "learning_rate": 3.0195186248856866e-07, + "loss": 0.0388, + "step": 2655 + }, + { + "epoch": 3.579514824797844, + "grad_norm": 9.579836743314493, + "learning_rate": 3.00086690973152e-07, + "loss": 0.0852, + "step": 2656 + }, + { + "epoch": 3.5808625336927222, + "grad_norm": 2.4735754134477808, + "learning_rate": 2.9822711971490224e-07, + "loss": 0.0401, + "step": 2657 + }, + { + "epoch": 3.582210242587601, + "grad_norm": 6.513670187465349, + "learning_rate": 2.963731509296192e-07, + "loss": 0.0378, + "step": 2658 + }, + { + "epoch": 3.58355795148248, + "grad_norm": 6.182182933932895, + "learning_rate": 2.9452478682643005e-07, + "loss": 0.0513, + "step": 2659 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 14.009861212145449, + "learning_rate": 2.9268202960778256e-07, + "loss": 0.045, + "step": 2660 + }, + { + "epoch": 3.586253369272237, + "grad_norm": 12.954926668817716, + "learning_rate": 2.9084488146944477e-07, + "loss": 0.0396, + "step": 2661 + }, + { + "epoch": 3.5876010781671157, + "grad_norm": 14.143903524434416, + "learning_rate": 2.890133446004978e-07, + "loss": 0.054, + "step": 2662 + }, + { + "epoch": 3.5889487870619945, + "grad_norm": 13.053572410379958, + "learning_rate": 2.8718742118334143e-07, + "loss": 0.0419, + "step": 2663 + }, + { + "epoch": 3.5902964959568733, + "grad_norm": 27.761193753780503, + "learning_rate": 2.8536711339368194e-07, + "loss": 0.0469, + "step": 2664 + }, + { + "epoch": 3.591644204851752, + "grad_norm": 18.319995865027195, + "learning_rate": 2.8355242340053766e-07, + "loss": 0.0578, + "step": 2665 + }, + { + "epoch": 3.592991913746631, + "grad_norm": 2.8960511122386614, + "learning_rate": 2.817433533662317e-07, + "loss": 0.052, + "step": 2666 + }, + { + "epoch": 3.5943396226415096, + "grad_norm": 17.258455106750684, + "learning_rate": 2.799399054463886e-07, + "loss": 0.0553, + "step": 2667 + }, + { + "epoch": 3.595687331536388, + "grad_norm": 6.140262382404533, + "learning_rate": 2.7814208178993716e-07, + "loss": 0.0325, + "step": 2668 + }, + { + "epoch": 3.5970350404312668, + "grad_norm": 3.9822954528130428, + "learning_rate": 2.763498845391033e-07, + "loss": 0.054, + "step": 2669 + }, + { + "epoch": 3.5983827493261455, + "grad_norm": 9.84708304461305, + "learning_rate": 2.745633158294081e-07, + "loss": 0.0233, + "step": 2670 + }, + { + "epoch": 3.5997304582210243, + "grad_norm": 8.109786343873653, + "learning_rate": 2.7278237778966487e-07, + "loss": 0.0698, + "step": 2671 + }, + { + "epoch": 3.601078167115903, + "grad_norm": 15.013560800066395, + "learning_rate": 2.7100707254198166e-07, + "loss": 0.0743, + "step": 2672 + }, + { + "epoch": 3.6024258760107815, + "grad_norm": 12.125867194426423, + "learning_rate": 2.692374022017491e-07, + "loss": 0.0469, + "step": 2673 + }, + { + "epoch": 3.6037735849056602, + "grad_norm": 19.18810737476993, + "learning_rate": 2.674733688776482e-07, + "loss": 0.0576, + "step": 2674 + }, + { + "epoch": 3.605121293800539, + "grad_norm": 11.568826433366464, + "learning_rate": 2.6571497467164033e-07, + "loss": 0.0338, + "step": 2675 + }, + { + "epoch": 3.606469002695418, + "grad_norm": 5.0289910790620445, + "learning_rate": 2.639622216789689e-07, + "loss": 0.0376, + "step": 2676 + }, + { + "epoch": 3.6078167115902966, + "grad_norm": 9.00314316783378, + "learning_rate": 2.6221511198815443e-07, + "loss": 0.0215, + "step": 2677 + }, + { + "epoch": 3.6091644204851754, + "grad_norm": 4.105304823085911, + "learning_rate": 2.6047364768099326e-07, + "loss": 0.0575, + "step": 2678 + }, + { + "epoch": 3.610512129380054, + "grad_norm": 1.7739675970293738, + "learning_rate": 2.587378308325561e-07, + "loss": 0.0543, + "step": 2679 + }, + { + "epoch": 3.6118598382749325, + "grad_norm": 1.665262085369456, + "learning_rate": 2.5700766351118236e-07, + "loss": 0.0381, + "step": 2680 + }, + { + "epoch": 3.6132075471698113, + "grad_norm": 3.337626568119621, + "learning_rate": 2.5528314777848175e-07, + "loss": 0.0267, + "step": 2681 + }, + { + "epoch": 3.61455525606469, + "grad_norm": 12.380980370341229, + "learning_rate": 2.5356428568932725e-07, + "loss": 0.055, + "step": 2682 + }, + { + "epoch": 3.615902964959569, + "grad_norm": 15.652133838974866, + "learning_rate": 2.518510792918577e-07, + "loss": 0.0742, + "step": 2683 + }, + { + "epoch": 3.617250673854447, + "grad_norm": 7.53246886054573, + "learning_rate": 2.501435306274719e-07, + "loss": 0.0511, + "step": 2684 + }, + { + "epoch": 3.618598382749326, + "grad_norm": 21.169751988108466, + "learning_rate": 2.4844164173082605e-07, + "loss": 0.0508, + "step": 2685 + }, + { + "epoch": 3.6199460916442048, + "grad_norm": 3.974687744281765, + "learning_rate": 2.4674541462983316e-07, + "loss": 0.0511, + "step": 2686 + }, + { + "epoch": 3.6212938005390836, + "grad_norm": 3.3452582253911705, + "learning_rate": 2.4505485134566076e-07, + "loss": 0.0453, + "step": 2687 + }, + { + "epoch": 3.6226415094339623, + "grad_norm": 11.628667113632783, + "learning_rate": 2.433699538927259e-07, + "loss": 0.039, + "step": 2688 + }, + { + "epoch": 3.623989218328841, + "grad_norm": 2.7878607660370096, + "learning_rate": 2.4169072427869535e-07, + "loss": 0.0446, + "step": 2689 + }, + { + "epoch": 3.62533692722372, + "grad_norm": 8.008377415748589, + "learning_rate": 2.4001716450448296e-07, + "loss": 0.053, + "step": 2690 + }, + { + "epoch": 3.6266846361185983, + "grad_norm": 4.093426667789267, + "learning_rate": 2.3834927656424423e-07, + "loss": 0.0442, + "step": 2691 + }, + { + "epoch": 3.628032345013477, + "grad_norm": 7.832705595342025, + "learning_rate": 2.3668706244537876e-07, + "loss": 0.0396, + "step": 2692 + }, + { + "epoch": 3.629380053908356, + "grad_norm": 18.39327095688308, + "learning_rate": 2.3503052412852388e-07, + "loss": 0.0554, + "step": 2693 + }, + { + "epoch": 3.6307277628032346, + "grad_norm": 7.674156972188003, + "learning_rate": 2.3337966358755572e-07, + "loss": 0.0698, + "step": 2694 + }, + { + "epoch": 3.632075471698113, + "grad_norm": 2.190195337877111, + "learning_rate": 2.3173448278958178e-07, + "loss": 0.0604, + "step": 2695 + }, + { + "epoch": 3.6334231805929917, + "grad_norm": 4.564417703201633, + "learning_rate": 2.3009498369494565e-07, + "loss": 0.0499, + "step": 2696 + }, + { + "epoch": 3.6347708894878705, + "grad_norm": 1.250359711467463, + "learning_rate": 2.2846116825721688e-07, + "loss": 0.0473, + "step": 2697 + }, + { + "epoch": 3.6361185983827493, + "grad_norm": 3.5906858234105705, + "learning_rate": 2.2683303842319593e-07, + "loss": 0.0521, + "step": 2698 + }, + { + "epoch": 3.637466307277628, + "grad_norm": 4.533543681071234, + "learning_rate": 2.2521059613290596e-07, + "loss": 0.0379, + "step": 2699 + }, + { + "epoch": 3.638814016172507, + "grad_norm": 4.473092340690641, + "learning_rate": 2.2359384331959556e-07, + "loss": 0.0536, + "step": 2700 + }, + { + "epoch": 3.6401617250673857, + "grad_norm": 7.238929214788152, + "learning_rate": 2.2198278190973145e-07, + "loss": 0.0358, + "step": 2701 + }, + { + "epoch": 3.641509433962264, + "grad_norm": 5.631586811874394, + "learning_rate": 2.2037741382299916e-07, + "loss": 0.0439, + "step": 2702 + }, + { + "epoch": 3.642857142857143, + "grad_norm": 9.926928357080039, + "learning_rate": 2.1877774097230296e-07, + "loss": 0.0517, + "step": 2703 + }, + { + "epoch": 3.6442048517520216, + "grad_norm": 7.664063431452046, + "learning_rate": 2.171837652637554e-07, + "loss": 0.0422, + "step": 2704 + }, + { + "epoch": 3.6455525606469004, + "grad_norm": 10.320041430688827, + "learning_rate": 2.1559548859668766e-07, + "loss": 0.0527, + "step": 2705 + }, + { + "epoch": 3.6469002695417787, + "grad_norm": 12.3887890309736, + "learning_rate": 2.1401291286363312e-07, + "loss": 0.0521, + "step": 2706 + }, + { + "epoch": 3.6482479784366575, + "grad_norm": 4.3182990113119795, + "learning_rate": 2.1243603995033668e-07, + "loss": 0.0464, + "step": 2707 + }, + { + "epoch": 3.6495956873315363, + "grad_norm": 11.69084582161753, + "learning_rate": 2.10864871735747e-07, + "loss": 0.0582, + "step": 2708 + }, + { + "epoch": 3.650943396226415, + "grad_norm": 7.989832637716855, + "learning_rate": 2.0929941009201425e-07, + "loss": 0.0336, + "step": 2709 + }, + { + "epoch": 3.652291105121294, + "grad_norm": 1.9948518116752096, + "learning_rate": 2.0773965688448861e-07, + "loss": 0.0649, + "step": 2710 + }, + { + "epoch": 3.6536388140161726, + "grad_norm": 20.290820237499823, + "learning_rate": 2.0618561397172055e-07, + "loss": 0.0821, + "step": 2711 + }, + { + "epoch": 3.6549865229110514, + "grad_norm": 14.302827015889248, + "learning_rate": 2.0463728320545385e-07, + "loss": 0.0439, + "step": 2712 + }, + { + "epoch": 3.6563342318059298, + "grad_norm": 7.703227118587633, + "learning_rate": 2.030946664306277e-07, + "loss": 0.0436, + "step": 2713 + }, + { + "epoch": 3.6576819407008085, + "grad_norm": 2.882238048112119, + "learning_rate": 2.015577654853712e-07, + "loss": 0.035, + "step": 2714 + }, + { + "epoch": 3.6590296495956873, + "grad_norm": 17.30829193459423, + "learning_rate": 2.0002658220100334e-07, + "loss": 0.0417, + "step": 2715 + }, + { + "epoch": 3.660377358490566, + "grad_norm": 1.7158883247328076, + "learning_rate": 1.9850111840203023e-07, + "loss": 0.0462, + "step": 2716 + }, + { + "epoch": 3.661725067385445, + "grad_norm": 22.467856673125493, + "learning_rate": 1.9698137590614287e-07, + "loss": 0.0435, + "step": 2717 + }, + { + "epoch": 3.6630727762803232, + "grad_norm": 9.29331990902879, + "learning_rate": 1.9546735652421544e-07, + "loss": 0.05, + "step": 2718 + }, + { + "epoch": 3.664420485175202, + "grad_norm": 3.8497198739937892, + "learning_rate": 1.9395906206030047e-07, + "loss": 0.0304, + "step": 2719 + }, + { + "epoch": 3.665768194070081, + "grad_norm": 3.664391717100003, + "learning_rate": 1.9245649431163248e-07, + "loss": 0.0551, + "step": 2720 + }, + { + "epoch": 3.6671159029649596, + "grad_norm": 1.9299982569563479, + "learning_rate": 1.9095965506861825e-07, + "loss": 0.0518, + "step": 2721 + }, + { + "epoch": 3.6684636118598384, + "grad_norm": 3.2863594179353277, + "learning_rate": 1.8946854611484156e-07, + "loss": 0.0519, + "step": 2722 + }, + { + "epoch": 3.669811320754717, + "grad_norm": 23.847866472099838, + "learning_rate": 1.879831692270573e-07, + "loss": 0.0509, + "step": 2723 + }, + { + "epoch": 3.671159029649596, + "grad_norm": 1.2053780387602213, + "learning_rate": 1.8650352617519075e-07, + "loss": 0.0474, + "step": 2724 + }, + { + "epoch": 3.6725067385444743, + "grad_norm": 1.6651262552485462, + "learning_rate": 1.850296187223327e-07, + "loss": 0.0452, + "step": 2725 + }, + { + "epoch": 3.673854447439353, + "grad_norm": 2.9947946339380214, + "learning_rate": 1.8356144862474222e-07, + "loss": 0.0719, + "step": 2726 + }, + { + "epoch": 3.675202156334232, + "grad_norm": 10.588349844764204, + "learning_rate": 1.8209901763184156e-07, + "loss": 0.0566, + "step": 2727 + }, + { + "epoch": 3.6765498652291106, + "grad_norm": 5.044862568895696, + "learning_rate": 1.806423274862118e-07, + "loss": 0.0718, + "step": 2728 + }, + { + "epoch": 3.677897574123989, + "grad_norm": 3.1361105095126174, + "learning_rate": 1.7919137992359835e-07, + "loss": 0.0358, + "step": 2729 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 6.564517447035489, + "learning_rate": 1.7774617667289828e-07, + "loss": 0.0464, + "step": 2730 + }, + { + "epoch": 3.6805929919137466, + "grad_norm": 10.357711537766997, + "learning_rate": 1.7630671945616851e-07, + "loss": 0.0486, + "step": 2731 + }, + { + "epoch": 3.6819407008086253, + "grad_norm": 2.1232999964817147, + "learning_rate": 1.74873009988617e-07, + "loss": 0.0578, + "step": 2732 + }, + { + "epoch": 3.683288409703504, + "grad_norm": 4.175158893738561, + "learning_rate": 1.734450499786039e-07, + "loss": 0.065, + "step": 2733 + }, + { + "epoch": 3.684636118598383, + "grad_norm": 1.7710280635084814, + "learning_rate": 1.720228411276359e-07, + "loss": 0.0355, + "step": 2734 + }, + { + "epoch": 3.6859838274932617, + "grad_norm": 2.072255427274556, + "learning_rate": 1.7060638513037076e-07, + "loss": 0.0395, + "step": 2735 + }, + { + "epoch": 3.68733153638814, + "grad_norm": 2.9413926825835706, + "learning_rate": 1.6919568367460837e-07, + "loss": 0.0413, + "step": 2736 + }, + { + "epoch": 3.688679245283019, + "grad_norm": 1.9790933648959819, + "learning_rate": 1.6779073844129358e-07, + "loss": 0.0506, + "step": 2737 + }, + { + "epoch": 3.6900269541778976, + "grad_norm": 12.341313180143125, + "learning_rate": 1.6639155110451056e-07, + "loss": 0.0475, + "step": 2738 + }, + { + "epoch": 3.6913746630727764, + "grad_norm": 1.5153231682297466, + "learning_rate": 1.6499812333148346e-07, + "loss": 0.0393, + "step": 2739 + }, + { + "epoch": 3.6927223719676547, + "grad_norm": 3.032602734073951, + "learning_rate": 1.6361045678257414e-07, + "loss": 0.0579, + "step": 2740 + }, + { + "epoch": 3.6940700808625335, + "grad_norm": 8.102868735883801, + "learning_rate": 1.6222855311127827e-07, + "loss": 0.0517, + "step": 2741 + }, + { + "epoch": 3.6954177897574123, + "grad_norm": 3.110092772307966, + "learning_rate": 1.6085241396422647e-07, + "loss": 0.0419, + "step": 2742 + }, + { + "epoch": 3.696765498652291, + "grad_norm": 1.578986960556491, + "learning_rate": 1.594820409811776e-07, + "loss": 0.0333, + "step": 2743 + }, + { + "epoch": 3.69811320754717, + "grad_norm": 1.0758514723415253, + "learning_rate": 1.581174357950238e-07, + "loss": 0.046, + "step": 2744 + }, + { + "epoch": 3.6994609164420487, + "grad_norm": 5.268825797484257, + "learning_rate": 1.5675860003178056e-07, + "loss": 0.0633, + "step": 2745 + }, + { + "epoch": 3.7008086253369274, + "grad_norm": 4.123253507846095, + "learning_rate": 1.5540553531059043e-07, + "loss": 0.0256, + "step": 2746 + }, + { + "epoch": 3.702156334231806, + "grad_norm": 1.7842267487939687, + "learning_rate": 1.54058243243721e-07, + "loss": 0.0563, + "step": 2747 + }, + { + "epoch": 3.7035040431266846, + "grad_norm": 2.9721581315038508, + "learning_rate": 1.5271672543655857e-07, + "loss": 0.0302, + "step": 2748 + }, + { + "epoch": 3.7048517520215634, + "grad_norm": 5.1189408915287, + "learning_rate": 1.5138098348761065e-07, + "loss": 0.047, + "step": 2749 + }, + { + "epoch": 3.706199460916442, + "grad_norm": 3.039944284614734, + "learning_rate": 1.5005101898850128e-07, + "loss": 0.0255, + "step": 2750 + }, + { + "epoch": 3.7075471698113205, + "grad_norm": 9.94024690638682, + "learning_rate": 1.487268335239722e-07, + "loss": 0.0505, + "step": 2751 + }, + { + "epoch": 3.7088948787061993, + "grad_norm": 4.686069695791377, + "learning_rate": 1.4740842867187578e-07, + "loss": 0.0613, + "step": 2752 + }, + { + "epoch": 3.710242587601078, + "grad_norm": 3.367213437990566, + "learning_rate": 1.460958060031814e-07, + "loss": 0.0504, + "step": 2753 + }, + { + "epoch": 3.711590296495957, + "grad_norm": 1.6236320382037188, + "learning_rate": 1.4478896708196354e-07, + "loss": 0.0261, + "step": 2754 + }, + { + "epoch": 3.7129380053908356, + "grad_norm": 1.2325452232814358, + "learning_rate": 1.434879134654077e-07, + "loss": 0.0392, + "step": 2755 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 3.989534372519983, + "learning_rate": 1.421926467038054e-07, + "loss": 0.055, + "step": 2756 + }, + { + "epoch": 3.715633423180593, + "grad_norm": 1.9768547875046119, + "learning_rate": 1.4090316834055262e-07, + "loss": 0.043, + "step": 2757 + }, + { + "epoch": 3.7169811320754715, + "grad_norm": 1.6785077196459035, + "learning_rate": 1.3961947991214698e-07, + "loss": 0.0298, + "step": 2758 + }, + { + "epoch": 3.7183288409703503, + "grad_norm": 5.308858753837068, + "learning_rate": 1.3834158294818988e-07, + "loss": 0.072, + "step": 2759 + }, + { + "epoch": 3.719676549865229, + "grad_norm": 3.5585030017462063, + "learning_rate": 1.3706947897137834e-07, + "loss": 0.0559, + "step": 2760 + }, + { + "epoch": 3.721024258760108, + "grad_norm": 10.39283097266443, + "learning_rate": 1.358031694975087e-07, + "loss": 0.0386, + "step": 2761 + }, + { + "epoch": 3.7223719676549867, + "grad_norm": 5.663438199494803, + "learning_rate": 1.345426560354729e-07, + "loss": 0.0624, + "step": 2762 + }, + { + "epoch": 3.723719676549865, + "grad_norm": 3.7197317794473475, + "learning_rate": 1.3328794008725555e-07, + "loss": 0.0707, + "step": 2763 + }, + { + "epoch": 3.725067385444744, + "grad_norm": 5.184223938111806, + "learning_rate": 1.320390231479335e-07, + "loss": 0.0652, + "step": 2764 + }, + { + "epoch": 3.7264150943396226, + "grad_norm": 2.517942931224687, + "learning_rate": 1.3079590670567356e-07, + "loss": 0.0368, + "step": 2765 + }, + { + "epoch": 3.7277628032345014, + "grad_norm": 5.590204940140623, + "learning_rate": 1.2955859224173251e-07, + "loss": 0.0463, + "step": 2766 + }, + { + "epoch": 3.72911051212938, + "grad_norm": 4.024760802616726, + "learning_rate": 1.283270812304499e-07, + "loss": 0.0457, + "step": 2767 + }, + { + "epoch": 3.730458221024259, + "grad_norm": 9.824090933298285, + "learning_rate": 1.2710137513925468e-07, + "loss": 0.0579, + "step": 2768 + }, + { + "epoch": 3.7318059299191377, + "grad_norm": 10.959249596988972, + "learning_rate": 1.2588147542865525e-07, + "loss": 0.0453, + "step": 2769 + }, + { + "epoch": 3.733153638814016, + "grad_norm": 6.182859601439868, + "learning_rate": 1.2466738355224327e-07, + "loss": 0.0453, + "step": 2770 + }, + { + "epoch": 3.734501347708895, + "grad_norm": 4.983340134416494, + "learning_rate": 1.2345910095668934e-07, + "loss": 0.0899, + "step": 2771 + }, + { + "epoch": 3.7358490566037736, + "grad_norm": 2.27428727444909, + "learning_rate": 1.222566290817423e-07, + "loss": 0.0492, + "step": 2772 + }, + { + "epoch": 3.7371967654986524, + "grad_norm": 1.3415084311226164, + "learning_rate": 1.2105996936022545e-07, + "loss": 0.0377, + "step": 2773 + }, + { + "epoch": 3.7385444743935308, + "grad_norm": 10.777290336854273, + "learning_rate": 1.1986912321803935e-07, + "loss": 0.0411, + "step": 2774 + }, + { + "epoch": 3.7398921832884096, + "grad_norm": 2.6916725184672776, + "learning_rate": 1.186840920741561e-07, + "loss": 0.0203, + "step": 2775 + }, + { + "epoch": 3.7412398921832883, + "grad_norm": 5.028089088145755, + "learning_rate": 1.1750487734061677e-07, + "loss": 0.0368, + "step": 2776 + }, + { + "epoch": 3.742587601078167, + "grad_norm": 13.457374829254713, + "learning_rate": 1.1633148042253516e-07, + "loss": 0.0446, + "step": 2777 + }, + { + "epoch": 3.743935309973046, + "grad_norm": 2.7649564967725517, + "learning_rate": 1.1516390271809063e-07, + "loss": 0.0389, + "step": 2778 + }, + { + "epoch": 3.7452830188679247, + "grad_norm": 1.6940422945330769, + "learning_rate": 1.1400214561852973e-07, + "loss": 0.0422, + "step": 2779 + }, + { + "epoch": 3.7466307277628035, + "grad_norm": 12.39956848341851, + "learning_rate": 1.128462105081618e-07, + "loss": 0.0739, + "step": 2780 + }, + { + "epoch": 3.747978436657682, + "grad_norm": 4.13878420993068, + "learning_rate": 1.1169609876436061e-07, + "loss": 0.0351, + "step": 2781 + }, + { + "epoch": 3.7493261455525606, + "grad_norm": 4.223243840986754, + "learning_rate": 1.1055181175755992e-07, + "loss": 0.0484, + "step": 2782 + }, + { + "epoch": 3.7506738544474394, + "grad_norm": 2.664483947266017, + "learning_rate": 1.0941335085125349e-07, + "loss": 0.0419, + "step": 2783 + }, + { + "epoch": 3.752021563342318, + "grad_norm": 7.738390402565279, + "learning_rate": 1.0828071740199286e-07, + "loss": 0.0339, + "step": 2784 + }, + { + "epoch": 3.7533692722371965, + "grad_norm": 7.911235841603667, + "learning_rate": 1.0715391275938513e-07, + "loss": 0.0343, + "step": 2785 + }, + { + "epoch": 3.7547169811320753, + "grad_norm": 6.881538176445077, + "learning_rate": 1.0603293826609296e-07, + "loss": 0.0257, + "step": 2786 + }, + { + "epoch": 3.756064690026954, + "grad_norm": 1.249340325109362, + "learning_rate": 1.0491779525783119e-07, + "loss": 0.0552, + "step": 2787 + }, + { + "epoch": 3.757412398921833, + "grad_norm": 2.8050310239382066, + "learning_rate": 1.0380848506336639e-07, + "loss": 0.0588, + "step": 2788 + }, + { + "epoch": 3.7587601078167117, + "grad_norm": 2.7240911596497623, + "learning_rate": 1.0270500900451453e-07, + "loss": 0.0447, + "step": 2789 + }, + { + "epoch": 3.7601078167115904, + "grad_norm": 2.724885022311007, + "learning_rate": 1.016073683961416e-07, + "loss": 0.0427, + "step": 2790 + }, + { + "epoch": 3.7614555256064692, + "grad_norm": 6.507653824005585, + "learning_rate": 1.0051556454615696e-07, + "loss": 0.0636, + "step": 2791 + }, + { + "epoch": 3.7628032345013476, + "grad_norm": 9.290307681228247, + "learning_rate": 9.942959875551883e-08, + "loss": 0.0376, + "step": 2792 + }, + { + "epoch": 3.7641509433962264, + "grad_norm": 8.216471481913903, + "learning_rate": 9.8349472318226e-08, + "loss": 0.0413, + "step": 2793 + }, + { + "epoch": 3.765498652291105, + "grad_norm": 12.575983398255376, + "learning_rate": 9.727518652132062e-08, + "loss": 0.0481, + "step": 2794 + }, + { + "epoch": 3.766846361185984, + "grad_norm": 3.1157167320260806, + "learning_rate": 9.620674264488594e-08, + "loss": 0.0394, + "step": 2795 + }, + { + "epoch": 3.7681940700808623, + "grad_norm": 9.113974873234556, + "learning_rate": 9.514414196204302e-08, + "loss": 0.0365, + "step": 2796 + }, + { + "epoch": 3.769541778975741, + "grad_norm": 6.557678140358577, + "learning_rate": 9.408738573895015e-08, + "loss": 0.0466, + "step": 2797 + }, + { + "epoch": 3.77088948787062, + "grad_norm": 2.6035916037771605, + "learning_rate": 9.30364752348023e-08, + "loss": 0.0711, + "step": 2798 + }, + { + "epoch": 3.7722371967654986, + "grad_norm": 12.000588946208916, + "learning_rate": 9.199141170183001e-08, + "loss": 0.0415, + "step": 2799 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 2.1538596247218535, + "learning_rate": 9.095219638529385e-08, + "loss": 0.0433, + "step": 2800 + }, + { + "epoch": 3.774932614555256, + "grad_norm": 5.84451600504022, + "learning_rate": 8.991883052348883e-08, + "loss": 0.027, + "step": 2801 + }, + { + "epoch": 3.776280323450135, + "grad_norm": 1.8422279710639362, + "learning_rate": 8.889131534773776e-08, + "loss": 0.0358, + "step": 2802 + }, + { + "epoch": 3.7776280323450133, + "grad_norm": 2.6313399823503008, + "learning_rate": 8.786965208239296e-08, + "loss": 0.057, + "step": 2803 + }, + { + "epoch": 3.778975741239892, + "grad_norm": 9.916156105033927, + "learning_rate": 8.685384194483448e-08, + "loss": 0.0413, + "step": 2804 + }, + { + "epoch": 3.780323450134771, + "grad_norm": 4.752259653104604, + "learning_rate": 8.58438861454669e-08, + "loss": 0.0502, + "step": 2805 + }, + { + "epoch": 3.7816711590296497, + "grad_norm": 6.62531760956988, + "learning_rate": 8.483978588771758e-08, + "loss": 0.0389, + "step": 2806 + }, + { + "epoch": 3.7830188679245285, + "grad_norm": 2.697006268907918, + "learning_rate": 8.384154236804109e-08, + "loss": 0.0447, + "step": 2807 + }, + { + "epoch": 3.784366576819407, + "grad_norm": 4.496563555065174, + "learning_rate": 8.284915677590877e-08, + "loss": 0.0571, + "step": 2808 + }, + { + "epoch": 3.7857142857142856, + "grad_norm": 3.09908001930826, + "learning_rate": 8.186263029381358e-08, + "loss": 0.0594, + "step": 2809 + }, + { + "epoch": 3.7870619946091644, + "grad_norm": 10.038859856801103, + "learning_rate": 8.088196409726801e-08, + "loss": 0.0474, + "step": 2810 + }, + { + "epoch": 3.788409703504043, + "grad_norm": 9.536723322493089, + "learning_rate": 7.990715935479953e-08, + "loss": 0.0396, + "step": 2811 + }, + { + "epoch": 3.789757412398922, + "grad_norm": 12.013085978828316, + "learning_rate": 7.893821722795292e-08, + "loss": 0.0369, + "step": 2812 + }, + { + "epoch": 3.7911051212938007, + "grad_norm": 5.6119292208173395, + "learning_rate": 7.797513887128683e-08, + "loss": 0.0246, + "step": 2813 + }, + { + "epoch": 3.7924528301886795, + "grad_norm": 2.278737637861351, + "learning_rate": 7.701792543237275e-08, + "loss": 0.0467, + "step": 2814 + }, + { + "epoch": 3.793800539083558, + "grad_norm": 3.8501967236660968, + "learning_rate": 7.606657805179274e-08, + "loss": 0.0502, + "step": 2815 + }, + { + "epoch": 3.7951482479784366, + "grad_norm": 1.078323636352407, + "learning_rate": 7.51210978631417e-08, + "loss": 0.0406, + "step": 2816 + }, + { + "epoch": 3.7964959568733154, + "grad_norm": 6.791392296985932, + "learning_rate": 7.418148599302066e-08, + "loss": 0.0439, + "step": 2817 + }, + { + "epoch": 3.797843665768194, + "grad_norm": 6.651339554510167, + "learning_rate": 7.324774356103958e-08, + "loss": 0.0351, + "step": 2818 + }, + { + "epoch": 3.7991913746630726, + "grad_norm": 6.9181201299126185, + "learning_rate": 7.231987167981347e-08, + "loss": 0.0448, + "step": 2819 + }, + { + "epoch": 3.8005390835579513, + "grad_norm": 1.1143531846323622, + "learning_rate": 7.139787145496457e-08, + "loss": 0.058, + "step": 2820 + }, + { + "epoch": 3.80188679245283, + "grad_norm": 3.5744434423807627, + "learning_rate": 7.048174398511576e-08, + "loss": 0.0477, + "step": 2821 + }, + { + "epoch": 3.803234501347709, + "grad_norm": 11.807970340286998, + "learning_rate": 6.957149036189325e-08, + "loss": 0.0409, + "step": 2822 + }, + { + "epoch": 3.8045822102425877, + "grad_norm": 5.624235874570568, + "learning_rate": 6.866711166992557e-08, + "loss": 0.0369, + "step": 2823 + }, + { + "epoch": 3.8059299191374665, + "grad_norm": 1.3725139300267497, + "learning_rate": 6.776860898683846e-08, + "loss": 0.0269, + "step": 2824 + }, + { + "epoch": 3.8072776280323453, + "grad_norm": 6.1894592587356785, + "learning_rate": 6.68759833832583e-08, + "loss": 0.0374, + "step": 2825 + }, + { + "epoch": 3.8086253369272236, + "grad_norm": 12.267007362741388, + "learning_rate": 6.598923592280648e-08, + "loss": 0.0515, + "step": 2826 + }, + { + "epoch": 3.8099730458221024, + "grad_norm": 5.088376265292601, + "learning_rate": 6.510836766210115e-08, + "loss": 0.0377, + "step": 2827 + }, + { + "epoch": 3.811320754716981, + "grad_norm": 7.772045202731622, + "learning_rate": 6.423337965075604e-08, + "loss": 0.0544, + "step": 2828 + }, + { + "epoch": 3.81266846361186, + "grad_norm": 5.561545903205886, + "learning_rate": 6.336427293137714e-08, + "loss": 0.039, + "step": 2829 + }, + { + "epoch": 3.8140161725067383, + "grad_norm": 5.125193852066618, + "learning_rate": 6.250104853956052e-08, + "loss": 0.0414, + "step": 2830 + }, + { + "epoch": 3.815363881401617, + "grad_norm": 5.502353876553165, + "learning_rate": 6.164370750389781e-08, + "loss": 0.0423, + "step": 2831 + }, + { + "epoch": 3.816711590296496, + "grad_norm": 9.699880623408758, + "learning_rate": 6.079225084596574e-08, + "loss": 0.0441, + "step": 2832 + }, + { + "epoch": 3.8180592991913747, + "grad_norm": 7.4867608530997005, + "learning_rate": 5.994667958033163e-08, + "loss": 0.061, + "step": 2833 + }, + { + "epoch": 3.8194070080862534, + "grad_norm": 3.375973748947476, + "learning_rate": 5.910699471455006e-08, + "loss": 0.063, + "step": 2834 + }, + { + "epoch": 3.8207547169811322, + "grad_norm": 4.132384763899708, + "learning_rate": 5.827319724915959e-08, + "loss": 0.0255, + "step": 2835 + }, + { + "epoch": 3.822102425876011, + "grad_norm": 2.7329215014592023, + "learning_rate": 5.744528817768602e-08, + "loss": 0.0244, + "step": 2836 + }, + { + "epoch": 3.8234501347708894, + "grad_norm": 15.825293099950747, + "learning_rate": 5.6623268486637464e-08, + "loss": 0.0649, + "step": 2837 + }, + { + "epoch": 3.824797843665768, + "grad_norm": 12.147336652604853, + "learning_rate": 5.5807139155505395e-08, + "loss": 0.0601, + "step": 2838 + }, + { + "epoch": 3.826145552560647, + "grad_norm": 1.8451026355526814, + "learning_rate": 5.4996901156760266e-08, + "loss": 0.0391, + "step": 2839 + }, + { + "epoch": 3.8274932614555257, + "grad_norm": 9.48587658853723, + "learning_rate": 5.419255545585533e-08, + "loss": 0.0529, + "step": 2840 + }, + { + "epoch": 3.828840970350404, + "grad_norm": 3.7636439662670806, + "learning_rate": 5.339410301122172e-08, + "loss": 0.0622, + "step": 2841 + }, + { + "epoch": 3.830188679245283, + "grad_norm": 6.29758206364095, + "learning_rate": 5.260154477426727e-08, + "loss": 0.042, + "step": 2842 + }, + { + "epoch": 3.8315363881401616, + "grad_norm": 4.90586044934946, + "learning_rate": 5.181488168937876e-08, + "loss": 0.0333, + "step": 2843 + }, + { + "epoch": 3.8328840970350404, + "grad_norm": 5.988016705480039, + "learning_rate": 5.103411469391639e-08, + "loss": 0.036, + "step": 2844 + }, + { + "epoch": 3.834231805929919, + "grad_norm": 1.0632997037969083, + "learning_rate": 5.0259244718215414e-08, + "loss": 0.0404, + "step": 2845 + }, + { + "epoch": 3.835579514824798, + "grad_norm": 12.126917850335584, + "learning_rate": 4.949027268558504e-08, + "loss": 0.0386, + "step": 2846 + }, + { + "epoch": 3.8369272237196768, + "grad_norm": 4.9569961500205375, + "learning_rate": 4.872719951230675e-08, + "loss": 0.0556, + "step": 2847 + }, + { + "epoch": 3.838274932614555, + "grad_norm": 4.217565241154412, + "learning_rate": 4.797002610763102e-08, + "loss": 0.0542, + "step": 2848 + }, + { + "epoch": 3.839622641509434, + "grad_norm": 3.254150492565113, + "learning_rate": 4.721875337378168e-08, + "loss": 0.0494, + "step": 2849 + }, + { + "epoch": 3.8409703504043127, + "grad_norm": 1.48903157813167, + "learning_rate": 4.647338220594932e-08, + "loss": 0.0264, + "step": 2850 + }, + { + "epoch": 3.8423180592991915, + "grad_norm": 14.586925963346257, + "learning_rate": 4.573391349229239e-08, + "loss": 0.0709, + "step": 2851 + }, + { + "epoch": 3.8436657681940702, + "grad_norm": 12.340587719966756, + "learning_rate": 4.5000348113937166e-08, + "loss": 0.054, + "step": 2852 + }, + { + "epoch": 3.8450134770889486, + "grad_norm": 4.091007288698401, + "learning_rate": 4.4272686944975e-08, + "loss": 0.0529, + "step": 2853 + }, + { + "epoch": 3.8463611859838274, + "grad_norm": 1.8278793121455683, + "learning_rate": 4.355093085246232e-08, + "loss": 0.0376, + "step": 2854 + }, + { + "epoch": 3.847708894878706, + "grad_norm": 1.061928583578243, + "learning_rate": 4.283508069641951e-08, + "loss": 0.0463, + "step": 2855 + }, + { + "epoch": 3.849056603773585, + "grad_norm": 1.4596766857576322, + "learning_rate": 4.212513732982926e-08, + "loss": 0.0553, + "step": 2856 + }, + { + "epoch": 3.8504043126684637, + "grad_norm": 2.181958709811321, + "learning_rate": 4.142110159863544e-08, + "loss": 0.0401, + "step": 2857 + }, + { + "epoch": 3.8517520215633425, + "grad_norm": 2.79316269624242, + "learning_rate": 4.072297434174366e-08, + "loss": 0.0758, + "step": 2858 + }, + { + "epoch": 3.8530997304582213, + "grad_norm": 10.990533239457145, + "learning_rate": 4.00307563910185e-08, + "loss": 0.0446, + "step": 2859 + }, + { + "epoch": 3.8544474393530996, + "grad_norm": 12.782963830075772, + "learning_rate": 3.934444857128295e-08, + "loss": 0.0487, + "step": 2860 + }, + { + "epoch": 3.8557951482479784, + "grad_norm": 9.446856024753524, + "learning_rate": 3.866405170031895e-08, + "loss": 0.0385, + "step": 2861 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 6.109742279925789, + "learning_rate": 3.7989566588863544e-08, + "loss": 0.0335, + "step": 2862 + }, + { + "epoch": 3.858490566037736, + "grad_norm": 5.992262458580537, + "learning_rate": 3.732099404061052e-08, + "loss": 0.0503, + "step": 2863 + }, + { + "epoch": 3.8598382749326143, + "grad_norm": 7.324240234266683, + "learning_rate": 3.665833485220927e-08, + "loss": 0.0546, + "step": 2864 + }, + { + "epoch": 3.861185983827493, + "grad_norm": 5.912611533658952, + "learning_rate": 3.6001589813260405e-08, + "loss": 0.0226, + "step": 2865 + }, + { + "epoch": 3.862533692722372, + "grad_norm": 4.79819262195424, + "learning_rate": 3.535075970631963e-08, + "loss": 0.0599, + "step": 2866 + }, + { + "epoch": 3.8638814016172507, + "grad_norm": 6.862303898201932, + "learning_rate": 3.47058453068938e-08, + "loss": 0.052, + "step": 2867 + }, + { + "epoch": 3.8652291105121295, + "grad_norm": 5.914860721429833, + "learning_rate": 3.4066847383442125e-08, + "loss": 0.0447, + "step": 2868 + }, + { + "epoch": 3.8665768194070083, + "grad_norm": 6.250281027171524, + "learning_rate": 3.3433766697371085e-08, + "loss": 0.0526, + "step": 2869 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 1.4494907384058866, + "learning_rate": 3.2806604003039475e-08, + "loss": 0.0461, + "step": 2870 + }, + { + "epoch": 3.8692722371967654, + "grad_norm": 6.454907044753073, + "learning_rate": 3.2185360047752854e-08, + "loss": 0.0482, + "step": 2871 + }, + { + "epoch": 3.870619946091644, + "grad_norm": 2.388740575024787, + "learning_rate": 3.157003557176408e-08, + "loss": 0.0603, + "step": 2872 + }, + { + "epoch": 3.871967654986523, + "grad_norm": 10.086251968330647, + "learning_rate": 3.096063130827331e-08, + "loss": 0.0471, + "step": 2873 + }, + { + "epoch": 3.8733153638814017, + "grad_norm": 4.9919971675813954, + "learning_rate": 3.035714798342526e-08, + "loss": 0.0364, + "step": 2874 + }, + { + "epoch": 3.87466307277628, + "grad_norm": 5.356409545056359, + "learning_rate": 2.975958631631082e-08, + "loss": 0.0539, + "step": 2875 + }, + { + "epoch": 3.876010781671159, + "grad_norm": 4.828527003403077, + "learning_rate": 2.916794701896375e-08, + "loss": 0.0544, + "step": 2876 + }, + { + "epoch": 3.8773584905660377, + "grad_norm": 4.914931295471877, + "learning_rate": 2.8582230796362352e-08, + "loss": 0.0587, + "step": 2877 + }, + { + "epoch": 3.8787061994609164, + "grad_norm": 9.067140690320944, + "learning_rate": 2.8002438346424467e-08, + "loss": 0.0597, + "step": 2878 + }, + { + "epoch": 3.8800539083557952, + "grad_norm": 10.653124110444182, + "learning_rate": 2.7428570360013006e-08, + "loss": 0.0566, + "step": 2879 + }, + { + "epoch": 3.881401617250674, + "grad_norm": 3.4943980435069912, + "learning_rate": 2.686062752092822e-08, + "loss": 0.055, + "step": 2880 + }, + { + "epoch": 3.882749326145553, + "grad_norm": 6.624860314186624, + "learning_rate": 2.629861050591209e-08, + "loss": 0.0439, + "step": 2881 + }, + { + "epoch": 3.884097035040431, + "grad_norm": 5.794647551071399, + "learning_rate": 2.5742519984645053e-08, + "loss": 0.0513, + "step": 2882 + }, + { + "epoch": 3.88544474393531, + "grad_norm": 2.385001683178302, + "learning_rate": 2.519235661974484e-08, + "loss": 0.0406, + "step": 2883 + }, + { + "epoch": 3.8867924528301887, + "grad_norm": 4.007688788158112, + "learning_rate": 2.4648121066768728e-08, + "loss": 0.0346, + "step": 2884 + }, + { + "epoch": 3.8881401617250675, + "grad_norm": 7.806082835147591, + "learning_rate": 2.4109813974208527e-08, + "loss": 0.0375, + "step": 2885 + }, + { + "epoch": 3.889487870619946, + "grad_norm": 2.8602686236631927, + "learning_rate": 2.357743598349338e-08, + "loss": 0.0243, + "step": 2886 + }, + { + "epoch": 3.8908355795148246, + "grad_norm": 16.05278438501454, + "learning_rate": 2.3050987728985286e-08, + "loss": 0.0546, + "step": 2887 + }, + { + "epoch": 3.8921832884097034, + "grad_norm": 19.382134102397636, + "learning_rate": 2.2530469837984125e-08, + "loss": 0.0633, + "step": 2888 + }, + { + "epoch": 3.893530997304582, + "grad_norm": 8.483000277639226, + "learning_rate": 2.2015882930720433e-08, + "loss": 0.0601, + "step": 2889 + }, + { + "epoch": 3.894878706199461, + "grad_norm": 0.8937294214039072, + "learning_rate": 2.1507227620358174e-08, + "loss": 0.0276, + "step": 2890 + }, + { + "epoch": 3.8962264150943398, + "grad_norm": 2.98542717994245, + "learning_rate": 2.100450451299363e-08, + "loss": 0.0451, + "step": 2891 + }, + { + "epoch": 3.8975741239892185, + "grad_norm": 1.316981826801314, + "learning_rate": 2.050771420765596e-08, + "loss": 0.0467, + "step": 2892 + }, + { + "epoch": 3.898921832884097, + "grad_norm": 2.100695860510542, + "learning_rate": 2.0016857296302207e-08, + "loss": 0.0467, + "step": 2893 + }, + { + "epoch": 3.9002695417789757, + "grad_norm": 4.92832641723696, + "learning_rate": 1.953193436382117e-08, + "loss": 0.0602, + "step": 2894 + }, + { + "epoch": 3.9016172506738545, + "grad_norm": 0.7698172237214321, + "learning_rate": 1.9052945988030648e-08, + "loss": 0.0194, + "step": 2895 + }, + { + "epoch": 3.9029649595687332, + "grad_norm": 5.679517670737282, + "learning_rate": 1.8579892739676865e-08, + "loss": 0.0357, + "step": 2896 + }, + { + "epoch": 3.904312668463612, + "grad_norm": 1.6342105977545285, + "learning_rate": 1.8112775182434485e-08, + "loss": 0.03, + "step": 2897 + }, + { + "epoch": 3.9056603773584904, + "grad_norm": 3.7358799249166603, + "learning_rate": 1.765159387290438e-08, + "loss": 0.0693, + "step": 2898 + }, + { + "epoch": 3.907008086253369, + "grad_norm": 3.7908511150619675, + "learning_rate": 1.719634936061476e-08, + "loss": 0.0249, + "step": 2899 + }, + { + "epoch": 3.908355795148248, + "grad_norm": 7.876226827346509, + "learning_rate": 1.6747042188018925e-08, + "loss": 0.0599, + "step": 2900 + }, + { + "epoch": 3.9097035040431267, + "grad_norm": 9.892494081790367, + "learning_rate": 1.6303672890497503e-08, + "loss": 0.0364, + "step": 2901 + }, + { + "epoch": 3.9110512129380055, + "grad_norm": 9.892494081790367, + "learning_rate": 1.6303672890497503e-08, + "loss": 0.0938, + "step": 2902 + }, + { + "epoch": 3.9123989218328843, + "grad_norm": 5.280640353746535, + "learning_rate": 1.5866241996352893e-08, + "loss": 0.0416, + "step": 2903 + }, + { + "epoch": 3.913746630727763, + "grad_norm": 13.37458285774401, + "learning_rate": 1.5434750026813717e-08, + "loss": 0.0468, + "step": 2904 + }, + { + "epoch": 3.9150943396226414, + "grad_norm": 19.35308457234627, + "learning_rate": 1.5009197496030358e-08, + "loss": 0.0564, + "step": 2905 + }, + { + "epoch": 3.91644204851752, + "grad_norm": 3.0589181577722604, + "learning_rate": 1.4589584911077759e-08, + "loss": 0.0412, + "step": 2906 + }, + { + "epoch": 3.917789757412399, + "grad_norm": 7.274684386497305, + "learning_rate": 1.4175912771951517e-08, + "loss": 0.0376, + "step": 2907 + }, + { + "epoch": 3.9191374663072778, + "grad_norm": 3.8957147535889436, + "learning_rate": 1.3768181571569006e-08, + "loss": 0.0456, + "step": 2908 + }, + { + "epoch": 3.920485175202156, + "grad_norm": 9.325790518726736, + "learning_rate": 1.3366391795769373e-08, + "loss": 0.042, + "step": 2909 + }, + { + "epoch": 3.921832884097035, + "grad_norm": 2.9156382122616935, + "learning_rate": 1.2970543923311319e-08, + "loss": 0.047, + "step": 2910 + }, + { + "epoch": 3.9231805929919137, + "grad_norm": 2.4338957752110257, + "learning_rate": 1.2580638425874204e-08, + "loss": 0.0409, + "step": 2911 + }, + { + "epoch": 3.9245283018867925, + "grad_norm": 0.7264537342654596, + "learning_rate": 1.2196675768055832e-08, + "loss": 0.0233, + "step": 2912 + }, + { + "epoch": 3.9258760107816713, + "grad_norm": 5.372308706075504, + "learning_rate": 1.1818656407373008e-08, + "loss": 0.0382, + "step": 2913 + }, + { + "epoch": 3.92722371967655, + "grad_norm": 11.712349543248509, + "learning_rate": 1.1446580794260975e-08, + "loss": 0.0719, + "step": 2914 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 10.002885103997789, + "learning_rate": 1.1080449372072311e-08, + "loss": 0.0399, + "step": 2915 + }, + { + "epoch": 3.929919137466307, + "grad_norm": 1.8034723565556012, + "learning_rate": 1.0720262577076923e-08, + "loss": 0.0486, + "step": 2916 + }, + { + "epoch": 3.931266846361186, + "grad_norm": 8.227450038263035, + "learning_rate": 1.03660208384615e-08, + "loss": 0.0388, + "step": 2917 + }, + { + "epoch": 3.9326145552560647, + "grad_norm": 12.567939181994017, + "learning_rate": 1.0017724578327281e-08, + "loss": 0.0558, + "step": 2918 + }, + { + "epoch": 3.9339622641509435, + "grad_norm": 3.241615396707216, + "learning_rate": 9.6753742116934e-09, + "loss": 0.0414, + "step": 2919 + }, + { + "epoch": 3.935309973045822, + "grad_norm": 10.451394034602382, + "learning_rate": 9.338970146492431e-09, + "loss": 0.0278, + "step": 2920 + }, + { + "epoch": 3.9366576819407006, + "grad_norm": 13.056541853123605, + "learning_rate": 9.008512783572066e-09, + "loss": 0.0518, + "step": 2921 + }, + { + "epoch": 3.9380053908355794, + "grad_norm": 4.617194780252319, + "learning_rate": 8.684002516694546e-09, + "loss": 0.0563, + "step": 2922 + }, + { + "epoch": 3.939353099730458, + "grad_norm": 5.842583169353607, + "learning_rate": 8.365439732534453e-09, + "loss": 0.0599, + "step": 2923 + }, + { + "epoch": 3.940700808625337, + "grad_norm": 7.374951583562156, + "learning_rate": 8.05282481068148e-09, + "loss": 0.0357, + "step": 2924 + }, + { + "epoch": 3.942048517520216, + "grad_norm": 7.058054759676774, + "learning_rate": 7.746158123635994e-09, + "loss": 0.0391, + "step": 2925 + }, + { + "epoch": 3.9433962264150946, + "grad_norm": 7.540835084766388, + "learning_rate": 7.4454400368118015e-09, + "loss": 0.053, + "step": 2926 + }, + { + "epoch": 3.944743935309973, + "grad_norm": 4.170454421859463, + "learning_rate": 7.150670908535051e-09, + "loss": 0.0292, + "step": 2927 + }, + { + "epoch": 3.9460916442048517, + "grad_norm": 10.113051458278116, + "learning_rate": 6.8618510900414495e-09, + "loss": 0.0433, + "step": 2928 + }, + { + "epoch": 3.9474393530997305, + "grad_norm": 14.249415757402923, + "learning_rate": 6.578980925479594e-09, + "loss": 0.0374, + "step": 2929 + }, + { + "epoch": 3.9487870619946093, + "grad_norm": 8.076212954389966, + "learning_rate": 6.302060751908201e-09, + "loss": 0.0476, + "step": 2930 + }, + { + "epoch": 3.9501347708894876, + "grad_norm": 3.6067483733652783, + "learning_rate": 6.0310908992955444e-09, + "loss": 0.0431, + "step": 2931 + }, + { + "epoch": 3.9514824797843664, + "grad_norm": 10.71326681321002, + "learning_rate": 5.7660716905205696e-09, + "loss": 0.0665, + "step": 2932 + }, + { + "epoch": 3.952830188679245, + "grad_norm": 12.272539276954214, + "learning_rate": 5.507003441370673e-09, + "loss": 0.0324, + "step": 2933 + }, + { + "epoch": 3.954177897574124, + "grad_norm": 5.069527736229834, + "learning_rate": 5.253886460542257e-09, + "loss": 0.0312, + "step": 2934 + }, + { + "epoch": 3.9555256064690028, + "grad_norm": 22.604732398949245, + "learning_rate": 5.0067210496423935e-09, + "loss": 0.0735, + "step": 2935 + }, + { + "epoch": 3.9568733153638815, + "grad_norm": 1.7392452730641985, + "learning_rate": 4.76550750318383e-09, + "loss": 0.0452, + "step": 2936 + }, + { + "epoch": 3.9582210242587603, + "grad_norm": 2.849447444790203, + "learning_rate": 4.530246108588876e-09, + "loss": 0.0253, + "step": 2937 + }, + { + "epoch": 3.9595687331536387, + "grad_norm": 10.441013546292712, + "learning_rate": 4.3009371461871785e-09, + "loss": 0.0528, + "step": 2938 + }, + { + "epoch": 3.9609164420485174, + "grad_norm": 10.42186603766956, + "learning_rate": 4.077580889215171e-09, + "loss": 0.0466, + "step": 2939 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 7.287719196266435, + "learning_rate": 3.8601776038166286e-09, + "loss": 0.0496, + "step": 2940 + }, + { + "epoch": 3.963611859838275, + "grad_norm": 2.2482739385187975, + "learning_rate": 3.648727549042108e-09, + "loss": 0.03, + "step": 2941 + }, + { + "epoch": 3.964959568733154, + "grad_norm": 7.522954706783293, + "learning_rate": 3.4432309768483994e-09, + "loss": 0.0466, + "step": 2942 + }, + { + "epoch": 3.966307277628032, + "grad_norm": 3.0310051082996146, + "learning_rate": 3.2436881320974113e-09, + "loss": 0.0732, + "step": 2943 + }, + { + "epoch": 3.967654986522911, + "grad_norm": 6.330173272441354, + "learning_rate": 3.0500992525589467e-09, + "loss": 0.0393, + "step": 2944 + }, + { + "epoch": 3.9690026954177897, + "grad_norm": 7.699889835107648, + "learning_rate": 2.8624645689062645e-09, + "loss": 0.0581, + "step": 2945 + }, + { + "epoch": 3.9703504043126685, + "grad_norm": 3.760414170348352, + "learning_rate": 2.680784304718298e-09, + "loss": 0.0218, + "step": 2946 + }, + { + "epoch": 3.9716981132075473, + "grad_norm": 3.326682093597665, + "learning_rate": 2.5050586764790995e-09, + "loss": 0.0418, + "step": 2947 + }, + { + "epoch": 3.973045822102426, + "grad_norm": 13.538031303569204, + "learning_rate": 2.3352878935778424e-09, + "loss": 0.0527, + "step": 2948 + }, + { + "epoch": 3.974393530997305, + "grad_norm": 1.4679967626483512, + "learning_rate": 2.171472158307153e-09, + "loss": 0.0497, + "step": 2949 + }, + { + "epoch": 3.975741239892183, + "grad_norm": 10.159380656941977, + "learning_rate": 2.0136116658642233e-09, + "loss": 0.0495, + "step": 2950 + }, + { + "epoch": 3.977088948787062, + "grad_norm": 12.186249528346002, + "learning_rate": 1.8617066043508103e-09, + "loss": 0.0671, + "step": 2951 + }, + { + "epoch": 3.9784366576819408, + "grad_norm": 9.241474126244983, + "learning_rate": 1.715757154771569e-09, + "loss": 0.0603, + "step": 2952 + }, + { + "epoch": 3.9797843665768196, + "grad_norm": 4.904003574840321, + "learning_rate": 1.5757634910351648e-09, + "loss": 0.0405, + "step": 2953 + }, + { + "epoch": 3.981132075471698, + "grad_norm": 13.682193054981768, + "learning_rate": 1.4417257799526075e-09, + "loss": 0.0706, + "step": 2954 + }, + { + "epoch": 3.9824797843665767, + "grad_norm": 4.007384017471423, + "learning_rate": 1.3136441812389156e-09, + "loss": 0.0569, + "step": 2955 + }, + { + "epoch": 3.9838274932614555, + "grad_norm": 2.7963316485289567, + "learning_rate": 1.1915188475125627e-09, + "loss": 0.047, + "step": 2956 + }, + { + "epoch": 3.9851752021563343, + "grad_norm": 13.615769512085542, + "learning_rate": 1.0753499242927012e-09, + "loss": 0.0542, + "step": 2957 + }, + { + "epoch": 3.986522911051213, + "grad_norm": 6.170968487949596, + "learning_rate": 9.65137550003048e-10, + "loss": 0.0556, + "step": 2958 + }, + { + "epoch": 3.987870619946092, + "grad_norm": 2.228883869127259, + "learning_rate": 8.60881855969109e-10, + "loss": 0.0387, + "step": 2959 + }, + { + "epoch": 3.9892183288409706, + "grad_norm": 12.528169559270145, + "learning_rate": 7.625829664176243e-10, + "loss": 0.051, + "step": 2960 + }, + { + "epoch": 3.990566037735849, + "grad_norm": 14.741673606068362, + "learning_rate": 6.702409984793434e-10, + "loss": 0.061, + "step": 2961 + }, + { + "epoch": 3.9919137466307277, + "grad_norm": 1.785357437935009, + "learning_rate": 5.838560621845845e-10, + "loss": 0.0661, + "step": 2962 + }, + { + "epoch": 3.9932614555256065, + "grad_norm": 5.913202065209953, + "learning_rate": 5.034282604676755e-10, + "loss": 0.0359, + "step": 2963 + }, + { + "epoch": 3.9946091644204853, + "grad_norm": 7.329961476742552, + "learning_rate": 4.289576891630676e-10, + "loss": 0.0392, + "step": 2964 + }, + { + "epoch": 3.9959568733153636, + "grad_norm": 6.03293656461464, + "learning_rate": 3.604444370075566e-10, + "loss": 0.0251, + "step": 2965 + }, + { + "epoch": 3.9973045822102424, + "grad_norm": 7.301215703621023, + "learning_rate": 2.9788858563917223e-10, + "loss": 0.0594, + "step": 2966 + }, + { + "epoch": 3.998652291105121, + "grad_norm": 1.8669618708051277, + "learning_rate": 2.412902095971781e-10, + "loss": 0.0365, + "step": 2967 + }, + { + "epoch": 4.0, + "grad_norm": 3.6948236471902964, + "learning_rate": 1.9064937632318203e-10, + "loss": 0.0695, + "step": 2968 + }, + { + "epoch": 4.0, + "step": 2968, + "total_flos": 653837712506880.0, + "train_loss": 0.11306895002361257, + "train_runtime": 14531.8108, + "train_samples_per_second": 13.07, + "train_steps_per_second": 0.204 + } + ], + "logging_steps": 1.0, + "max_steps": 2968, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 150.0, + "total_flos": 653837712506880.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}