diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2278725729175424, + "eval_steps": 500, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007263977481669807, + "grad_norm": 63.2759549108439, + "learning_rate": 3.333333333333334e-08, + "loss": 7.3622, + "step": 1 + }, + { + "epoch": 0.0014527954963339613, + "grad_norm": 63.41948831357221, + "learning_rate": 6.666666666666668e-08, + "loss": 7.3563, + "step": 2 + }, + { + "epoch": 0.002179193244500942, + "grad_norm": 63.20335169729121, + "learning_rate": 1.0000000000000001e-07, + "loss": 7.3613, + "step": 3 + }, + { + "epoch": 0.0029055909926679227, + "grad_norm": 63.088082938073725, + "learning_rate": 1.3333333333333336e-07, + "loss": 7.3498, + "step": 4 + }, + { + "epoch": 0.0036319887408349035, + "grad_norm": 66.62993398865497, + "learning_rate": 1.6666666666666668e-07, + "loss": 7.4933, + "step": 5 + }, + { + "epoch": 0.004358386489001884, + "grad_norm": 64.28207217419981, + "learning_rate": 2.0000000000000002e-07, + "loss": 7.3739, + "step": 6 + }, + { + "epoch": 0.005084784237168865, + "grad_norm": 64.86547058325266, + "learning_rate": 2.3333333333333336e-07, + "loss": 7.4356, + "step": 7 + }, + { + "epoch": 0.005811181985335845, + "grad_norm": 62.520037864532036, + "learning_rate": 2.666666666666667e-07, + "loss": 7.3069, + "step": 8 + }, + { + "epoch": 0.006537579733502826, + "grad_norm": 64.86566194865063, + "learning_rate": 3.0000000000000004e-07, + "loss": 7.4228, + "step": 9 + }, + { + "epoch": 0.007263977481669807, + "grad_norm": 62.970846908905436, + "learning_rate": 3.3333333333333335e-07, + "loss": 7.3558, + "step": 10 + }, + { + "epoch": 0.007990375229836788, + "grad_norm": 63.631610429531904, + "learning_rate": 3.666666666666667e-07, + "loss": 7.3535, + "step": 11 + }, + { + "epoch": 0.008716772978003768, + "grad_norm": 63.25811371761601, + "learning_rate": 4.0000000000000003e-07, + "loss": 7.3819, + "step": 12 + }, + { + "epoch": 0.00944317072617075, + "grad_norm": 63.66259077849509, + "learning_rate": 4.333333333333334e-07, + "loss": 7.3795, + "step": 13 + }, + { + "epoch": 0.01016956847433773, + "grad_norm": 64.15289940053654, + "learning_rate": 4.666666666666667e-07, + "loss": 7.3683, + "step": 14 + }, + { + "epoch": 0.01089596622250471, + "grad_norm": 64.59626603451477, + "learning_rate": 5.000000000000001e-07, + "loss": 7.3799, + "step": 15 + }, + { + "epoch": 0.01162236397067169, + "grad_norm": 63.26977989702329, + "learning_rate": 5.333333333333335e-07, + "loss": 7.3623, + "step": 16 + }, + { + "epoch": 0.012348761718838672, + "grad_norm": 60.067823922331, + "learning_rate": 5.666666666666667e-07, + "loss": 7.1842, + "step": 17 + }, + { + "epoch": 0.013075159467005652, + "grad_norm": 63.99871264593463, + "learning_rate": 6.000000000000001e-07, + "loss": 7.236, + "step": 18 + }, + { + "epoch": 0.013801557215172632, + "grad_norm": 59.629646935909086, + "learning_rate": 6.333333333333334e-07, + "loss": 7.1603, + "step": 19 + }, + { + "epoch": 0.014527954963339614, + "grad_norm": 62.1060954445667, + "learning_rate": 6.666666666666667e-07, + "loss": 7.283, + "step": 20 + }, + { + "epoch": 0.015254352711506594, + "grad_norm": 60.05206589269812, + "learning_rate": 7.000000000000001e-07, + "loss": 7.186, + "step": 21 + }, + { + "epoch": 0.015980750459673575, + "grad_norm": 59.645318014317375, + "learning_rate": 7.333333333333334e-07, + "loss": 7.1921, + "step": 22 + }, + { + "epoch": 0.016707148207840555, + "grad_norm": 57.48650185086973, + "learning_rate": 7.666666666666667e-07, + "loss": 6.9704, + "step": 23 + }, + { + "epoch": 0.017433545956007535, + "grad_norm": 53.42583321263901, + "learning_rate": 8.000000000000001e-07, + "loss": 6.8416, + "step": 24 + }, + { + "epoch": 0.01815994370417452, + "grad_norm": 53.633682206139234, + "learning_rate": 8.333333333333333e-07, + "loss": 6.8657, + "step": 25 + }, + { + "epoch": 0.0188863414523415, + "grad_norm": 53.611957520088204, + "learning_rate": 8.666666666666668e-07, + "loss": 6.8391, + "step": 26 + }, + { + "epoch": 0.01961273920050848, + "grad_norm": 53.90927959387481, + "learning_rate": 9.000000000000001e-07, + "loss": 6.8457, + "step": 27 + }, + { + "epoch": 0.02033913694867546, + "grad_norm": 52.34299208362781, + "learning_rate": 9.333333333333334e-07, + "loss": 6.7246, + "step": 28 + }, + { + "epoch": 0.021065534696842438, + "grad_norm": 52.44583643778421, + "learning_rate": 9.666666666666668e-07, + "loss": 6.7693, + "step": 29 + }, + { + "epoch": 0.02179193244500942, + "grad_norm": 50.904404917626735, + "learning_rate": 1.0000000000000002e-06, + "loss": 6.6598, + "step": 30 + }, + { + "epoch": 0.0225183301931764, + "grad_norm": 51.76963606843732, + "learning_rate": 1.0333333333333333e-06, + "loss": 6.5898, + "step": 31 + }, + { + "epoch": 0.02324472794134338, + "grad_norm": 51.843062752094205, + "learning_rate": 1.066666666666667e-06, + "loss": 6.4639, + "step": 32 + }, + { + "epoch": 0.02397112568951036, + "grad_norm": 48.38768669965513, + "learning_rate": 1.1e-06, + "loss": 6.1057, + "step": 33 + }, + { + "epoch": 0.024697523437677345, + "grad_norm": 47.75728492688787, + "learning_rate": 1.1333333333333334e-06, + "loss": 6.0672, + "step": 34 + }, + { + "epoch": 0.025423921185844325, + "grad_norm": 47.1983740094125, + "learning_rate": 1.1666666666666668e-06, + "loss": 5.9977, + "step": 35 + }, + { + "epoch": 0.026150318934011305, + "grad_norm": 47.80807555427671, + "learning_rate": 1.2000000000000002e-06, + "loss": 6.0389, + "step": 36 + }, + { + "epoch": 0.026876716682178284, + "grad_norm": 46.78515259105002, + "learning_rate": 1.2333333333333335e-06, + "loss": 5.9493, + "step": 37 + }, + { + "epoch": 0.027603114430345264, + "grad_norm": 46.44941548286642, + "learning_rate": 1.2666666666666669e-06, + "loss": 5.9096, + "step": 38 + }, + { + "epoch": 0.028329512178512248, + "grad_norm": 47.87743744982369, + "learning_rate": 1.3e-06, + "loss": 5.957, + "step": 39 + }, + { + "epoch": 0.029055909926679228, + "grad_norm": 47.69020729829381, + "learning_rate": 1.3333333333333334e-06, + "loss": 5.9052, + "step": 40 + }, + { + "epoch": 0.029782307674846208, + "grad_norm": 45.65840416434142, + "learning_rate": 1.3666666666666668e-06, + "loss": 5.7094, + "step": 41 + }, + { + "epoch": 0.030508705423013188, + "grad_norm": 46.52695760405757, + "learning_rate": 1.4000000000000001e-06, + "loss": 5.6376, + "step": 42 + }, + { + "epoch": 0.03123510317118017, + "grad_norm": 47.27139026128274, + "learning_rate": 1.4333333333333335e-06, + "loss": 5.5706, + "step": 43 + }, + { + "epoch": 0.03196150091934715, + "grad_norm": 46.74277727155727, + "learning_rate": 1.4666666666666669e-06, + "loss": 5.4399, + "step": 44 + }, + { + "epoch": 0.03268789866751413, + "grad_norm": 46.31302781518561, + "learning_rate": 1.5e-06, + "loss": 5.305, + "step": 45 + }, + { + "epoch": 0.03341429641568111, + "grad_norm": 47.29508573577366, + "learning_rate": 1.5333333333333334e-06, + "loss": 5.2508, + "step": 46 + }, + { + "epoch": 0.034140694163848094, + "grad_norm": 46.5497242137799, + "learning_rate": 1.566666666666667e-06, + "loss": 5.1042, + "step": 47 + }, + { + "epoch": 0.03486709191201507, + "grad_norm": 47.310997403673, + "learning_rate": 1.6000000000000001e-06, + "loss": 5.0298, + "step": 48 + }, + { + "epoch": 0.035593489660182054, + "grad_norm": 47.85952489756574, + "learning_rate": 1.6333333333333335e-06, + "loss": 4.9936, + "step": 49 + }, + { + "epoch": 0.03631988740834904, + "grad_norm": 48.28390256638122, + "learning_rate": 1.6666666666666667e-06, + "loss": 4.9282, + "step": 50 + }, + { + "epoch": 0.037046285156516014, + "grad_norm": 47.36438783340036, + "learning_rate": 1.7000000000000002e-06, + "loss": 4.7947, + "step": 51 + }, + { + "epoch": 0.037772682904683, + "grad_norm": 47.97444500743773, + "learning_rate": 1.7333333333333336e-06, + "loss": 4.7598, + "step": 52 + }, + { + "epoch": 0.038499080652849973, + "grad_norm": 48.032965049328915, + "learning_rate": 1.7666666666666668e-06, + "loss": 4.7174, + "step": 53 + }, + { + "epoch": 0.03922547840101696, + "grad_norm": 48.54449670206776, + "learning_rate": 1.8000000000000001e-06, + "loss": 4.6641, + "step": 54 + }, + { + "epoch": 0.03995187614918394, + "grad_norm": 48.098423617340245, + "learning_rate": 1.8333333333333333e-06, + "loss": 4.5518, + "step": 55 + }, + { + "epoch": 0.04067827389735092, + "grad_norm": 48.398816364766674, + "learning_rate": 1.8666666666666669e-06, + "loss": 4.4794, + "step": 56 + }, + { + "epoch": 0.0414046716455179, + "grad_norm": 49.27527728561581, + "learning_rate": 1.9000000000000002e-06, + "loss": 4.4303, + "step": 57 + }, + { + "epoch": 0.042131069393684877, + "grad_norm": 48.371954013728626, + "learning_rate": 1.9333333333333336e-06, + "loss": 4.3185, + "step": 58 + }, + { + "epoch": 0.04285746714185186, + "grad_norm": 48.5608446730172, + "learning_rate": 1.9666666666666668e-06, + "loss": 4.2362, + "step": 59 + }, + { + "epoch": 0.04358386489001884, + "grad_norm": 48.368964459448684, + "learning_rate": 2.0000000000000003e-06, + "loss": 4.1383, + "step": 60 + }, + { + "epoch": 0.04431026263818582, + "grad_norm": 48.57285815251858, + "learning_rate": 2.0333333333333335e-06, + "loss": 4.0723, + "step": 61 + }, + { + "epoch": 0.0450366603863528, + "grad_norm": 48.76734862112584, + "learning_rate": 2.0666666666666666e-06, + "loss": 4.0063, + "step": 62 + }, + { + "epoch": 0.04576305813451978, + "grad_norm": 49.26145502427768, + "learning_rate": 2.1000000000000002e-06, + "loss": 3.957, + "step": 63 + }, + { + "epoch": 0.04648945588268676, + "grad_norm": 49.50243947973019, + "learning_rate": 2.133333333333334e-06, + "loss": 3.8785, + "step": 64 + }, + { + "epoch": 0.047215853630853746, + "grad_norm": 49.828107092810185, + "learning_rate": 2.166666666666667e-06, + "loss": 3.8176, + "step": 65 + }, + { + "epoch": 0.04794225137902072, + "grad_norm": 49.26466685923202, + "learning_rate": 2.2e-06, + "loss": 3.7495, + "step": 66 + }, + { + "epoch": 0.048668649127187706, + "grad_norm": 48.62847142711153, + "learning_rate": 2.2333333333333333e-06, + "loss": 3.6826, + "step": 67 + }, + { + "epoch": 0.04939504687535469, + "grad_norm": 49.05074040352798, + "learning_rate": 2.266666666666667e-06, + "loss": 3.6296, + "step": 68 + }, + { + "epoch": 0.050121444623521666, + "grad_norm": 49.862009737363984, + "learning_rate": 2.3000000000000004e-06, + "loss": 3.5896, + "step": 69 + }, + { + "epoch": 0.05084784237168865, + "grad_norm": 50.69198753633318, + "learning_rate": 2.3333333333333336e-06, + "loss": 3.5724, + "step": 70 + }, + { + "epoch": 0.051574240119855626, + "grad_norm": 50.03195157652246, + "learning_rate": 2.3666666666666667e-06, + "loss": 3.5113, + "step": 71 + }, + { + "epoch": 0.05230063786802261, + "grad_norm": 49.23258661177582, + "learning_rate": 2.4000000000000003e-06, + "loss": 3.4314, + "step": 72 + }, + { + "epoch": 0.05302703561618959, + "grad_norm": 50.580583101959995, + "learning_rate": 2.4333333333333335e-06, + "loss": 3.411, + "step": 73 + }, + { + "epoch": 0.05375343336435657, + "grad_norm": 50.72488608293446, + "learning_rate": 2.466666666666667e-06, + "loss": 3.3533, + "step": 74 + }, + { + "epoch": 0.05447983111252355, + "grad_norm": 50.6273218587908, + "learning_rate": 2.5e-06, + "loss": 3.3145, + "step": 75 + }, + { + "epoch": 0.05520622886069053, + "grad_norm": 50.73713708166495, + "learning_rate": 2.5333333333333338e-06, + "loss": 3.2674, + "step": 76 + }, + { + "epoch": 0.05593262660885751, + "grad_norm": 50.698736069610405, + "learning_rate": 2.566666666666667e-06, + "loss": 3.2224, + "step": 77 + }, + { + "epoch": 0.056659024357024496, + "grad_norm": 49.95915915848869, + "learning_rate": 2.6e-06, + "loss": 3.1458, + "step": 78 + }, + { + "epoch": 0.05738542210519147, + "grad_norm": 50.41144470850246, + "learning_rate": 2.6333333333333332e-06, + "loss": 3.1436, + "step": 79 + }, + { + "epoch": 0.058111819853358455, + "grad_norm": 50.69282224043935, + "learning_rate": 2.666666666666667e-06, + "loss": 3.0908, + "step": 80 + }, + { + "epoch": 0.05883821760152543, + "grad_norm": 49.800949375863084, + "learning_rate": 2.7000000000000004e-06, + "loss": 3.0296, + "step": 81 + }, + { + "epoch": 0.059564615349692415, + "grad_norm": 51.039912830570024, + "learning_rate": 2.7333333333333336e-06, + "loss": 2.9943, + "step": 82 + }, + { + "epoch": 0.0602910130978594, + "grad_norm": 49.70036616878881, + "learning_rate": 2.766666666666667e-06, + "loss": 2.9383, + "step": 83 + }, + { + "epoch": 0.061017410846026375, + "grad_norm": 50.93003291238077, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.9363, + "step": 84 + }, + { + "epoch": 0.06174380859419336, + "grad_norm": 50.34453682192858, + "learning_rate": 2.8333333333333335e-06, + "loss": 2.8795, + "step": 85 + }, + { + "epoch": 0.06247020634236034, + "grad_norm": 50.87658557775924, + "learning_rate": 2.866666666666667e-06, + "loss": 2.8499, + "step": 86 + }, + { + "epoch": 0.06319660409052733, + "grad_norm": 49.93087461716983, + "learning_rate": 2.9e-06, + "loss": 2.8086, + "step": 87 + }, + { + "epoch": 0.0639230018386943, + "grad_norm": 50.674279843425246, + "learning_rate": 2.9333333333333338e-06, + "loss": 2.7729, + "step": 88 + }, + { + "epoch": 0.06464939958686128, + "grad_norm": 50.663286782284274, + "learning_rate": 2.9666666666666673e-06, + "loss": 2.7499, + "step": 89 + }, + { + "epoch": 0.06537579733502825, + "grad_norm": 50.35183232947615, + "learning_rate": 3e-06, + "loss": 2.7004, + "step": 90 + }, + { + "epoch": 0.06610219508319524, + "grad_norm": 50.447339204931495, + "learning_rate": 3.0333333333333337e-06, + "loss": 2.6782, + "step": 91 + }, + { + "epoch": 0.06682859283136222, + "grad_norm": 50.08671877556182, + "learning_rate": 3.066666666666667e-06, + "loss": 2.6386, + "step": 92 + }, + { + "epoch": 0.0675549905795292, + "grad_norm": 50.85016696537039, + "learning_rate": 3.1000000000000004e-06, + "loss": 2.6105, + "step": 93 + }, + { + "epoch": 0.06828138832769619, + "grad_norm": 50.432484997573745, + "learning_rate": 3.133333333333334e-06, + "loss": 2.579, + "step": 94 + }, + { + "epoch": 0.06900778607586316, + "grad_norm": 49.822703178081824, + "learning_rate": 3.1666666666666667e-06, + "loss": 2.5516, + "step": 95 + }, + { + "epoch": 0.06973418382403014, + "grad_norm": 49.52642010085431, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.4905, + "step": 96 + }, + { + "epoch": 0.07046058157219713, + "grad_norm": 49.393946360680815, + "learning_rate": 3.2333333333333334e-06, + "loss": 2.4803, + "step": 97 + }, + { + "epoch": 0.07118697932036411, + "grad_norm": 48.897469338046086, + "learning_rate": 3.266666666666667e-06, + "loss": 2.4483, + "step": 98 + }, + { + "epoch": 0.07191337706853108, + "grad_norm": 49.378935763876314, + "learning_rate": 3.3000000000000006e-06, + "loss": 2.4292, + "step": 99 + }, + { + "epoch": 0.07263977481669807, + "grad_norm": 50.41695005032472, + "learning_rate": 3.3333333333333333e-06, + "loss": 2.4022, + "step": 100 + }, + { + "epoch": 0.07336617256486505, + "grad_norm": 50.2049353156204, + "learning_rate": 3.366666666666667e-06, + "loss": 2.3749, + "step": 101 + }, + { + "epoch": 0.07409257031303203, + "grad_norm": 49.693566201239186, + "learning_rate": 3.4000000000000005e-06, + "loss": 2.3459, + "step": 102 + }, + { + "epoch": 0.074818968061199, + "grad_norm": 48.84685424655744, + "learning_rate": 3.4333333333333336e-06, + "loss": 2.334, + "step": 103 + }, + { + "epoch": 0.075545365809366, + "grad_norm": 47.62798075643798, + "learning_rate": 3.4666666666666672e-06, + "loss": 2.2763, + "step": 104 + }, + { + "epoch": 0.07627176355753297, + "grad_norm": 48.19573541336029, + "learning_rate": 3.5e-06, + "loss": 2.265, + "step": 105 + }, + { + "epoch": 0.07699816130569995, + "grad_norm": 49.45197671503261, + "learning_rate": 3.5333333333333335e-06, + "loss": 2.2395, + "step": 106 + }, + { + "epoch": 0.07772455905386694, + "grad_norm": 47.40823487468658, + "learning_rate": 3.566666666666667e-06, + "loss": 2.2103, + "step": 107 + }, + { + "epoch": 0.07845095680203391, + "grad_norm": 47.671328668080086, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.1863, + "step": 108 + }, + { + "epoch": 0.07917735455020089, + "grad_norm": 47.279964152668605, + "learning_rate": 3.633333333333334e-06, + "loss": 2.1527, + "step": 109 + }, + { + "epoch": 0.07990375229836788, + "grad_norm": 46.06802031513997, + "learning_rate": 3.6666666666666666e-06, + "loss": 2.1353, + "step": 110 + }, + { + "epoch": 0.08063015004653486, + "grad_norm": 46.73470181620622, + "learning_rate": 3.7e-06, + "loss": 2.1074, + "step": 111 + }, + { + "epoch": 0.08135654779470183, + "grad_norm": 47.34253122512511, + "learning_rate": 3.7333333333333337e-06, + "loss": 2.0864, + "step": 112 + }, + { + "epoch": 0.08208294554286882, + "grad_norm": 46.64992121387836, + "learning_rate": 3.766666666666667e-06, + "loss": 2.072, + "step": 113 + }, + { + "epoch": 0.0828093432910358, + "grad_norm": 46.30918717601416, + "learning_rate": 3.8000000000000005e-06, + "loss": 2.0391, + "step": 114 + }, + { + "epoch": 0.08353574103920278, + "grad_norm": 46.61354559014084, + "learning_rate": 3.833333333333334e-06, + "loss": 2.014, + "step": 115 + }, + { + "epoch": 0.08426213878736975, + "grad_norm": 45.95154255629814, + "learning_rate": 3.866666666666667e-06, + "loss": 2.0111, + "step": 116 + }, + { + "epoch": 0.08498853653553674, + "grad_norm": 45.17776725607547, + "learning_rate": 3.900000000000001e-06, + "loss": 1.9856, + "step": 117 + }, + { + "epoch": 0.08571493428370372, + "grad_norm": 45.131837668001495, + "learning_rate": 3.9333333333333335e-06, + "loss": 1.9698, + "step": 118 + }, + { + "epoch": 0.0864413320318707, + "grad_norm": 44.877128843538, + "learning_rate": 3.966666666666667e-06, + "loss": 1.9418, + "step": 119 + }, + { + "epoch": 0.08716772978003769, + "grad_norm": 43.42147842225612, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9271, + "step": 120 + }, + { + "epoch": 0.08789412752820466, + "grad_norm": 43.9251239135038, + "learning_rate": 4.033333333333333e-06, + "loss": 1.9074, + "step": 121 + }, + { + "epoch": 0.08862052527637164, + "grad_norm": 45.52313461713586, + "learning_rate": 4.066666666666667e-06, + "loss": 1.8972, + "step": 122 + }, + { + "epoch": 0.08934692302453863, + "grad_norm": 43.96000925438749, + "learning_rate": 4.1e-06, + "loss": 1.867, + "step": 123 + }, + { + "epoch": 0.0900733207727056, + "grad_norm": 43.27066380698389, + "learning_rate": 4.133333333333333e-06, + "loss": 1.8483, + "step": 124 + }, + { + "epoch": 0.09079971852087258, + "grad_norm": 42.70094664578436, + "learning_rate": 4.166666666666667e-06, + "loss": 1.8224, + "step": 125 + }, + { + "epoch": 0.09152611626903956, + "grad_norm": 41.5753156714769, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.8262, + "step": 126 + }, + { + "epoch": 0.09225251401720655, + "grad_norm": 41.898261769934656, + "learning_rate": 4.233333333333334e-06, + "loss": 1.787, + "step": 127 + }, + { + "epoch": 0.09297891176537353, + "grad_norm": 42.0872491446625, + "learning_rate": 4.266666666666668e-06, + "loss": 1.7713, + "step": 128 + }, + { + "epoch": 0.0937053095135405, + "grad_norm": 41.19380423975876, + "learning_rate": 4.3e-06, + "loss": 1.7568, + "step": 129 + }, + { + "epoch": 0.09443170726170749, + "grad_norm": 40.680968528405074, + "learning_rate": 4.333333333333334e-06, + "loss": 1.7495, + "step": 130 + }, + { + "epoch": 0.09515810500987447, + "grad_norm": 40.16939074899239, + "learning_rate": 4.366666666666667e-06, + "loss": 1.7404, + "step": 131 + }, + { + "epoch": 0.09588450275804145, + "grad_norm": 39.72263478772674, + "learning_rate": 4.4e-06, + "loss": 1.7212, + "step": 132 + }, + { + "epoch": 0.09661090050620844, + "grad_norm": 39.49181775364284, + "learning_rate": 4.433333333333334e-06, + "loss": 1.6969, + "step": 133 + }, + { + "epoch": 0.09733729825437541, + "grad_norm": 39.04439034347531, + "learning_rate": 4.4666666666666665e-06, + "loss": 1.6875, + "step": 134 + }, + { + "epoch": 0.09806369600254239, + "grad_norm": 38.443498008849005, + "learning_rate": 4.5e-06, + "loss": 1.693, + "step": 135 + }, + { + "epoch": 0.09879009375070938, + "grad_norm": 38.36227491759409, + "learning_rate": 4.533333333333334e-06, + "loss": 1.659, + "step": 136 + }, + { + "epoch": 0.09951649149887636, + "grad_norm": 37.3629590490201, + "learning_rate": 4.566666666666667e-06, + "loss": 1.6459, + "step": 137 + }, + { + "epoch": 0.10024288924704333, + "grad_norm": 37.54793761307316, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6365, + "step": 138 + }, + { + "epoch": 0.10096928699521031, + "grad_norm": 36.18933616176955, + "learning_rate": 4.633333333333334e-06, + "loss": 1.631, + "step": 139 + }, + { + "epoch": 0.1016956847433773, + "grad_norm": 36.69815961845787, + "learning_rate": 4.666666666666667e-06, + "loss": 1.6211, + "step": 140 + }, + { + "epoch": 0.10242208249154428, + "grad_norm": 36.12103329793505, + "learning_rate": 4.7e-06, + "loss": 1.6008, + "step": 141 + }, + { + "epoch": 0.10314848023971125, + "grad_norm": 34.75085594638176, + "learning_rate": 4.7333333333333335e-06, + "loss": 1.5941, + "step": 142 + }, + { + "epoch": 0.10387487798787824, + "grad_norm": 35.264396011673654, + "learning_rate": 4.766666666666667e-06, + "loss": 1.5712, + "step": 143 + }, + { + "epoch": 0.10460127573604522, + "grad_norm": 35.15237015020466, + "learning_rate": 4.800000000000001e-06, + "loss": 1.5638, + "step": 144 + }, + { + "epoch": 0.1053276734842122, + "grad_norm": 34.263602593586796, + "learning_rate": 4.833333333333333e-06, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.10605407123237919, + "grad_norm": 33.753618127877616, + "learning_rate": 4.866666666666667e-06, + "loss": 1.5283, + "step": 146 + }, + { + "epoch": 0.10678046898054616, + "grad_norm": 33.839298156814806, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5213, + "step": 147 + }, + { + "epoch": 0.10750686672871314, + "grad_norm": 33.52850494542813, + "learning_rate": 4.933333333333334e-06, + "loss": 1.5176, + "step": 148 + }, + { + "epoch": 0.10823326447688013, + "grad_norm": 32.69335919755698, + "learning_rate": 4.966666666666667e-06, + "loss": 1.5137, + "step": 149 + }, + { + "epoch": 0.1089596622250471, + "grad_norm": 32.83997712477471, + "learning_rate": 5e-06, + "loss": 1.4922, + "step": 150 + }, + { + "epoch": 0.10968605997321408, + "grad_norm": 31.70298573457259, + "learning_rate": 5.033333333333333e-06, + "loss": 1.4865, + "step": 151 + }, + { + "epoch": 0.11041245772138106, + "grad_norm": 31.601502772523528, + "learning_rate": 5.0666666666666676e-06, + "loss": 1.4676, + "step": 152 + }, + { + "epoch": 0.11113885546954805, + "grad_norm": 31.921384648076824, + "learning_rate": 5.1e-06, + "loss": 1.4523, + "step": 153 + }, + { + "epoch": 0.11186525321771502, + "grad_norm": 31.225511458265583, + "learning_rate": 5.133333333333334e-06, + "loss": 1.4455, + "step": 154 + }, + { + "epoch": 0.112591650965882, + "grad_norm": 30.32788022406298, + "learning_rate": 5.1666666666666675e-06, + "loss": 1.4522, + "step": 155 + }, + { + "epoch": 0.11331804871404899, + "grad_norm": 30.167607122123012, + "learning_rate": 5.2e-06, + "loss": 1.4346, + "step": 156 + }, + { + "epoch": 0.11404444646221597, + "grad_norm": 29.780741495847135, + "learning_rate": 5.233333333333334e-06, + "loss": 1.4242, + "step": 157 + }, + { + "epoch": 0.11477084421038294, + "grad_norm": 29.145633047690872, + "learning_rate": 5.2666666666666665e-06, + "loss": 1.4239, + "step": 158 + }, + { + "epoch": 0.11549724195854993, + "grad_norm": 28.905395314413873, + "learning_rate": 5.300000000000001e-06, + "loss": 1.4028, + "step": 159 + }, + { + "epoch": 0.11622363970671691, + "grad_norm": 27.95492252225459, + "learning_rate": 5.333333333333334e-06, + "loss": 1.4024, + "step": 160 + }, + { + "epoch": 0.11695003745488389, + "grad_norm": 28.364454949347795, + "learning_rate": 5.366666666666666e-06, + "loss": 1.3966, + "step": 161 + }, + { + "epoch": 0.11767643520305086, + "grad_norm": 27.595047874488912, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3804, + "step": 162 + }, + { + "epoch": 0.11840283295121785, + "grad_norm": 27.224371964119193, + "learning_rate": 5.4333333333333335e-06, + "loss": 1.374, + "step": 163 + }, + { + "epoch": 0.11912923069938483, + "grad_norm": 26.950172876003883, + "learning_rate": 5.466666666666667e-06, + "loss": 1.3664, + "step": 164 + }, + { + "epoch": 0.1198556284475518, + "grad_norm": 26.807043896561844, + "learning_rate": 5.500000000000001e-06, + "loss": 1.3463, + "step": 165 + }, + { + "epoch": 0.1205820261957188, + "grad_norm": 25.770582681589264, + "learning_rate": 5.533333333333334e-06, + "loss": 1.3464, + "step": 166 + }, + { + "epoch": 0.12130842394388577, + "grad_norm": 26.002994801093113, + "learning_rate": 5.566666666666667e-06, + "loss": 1.3223, + "step": 167 + }, + { + "epoch": 0.12203482169205275, + "grad_norm": 25.487072733988494, + "learning_rate": 5.600000000000001e-06, + "loss": 1.3284, + "step": 168 + }, + { + "epoch": 0.12276121944021974, + "grad_norm": 24.657995803131406, + "learning_rate": 5.633333333333334e-06, + "loss": 1.3275, + "step": 169 + }, + { + "epoch": 0.12348761718838672, + "grad_norm": 24.472525282080856, + "learning_rate": 5.666666666666667e-06, + "loss": 1.3198, + "step": 170 + }, + { + "epoch": 0.1242140149365537, + "grad_norm": 24.396214212939338, + "learning_rate": 5.7e-06, + "loss": 1.3064, + "step": 171 + }, + { + "epoch": 0.12494041268472068, + "grad_norm": 23.954881409211772, + "learning_rate": 5.733333333333334e-06, + "loss": 1.302, + "step": 172 + }, + { + "epoch": 0.12566681043288766, + "grad_norm": 23.260344993149822, + "learning_rate": 5.766666666666667e-06, + "loss": 1.2848, + "step": 173 + }, + { + "epoch": 0.12639320818105465, + "grad_norm": 23.023966618398276, + "learning_rate": 5.8e-06, + "loss": 1.284, + "step": 174 + }, + { + "epoch": 0.1271196059292216, + "grad_norm": 22.329009297749824, + "learning_rate": 5.833333333333334e-06, + "loss": 1.2843, + "step": 175 + }, + { + "epoch": 0.1278460036773886, + "grad_norm": 22.5351171358773, + "learning_rate": 5.8666666666666675e-06, + "loss": 1.2682, + "step": 176 + }, + { + "epoch": 0.1285724014255556, + "grad_norm": 22.009496715465836, + "learning_rate": 5.9e-06, + "loss": 1.2625, + "step": 177 + }, + { + "epoch": 0.12929879917372256, + "grad_norm": 21.965579290780184, + "learning_rate": 5.933333333333335e-06, + "loss": 1.2579, + "step": 178 + }, + { + "epoch": 0.13002519692188955, + "grad_norm": 21.682618987574696, + "learning_rate": 5.966666666666667e-06, + "loss": 1.2549, + "step": 179 + }, + { + "epoch": 0.1307515946700565, + "grad_norm": 21.080469805135827, + "learning_rate": 6e-06, + "loss": 1.2388, + "step": 180 + }, + { + "epoch": 0.1314779924182235, + "grad_norm": 20.967610245749125, + "learning_rate": 6.033333333333335e-06, + "loss": 1.2265, + "step": 181 + }, + { + "epoch": 0.1322043901663905, + "grad_norm": 20.1492358534322, + "learning_rate": 6.066666666666667e-06, + "loss": 1.2454, + "step": 182 + }, + { + "epoch": 0.13293078791455745, + "grad_norm": 19.607412502409314, + "learning_rate": 6.1e-06, + "loss": 1.2614, + "step": 183 + }, + { + "epoch": 0.13365718566272444, + "grad_norm": 19.346758211452215, + "learning_rate": 6.133333333333334e-06, + "loss": 1.2399, + "step": 184 + }, + { + "epoch": 0.13438358341089143, + "grad_norm": 19.257583489746565, + "learning_rate": 6.166666666666667e-06, + "loss": 1.232, + "step": 185 + }, + { + "epoch": 0.1351099811590584, + "grad_norm": 18.86454673675082, + "learning_rate": 6.200000000000001e-06, + "loss": 1.219, + "step": 186 + }, + { + "epoch": 0.13583637890722539, + "grad_norm": 18.46080931559179, + "learning_rate": 6.2333333333333335e-06, + "loss": 1.2205, + "step": 187 + }, + { + "epoch": 0.13656277665539238, + "grad_norm": 18.28704328355964, + "learning_rate": 6.266666666666668e-06, + "loss": 1.2031, + "step": 188 + }, + { + "epoch": 0.13728917440355934, + "grad_norm": 18.229753602356926, + "learning_rate": 6.300000000000001e-06, + "loss": 1.2197, + "step": 189 + }, + { + "epoch": 0.13801557215172633, + "grad_norm": 17.836618035828575, + "learning_rate": 6.333333333333333e-06, + "loss": 1.1986, + "step": 190 + }, + { + "epoch": 0.13874196989989332, + "grad_norm": 17.524100548365034, + "learning_rate": 6.366666666666668e-06, + "loss": 1.2009, + "step": 191 + }, + { + "epoch": 0.13946836764806028, + "grad_norm": 17.365264025546907, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.1699, + "step": 192 + }, + { + "epoch": 0.14019476539622727, + "grad_norm": 17.146603082680905, + "learning_rate": 6.433333333333333e-06, + "loss": 1.1738, + "step": 193 + }, + { + "epoch": 0.14092116314439426, + "grad_norm": 16.859962202084272, + "learning_rate": 6.466666666666667e-06, + "loss": 1.1783, + "step": 194 + }, + { + "epoch": 0.14164756089256123, + "grad_norm": 16.420121457022596, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1685, + "step": 195 + }, + { + "epoch": 0.14237395864072822, + "grad_norm": 16.275858682293666, + "learning_rate": 6.533333333333334e-06, + "loss": 1.1689, + "step": 196 + }, + { + "epoch": 0.1431003563888952, + "grad_norm": 15.72759862599527, + "learning_rate": 6.566666666666667e-06, + "loss": 1.1618, + "step": 197 + }, + { + "epoch": 0.14382675413706217, + "grad_norm": 15.213709034432256, + "learning_rate": 6.600000000000001e-06, + "loss": 1.1834, + "step": 198 + }, + { + "epoch": 0.14455315188522916, + "grad_norm": 15.44383632151497, + "learning_rate": 6.633333333333334e-06, + "loss": 1.1606, + "step": 199 + }, + { + "epoch": 0.14527954963339615, + "grad_norm": 15.0297402495118, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1643, + "step": 200 + }, + { + "epoch": 0.1460059473815631, + "grad_norm": 14.616753595774137, + "learning_rate": 6.700000000000001e-06, + "loss": 1.1572, + "step": 201 + }, + { + "epoch": 0.1467323451297301, + "grad_norm": 14.988102097685823, + "learning_rate": 6.733333333333334e-06, + "loss": 1.1223, + "step": 202 + }, + { + "epoch": 0.1474587428778971, + "grad_norm": 14.154002560143741, + "learning_rate": 6.7666666666666665e-06, + "loss": 1.1482, + "step": 203 + }, + { + "epoch": 0.14818514062606405, + "grad_norm": 14.070198384045403, + "learning_rate": 6.800000000000001e-06, + "loss": 1.137, + "step": 204 + }, + { + "epoch": 0.14891153837423105, + "grad_norm": 14.00683253820219, + "learning_rate": 6.833333333333334e-06, + "loss": 1.1349, + "step": 205 + }, + { + "epoch": 0.149637936122398, + "grad_norm": 13.651960917625363, + "learning_rate": 6.866666666666667e-06, + "loss": 1.1313, + "step": 206 + }, + { + "epoch": 0.150364333870565, + "grad_norm": 13.649166736813594, + "learning_rate": 6.9e-06, + "loss": 1.1372, + "step": 207 + }, + { + "epoch": 0.151090731618732, + "grad_norm": 13.321459953919788, + "learning_rate": 6.9333333333333344e-06, + "loss": 1.121, + "step": 208 + }, + { + "epoch": 0.15181712936689895, + "grad_norm": 13.18972666324515, + "learning_rate": 6.966666666666667e-06, + "loss": 1.1196, + "step": 209 + }, + { + "epoch": 0.15254352711506594, + "grad_norm": 13.023718213326685, + "learning_rate": 7e-06, + "loss": 1.1161, + "step": 210 + }, + { + "epoch": 0.15326992486323293, + "grad_norm": 12.778850791700851, + "learning_rate": 7.033333333333334e-06, + "loss": 1.1333, + "step": 211 + }, + { + "epoch": 0.1539963226113999, + "grad_norm": 12.636516960484471, + "learning_rate": 7.066666666666667e-06, + "loss": 1.1124, + "step": 212 + }, + { + "epoch": 0.15472272035956688, + "grad_norm": 12.497406739744827, + "learning_rate": 7.100000000000001e-06, + "loss": 1.1297, + "step": 213 + }, + { + "epoch": 0.15544911810773387, + "grad_norm": 12.253086279848734, + "learning_rate": 7.133333333333334e-06, + "loss": 1.108, + "step": 214 + }, + { + "epoch": 0.15617551585590084, + "grad_norm": 12.301343331028344, + "learning_rate": 7.166666666666667e-06, + "loss": 1.0704, + "step": 215 + }, + { + "epoch": 0.15690191360406783, + "grad_norm": 11.856234190055382, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.106, + "step": 216 + }, + { + "epoch": 0.15762831135223482, + "grad_norm": 11.58489034219279, + "learning_rate": 7.233333333333334e-06, + "loss": 1.1039, + "step": 217 + }, + { + "epoch": 0.15835470910040178, + "grad_norm": 11.40452942412662, + "learning_rate": 7.266666666666668e-06, + "loss": 1.107, + "step": 218 + }, + { + "epoch": 0.15908110684856877, + "grad_norm": 11.492550763214666, + "learning_rate": 7.3e-06, + "loss": 1.0881, + "step": 219 + }, + { + "epoch": 0.15980750459673576, + "grad_norm": 11.269901967107916, + "learning_rate": 7.333333333333333e-06, + "loss": 1.0904, + "step": 220 + }, + { + "epoch": 0.16053390234490272, + "grad_norm": 10.96158775655295, + "learning_rate": 7.3666666666666676e-06, + "loss": 1.0952, + "step": 221 + }, + { + "epoch": 0.16126030009306971, + "grad_norm": 11.13901163124956, + "learning_rate": 7.4e-06, + "loss": 1.074, + "step": 222 + }, + { + "epoch": 0.1619866978412367, + "grad_norm": 10.84361490879539, + "learning_rate": 7.433333333333334e-06, + "loss": 1.0775, + "step": 223 + }, + { + "epoch": 0.16271309558940367, + "grad_norm": 10.809242398633671, + "learning_rate": 7.4666666666666675e-06, + "loss": 1.0599, + "step": 224 + }, + { + "epoch": 0.16343949333757066, + "grad_norm": 10.357828962160399, + "learning_rate": 7.500000000000001e-06, + "loss": 1.0719, + "step": 225 + }, + { + "epoch": 0.16416589108573765, + "grad_norm": 10.16294538034498, + "learning_rate": 7.533333333333334e-06, + "loss": 1.0737, + "step": 226 + }, + { + "epoch": 0.1648922888339046, + "grad_norm": 10.39061500039731, + "learning_rate": 7.566666666666667e-06, + "loss": 1.0556, + "step": 227 + }, + { + "epoch": 0.1656186865820716, + "grad_norm": 10.221493241719058, + "learning_rate": 7.600000000000001e-06, + "loss": 1.0476, + "step": 228 + }, + { + "epoch": 0.16634508433023856, + "grad_norm": 9.907815020465414, + "learning_rate": 7.633333333333334e-06, + "loss": 1.0553, + "step": 229 + }, + { + "epoch": 0.16707148207840555, + "grad_norm": 9.581041133597106, + "learning_rate": 7.666666666666667e-06, + "loss": 1.0775, + "step": 230 + }, + { + "epoch": 0.16779787982657254, + "grad_norm": 9.71168279973024, + "learning_rate": 7.7e-06, + "loss": 1.0639, + "step": 231 + }, + { + "epoch": 0.1685242775747395, + "grad_norm": 9.551789851749682, + "learning_rate": 7.733333333333334e-06, + "loss": 1.0503, + "step": 232 + }, + { + "epoch": 0.1692506753229065, + "grad_norm": 9.376979470265642, + "learning_rate": 7.766666666666666e-06, + "loss": 1.0605, + "step": 233 + }, + { + "epoch": 0.1699770730710735, + "grad_norm": 9.215487743984946, + "learning_rate": 7.800000000000002e-06, + "loss": 1.0529, + "step": 234 + }, + { + "epoch": 0.17070347081924045, + "grad_norm": 9.023321477961552, + "learning_rate": 7.833333333333333e-06, + "loss": 1.06, + "step": 235 + }, + { + "epoch": 0.17142986856740744, + "grad_norm": 8.978074772661731, + "learning_rate": 7.866666666666667e-06, + "loss": 1.0296, + "step": 236 + }, + { + "epoch": 0.17215626631557443, + "grad_norm": 8.837716165817618, + "learning_rate": 7.9e-06, + "loss": 1.0255, + "step": 237 + }, + { + "epoch": 0.1728826640637414, + "grad_norm": 8.634089579695464, + "learning_rate": 7.933333333333334e-06, + "loss": 1.0436, + "step": 238 + }, + { + "epoch": 0.17360906181190838, + "grad_norm": 8.239794943770667, + "learning_rate": 7.966666666666668e-06, + "loss": 1.0565, + "step": 239 + }, + { + "epoch": 0.17433545956007537, + "grad_norm": 8.517474915317166, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0147, + "step": 240 + }, + { + "epoch": 0.17506185730824234, + "grad_norm": 8.270883045328144, + "learning_rate": 8.033333333333335e-06, + "loss": 1.0425, + "step": 241 + }, + { + "epoch": 0.17578825505640933, + "grad_norm": 8.19007825276564, + "learning_rate": 8.066666666666667e-06, + "loss": 1.0255, + "step": 242 + }, + { + "epoch": 0.17651465280457632, + "grad_norm": 8.020665101381212, + "learning_rate": 8.1e-06, + "loss": 1.0372, + "step": 243 + }, + { + "epoch": 0.17724105055274328, + "grad_norm": 8.078219951557776, + "learning_rate": 8.133333333333334e-06, + "loss": 1.023, + "step": 244 + }, + { + "epoch": 0.17796744830091027, + "grad_norm": 7.830702554013111, + "learning_rate": 8.166666666666668e-06, + "loss": 1.0366, + "step": 245 + }, + { + "epoch": 0.17869384604907726, + "grad_norm": 7.820391288845364, + "learning_rate": 8.2e-06, + "loss": 1.0234, + "step": 246 + }, + { + "epoch": 0.17942024379724422, + "grad_norm": 7.607710068511707, + "learning_rate": 8.233333333333335e-06, + "loss": 1.0226, + "step": 247 + }, + { + "epoch": 0.1801466415454112, + "grad_norm": 7.541222252515193, + "learning_rate": 8.266666666666667e-06, + "loss": 1.0205, + "step": 248 + }, + { + "epoch": 0.1808730392935782, + "grad_norm": 7.602914271454926, + "learning_rate": 8.3e-06, + "loss": 1.0057, + "step": 249 + }, + { + "epoch": 0.18159943704174517, + "grad_norm": 7.41561445398044, + "learning_rate": 8.333333333333334e-06, + "loss": 1.0091, + "step": 250 + }, + { + "epoch": 0.18232583478991216, + "grad_norm": 7.428385752483024, + "learning_rate": 8.366666666666667e-06, + "loss": 1.0091, + "step": 251 + }, + { + "epoch": 0.18305223253807912, + "grad_norm": 7.215990147777718, + "learning_rate": 8.400000000000001e-06, + "loss": 1.0165, + "step": 252 + }, + { + "epoch": 0.1837786302862461, + "grad_norm": 7.137875961690411, + "learning_rate": 8.433333333333334e-06, + "loss": 0.9942, + "step": 253 + }, + { + "epoch": 0.1845050280344131, + "grad_norm": 7.178095525054059, + "learning_rate": 8.466666666666668e-06, + "loss": 0.9913, + "step": 254 + }, + { + "epoch": 0.18523142578258006, + "grad_norm": 6.8632846366421285, + "learning_rate": 8.5e-06, + "loss": 1.0196, + "step": 255 + }, + { + "epoch": 0.18595782353074705, + "grad_norm": 6.91059218104985, + "learning_rate": 8.533333333333335e-06, + "loss": 1.0035, + "step": 256 + }, + { + "epoch": 0.18668422127891404, + "grad_norm": 6.756354737848418, + "learning_rate": 8.566666666666667e-06, + "loss": 1.0071, + "step": 257 + }, + { + "epoch": 0.187410619027081, + "grad_norm": 6.672245371900697, + "learning_rate": 8.6e-06, + "loss": 1.0146, + "step": 258 + }, + { + "epoch": 0.188137016775248, + "grad_norm": 6.664492580555999, + "learning_rate": 8.633333333333334e-06, + "loss": 1.0041, + "step": 259 + }, + { + "epoch": 0.18886341452341499, + "grad_norm": 6.630116579252964, + "learning_rate": 8.666666666666668e-06, + "loss": 0.9909, + "step": 260 + }, + { + "epoch": 0.18958981227158195, + "grad_norm": 6.540452718326813, + "learning_rate": 8.700000000000001e-06, + "loss": 0.9957, + "step": 261 + }, + { + "epoch": 0.19031621001974894, + "grad_norm": 6.341367627655762, + "learning_rate": 8.733333333333333e-06, + "loss": 1.0009, + "step": 262 + }, + { + "epoch": 0.19104260776791593, + "grad_norm": 6.234409930165991, + "learning_rate": 8.766666666666669e-06, + "loss": 1.001, + "step": 263 + }, + { + "epoch": 0.1917690055160829, + "grad_norm": 6.305502441912889, + "learning_rate": 8.8e-06, + "loss": 0.9935, + "step": 264 + }, + { + "epoch": 0.19249540326424988, + "grad_norm": 6.225158071475369, + "learning_rate": 8.833333333333334e-06, + "loss": 1.0081, + "step": 265 + }, + { + "epoch": 0.19322180101241687, + "grad_norm": 6.058144207484882, + "learning_rate": 8.866666666666668e-06, + "loss": 0.9841, + "step": 266 + }, + { + "epoch": 0.19394819876058383, + "grad_norm": 5.98628355833425, + "learning_rate": 8.900000000000001e-06, + "loss": 0.9876, + "step": 267 + }, + { + "epoch": 0.19467459650875082, + "grad_norm": 5.943991083458489, + "learning_rate": 8.933333333333333e-06, + "loss": 0.997, + "step": 268 + }, + { + "epoch": 0.19540099425691781, + "grad_norm": 5.873982099655574, + "learning_rate": 8.966666666666667e-06, + "loss": 0.978, + "step": 269 + }, + { + "epoch": 0.19612739200508478, + "grad_norm": 5.895132355616301, + "learning_rate": 9e-06, + "loss": 0.9741, + "step": 270 + }, + { + "epoch": 0.19685378975325177, + "grad_norm": 5.769369464159913, + "learning_rate": 9.033333333333334e-06, + "loss": 0.9732, + "step": 271 + }, + { + "epoch": 0.19758018750141876, + "grad_norm": 5.576420099083153, + "learning_rate": 9.066666666666667e-06, + "loss": 0.9892, + "step": 272 + }, + { + "epoch": 0.19830658524958572, + "grad_norm": 5.674605536835416, + "learning_rate": 9.100000000000001e-06, + "loss": 0.9647, + "step": 273 + }, + { + "epoch": 0.1990329829977527, + "grad_norm": 5.689962851483039, + "learning_rate": 9.133333333333335e-06, + "loss": 0.9561, + "step": 274 + }, + { + "epoch": 0.1997593807459197, + "grad_norm": 5.451787863746172, + "learning_rate": 9.166666666666666e-06, + "loss": 0.9693, + "step": 275 + }, + { + "epoch": 0.20048577849408666, + "grad_norm": 5.508273579548314, + "learning_rate": 9.200000000000002e-06, + "loss": 0.9476, + "step": 276 + }, + { + "epoch": 0.20121217624225365, + "grad_norm": 5.346002804833987, + "learning_rate": 9.233333333333334e-06, + "loss": 0.9492, + "step": 277 + }, + { + "epoch": 0.20193857399042062, + "grad_norm": 5.315336149698185, + "learning_rate": 9.266666666666667e-06, + "loss": 0.9608, + "step": 278 + }, + { + "epoch": 0.2026649717385876, + "grad_norm": 5.215861439520431, + "learning_rate": 9.3e-06, + "loss": 0.9854, + "step": 279 + }, + { + "epoch": 0.2033913694867546, + "grad_norm": 5.193177826479856, + "learning_rate": 9.333333333333334e-06, + "loss": 0.9604, + "step": 280 + }, + { + "epoch": 0.20411776723492156, + "grad_norm": 5.123020615984874, + "learning_rate": 9.366666666666668e-06, + "loss": 0.9585, + "step": 281 + }, + { + "epoch": 0.20484416498308855, + "grad_norm": 5.029202178322807, + "learning_rate": 9.4e-06, + "loss": 0.9733, + "step": 282 + }, + { + "epoch": 0.20557056273125554, + "grad_norm": 5.013483166254676, + "learning_rate": 9.433333333333335e-06, + "loss": 0.9684, + "step": 283 + }, + { + "epoch": 0.2062969604794225, + "grad_norm": 4.887824924314841, + "learning_rate": 9.466666666666667e-06, + "loss": 0.9734, + "step": 284 + }, + { + "epoch": 0.2070233582275895, + "grad_norm": 4.975153535316991, + "learning_rate": 9.5e-06, + "loss": 0.9635, + "step": 285 + }, + { + "epoch": 0.20774975597575648, + "grad_norm": 4.856621018396769, + "learning_rate": 9.533333333333334e-06, + "loss": 0.9523, + "step": 286 + }, + { + "epoch": 0.20847615372392345, + "grad_norm": 4.921703565848544, + "learning_rate": 9.566666666666668e-06, + "loss": 0.939, + "step": 287 + }, + { + "epoch": 0.20920255147209044, + "grad_norm": 4.759014226910551, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9514, + "step": 288 + }, + { + "epoch": 0.20992894922025743, + "grad_norm": 4.850649945410091, + "learning_rate": 9.633333333333335e-06, + "loss": 0.9423, + "step": 289 + }, + { + "epoch": 0.2106553469684244, + "grad_norm": 4.7510300916535835, + "learning_rate": 9.666666666666667e-06, + "loss": 0.9375, + "step": 290 + }, + { + "epoch": 0.21138174471659138, + "grad_norm": 4.631844761995069, + "learning_rate": 9.7e-06, + "loss": 0.9498, + "step": 291 + }, + { + "epoch": 0.21210814246475837, + "grad_norm": 4.6840553260562094, + "learning_rate": 9.733333333333334e-06, + "loss": 0.9318, + "step": 292 + }, + { + "epoch": 0.21283454021292533, + "grad_norm": 4.64051963705808, + "learning_rate": 9.766666666666667e-06, + "loss": 0.935, + "step": 293 + }, + { + "epoch": 0.21356093796109232, + "grad_norm": 4.510960059913492, + "learning_rate": 9.800000000000001e-06, + "loss": 0.9431, + "step": 294 + }, + { + "epoch": 0.2142873357092593, + "grad_norm": 4.407943102592227, + "learning_rate": 9.833333333333333e-06, + "loss": 0.9538, + "step": 295 + }, + { + "epoch": 0.21501373345742628, + "grad_norm": 4.477665809742677, + "learning_rate": 9.866666666666668e-06, + "loss": 0.9414, + "step": 296 + }, + { + "epoch": 0.21574013120559327, + "grad_norm": 4.3658879124763885, + "learning_rate": 9.9e-06, + "loss": 0.9475, + "step": 297 + }, + { + "epoch": 0.21646652895376026, + "grad_norm": 4.3068037062213875, + "learning_rate": 9.933333333333334e-06, + "loss": 0.9423, + "step": 298 + }, + { + "epoch": 0.21719292670192722, + "grad_norm": 4.285180819974754, + "learning_rate": 9.966666666666667e-06, + "loss": 0.9399, + "step": 299 + }, + { + "epoch": 0.2179193244500942, + "grad_norm": 4.229079706272251, + "learning_rate": 1e-05, + "loss": 0.9482, + "step": 300 + }, + { + "epoch": 0.21864572219826117, + "grad_norm": 4.281554504080381, + "learning_rate": 9.99999943011404e-06, + "loss": 0.9175, + "step": 301 + }, + { + "epoch": 0.21937211994642816, + "grad_norm": 4.169777969455729, + "learning_rate": 9.999997720456285e-06, + "loss": 0.9479, + "step": 302 + }, + { + "epoch": 0.22009851769459515, + "grad_norm": 4.1699923475025855, + "learning_rate": 9.999994871027128e-06, + "loss": 0.925, + "step": 303 + }, + { + "epoch": 0.22082491544276212, + "grad_norm": 4.125376173668559, + "learning_rate": 9.999990881827218e-06, + "loss": 0.9263, + "step": 304 + }, + { + "epoch": 0.2215513131909291, + "grad_norm": 4.075622019823068, + "learning_rate": 9.999985752857464e-06, + "loss": 0.9415, + "step": 305 + }, + { + "epoch": 0.2222777109390961, + "grad_norm": 4.087232981051823, + "learning_rate": 9.999979484119035e-06, + "loss": 0.9023, + "step": 306 + }, + { + "epoch": 0.22300410868726306, + "grad_norm": 3.9626016761472567, + "learning_rate": 9.99997207561336e-06, + "loss": 0.9274, + "step": 307 + }, + { + "epoch": 0.22373050643543005, + "grad_norm": 3.9837800051798036, + "learning_rate": 9.999963527342128e-06, + "loss": 0.9256, + "step": 308 + }, + { + "epoch": 0.22445690418359704, + "grad_norm": 3.916777154550035, + "learning_rate": 9.999953839307287e-06, + "loss": 0.9217, + "step": 309 + }, + { + "epoch": 0.225183301931764, + "grad_norm": 3.8728958068319006, + "learning_rate": 9.999943011511045e-06, + "loss": 0.9275, + "step": 310 + }, + { + "epoch": 0.225909699679931, + "grad_norm": 3.8132974533118, + "learning_rate": 9.999931043955876e-06, + "loss": 0.9207, + "step": 311 + }, + { + "epoch": 0.22663609742809798, + "grad_norm": 3.841173198735151, + "learning_rate": 9.999917936644498e-06, + "loss": 0.9086, + "step": 312 + }, + { + "epoch": 0.22736249517626494, + "grad_norm": 3.8065529809421452, + "learning_rate": 9.999903689579906e-06, + "loss": 0.9273, + "step": 313 + }, + { + "epoch": 0.22808889292443194, + "grad_norm": 3.7651904589221785, + "learning_rate": 9.999888302765347e-06, + "loss": 0.9189, + "step": 314 + }, + { + "epoch": 0.22881529067259893, + "grad_norm": 3.7601408395621334, + "learning_rate": 9.999871776204325e-06, + "loss": 0.9134, + "step": 315 + }, + { + "epoch": 0.2295416884207659, + "grad_norm": 3.670695667395263, + "learning_rate": 9.99985410990061e-06, + "loss": 0.916, + "step": 316 + }, + { + "epoch": 0.23026808616893288, + "grad_norm": 3.6252352338521305, + "learning_rate": 9.999835303858231e-06, + "loss": 0.9134, + "step": 317 + }, + { + "epoch": 0.23099448391709987, + "grad_norm": 3.6643475103199106, + "learning_rate": 9.99981535808147e-06, + "loss": 0.9056, + "step": 318 + }, + { + "epoch": 0.23172088166526683, + "grad_norm": 3.6490134106546113, + "learning_rate": 9.99979427257488e-06, + "loss": 0.908, + "step": 319 + }, + { + "epoch": 0.23244727941343382, + "grad_norm": 3.595270690577472, + "learning_rate": 9.999772047343259e-06, + "loss": 0.9089, + "step": 320 + }, + { + "epoch": 0.2331736771616008, + "grad_norm": 3.5270211803260803, + "learning_rate": 9.999748682391682e-06, + "loss": 0.9042, + "step": 321 + }, + { + "epoch": 0.23390007490976777, + "grad_norm": 3.526672305447795, + "learning_rate": 9.99972417772547e-06, + "loss": 0.9027, + "step": 322 + }, + { + "epoch": 0.23462647265793476, + "grad_norm": 3.4884435048204785, + "learning_rate": 9.99969853335021e-06, + "loss": 0.9147, + "step": 323 + }, + { + "epoch": 0.23535287040610173, + "grad_norm": 3.466461119074874, + "learning_rate": 9.99967174927175e-06, + "loss": 0.8831, + "step": 324 + }, + { + "epoch": 0.23607926815426872, + "grad_norm": 3.383397947245138, + "learning_rate": 9.99964382549619e-06, + "loss": 0.9125, + "step": 325 + }, + { + "epoch": 0.2368056659024357, + "grad_norm": 3.442604009677502, + "learning_rate": 9.999614762029901e-06, + "loss": 0.8934, + "step": 326 + }, + { + "epoch": 0.23753206365060267, + "grad_norm": 3.355588421676241, + "learning_rate": 9.999584558879507e-06, + "loss": 0.9061, + "step": 327 + }, + { + "epoch": 0.23825846139876966, + "grad_norm": 3.266053823556341, + "learning_rate": 9.999553216051892e-06, + "loss": 0.9182, + "step": 328 + }, + { + "epoch": 0.23898485914693665, + "grad_norm": 3.3662912679017603, + "learning_rate": 9.9995207335542e-06, + "loss": 0.9014, + "step": 329 + }, + { + "epoch": 0.2397112568951036, + "grad_norm": 3.286115029330833, + "learning_rate": 9.999487111393836e-06, + "loss": 0.9102, + "step": 330 + }, + { + "epoch": 0.2404376546432706, + "grad_norm": 3.337503631773769, + "learning_rate": 9.999452349578465e-06, + "loss": 0.9128, + "step": 331 + }, + { + "epoch": 0.2411640523914376, + "grad_norm": 3.1928364364917896, + "learning_rate": 9.999416448116011e-06, + "loss": 0.9107, + "step": 332 + }, + { + "epoch": 0.24189045013960456, + "grad_norm": 3.1773858850837833, + "learning_rate": 9.999379407014656e-06, + "loss": 0.9102, + "step": 333 + }, + { + "epoch": 0.24261684788777155, + "grad_norm": 3.1494952919561117, + "learning_rate": 9.999341226282848e-06, + "loss": 0.9053, + "step": 334 + }, + { + "epoch": 0.24334324563593854, + "grad_norm": 3.1585806465130704, + "learning_rate": 9.999301905929286e-06, + "loss": 0.9027, + "step": 335 + }, + { + "epoch": 0.2440696433841055, + "grad_norm": 3.153378709522304, + "learning_rate": 9.999261445962936e-06, + "loss": 0.8965, + "step": 336 + }, + { + "epoch": 0.2447960411322725, + "grad_norm": 3.1271021116928215, + "learning_rate": 9.999219846393018e-06, + "loss": 0.9022, + "step": 337 + }, + { + "epoch": 0.24552243888043948, + "grad_norm": 3.1009031288222775, + "learning_rate": 9.999177107229019e-06, + "loss": 0.9013, + "step": 338 + }, + { + "epoch": 0.24624883662860644, + "grad_norm": 3.1087383815587244, + "learning_rate": 9.999133228480679e-06, + "loss": 0.9037, + "step": 339 + }, + { + "epoch": 0.24697523437677343, + "grad_norm": 3.080267158750676, + "learning_rate": 9.999088210158001e-06, + "loss": 0.8849, + "step": 340 + }, + { + "epoch": 0.24770163212494042, + "grad_norm": 2.999486292773668, + "learning_rate": 9.999042052271247e-06, + "loss": 0.8937, + "step": 341 + }, + { + "epoch": 0.2484280298731074, + "grad_norm": 3.013895982835644, + "learning_rate": 9.99899475483094e-06, + "loss": 0.8888, + "step": 342 + }, + { + "epoch": 0.24915442762127438, + "grad_norm": 3.073758208806406, + "learning_rate": 9.998946317847857e-06, + "loss": 0.8602, + "step": 343 + }, + { + "epoch": 0.24988082536944137, + "grad_norm": 2.9286726313576112, + "learning_rate": 9.998896741333047e-06, + "loss": 0.9008, + "step": 344 + }, + { + "epoch": 0.25060722311760836, + "grad_norm": 2.910650348601522, + "learning_rate": 9.998846025297804e-06, + "loss": 0.895, + "step": 345 + }, + { + "epoch": 0.2513336208657753, + "grad_norm": 2.8993230794505838, + "learning_rate": 9.998794169753694e-06, + "loss": 0.8841, + "step": 346 + }, + { + "epoch": 0.2520600186139423, + "grad_norm": 2.9168647806374617, + "learning_rate": 9.998741174712534e-06, + "loss": 0.8989, + "step": 347 + }, + { + "epoch": 0.2527864163621093, + "grad_norm": 2.9351727444896283, + "learning_rate": 9.998687040186407e-06, + "loss": 0.8851, + "step": 348 + }, + { + "epoch": 0.25351281411027626, + "grad_norm": 2.823829131544092, + "learning_rate": 9.998631766187651e-06, + "loss": 0.8873, + "step": 349 + }, + { + "epoch": 0.2542392118584432, + "grad_norm": 2.8207151518990448, + "learning_rate": 9.998575352728869e-06, + "loss": 0.8808, + "step": 350 + }, + { + "epoch": 0.25496560960661024, + "grad_norm": 2.881827280089076, + "learning_rate": 9.99851779982292e-06, + "loss": 0.8792, + "step": 351 + }, + { + "epoch": 0.2556920073547772, + "grad_norm": 2.839308859851771, + "learning_rate": 9.998459107482922e-06, + "loss": 0.8716, + "step": 352 + }, + { + "epoch": 0.25641840510294417, + "grad_norm": 2.7473965822142743, + "learning_rate": 9.998399275722253e-06, + "loss": 0.8842, + "step": 353 + }, + { + "epoch": 0.2571448028511112, + "grad_norm": 2.762628380371553, + "learning_rate": 9.998338304554553e-06, + "loss": 0.8847, + "step": 354 + }, + { + "epoch": 0.25787120059927815, + "grad_norm": 2.7322944562152616, + "learning_rate": 9.998276193993723e-06, + "loss": 0.8605, + "step": 355 + }, + { + "epoch": 0.2585975983474451, + "grad_norm": 2.666398101224024, + "learning_rate": 9.99821294405392e-06, + "loss": 0.8867, + "step": 356 + }, + { + "epoch": 0.25932399609561213, + "grad_norm": 2.710835958236394, + "learning_rate": 9.998148554749561e-06, + "loss": 0.8634, + "step": 357 + }, + { + "epoch": 0.2600503938437791, + "grad_norm": 2.6670423652778346, + "learning_rate": 9.998083026095323e-06, + "loss": 0.8878, + "step": 358 + }, + { + "epoch": 0.26077679159194606, + "grad_norm": 2.681536519147625, + "learning_rate": 9.998016358106147e-06, + "loss": 0.8686, + "step": 359 + }, + { + "epoch": 0.261503189340113, + "grad_norm": 2.597410074827163, + "learning_rate": 9.997948550797227e-06, + "loss": 0.8868, + "step": 360 + }, + { + "epoch": 0.26222958708828004, + "grad_norm": 2.621123678822491, + "learning_rate": 9.997879604184023e-06, + "loss": 0.887, + "step": 361 + }, + { + "epoch": 0.262955984836447, + "grad_norm": 2.604762010879466, + "learning_rate": 9.99780951828225e-06, + "loss": 0.8587, + "step": 362 + }, + { + "epoch": 0.26368238258461396, + "grad_norm": 2.653532685112411, + "learning_rate": 9.997738293107882e-06, + "loss": 0.8694, + "step": 363 + }, + { + "epoch": 0.264408780332781, + "grad_norm": 2.5929563189339575, + "learning_rate": 9.997665928677159e-06, + "loss": 0.8663, + "step": 364 + }, + { + "epoch": 0.26513517808094794, + "grad_norm": 2.511532491788452, + "learning_rate": 9.997592425006574e-06, + "loss": 0.8923, + "step": 365 + }, + { + "epoch": 0.2658615758291149, + "grad_norm": 2.5809881467050313, + "learning_rate": 9.997517782112885e-06, + "loss": 0.8544, + "step": 366 + }, + { + "epoch": 0.2665879735772819, + "grad_norm": 2.5451328703553964, + "learning_rate": 9.997442000013105e-06, + "loss": 0.869, + "step": 367 + }, + { + "epoch": 0.2673143713254489, + "grad_norm": 2.5718644330923253, + "learning_rate": 9.997365078724512e-06, + "loss": 0.8576, + "step": 368 + }, + { + "epoch": 0.26804076907361585, + "grad_norm": 2.5201758471763127, + "learning_rate": 9.997287018264637e-06, + "loss": 0.8471, + "step": 369 + }, + { + "epoch": 0.26876716682178287, + "grad_norm": 2.469545330155226, + "learning_rate": 9.997207818651273e-06, + "loss": 0.8716, + "step": 370 + }, + { + "epoch": 0.26949356456994983, + "grad_norm": 2.4963246591865556, + "learning_rate": 9.99712747990248e-06, + "loss": 0.8598, + "step": 371 + }, + { + "epoch": 0.2702199623181168, + "grad_norm": 2.4316216175071825, + "learning_rate": 9.997046002036566e-06, + "loss": 0.8687, + "step": 372 + }, + { + "epoch": 0.2709463600662838, + "grad_norm": 2.4339436736619677, + "learning_rate": 9.996963385072108e-06, + "loss": 0.8536, + "step": 373 + }, + { + "epoch": 0.27167275781445077, + "grad_norm": 2.447464955174159, + "learning_rate": 9.996879629027934e-06, + "loss": 0.8666, + "step": 374 + }, + { + "epoch": 0.27239915556261773, + "grad_norm": 2.408860632016767, + "learning_rate": 9.996794733923141e-06, + "loss": 0.858, + "step": 375 + }, + { + "epoch": 0.27312555331078475, + "grad_norm": 2.381748758234186, + "learning_rate": 9.996708699777081e-06, + "loss": 0.8671, + "step": 376 + }, + { + "epoch": 0.2738519510589517, + "grad_norm": 2.547484262667178, + "learning_rate": 9.996621526609364e-06, + "loss": 0.8385, + "step": 377 + }, + { + "epoch": 0.2745783488071187, + "grad_norm": 2.4047390006326905, + "learning_rate": 9.996533214439864e-06, + "loss": 0.8594, + "step": 378 + }, + { + "epoch": 0.2753047465552857, + "grad_norm": 2.392166393236436, + "learning_rate": 9.996443763288708e-06, + "loss": 0.855, + "step": 379 + }, + { + "epoch": 0.27603114430345266, + "grad_norm": 2.314177983313085, + "learning_rate": 9.99635317317629e-06, + "loss": 0.8613, + "step": 380 + }, + { + "epoch": 0.2767575420516196, + "grad_norm": 2.2953329879725706, + "learning_rate": 9.99626144412326e-06, + "loss": 0.8508, + "step": 381 + }, + { + "epoch": 0.27748393979978664, + "grad_norm": 2.373682548895982, + "learning_rate": 9.996168576150527e-06, + "loss": 0.8529, + "step": 382 + }, + { + "epoch": 0.2782103375479536, + "grad_norm": 2.272694418338483, + "learning_rate": 9.996074569279261e-06, + "loss": 0.8499, + "step": 383 + }, + { + "epoch": 0.27893673529612056, + "grad_norm": 2.27252146099723, + "learning_rate": 9.995979423530893e-06, + "loss": 0.8597, + "step": 384 + }, + { + "epoch": 0.2796631330442876, + "grad_norm": 2.2642442307360624, + "learning_rate": 9.995883138927109e-06, + "loss": 0.8642, + "step": 385 + }, + { + "epoch": 0.28038953079245454, + "grad_norm": 2.2782516681714635, + "learning_rate": 9.995785715489861e-06, + "loss": 0.8433, + "step": 386 + }, + { + "epoch": 0.2811159285406215, + "grad_norm": 2.2336277825210216, + "learning_rate": 9.995687153241353e-06, + "loss": 0.856, + "step": 387 + }, + { + "epoch": 0.2818423262887885, + "grad_norm": 2.195604639250854, + "learning_rate": 9.995587452204056e-06, + "loss": 0.8668, + "step": 388 + }, + { + "epoch": 0.2825687240369555, + "grad_norm": 2.1848982031790976, + "learning_rate": 9.995486612400695e-06, + "loss": 0.8638, + "step": 389 + }, + { + "epoch": 0.28329512178512245, + "grad_norm": 2.2079370575686292, + "learning_rate": 9.995384633854259e-06, + "loss": 0.8459, + "step": 390 + }, + { + "epoch": 0.28402151953328947, + "grad_norm": 2.4403396183778168, + "learning_rate": 9.995281516587992e-06, + "loss": 0.8442, + "step": 391 + }, + { + "epoch": 0.28474791728145643, + "grad_norm": 2.190782039428668, + "learning_rate": 9.995177260625401e-06, + "loss": 0.8571, + "step": 392 + }, + { + "epoch": 0.2854743150296234, + "grad_norm": 2.1783620914890447, + "learning_rate": 9.995071865990255e-06, + "loss": 0.8421, + "step": 393 + }, + { + "epoch": 0.2862007127777904, + "grad_norm": 2.160114786487807, + "learning_rate": 9.994965332706574e-06, + "loss": 0.8464, + "step": 394 + }, + { + "epoch": 0.2869271105259574, + "grad_norm": 2.1077114811406363, + "learning_rate": 9.994857660798644e-06, + "loss": 0.851, + "step": 395 + }, + { + "epoch": 0.28765350827412434, + "grad_norm": 2.130514190474774, + "learning_rate": 9.994748850291013e-06, + "loss": 0.8391, + "step": 396 + }, + { + "epoch": 0.28837990602229135, + "grad_norm": 2.305097773774775, + "learning_rate": 9.99463890120848e-06, + "loss": 0.8484, + "step": 397 + }, + { + "epoch": 0.2891063037704583, + "grad_norm": 2.3190227202687557, + "learning_rate": 9.994527813576111e-06, + "loss": 0.8687, + "step": 398 + }, + { + "epoch": 0.2898327015186253, + "grad_norm": 2.0697354319941184, + "learning_rate": 9.994415587419229e-06, + "loss": 0.871, + "step": 399 + }, + { + "epoch": 0.2905590992667923, + "grad_norm": 2.1179466292046802, + "learning_rate": 9.994302222763415e-06, + "loss": 0.8427, + "step": 400 + }, + { + "epoch": 0.29128549701495926, + "grad_norm": 2.0548380672609254, + "learning_rate": 9.994187719634512e-06, + "loss": 0.851, + "step": 401 + }, + { + "epoch": 0.2920118947631262, + "grad_norm": 2.004703885482171, + "learning_rate": 9.994072078058621e-06, + "loss": 0.8572, + "step": 402 + }, + { + "epoch": 0.29273829251129324, + "grad_norm": 2.040493240651109, + "learning_rate": 9.993955298062103e-06, + "loss": 0.8497, + "step": 403 + }, + { + "epoch": 0.2934646902594602, + "grad_norm": 2.0653918004675615, + "learning_rate": 9.99383737967158e-06, + "loss": 0.8361, + "step": 404 + }, + { + "epoch": 0.29419108800762717, + "grad_norm": 2.0162210298258088, + "learning_rate": 9.99371832291393e-06, + "loss": 0.8407, + "step": 405 + }, + { + "epoch": 0.2949174857557942, + "grad_norm": 2.039149148468403, + "learning_rate": 9.993598127816292e-06, + "loss": 0.8252, + "step": 406 + }, + { + "epoch": 0.29564388350396115, + "grad_norm": 1.9921985897639096, + "learning_rate": 9.993476794406068e-06, + "loss": 0.8467, + "step": 407 + }, + { + "epoch": 0.2963702812521281, + "grad_norm": 1.9853296697955654, + "learning_rate": 9.993354322710914e-06, + "loss": 0.8447, + "step": 408 + }, + { + "epoch": 0.29709667900029507, + "grad_norm": 1.933388097861427, + "learning_rate": 9.993230712758748e-06, + "loss": 0.8443, + "step": 409 + }, + { + "epoch": 0.2978230767484621, + "grad_norm": 2.015700970579226, + "learning_rate": 9.99310596457775e-06, + "loss": 0.8273, + "step": 410 + }, + { + "epoch": 0.29854947449662905, + "grad_norm": 1.9666969573202417, + "learning_rate": 9.992980078196355e-06, + "loss": 0.8348, + "step": 411 + }, + { + "epoch": 0.299275872244796, + "grad_norm": 1.9594216972253362, + "learning_rate": 9.992853053643257e-06, + "loss": 0.832, + "step": 412 + }, + { + "epoch": 0.30000226999296303, + "grad_norm": 1.9446630419498114, + "learning_rate": 9.992724890947417e-06, + "loss": 0.8267, + "step": 413 + }, + { + "epoch": 0.30072866774113, + "grad_norm": 1.9688606092767582, + "learning_rate": 9.992595590138045e-06, + "loss": 0.8368, + "step": 414 + }, + { + "epoch": 0.30145506548929696, + "grad_norm": 1.9423435194261474, + "learning_rate": 9.99246515124462e-06, + "loss": 0.8242, + "step": 415 + }, + { + "epoch": 0.302181463237464, + "grad_norm": 1.9125718876356885, + "learning_rate": 9.992333574296872e-06, + "loss": 0.8276, + "step": 416 + }, + { + "epoch": 0.30290786098563094, + "grad_norm": 1.883649957859309, + "learning_rate": 9.992200859324799e-06, + "loss": 0.8298, + "step": 417 + }, + { + "epoch": 0.3036342587337979, + "grad_norm": 1.879036782487961, + "learning_rate": 9.992067006358651e-06, + "loss": 0.8353, + "step": 418 + }, + { + "epoch": 0.3043606564819649, + "grad_norm": 1.8719975868291152, + "learning_rate": 9.991932015428941e-06, + "loss": 0.8443, + "step": 419 + }, + { + "epoch": 0.3050870542301319, + "grad_norm": 1.874484854453761, + "learning_rate": 9.991795886566443e-06, + "loss": 0.8292, + "step": 420 + }, + { + "epoch": 0.30581345197829884, + "grad_norm": 1.8775152039378569, + "learning_rate": 9.991658619802183e-06, + "loss": 0.8252, + "step": 421 + }, + { + "epoch": 0.30653984972646586, + "grad_norm": 1.8441233018367729, + "learning_rate": 9.991520215167455e-06, + "loss": 0.8272, + "step": 422 + }, + { + "epoch": 0.3072662474746328, + "grad_norm": 1.8281278187371393, + "learning_rate": 9.99138067269381e-06, + "loss": 0.8333, + "step": 423 + }, + { + "epoch": 0.3079926452227998, + "grad_norm": 1.9152044763911662, + "learning_rate": 9.991239992413056e-06, + "loss": 0.8289, + "step": 424 + }, + { + "epoch": 0.3087190429709668, + "grad_norm": 1.853675901429331, + "learning_rate": 9.99109817435726e-06, + "loss": 0.8206, + "step": 425 + }, + { + "epoch": 0.30944544071913377, + "grad_norm": 1.828312370971533, + "learning_rate": 9.990955218558751e-06, + "loss": 0.8017, + "step": 426 + }, + { + "epoch": 0.31017183846730073, + "grad_norm": 1.828150007798247, + "learning_rate": 9.99081112505012e-06, + "loss": 0.8268, + "step": 427 + }, + { + "epoch": 0.31089823621546775, + "grad_norm": 2.086629705375154, + "learning_rate": 9.990665893864208e-06, + "loss": 0.8187, + "step": 428 + }, + { + "epoch": 0.3116246339636347, + "grad_norm": 1.7980079821122612, + "learning_rate": 9.990519525034125e-06, + "loss": 0.8181, + "step": 429 + }, + { + "epoch": 0.3123510317118017, + "grad_norm": 1.8022071581091539, + "learning_rate": 9.990372018593236e-06, + "loss": 0.8107, + "step": 430 + }, + { + "epoch": 0.3130774294599687, + "grad_norm": 1.7817457263756484, + "learning_rate": 9.990223374575165e-06, + "loss": 0.8223, + "step": 431 + }, + { + "epoch": 0.31380382720813566, + "grad_norm": 1.8186788986310414, + "learning_rate": 9.990073593013794e-06, + "loss": 0.8306, + "step": 432 + }, + { + "epoch": 0.3145302249563026, + "grad_norm": 1.78999729473262, + "learning_rate": 9.989922673943271e-06, + "loss": 0.8218, + "step": 433 + }, + { + "epoch": 0.31525662270446964, + "grad_norm": 1.7922498131927898, + "learning_rate": 9.989770617397994e-06, + "loss": 0.8177, + "step": 434 + }, + { + "epoch": 0.3159830204526366, + "grad_norm": 1.7407032579649269, + "learning_rate": 9.989617423412627e-06, + "loss": 0.8172, + "step": 435 + }, + { + "epoch": 0.31670941820080356, + "grad_norm": 1.7377083293660502, + "learning_rate": 9.989463092022093e-06, + "loss": 0.8097, + "step": 436 + }, + { + "epoch": 0.3174358159489706, + "grad_norm": 1.8106149891609342, + "learning_rate": 9.989307623261571e-06, + "loss": 0.8324, + "step": 437 + }, + { + "epoch": 0.31816221369713754, + "grad_norm": 2.0548792095096537, + "learning_rate": 9.989151017166498e-06, + "loss": 0.8281, + "step": 438 + }, + { + "epoch": 0.3188886114453045, + "grad_norm": 1.7202909871253904, + "learning_rate": 9.988993273772578e-06, + "loss": 0.8152, + "step": 439 + }, + { + "epoch": 0.3196150091934715, + "grad_norm": 1.9416483022054316, + "learning_rate": 9.988834393115768e-06, + "loss": 0.8116, + "step": 440 + }, + { + "epoch": 0.3203414069416385, + "grad_norm": 1.700305712466284, + "learning_rate": 9.98867437523228e-06, + "loss": 0.8192, + "step": 441 + }, + { + "epoch": 0.32106780468980545, + "grad_norm": 1.6986113649343069, + "learning_rate": 9.9885132201586e-06, + "loss": 0.8206, + "step": 442 + }, + { + "epoch": 0.32179420243797247, + "grad_norm": 1.7568165040916024, + "learning_rate": 9.988350927931455e-06, + "loss": 0.8159, + "step": 443 + }, + { + "epoch": 0.32252060018613943, + "grad_norm": 1.6638610925315405, + "learning_rate": 9.98818749858785e-06, + "loss": 0.8261, + "step": 444 + }, + { + "epoch": 0.3232469979343064, + "grad_norm": 1.6400851065971984, + "learning_rate": 9.988022932165029e-06, + "loss": 0.822, + "step": 445 + }, + { + "epoch": 0.3239733956824734, + "grad_norm": 1.6637785482086707, + "learning_rate": 9.987857228700512e-06, + "loss": 0.7977, + "step": 446 + }, + { + "epoch": 0.32469979343064037, + "grad_norm": 1.6527505249510377, + "learning_rate": 9.987690388232071e-06, + "loss": 0.8109, + "step": 447 + }, + { + "epoch": 0.32542619117880733, + "grad_norm": 1.6589931492071577, + "learning_rate": 9.987522410797736e-06, + "loss": 0.8122, + "step": 448 + }, + { + "epoch": 0.32615258892697435, + "grad_norm": 1.6112609442519945, + "learning_rate": 9.987353296435802e-06, + "loss": 0.8223, + "step": 449 + }, + { + "epoch": 0.3268789866751413, + "grad_norm": 1.6299975467243404, + "learning_rate": 9.987183045184815e-06, + "loss": 0.8148, + "step": 450 + }, + { + "epoch": 0.3276053844233083, + "grad_norm": 1.6911871741513431, + "learning_rate": 9.987011657083587e-06, + "loss": 0.812, + "step": 451 + }, + { + "epoch": 0.3283317821714753, + "grad_norm": 1.6301988035526893, + "learning_rate": 9.986839132171186e-06, + "loss": 0.7894, + "step": 452 + }, + { + "epoch": 0.32905817991964226, + "grad_norm": 1.6050588368442675, + "learning_rate": 9.98666547048694e-06, + "loss": 0.7994, + "step": 453 + }, + { + "epoch": 0.3297845776678092, + "grad_norm": 1.5935740513279069, + "learning_rate": 9.986490672070438e-06, + "loss": 0.8133, + "step": 454 + }, + { + "epoch": 0.33051097541597624, + "grad_norm": 1.5800491187854209, + "learning_rate": 9.986314736961522e-06, + "loss": 0.8057, + "step": 455 + }, + { + "epoch": 0.3312373731641432, + "grad_norm": 1.5998802898110331, + "learning_rate": 9.986137665200298e-06, + "loss": 0.7788, + "step": 456 + }, + { + "epoch": 0.33196377091231016, + "grad_norm": 1.5993757442652543, + "learning_rate": 9.985959456827132e-06, + "loss": 0.7947, + "step": 457 + }, + { + "epoch": 0.3326901686604771, + "grad_norm": 1.599986123833006, + "learning_rate": 9.985780111882648e-06, + "loss": 0.8124, + "step": 458 + }, + { + "epoch": 0.33341656640864414, + "grad_norm": 1.567732652499849, + "learning_rate": 9.985599630407727e-06, + "loss": 0.7983, + "step": 459 + }, + { + "epoch": 0.3341429641568111, + "grad_norm": 1.568753470069939, + "learning_rate": 9.98541801244351e-06, + "loss": 0.7993, + "step": 460 + }, + { + "epoch": 0.33486936190497807, + "grad_norm": 1.53872532512622, + "learning_rate": 9.9852352580314e-06, + "loss": 0.7954, + "step": 461 + }, + { + "epoch": 0.3355957596531451, + "grad_norm": 1.573735628732098, + "learning_rate": 9.985051367213055e-06, + "loss": 0.8028, + "step": 462 + }, + { + "epoch": 0.33632215740131205, + "grad_norm": 1.5562781601601243, + "learning_rate": 9.984866340030393e-06, + "loss": 0.7991, + "step": 463 + }, + { + "epoch": 0.337048555149479, + "grad_norm": 1.5286823486792622, + "learning_rate": 9.984680176525591e-06, + "loss": 0.8086, + "step": 464 + }, + { + "epoch": 0.33777495289764603, + "grad_norm": 1.5206427472940696, + "learning_rate": 9.98449287674109e-06, + "loss": 0.7931, + "step": 465 + }, + { + "epoch": 0.338501350645813, + "grad_norm": 1.529018033590309, + "learning_rate": 9.984304440719582e-06, + "loss": 0.7976, + "step": 466 + }, + { + "epoch": 0.33922774839397996, + "grad_norm": 1.4941622511227373, + "learning_rate": 9.984114868504023e-06, + "loss": 0.8079, + "step": 467 + }, + { + "epoch": 0.339954146142147, + "grad_norm": 1.4781959326957563, + "learning_rate": 9.983924160137627e-06, + "loss": 0.8022, + "step": 468 + }, + { + "epoch": 0.34068054389031394, + "grad_norm": 1.4674192733847702, + "learning_rate": 9.983732315663865e-06, + "loss": 0.8091, + "step": 469 + }, + { + "epoch": 0.3414069416384809, + "grad_norm": 1.4885758046839543, + "learning_rate": 9.983539335126473e-06, + "loss": 0.8014, + "step": 470 + }, + { + "epoch": 0.3421333393866479, + "grad_norm": 1.4639151595858675, + "learning_rate": 9.983345218569438e-06, + "loss": 0.8076, + "step": 471 + }, + { + "epoch": 0.3428597371348149, + "grad_norm": 1.4519543436282374, + "learning_rate": 9.98314996603701e-06, + "loss": 0.8147, + "step": 472 + }, + { + "epoch": 0.34358613488298184, + "grad_norm": 1.4770141552096316, + "learning_rate": 9.9829535775737e-06, + "loss": 0.7926, + "step": 473 + }, + { + "epoch": 0.34431253263114886, + "grad_norm": 1.461845780579588, + "learning_rate": 9.982756053224274e-06, + "loss": 0.8123, + "step": 474 + }, + { + "epoch": 0.3450389303793158, + "grad_norm": 1.503728720014433, + "learning_rate": 9.982557393033758e-06, + "loss": 0.779, + "step": 475 + }, + { + "epoch": 0.3457653281274828, + "grad_norm": 1.4849653618968792, + "learning_rate": 9.982357597047438e-06, + "loss": 0.7866, + "step": 476 + }, + { + "epoch": 0.3464917258756498, + "grad_norm": 1.4519069940854028, + "learning_rate": 9.98215666531086e-06, + "loss": 0.7858, + "step": 477 + }, + { + "epoch": 0.34721812362381677, + "grad_norm": 1.441735470802499, + "learning_rate": 9.981954597869825e-06, + "loss": 0.8038, + "step": 478 + }, + { + "epoch": 0.34794452137198373, + "grad_norm": 1.4516397081642147, + "learning_rate": 9.981751394770395e-06, + "loss": 0.8039, + "step": 479 + }, + { + "epoch": 0.34867091912015075, + "grad_norm": 1.4238296243544586, + "learning_rate": 9.981547056058893e-06, + "loss": 0.8057, + "step": 480 + }, + { + "epoch": 0.3493973168683177, + "grad_norm": 1.4190132849441395, + "learning_rate": 9.981341581781899e-06, + "loss": 0.7899, + "step": 481 + }, + { + "epoch": 0.35012371461648467, + "grad_norm": 1.4326395273254544, + "learning_rate": 9.98113497198625e-06, + "loss": 0.7895, + "step": 482 + }, + { + "epoch": 0.3508501123646517, + "grad_norm": 1.399840454673465, + "learning_rate": 9.980927226719044e-06, + "loss": 0.7965, + "step": 483 + }, + { + "epoch": 0.35157651011281865, + "grad_norm": 1.4298889459244744, + "learning_rate": 9.980718346027636e-06, + "loss": 0.7716, + "step": 484 + }, + { + "epoch": 0.3523029078609856, + "grad_norm": 1.40248883066053, + "learning_rate": 9.980508329959646e-06, + "loss": 0.7886, + "step": 485 + }, + { + "epoch": 0.35302930560915263, + "grad_norm": 1.387451411929723, + "learning_rate": 9.980297178562943e-06, + "loss": 0.7865, + "step": 486 + }, + { + "epoch": 0.3537557033573196, + "grad_norm": 1.3729511131953516, + "learning_rate": 9.980084891885662e-06, + "loss": 0.7872, + "step": 487 + }, + { + "epoch": 0.35448210110548656, + "grad_norm": 1.3898531180126903, + "learning_rate": 9.979871469976197e-06, + "loss": 0.7692, + "step": 488 + }, + { + "epoch": 0.3552084988536536, + "grad_norm": 1.3687738845362856, + "learning_rate": 9.979656912883193e-06, + "loss": 0.7845, + "step": 489 + }, + { + "epoch": 0.35593489660182054, + "grad_norm": 1.381027925131146, + "learning_rate": 9.979441220655564e-06, + "loss": 0.7895, + "step": 490 + }, + { + "epoch": 0.3566612943499875, + "grad_norm": 1.346328819485624, + "learning_rate": 9.979224393342477e-06, + "loss": 0.8029, + "step": 491 + }, + { + "epoch": 0.3573876920981545, + "grad_norm": 1.384237345196671, + "learning_rate": 9.979006430993357e-06, + "loss": 0.8098, + "step": 492 + }, + { + "epoch": 0.3581140898463215, + "grad_norm": 1.3699677137536757, + "learning_rate": 9.97878733365789e-06, + "loss": 0.7804, + "step": 493 + }, + { + "epoch": 0.35884048759448844, + "grad_norm": 1.336258195408756, + "learning_rate": 9.978567101386021e-06, + "loss": 0.7782, + "step": 494 + }, + { + "epoch": 0.35956688534265546, + "grad_norm": 1.3486480096565705, + "learning_rate": 9.978345734227953e-06, + "loss": 0.791, + "step": 495 + }, + { + "epoch": 0.3602932830908224, + "grad_norm": 1.3402139669890587, + "learning_rate": 9.978123232234147e-06, + "loss": 0.7805, + "step": 496 + }, + { + "epoch": 0.3610196808389894, + "grad_norm": 1.3203806678784924, + "learning_rate": 9.977899595455324e-06, + "loss": 0.8059, + "step": 497 + }, + { + "epoch": 0.3617460785871564, + "grad_norm": 1.3240137088291317, + "learning_rate": 9.977674823942463e-06, + "loss": 0.7769, + "step": 498 + }, + { + "epoch": 0.36247247633532337, + "grad_norm": 1.3371647877624209, + "learning_rate": 9.9774489177468e-06, + "loss": 0.7828, + "step": 499 + }, + { + "epoch": 0.36319887408349033, + "grad_norm": 1.3435475467562, + "learning_rate": 9.977221876919833e-06, + "loss": 0.7942, + "step": 500 + }, + { + "epoch": 0.36392527183165735, + "grad_norm": 1.3153448965184218, + "learning_rate": 9.976993701513317e-06, + "loss": 0.7721, + "step": 501 + }, + { + "epoch": 0.3646516695798243, + "grad_norm": 1.314625926636882, + "learning_rate": 9.976764391579266e-06, + "loss": 0.7625, + "step": 502 + }, + { + "epoch": 0.3653780673279913, + "grad_norm": 1.2981750511740526, + "learning_rate": 9.976533947169948e-06, + "loss": 0.7834, + "step": 503 + }, + { + "epoch": 0.36610446507615824, + "grad_norm": 1.3305161678384152, + "learning_rate": 9.9763023683379e-06, + "loss": 0.7874, + "step": 504 + }, + { + "epoch": 0.36683086282432525, + "grad_norm": 1.3051284028811305, + "learning_rate": 9.976069655135906e-06, + "loss": 0.7657, + "step": 505 + }, + { + "epoch": 0.3675572605724922, + "grad_norm": 1.279969631814808, + "learning_rate": 9.975835807617019e-06, + "loss": 0.7822, + "step": 506 + }, + { + "epoch": 0.3682836583206592, + "grad_norm": 1.2960567138751116, + "learning_rate": 9.975600825834542e-06, + "loss": 0.7694, + "step": 507 + }, + { + "epoch": 0.3690100560688262, + "grad_norm": 1.306849416485967, + "learning_rate": 9.975364709842042e-06, + "loss": 0.786, + "step": 508 + }, + { + "epoch": 0.36973645381699316, + "grad_norm": 1.271523847952412, + "learning_rate": 9.97512745969334e-06, + "loss": 0.7761, + "step": 509 + }, + { + "epoch": 0.3704628515651601, + "grad_norm": 1.2914497316352134, + "learning_rate": 9.97488907544252e-06, + "loss": 0.7623, + "step": 510 + }, + { + "epoch": 0.37118924931332714, + "grad_norm": 1.309823954459763, + "learning_rate": 9.974649557143926e-06, + "loss": 0.777, + "step": 511 + }, + { + "epoch": 0.3719156470614941, + "grad_norm": 1.2827788113935663, + "learning_rate": 9.97440890485215e-06, + "loss": 0.7787, + "step": 512 + }, + { + "epoch": 0.37264204480966107, + "grad_norm": 1.3873355303110653, + "learning_rate": 9.974167118622056e-06, + "loss": 0.784, + "step": 513 + }, + { + "epoch": 0.3733684425578281, + "grad_norm": 1.3254270203436433, + "learning_rate": 9.973924198508758e-06, + "loss": 0.7701, + "step": 514 + }, + { + "epoch": 0.37409484030599505, + "grad_norm": 1.2944959469357342, + "learning_rate": 9.97368014456763e-06, + "loss": 0.76, + "step": 515 + }, + { + "epoch": 0.374821238054162, + "grad_norm": 1.2474253013121739, + "learning_rate": 9.973434956854306e-06, + "loss": 0.7666, + "step": 516 + }, + { + "epoch": 0.375547635802329, + "grad_norm": 1.2656514925156077, + "learning_rate": 9.97318863542468e-06, + "loss": 0.7684, + "step": 517 + }, + { + "epoch": 0.376274033550496, + "grad_norm": 1.2260343270931553, + "learning_rate": 9.972941180334896e-06, + "loss": 0.7752, + "step": 518 + }, + { + "epoch": 0.37700043129866295, + "grad_norm": 1.2365101027609489, + "learning_rate": 9.972692591641367e-06, + "loss": 0.7659, + "step": 519 + }, + { + "epoch": 0.37772682904682997, + "grad_norm": 1.2107248676220899, + "learning_rate": 9.97244286940076e-06, + "loss": 0.7958, + "step": 520 + }, + { + "epoch": 0.37845322679499693, + "grad_norm": 1.305074081461733, + "learning_rate": 9.972192013669998e-06, + "loss": 0.7608, + "step": 521 + }, + { + "epoch": 0.3791796245431639, + "grad_norm": 1.2188322690177842, + "learning_rate": 9.971940024506265e-06, + "loss": 0.7864, + "step": 522 + }, + { + "epoch": 0.3799060222913309, + "grad_norm": 1.2134567730823578, + "learning_rate": 9.971686901967006e-06, + "loss": 0.7777, + "step": 523 + }, + { + "epoch": 0.3806324200394979, + "grad_norm": 1.214745326766063, + "learning_rate": 9.971432646109919e-06, + "loss": 0.7643, + "step": 524 + }, + { + "epoch": 0.38135881778766484, + "grad_norm": 3.637168647480618, + "learning_rate": 9.971177256992961e-06, + "loss": 0.7689, + "step": 525 + }, + { + "epoch": 0.38208521553583186, + "grad_norm": 1.195297556379348, + "learning_rate": 9.970920734674355e-06, + "loss": 0.7732, + "step": 526 + }, + { + "epoch": 0.3828116132839988, + "grad_norm": 1.2164501623111703, + "learning_rate": 9.970663079212568e-06, + "loss": 0.7609, + "step": 527 + }, + { + "epoch": 0.3835380110321658, + "grad_norm": 1.2178209396112816, + "learning_rate": 9.97040429066634e-06, + "loss": 0.7654, + "step": 528 + }, + { + "epoch": 0.3842644087803328, + "grad_norm": 1.196664997899127, + "learning_rate": 9.970144369094663e-06, + "loss": 0.7604, + "step": 529 + }, + { + "epoch": 0.38499080652849976, + "grad_norm": 1.205518971317643, + "learning_rate": 9.969883314556784e-06, + "loss": 0.7621, + "step": 530 + }, + { + "epoch": 0.3857172042766667, + "grad_norm": 1.1951038596907404, + "learning_rate": 9.969621127112211e-06, + "loss": 0.7821, + "step": 531 + }, + { + "epoch": 0.38644360202483374, + "grad_norm": 1.1948561987593798, + "learning_rate": 9.969357806820714e-06, + "loss": 0.7613, + "step": 532 + }, + { + "epoch": 0.3871699997730007, + "grad_norm": 1.2101490112809141, + "learning_rate": 9.969093353742318e-06, + "loss": 0.7612, + "step": 533 + }, + { + "epoch": 0.38789639752116767, + "grad_norm": 1.171209913607891, + "learning_rate": 9.968827767937303e-06, + "loss": 0.7801, + "step": 534 + }, + { + "epoch": 0.3886227952693347, + "grad_norm": 1.2245907679426409, + "learning_rate": 9.968561049466214e-06, + "loss": 0.7712, + "step": 535 + }, + { + "epoch": 0.38934919301750165, + "grad_norm": 1.2097923964427286, + "learning_rate": 9.96829319838985e-06, + "loss": 0.7504, + "step": 536 + }, + { + "epoch": 0.3900755907656686, + "grad_norm": 1.2500277611138257, + "learning_rate": 9.968024214769265e-06, + "loss": 0.7662, + "step": 537 + }, + { + "epoch": 0.39080198851383563, + "grad_norm": 1.1852000915918435, + "learning_rate": 9.967754098665778e-06, + "loss": 0.7614, + "step": 538 + }, + { + "epoch": 0.3915283862620026, + "grad_norm": 1.194954947602848, + "learning_rate": 9.967482850140965e-06, + "loss": 0.7758, + "step": 539 + }, + { + "epoch": 0.39225478401016955, + "grad_norm": 1.1815956688439189, + "learning_rate": 9.967210469256657e-06, + "loss": 0.7598, + "step": 540 + }, + { + "epoch": 0.3929811817583366, + "grad_norm": 1.1881258495264189, + "learning_rate": 9.96693695607494e-06, + "loss": 0.7642, + "step": 541 + }, + { + "epoch": 0.39370757950650354, + "grad_norm": 1.1408535421371642, + "learning_rate": 9.96666231065817e-06, + "loss": 0.7645, + "step": 542 + }, + { + "epoch": 0.3944339772546705, + "grad_norm": 1.2935039433273636, + "learning_rate": 9.966386533068949e-06, + "loss": 0.7826, + "step": 543 + }, + { + "epoch": 0.3951603750028375, + "grad_norm": 1.2067339559583887, + "learning_rate": 9.96610962337014e-06, + "loss": 0.7424, + "step": 544 + }, + { + "epoch": 0.3958867727510045, + "grad_norm": 1.1225159513779392, + "learning_rate": 9.965831581624872e-06, + "loss": 0.7654, + "step": 545 + }, + { + "epoch": 0.39661317049917144, + "grad_norm": 1.180080430089498, + "learning_rate": 9.965552407896519e-06, + "loss": 0.7655, + "step": 546 + }, + { + "epoch": 0.39733956824733846, + "grad_norm": 1.1256118419038732, + "learning_rate": 9.965272102248726e-06, + "loss": 0.7581, + "step": 547 + }, + { + "epoch": 0.3980659659955054, + "grad_norm": 1.1436250068062523, + "learning_rate": 9.964990664745387e-06, + "loss": 0.7493, + "step": 548 + }, + { + "epoch": 0.3987923637436724, + "grad_norm": 1.125774487206082, + "learning_rate": 9.964708095450655e-06, + "loss": 0.7563, + "step": 549 + }, + { + "epoch": 0.3995187614918394, + "grad_norm": 1.136420520925349, + "learning_rate": 9.964424394428943e-06, + "loss": 0.7648, + "step": 550 + }, + { + "epoch": 0.40024515924000637, + "grad_norm": 1.1580527989641898, + "learning_rate": 9.964139561744927e-06, + "loss": 0.7468, + "step": 551 + }, + { + "epoch": 0.4009715569881733, + "grad_norm": 1.1434802807692184, + "learning_rate": 9.963853597463533e-06, + "loss": 0.7502, + "step": 552 + }, + { + "epoch": 0.4016979547363403, + "grad_norm": 1.1142997290828731, + "learning_rate": 9.963566501649946e-06, + "loss": 0.7355, + "step": 553 + }, + { + "epoch": 0.4024243524845073, + "grad_norm": 1.0993449364698016, + "learning_rate": 9.963278274369613e-06, + "loss": 0.7438, + "step": 554 + }, + { + "epoch": 0.40315075023267427, + "grad_norm": 1.0645134866074886, + "learning_rate": 9.962988915688234e-06, + "loss": 0.7581, + "step": 555 + }, + { + "epoch": 0.40387714798084123, + "grad_norm": 1.0874163008360718, + "learning_rate": 9.962698425671773e-06, + "loss": 0.7559, + "step": 556 + }, + { + "epoch": 0.40460354572900825, + "grad_norm": 1.0893436970495776, + "learning_rate": 9.962406804386447e-06, + "loss": 0.7392, + "step": 557 + }, + { + "epoch": 0.4053299434771752, + "grad_norm": 1.1791025461727056, + "learning_rate": 9.962114051898731e-06, + "loss": 0.7468, + "step": 558 + }, + { + "epoch": 0.4060563412253422, + "grad_norm": 1.0617478387032344, + "learning_rate": 9.96182016827536e-06, + "loss": 0.7549, + "step": 559 + }, + { + "epoch": 0.4067827389735092, + "grad_norm": 1.0774497292109935, + "learning_rate": 9.961525153583327e-06, + "loss": 0.7415, + "step": 560 + }, + { + "epoch": 0.40750913672167616, + "grad_norm": 1.0821338781986698, + "learning_rate": 9.961229007889883e-06, + "loss": 0.7307, + "step": 561 + }, + { + "epoch": 0.4082355344698431, + "grad_norm": 1.0892656942041017, + "learning_rate": 9.960931731262533e-06, + "loss": 0.7427, + "step": 562 + }, + { + "epoch": 0.40896193221801014, + "grad_norm": 1.062469625082668, + "learning_rate": 9.960633323769043e-06, + "loss": 0.7498, + "step": 563 + }, + { + "epoch": 0.4096883299661771, + "grad_norm": 1.0668837175759642, + "learning_rate": 9.960333785477437e-06, + "loss": 0.7478, + "step": 564 + }, + { + "epoch": 0.41041472771434406, + "grad_norm": 1.0441183230317423, + "learning_rate": 9.960033116455997e-06, + "loss": 0.7559, + "step": 565 + }, + { + "epoch": 0.4111411254625111, + "grad_norm": 1.4159202842843437, + "learning_rate": 9.95973131677326e-06, + "loss": 0.7347, + "step": 566 + }, + { + "epoch": 0.41186752321067804, + "grad_norm": 1.2970154589976308, + "learning_rate": 9.959428386498023e-06, + "loss": 0.7428, + "step": 567 + }, + { + "epoch": 0.412593920958845, + "grad_norm": 1.0635260939557447, + "learning_rate": 9.95912432569934e-06, + "loss": 0.7684, + "step": 568 + }, + { + "epoch": 0.413320318707012, + "grad_norm": 1.065670306952603, + "learning_rate": 9.958819134446527e-06, + "loss": 0.7305, + "step": 569 + }, + { + "epoch": 0.414046716455179, + "grad_norm": 1.051820184891295, + "learning_rate": 9.958512812809148e-06, + "loss": 0.7417, + "step": 570 + }, + { + "epoch": 0.41477311420334595, + "grad_norm": 1.0932314529080025, + "learning_rate": 9.958205360857031e-06, + "loss": 0.757, + "step": 571 + }, + { + "epoch": 0.41549951195151297, + "grad_norm": 1.0904112546627889, + "learning_rate": 9.957896778660266e-06, + "loss": 0.7325, + "step": 572 + }, + { + "epoch": 0.41622590969967993, + "grad_norm": 1.0710868590287677, + "learning_rate": 9.957587066289189e-06, + "loss": 0.7532, + "step": 573 + }, + { + "epoch": 0.4169523074478469, + "grad_norm": 1.029792935403967, + "learning_rate": 9.957276223814405e-06, + "loss": 0.7487, + "step": 574 + }, + { + "epoch": 0.4176787051960139, + "grad_norm": 1.039615004460751, + "learning_rate": 9.95696425130677e-06, + "loss": 0.7484, + "step": 575 + }, + { + "epoch": 0.4184051029441809, + "grad_norm": 1.027265520457917, + "learning_rate": 9.956651148837402e-06, + "loss": 0.7431, + "step": 576 + }, + { + "epoch": 0.41913150069234784, + "grad_norm": 1.0445331233584851, + "learning_rate": 9.956336916477672e-06, + "loss": 0.7518, + "step": 577 + }, + { + "epoch": 0.41985789844051485, + "grad_norm": 1.0056629352503992, + "learning_rate": 9.95602155429921e-06, + "loss": 0.7442, + "step": 578 + }, + { + "epoch": 0.4205842961886818, + "grad_norm": 1.0364566761082272, + "learning_rate": 9.955705062373904e-06, + "loss": 0.7563, + "step": 579 + }, + { + "epoch": 0.4213106939368488, + "grad_norm": 1.0210209438240678, + "learning_rate": 9.955387440773902e-06, + "loss": 0.7484, + "step": 580 + }, + { + "epoch": 0.4220370916850158, + "grad_norm": 1.0282324113074215, + "learning_rate": 9.955068689571606e-06, + "loss": 0.7282, + "step": 581 + }, + { + "epoch": 0.42276348943318276, + "grad_norm": 1.1729764498651893, + "learning_rate": 9.954748808839675e-06, + "loss": 0.7542, + "step": 582 + }, + { + "epoch": 0.4234898871813497, + "grad_norm": 1.040462706700545, + "learning_rate": 9.95442779865103e-06, + "loss": 0.7194, + "step": 583 + }, + { + "epoch": 0.42421628492951674, + "grad_norm": 1.0062430347983087, + "learning_rate": 9.954105659078846e-06, + "loss": 0.7431, + "step": 584 + }, + { + "epoch": 0.4249426826776837, + "grad_norm": 1.0639035558270775, + "learning_rate": 9.953782390196554e-06, + "loss": 0.7492, + "step": 585 + }, + { + "epoch": 0.42566908042585067, + "grad_norm": 1.0425379187859118, + "learning_rate": 9.953457992077847e-06, + "loss": 0.7339, + "step": 586 + }, + { + "epoch": 0.4263954781740177, + "grad_norm": 1.0031591436611542, + "learning_rate": 9.953132464796674e-06, + "loss": 0.7358, + "step": 587 + }, + { + "epoch": 0.42712187592218465, + "grad_norm": 1.0015864996993507, + "learning_rate": 9.952805808427235e-06, + "loss": 0.7305, + "step": 588 + }, + { + "epoch": 0.4278482736703516, + "grad_norm": 1.8101689727197796, + "learning_rate": 9.952478023043999e-06, + "loss": 0.7501, + "step": 589 + }, + { + "epoch": 0.4285746714185186, + "grad_norm": 1.0253116313842865, + "learning_rate": 9.952149108721683e-06, + "loss": 0.7311, + "step": 590 + }, + { + "epoch": 0.4293010691666856, + "grad_norm": 0.9992846826039635, + "learning_rate": 9.951819065535265e-06, + "loss": 0.7172, + "step": 591 + }, + { + "epoch": 0.43002746691485255, + "grad_norm": 0.9817655480972577, + "learning_rate": 9.95148789355998e-06, + "loss": 0.7259, + "step": 592 + }, + { + "epoch": 0.43075386466301957, + "grad_norm": 1.063898145073969, + "learning_rate": 9.951155592871317e-06, + "loss": 0.7283, + "step": 593 + }, + { + "epoch": 0.43148026241118653, + "grad_norm": 0.9758454373620455, + "learning_rate": 9.950822163545032e-06, + "loss": 0.7472, + "step": 594 + }, + { + "epoch": 0.4322066601593535, + "grad_norm": 1.000367186605782, + "learning_rate": 9.950487605657126e-06, + "loss": 0.7452, + "step": 595 + }, + { + "epoch": 0.4329330579075205, + "grad_norm": 0.9540605112818284, + "learning_rate": 9.950151919283866e-06, + "loss": 0.742, + "step": 596 + }, + { + "epoch": 0.4336594556556875, + "grad_norm": 1.0039954162122182, + "learning_rate": 9.94981510450177e-06, + "loss": 0.734, + "step": 597 + }, + { + "epoch": 0.43438585340385444, + "grad_norm": 0.9744628011735277, + "learning_rate": 9.949477161387619e-06, + "loss": 0.7395, + "step": 598 + }, + { + "epoch": 0.43511225115202146, + "grad_norm": 0.9773584355212287, + "learning_rate": 9.94913809001845e-06, + "loss": 0.7315, + "step": 599 + }, + { + "epoch": 0.4358386489001884, + "grad_norm": 0.9760546875526702, + "learning_rate": 9.948797890471552e-06, + "loss": 0.731, + "step": 600 + }, + { + "epoch": 0.4365650466483554, + "grad_norm": 0.9911326295240364, + "learning_rate": 9.948456562824478e-06, + "loss": 0.7209, + "step": 601 + }, + { + "epoch": 0.43729144439652234, + "grad_norm": 0.9630389794461729, + "learning_rate": 9.948114107155032e-06, + "loss": 0.7185, + "step": 602 + }, + { + "epoch": 0.43801784214468936, + "grad_norm": 0.9643561574474725, + "learning_rate": 9.947770523541283e-06, + "loss": 0.7312, + "step": 603 + }, + { + "epoch": 0.4387442398928563, + "grad_norm": 0.9457862591453264, + "learning_rate": 9.947425812061546e-06, + "loss": 0.7334, + "step": 604 + }, + { + "epoch": 0.4394706376410233, + "grad_norm": 0.9764292467518713, + "learning_rate": 9.947079972794405e-06, + "loss": 0.719, + "step": 605 + }, + { + "epoch": 0.4401970353891903, + "grad_norm": 0.9652820848231466, + "learning_rate": 9.946733005818694e-06, + "loss": 0.7614, + "step": 606 + }, + { + "epoch": 0.44092343313735727, + "grad_norm": 0.9789870034754022, + "learning_rate": 9.946384911213504e-06, + "loss": 0.7505, + "step": 607 + }, + { + "epoch": 0.44164983088552423, + "grad_norm": 0.97261896897398, + "learning_rate": 9.946035689058189e-06, + "loss": 0.7269, + "step": 608 + }, + { + "epoch": 0.44237622863369125, + "grad_norm": 0.9512662501729403, + "learning_rate": 9.945685339432349e-06, + "loss": 0.7275, + "step": 609 + }, + { + "epoch": 0.4431026263818582, + "grad_norm": 0.9193936544152037, + "learning_rate": 9.945333862415855e-06, + "loss": 0.734, + "step": 610 + }, + { + "epoch": 0.4438290241300252, + "grad_norm": 0.9437084533207984, + "learning_rate": 9.944981258088822e-06, + "loss": 0.7455, + "step": 611 + }, + { + "epoch": 0.4445554218781922, + "grad_norm": 0.9366898434836213, + "learning_rate": 9.94462752653163e-06, + "loss": 0.7141, + "step": 612 + }, + { + "epoch": 0.44528181962635915, + "grad_norm": 0.9649835362940479, + "learning_rate": 9.944272667824915e-06, + "loss": 0.7172, + "step": 613 + }, + { + "epoch": 0.4460082173745261, + "grad_norm": 0.9480294246476275, + "learning_rate": 9.943916682049564e-06, + "loss": 0.7282, + "step": 614 + }, + { + "epoch": 0.44673461512269314, + "grad_norm": 0.9575970196238945, + "learning_rate": 9.943559569286731e-06, + "loss": 0.731, + "step": 615 + }, + { + "epoch": 0.4474610128708601, + "grad_norm": 1.0501616602935506, + "learning_rate": 9.943201329617819e-06, + "loss": 0.7344, + "step": 616 + }, + { + "epoch": 0.44818741061902706, + "grad_norm": 0.9169524659520641, + "learning_rate": 9.94284196312449e-06, + "loss": 0.7241, + "step": 617 + }, + { + "epoch": 0.4489138083671941, + "grad_norm": 0.8988178774810287, + "learning_rate": 9.942481469888664e-06, + "loss": 0.7263, + "step": 618 + }, + { + "epoch": 0.44964020611536104, + "grad_norm": 0.9133573838776243, + "learning_rate": 9.942119849992515e-06, + "loss": 0.7176, + "step": 619 + }, + { + "epoch": 0.450366603863528, + "grad_norm": 0.9312725209626799, + "learning_rate": 9.94175710351848e-06, + "loss": 0.7199, + "step": 620 + }, + { + "epoch": 0.451093001611695, + "grad_norm": 0.9231425800341202, + "learning_rate": 9.941393230549245e-06, + "loss": 0.721, + "step": 621 + }, + { + "epoch": 0.451819399359862, + "grad_norm": 0.8913757165093452, + "learning_rate": 9.941028231167756e-06, + "loss": 0.736, + "step": 622 + }, + { + "epoch": 0.45254579710802895, + "grad_norm": 0.9192511627739857, + "learning_rate": 9.940662105457219e-06, + "loss": 0.7311, + "step": 623 + }, + { + "epoch": 0.45327219485619596, + "grad_norm": 1.0126121327276585, + "learning_rate": 9.940294853501093e-06, + "loss": 0.7328, + "step": 624 + }, + { + "epoch": 0.4539985926043629, + "grad_norm": 0.9086059945770585, + "learning_rate": 9.939926475383093e-06, + "loss": 0.7306, + "step": 625 + }, + { + "epoch": 0.4547249903525299, + "grad_norm": 0.9006835415527965, + "learning_rate": 9.939556971187195e-06, + "loss": 0.7139, + "step": 626 + }, + { + "epoch": 0.4554513881006969, + "grad_norm": 0.8776531408479233, + "learning_rate": 9.939186340997629e-06, + "loss": 0.7256, + "step": 627 + }, + { + "epoch": 0.45617778584886387, + "grad_norm": 0.9131179128239798, + "learning_rate": 9.938814584898878e-06, + "loss": 0.7139, + "step": 628 + }, + { + "epoch": 0.45690418359703083, + "grad_norm": 0.911827974566791, + "learning_rate": 9.938441702975689e-06, + "loss": 0.7057, + "step": 629 + }, + { + "epoch": 0.45763058134519785, + "grad_norm": 0.9156346537960734, + "learning_rate": 9.938067695313062e-06, + "loss": 0.7087, + "step": 630 + }, + { + "epoch": 0.4583569790933648, + "grad_norm": 0.8880023303410371, + "learning_rate": 9.937692561996253e-06, + "loss": 0.718, + "step": 631 + }, + { + "epoch": 0.4590833768415318, + "grad_norm": 0.9401020700448064, + "learning_rate": 9.937316303110773e-06, + "loss": 0.7213, + "step": 632 + }, + { + "epoch": 0.4598097745896988, + "grad_norm": 0.8760498121504141, + "learning_rate": 9.936938918742396e-06, + "loss": 0.7087, + "step": 633 + }, + { + "epoch": 0.46053617233786576, + "grad_norm": 0.8471932856764717, + "learning_rate": 9.936560408977144e-06, + "loss": 0.7294, + "step": 634 + }, + { + "epoch": 0.4612625700860327, + "grad_norm": 0.8875415965820969, + "learning_rate": 9.936180773901305e-06, + "loss": 0.7232, + "step": 635 + }, + { + "epoch": 0.46198896783419974, + "grad_norm": 0.8747330864112066, + "learning_rate": 9.935800013601415e-06, + "loss": 0.7288, + "step": 636 + }, + { + "epoch": 0.4627153655823667, + "grad_norm": 0.8939346610383381, + "learning_rate": 9.935418128164271e-06, + "loss": 0.7422, + "step": 637 + }, + { + "epoch": 0.46344176333053366, + "grad_norm": 0.8639659321367306, + "learning_rate": 9.935035117676925e-06, + "loss": 0.7111, + "step": 638 + }, + { + "epoch": 0.4641681610787007, + "grad_norm": 0.8809990893660246, + "learning_rate": 9.934650982226687e-06, + "loss": 0.706, + "step": 639 + }, + { + "epoch": 0.46489455882686764, + "grad_norm": 0.8495844086453649, + "learning_rate": 9.93426572190112e-06, + "loss": 0.7274, + "step": 640 + }, + { + "epoch": 0.4656209565750346, + "grad_norm": 0.8457492178190025, + "learning_rate": 9.933879336788048e-06, + "loss": 0.7159, + "step": 641 + }, + { + "epoch": 0.4663473543232016, + "grad_norm": 0.872251965734634, + "learning_rate": 9.93349182697555e-06, + "loss": 0.7174, + "step": 642 + }, + { + "epoch": 0.4670737520713686, + "grad_norm": 0.8564380863748268, + "learning_rate": 9.933103192551958e-06, + "loss": 0.6997, + "step": 643 + }, + { + "epoch": 0.46780014981953555, + "grad_norm": 0.8539062885057994, + "learning_rate": 9.932713433605862e-06, + "loss": 0.7169, + "step": 644 + }, + { + "epoch": 0.46852654756770257, + "grad_norm": 0.8602195511545719, + "learning_rate": 9.932322550226114e-06, + "loss": 0.7267, + "step": 645 + }, + { + "epoch": 0.46925294531586953, + "grad_norm": 0.8660259689062596, + "learning_rate": 9.931930542501813e-06, + "loss": 0.7058, + "step": 646 + }, + { + "epoch": 0.4699793430640365, + "grad_norm": 0.8412500932338813, + "learning_rate": 9.931537410522323e-06, + "loss": 0.7117, + "step": 647 + }, + { + "epoch": 0.47070574081220345, + "grad_norm": 0.847365844467039, + "learning_rate": 9.931143154377257e-06, + "loss": 0.7038, + "step": 648 + }, + { + "epoch": 0.4714321385603705, + "grad_norm": 0.863076023805393, + "learning_rate": 9.930747774156485e-06, + "loss": 0.7236, + "step": 649 + }, + { + "epoch": 0.47215853630853744, + "grad_norm": 0.8578990699179848, + "learning_rate": 9.930351269950144e-06, + "loss": 0.7249, + "step": 650 + }, + { + "epoch": 0.4728849340567044, + "grad_norm": 0.8468784620880256, + "learning_rate": 9.92995364184861e-06, + "loss": 0.7111, + "step": 651 + }, + { + "epoch": 0.4736113318048714, + "grad_norm": 0.8548540373716411, + "learning_rate": 9.92955488994253e-06, + "loss": 0.7054, + "step": 652 + }, + { + "epoch": 0.4743377295530384, + "grad_norm": 0.8263217534184496, + "learning_rate": 9.9291550143228e-06, + "loss": 0.7013, + "step": 653 + }, + { + "epoch": 0.47506412730120534, + "grad_norm": 0.8558868202871573, + "learning_rate": 9.928754015080571e-06, + "loss": 0.6998, + "step": 654 + }, + { + "epoch": 0.47579052504937236, + "grad_norm": 0.8527525987219964, + "learning_rate": 9.928351892307254e-06, + "loss": 0.706, + "step": 655 + }, + { + "epoch": 0.4765169227975393, + "grad_norm": 0.8096658723028995, + "learning_rate": 9.927948646094514e-06, + "loss": 0.7104, + "step": 656 + }, + { + "epoch": 0.4772433205457063, + "grad_norm": 0.825386383857699, + "learning_rate": 9.927544276534275e-06, + "loss": 0.7263, + "step": 657 + }, + { + "epoch": 0.4779697182938733, + "grad_norm": 0.8110031620274161, + "learning_rate": 9.927138783718713e-06, + "loss": 0.7327, + "step": 658 + }, + { + "epoch": 0.47869611604204027, + "grad_norm": 0.8725373219371803, + "learning_rate": 9.926732167740262e-06, + "loss": 0.7062, + "step": 659 + }, + { + "epoch": 0.4794225137902072, + "grad_norm": 0.84156185662018, + "learning_rate": 9.926324428691612e-06, + "loss": 0.7184, + "step": 660 + }, + { + "epoch": 0.48014891153837425, + "grad_norm": 0.8234352820652021, + "learning_rate": 9.92591556666571e-06, + "loss": 0.6966, + "step": 661 + }, + { + "epoch": 0.4808753092865412, + "grad_norm": 0.8392923258120455, + "learning_rate": 9.925505581755754e-06, + "loss": 0.7033, + "step": 662 + }, + { + "epoch": 0.48160170703470817, + "grad_norm": 0.8175945014983422, + "learning_rate": 9.925094474055207e-06, + "loss": 0.7089, + "step": 663 + }, + { + "epoch": 0.4823281047828752, + "grad_norm": 0.8911331780707599, + "learning_rate": 9.92468224365778e-06, + "loss": 0.6947, + "step": 664 + }, + { + "epoch": 0.48305450253104215, + "grad_norm": 0.8111671666726761, + "learning_rate": 9.924268890657443e-06, + "loss": 0.7013, + "step": 665 + }, + { + "epoch": 0.4837809002792091, + "grad_norm": 0.820218156032821, + "learning_rate": 9.923854415148423e-06, + "loss": 0.7069, + "step": 666 + }, + { + "epoch": 0.48450729802737613, + "grad_norm": 0.8368104072898442, + "learning_rate": 9.923438817225198e-06, + "loss": 0.7056, + "step": 667 + }, + { + "epoch": 0.4852336957755431, + "grad_norm": 0.8172858374676331, + "learning_rate": 9.92302209698251e-06, + "loss": 0.7113, + "step": 668 + }, + { + "epoch": 0.48596009352371006, + "grad_norm": 0.8111333167554251, + "learning_rate": 9.922604254515348e-06, + "loss": 0.6986, + "step": 669 + }, + { + "epoch": 0.4866864912718771, + "grad_norm": 0.7939369425854435, + "learning_rate": 9.922185289918965e-06, + "loss": 0.7197, + "step": 670 + }, + { + "epoch": 0.48741288902004404, + "grad_norm": 0.8073816864202008, + "learning_rate": 9.921765203288862e-06, + "loss": 0.6938, + "step": 671 + }, + { + "epoch": 0.488139286768211, + "grad_norm": 0.8146062858167089, + "learning_rate": 9.921343994720803e-06, + "loss": 0.6901, + "step": 672 + }, + { + "epoch": 0.488865684516378, + "grad_norm": 0.8073049273734886, + "learning_rate": 9.920921664310801e-06, + "loss": 0.689, + "step": 673 + }, + { + "epoch": 0.489592082264545, + "grad_norm": 0.8000480150131922, + "learning_rate": 9.920498212155132e-06, + "loss": 0.6939, + "step": 674 + }, + { + "epoch": 0.49031848001271194, + "grad_norm": 0.8068016192638968, + "learning_rate": 9.920073638350319e-06, + "loss": 0.7025, + "step": 675 + }, + { + "epoch": 0.49104487776087896, + "grad_norm": 0.8260934820137759, + "learning_rate": 9.91964794299315e-06, + "loss": 0.6908, + "step": 676 + }, + { + "epoch": 0.4917712755090459, + "grad_norm": 0.7997873997828585, + "learning_rate": 9.91922112618066e-06, + "loss": 0.7041, + "step": 677 + }, + { + "epoch": 0.4924976732572129, + "grad_norm": 2.174322927976191, + "learning_rate": 9.918793188010147e-06, + "loss": 0.7143, + "step": 678 + }, + { + "epoch": 0.4932240710053799, + "grad_norm": 0.8298841272183457, + "learning_rate": 9.91836412857916e-06, + "loss": 0.7002, + "step": 679 + }, + { + "epoch": 0.49395046875354687, + "grad_norm": 0.815955296740638, + "learning_rate": 9.917933947985508e-06, + "loss": 0.6999, + "step": 680 + }, + { + "epoch": 0.49467686650171383, + "grad_norm": 0.8141154640169115, + "learning_rate": 9.917502646327247e-06, + "loss": 0.7185, + "step": 681 + }, + { + "epoch": 0.49540326424988085, + "grad_norm": 0.8441227422366394, + "learning_rate": 9.917070223702697e-06, + "loss": 0.7182, + "step": 682 + }, + { + "epoch": 0.4961296619980478, + "grad_norm": 0.7881967510667903, + "learning_rate": 9.916636680210431e-06, + "loss": 0.7189, + "step": 683 + }, + { + "epoch": 0.4968560597462148, + "grad_norm": 0.787513266332936, + "learning_rate": 9.916202015949277e-06, + "loss": 0.6961, + "step": 684 + }, + { + "epoch": 0.4975824574943818, + "grad_norm": 0.8297981418322181, + "learning_rate": 9.915766231018317e-06, + "loss": 0.7032, + "step": 685 + }, + { + "epoch": 0.49830885524254875, + "grad_norm": 0.7882950365342019, + "learning_rate": 9.915329325516894e-06, + "loss": 0.6936, + "step": 686 + }, + { + "epoch": 0.4990352529907157, + "grad_norm": 0.8012742339373384, + "learning_rate": 9.914891299544599e-06, + "loss": 0.6999, + "step": 687 + }, + { + "epoch": 0.49976165073888273, + "grad_norm": 0.7831837037525278, + "learning_rate": 9.914452153201282e-06, + "loss": 0.7051, + "step": 688 + }, + { + "epoch": 0.5004880484870496, + "grad_norm": 0.7862014823443961, + "learning_rate": 9.91401188658705e-06, + "loss": 0.6818, + "step": 689 + }, + { + "epoch": 0.5012144462352167, + "grad_norm": 0.7812613534102408, + "learning_rate": 9.91357049980226e-06, + "loss": 0.7019, + "step": 690 + }, + { + "epoch": 0.5019408439833837, + "grad_norm": 0.7697961842553203, + "learning_rate": 9.913127992947534e-06, + "loss": 0.7026, + "step": 691 + }, + { + "epoch": 0.5026672417315506, + "grad_norm": 0.7706971322719415, + "learning_rate": 9.91268436612374e-06, + "loss": 0.6966, + "step": 692 + }, + { + "epoch": 0.5033936394797176, + "grad_norm": 0.8218604590916418, + "learning_rate": 9.912239619432004e-06, + "loss": 0.7078, + "step": 693 + }, + { + "epoch": 0.5041200372278846, + "grad_norm": 0.8656142603168385, + "learning_rate": 9.911793752973711e-06, + "loss": 0.6914, + "step": 694 + }, + { + "epoch": 0.5048464349760515, + "grad_norm": 0.7750652898480885, + "learning_rate": 9.911346766850493e-06, + "loss": 0.6906, + "step": 695 + }, + { + "epoch": 0.5055728327242186, + "grad_norm": 0.7414098590179129, + "learning_rate": 9.910898661164249e-06, + "loss": 0.7109, + "step": 696 + }, + { + "epoch": 0.5062992304723856, + "grad_norm": 0.7569073529543336, + "learning_rate": 9.910449436017123e-06, + "loss": 0.7044, + "step": 697 + }, + { + "epoch": 0.5070256282205525, + "grad_norm": 0.7519787240625135, + "learning_rate": 9.909999091511516e-06, + "loss": 0.6964, + "step": 698 + }, + { + "epoch": 0.5077520259687195, + "grad_norm": 0.7733018395208994, + "learning_rate": 9.909547627750089e-06, + "loss": 0.7088, + "step": 699 + }, + { + "epoch": 0.5084784237168865, + "grad_norm": 0.7721039060567157, + "learning_rate": 9.909095044835755e-06, + "loss": 0.6913, + "step": 700 + }, + { + "epoch": 0.5092048214650534, + "grad_norm": 0.7385882659818571, + "learning_rate": 9.90864134287168e-06, + "loss": 0.693, + "step": 701 + }, + { + "epoch": 0.5099312192132205, + "grad_norm": 0.7421782318904324, + "learning_rate": 9.90818652196129e-06, + "loss": 0.7196, + "step": 702 + }, + { + "epoch": 0.5106576169613875, + "grad_norm": 0.7481323889267485, + "learning_rate": 9.907730582208261e-06, + "loss": 0.6999, + "step": 703 + }, + { + "epoch": 0.5113840147095544, + "grad_norm": 0.7601344672286628, + "learning_rate": 9.90727352371653e-06, + "loss": 0.698, + "step": 704 + }, + { + "epoch": 0.5121104124577214, + "grad_norm": 0.7483342620276054, + "learning_rate": 9.906815346590282e-06, + "loss": 0.7051, + "step": 705 + }, + { + "epoch": 0.5128368102058883, + "grad_norm": 0.818152958994249, + "learning_rate": 9.906356050933962e-06, + "loss": 0.6999, + "step": 706 + }, + { + "epoch": 0.5135632079540553, + "grad_norm": 0.7478510985456449, + "learning_rate": 9.905895636852268e-06, + "loss": 0.7063, + "step": 707 + }, + { + "epoch": 0.5142896057022224, + "grad_norm": 0.8360311926939497, + "learning_rate": 9.905434104450153e-06, + "loss": 0.7075, + "step": 708 + }, + { + "epoch": 0.5150160034503893, + "grad_norm": 0.728081494414813, + "learning_rate": 9.904971453832828e-06, + "loss": 0.6928, + "step": 709 + }, + { + "epoch": 0.5157424011985563, + "grad_norm": 0.7732823717838929, + "learning_rate": 9.904507685105753e-06, + "loss": 0.6804, + "step": 710 + }, + { + "epoch": 0.5164687989467233, + "grad_norm": 0.732175100079485, + "learning_rate": 9.904042798374647e-06, + "loss": 0.7015, + "step": 711 + }, + { + "epoch": 0.5171951966948902, + "grad_norm": 0.7521814228879359, + "learning_rate": 9.903576793745482e-06, + "loss": 0.6938, + "step": 712 + }, + { + "epoch": 0.5179215944430572, + "grad_norm": 0.727963567904356, + "learning_rate": 9.903109671324488e-06, + "loss": 0.6927, + "step": 713 + }, + { + "epoch": 0.5186479921912243, + "grad_norm": 0.8051400934341161, + "learning_rate": 9.902641431218149e-06, + "loss": 0.6845, + "step": 714 + }, + { + "epoch": 0.5193743899393912, + "grad_norm": 0.7399599754417058, + "learning_rate": 9.902172073533198e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.5201007876875582, + "grad_norm": 0.7177232225200838, + "learning_rate": 9.90170159837663e-06, + "loss": 0.6854, + "step": 716 + }, + { + "epoch": 0.5208271854357251, + "grad_norm": 0.7250491270529293, + "learning_rate": 9.90123000585569e-06, + "loss": 0.6718, + "step": 717 + }, + { + "epoch": 0.5215535831838921, + "grad_norm": 0.8226795700631545, + "learning_rate": 9.90075729607788e-06, + "loss": 0.6718, + "step": 718 + }, + { + "epoch": 0.5222799809320591, + "grad_norm": 0.7404005782833571, + "learning_rate": 9.900283469150959e-06, + "loss": 0.6894, + "step": 719 + }, + { + "epoch": 0.523006378680226, + "grad_norm": 0.7201992579567925, + "learning_rate": 9.899808525182935e-06, + "loss": 0.6878, + "step": 720 + }, + { + "epoch": 0.5237327764283931, + "grad_norm": 0.7085334339463191, + "learning_rate": 9.899332464282075e-06, + "loss": 0.6953, + "step": 721 + }, + { + "epoch": 0.5244591741765601, + "grad_norm": 0.7360701211727764, + "learning_rate": 9.898855286556897e-06, + "loss": 0.7087, + "step": 722 + }, + { + "epoch": 0.525185571924727, + "grad_norm": 0.7083495495920359, + "learning_rate": 9.898376992116179e-06, + "loss": 0.6957, + "step": 723 + }, + { + "epoch": 0.525911969672894, + "grad_norm": 0.7240208436144087, + "learning_rate": 9.897897581068948e-06, + "loss": 0.6986, + "step": 724 + }, + { + "epoch": 0.526638367421061, + "grad_norm": 0.7409795690878006, + "learning_rate": 9.897417053524487e-06, + "loss": 0.6898, + "step": 725 + }, + { + "epoch": 0.5273647651692279, + "grad_norm": 0.7292619216347412, + "learning_rate": 9.896935409592339e-06, + "loss": 0.6915, + "step": 726 + }, + { + "epoch": 0.528091162917395, + "grad_norm": 0.825328679283231, + "learning_rate": 9.896452649382291e-06, + "loss": 0.6867, + "step": 727 + }, + { + "epoch": 0.528817560665562, + "grad_norm": 0.731154122216054, + "learning_rate": 9.895968773004394e-06, + "loss": 0.6741, + "step": 728 + }, + { + "epoch": 0.5295439584137289, + "grad_norm": 0.7268817197791653, + "learning_rate": 9.895483780568949e-06, + "loss": 0.6728, + "step": 729 + }, + { + "epoch": 0.5302703561618959, + "grad_norm": 0.8850345181917018, + "learning_rate": 9.894997672186513e-06, + "loss": 0.6674, + "step": 730 + }, + { + "epoch": 0.5309967539100628, + "grad_norm": 0.7129326314830353, + "learning_rate": 9.894510447967893e-06, + "loss": 0.6949, + "step": 731 + }, + { + "epoch": 0.5317231516582298, + "grad_norm": 0.7445881485337487, + "learning_rate": 9.894022108024157e-06, + "loss": 0.6958, + "step": 732 + }, + { + "epoch": 0.5324495494063969, + "grad_norm": 0.7087267238053241, + "learning_rate": 9.893532652466625e-06, + "loss": 0.6826, + "step": 733 + }, + { + "epoch": 0.5331759471545638, + "grad_norm": 0.7029975399745358, + "learning_rate": 9.893042081406868e-06, + "loss": 0.691, + "step": 734 + }, + { + "epoch": 0.5339023449027308, + "grad_norm": 0.7357703371007258, + "learning_rate": 9.892550394956715e-06, + "loss": 0.6657, + "step": 735 + }, + { + "epoch": 0.5346287426508978, + "grad_norm": 0.7042415378243042, + "learning_rate": 9.892057593228248e-06, + "loss": 0.6669, + "step": 736 + }, + { + "epoch": 0.5353551403990647, + "grad_norm": 0.7537027415403048, + "learning_rate": 9.891563676333805e-06, + "loss": 0.7107, + "step": 737 + }, + { + "epoch": 0.5360815381472317, + "grad_norm": 0.716171888686636, + "learning_rate": 9.891068644385973e-06, + "loss": 0.679, + "step": 738 + }, + { + "epoch": 0.5368079358953988, + "grad_norm": 0.7988047089210955, + "learning_rate": 9.890572497497598e-06, + "loss": 0.6931, + "step": 739 + }, + { + "epoch": 0.5375343336435657, + "grad_norm": 0.7574454818586034, + "learning_rate": 9.89007523578178e-06, + "loss": 0.6924, + "step": 740 + }, + { + "epoch": 0.5382607313917327, + "grad_norm": 0.6985625682424247, + "learning_rate": 9.889576859351873e-06, + "loss": 0.6986, + "step": 741 + }, + { + "epoch": 0.5389871291398997, + "grad_norm": 0.9035477970405281, + "learning_rate": 9.88907736832148e-06, + "loss": 0.6931, + "step": 742 + }, + { + "epoch": 0.5397135268880666, + "grad_norm": 0.6894579767672432, + "learning_rate": 9.888576762804465e-06, + "loss": 0.6933, + "step": 743 + }, + { + "epoch": 0.5404399246362336, + "grad_norm": 0.6913186724338037, + "learning_rate": 9.888075042914946e-06, + "loss": 0.7082, + "step": 744 + }, + { + "epoch": 0.5411663223844007, + "grad_norm": 0.6914903982190204, + "learning_rate": 9.887572208767287e-06, + "loss": 0.6932, + "step": 745 + }, + { + "epoch": 0.5418927201325676, + "grad_norm": 5.796056874784305, + "learning_rate": 9.887068260476112e-06, + "loss": 0.6791, + "step": 746 + }, + { + "epoch": 0.5426191178807346, + "grad_norm": 0.8081792780426444, + "learning_rate": 9.886563198156302e-06, + "loss": 0.69, + "step": 747 + }, + { + "epoch": 0.5433455156289015, + "grad_norm": 0.9851928484816941, + "learning_rate": 9.886057021922984e-06, + "loss": 0.6944, + "step": 748 + }, + { + "epoch": 0.5440719133770685, + "grad_norm": 1.0116190695537544, + "learning_rate": 9.885549731891547e-06, + "loss": 0.6921, + "step": 749 + }, + { + "epoch": 0.5447983111252355, + "grad_norm": 1.1344844671703262, + "learning_rate": 9.885041328177625e-06, + "loss": 0.7013, + "step": 750 + }, + { + "epoch": 0.5455247088734025, + "grad_norm": 0.8914720633431127, + "learning_rate": 9.884531810897115e-06, + "loss": 0.6885, + "step": 751 + }, + { + "epoch": 0.5462511066215695, + "grad_norm": 0.8559206929302882, + "learning_rate": 9.884021180166164e-06, + "loss": 0.683, + "step": 752 + }, + { + "epoch": 0.5469775043697365, + "grad_norm": 0.8223803501330558, + "learning_rate": 9.883509436101168e-06, + "loss": 0.6741, + "step": 753 + }, + { + "epoch": 0.5477039021179034, + "grad_norm": 0.8233029896285243, + "learning_rate": 9.882996578818786e-06, + "loss": 0.6931, + "step": 754 + }, + { + "epoch": 0.5484302998660704, + "grad_norm": 0.8167149294453134, + "learning_rate": 9.882482608435924e-06, + "loss": 0.6798, + "step": 755 + }, + { + "epoch": 0.5491566976142374, + "grad_norm": 0.8157887270833556, + "learning_rate": 9.881967525069745e-06, + "loss": 0.6746, + "step": 756 + }, + { + "epoch": 0.5498830953624044, + "grad_norm": 0.786640198971054, + "learning_rate": 9.881451328837663e-06, + "loss": 0.7005, + "step": 757 + }, + { + "epoch": 0.5506094931105714, + "grad_norm": 0.7642969715869354, + "learning_rate": 9.880934019857346e-06, + "loss": 0.6806, + "step": 758 + }, + { + "epoch": 0.5513358908587384, + "grad_norm": 0.7827436991540868, + "learning_rate": 9.880415598246722e-06, + "loss": 0.6789, + "step": 759 + }, + { + "epoch": 0.5520622886069053, + "grad_norm": 0.7550650650499403, + "learning_rate": 9.879896064123961e-06, + "loss": 0.6859, + "step": 760 + }, + { + "epoch": 0.5527886863550723, + "grad_norm": 0.7314415455404134, + "learning_rate": 9.879375417607498e-06, + "loss": 0.673, + "step": 761 + }, + { + "epoch": 0.5535150841032392, + "grad_norm": 0.725667106303544, + "learning_rate": 9.878853658816015e-06, + "loss": 0.6722, + "step": 762 + }, + { + "epoch": 0.5542414818514063, + "grad_norm": 0.7253088889076802, + "learning_rate": 9.878330787868448e-06, + "loss": 0.6831, + "step": 763 + }, + { + "epoch": 0.5549678795995733, + "grad_norm": 0.7118746003515296, + "learning_rate": 9.87780680488399e-06, + "loss": 0.6854, + "step": 764 + }, + { + "epoch": 0.5556942773477402, + "grad_norm": 0.7124344265859947, + "learning_rate": 9.877281709982082e-06, + "loss": 0.6668, + "step": 765 + }, + { + "epoch": 0.5564206750959072, + "grad_norm": 0.6870086540392398, + "learning_rate": 9.876755503282425e-06, + "loss": 0.669, + "step": 766 + }, + { + "epoch": 0.5571470728440742, + "grad_norm": 0.6921474230850522, + "learning_rate": 9.87622818490497e-06, + "loss": 0.6735, + "step": 767 + }, + { + "epoch": 0.5578734705922411, + "grad_norm": 0.6691283320223589, + "learning_rate": 9.875699754969919e-06, + "loss": 0.6788, + "step": 768 + }, + { + "epoch": 0.5585998683404081, + "grad_norm": 0.7111757456950618, + "learning_rate": 9.875170213597731e-06, + "loss": 0.68, + "step": 769 + }, + { + "epoch": 0.5593262660885752, + "grad_norm": 0.6912163957683205, + "learning_rate": 9.874639560909118e-06, + "loss": 0.6835, + "step": 770 + }, + { + "epoch": 0.5600526638367421, + "grad_norm": 0.7059079937216522, + "learning_rate": 9.874107797025045e-06, + "loss": 0.6836, + "step": 771 + }, + { + "epoch": 0.5607790615849091, + "grad_norm": 0.6943502146356935, + "learning_rate": 9.87357492206673e-06, + "loss": 0.688, + "step": 772 + }, + { + "epoch": 0.561505459333076, + "grad_norm": 0.663025793246858, + "learning_rate": 9.87304093615564e-06, + "loss": 0.6801, + "step": 773 + }, + { + "epoch": 0.562231857081243, + "grad_norm": 0.680073642969289, + "learning_rate": 9.872505839413504e-06, + "loss": 0.6752, + "step": 774 + }, + { + "epoch": 0.56295825482941, + "grad_norm": 0.8247131197237683, + "learning_rate": 9.8719696319623e-06, + "loss": 0.6897, + "step": 775 + }, + { + "epoch": 0.563684652577577, + "grad_norm": 0.6800749444531121, + "learning_rate": 9.871432313924255e-06, + "loss": 0.6944, + "step": 776 + }, + { + "epoch": 0.564411050325744, + "grad_norm": 0.6589468936996172, + "learning_rate": 9.870893885421856e-06, + "loss": 0.6891, + "step": 777 + }, + { + "epoch": 0.565137448073911, + "grad_norm": 0.6699789187345543, + "learning_rate": 9.870354346577839e-06, + "loss": 0.6689, + "step": 778 + }, + { + "epoch": 0.5658638458220779, + "grad_norm": 0.6407811046259845, + "learning_rate": 9.869813697515197e-06, + "loss": 0.6662, + "step": 779 + }, + { + "epoch": 0.5665902435702449, + "grad_norm": 0.6621715745362748, + "learning_rate": 9.869271938357168e-06, + "loss": 0.6789, + "step": 780 + }, + { + "epoch": 0.5673166413184119, + "grad_norm": 0.6559166860078044, + "learning_rate": 9.868729069227253e-06, + "loss": 0.6734, + "step": 781 + }, + { + "epoch": 0.5680430390665789, + "grad_norm": 0.6934565492823604, + "learning_rate": 9.868185090249198e-06, + "loss": 0.6946, + "step": 782 + }, + { + "epoch": 0.5687694368147459, + "grad_norm": 0.6727050469819293, + "learning_rate": 9.867640001547007e-06, + "loss": 0.6851, + "step": 783 + }, + { + "epoch": 0.5694958345629129, + "grad_norm": 0.6829495739508391, + "learning_rate": 9.867093803244935e-06, + "loss": 0.664, + "step": 784 + }, + { + "epoch": 0.5702222323110798, + "grad_norm": 0.6481246326312373, + "learning_rate": 9.866546495467493e-06, + "loss": 0.6779, + "step": 785 + }, + { + "epoch": 0.5709486300592468, + "grad_norm": 0.6439766189909788, + "learning_rate": 9.86599807833944e-06, + "loss": 0.6581, + "step": 786 + }, + { + "epoch": 0.5716750278074137, + "grad_norm": 0.687659375861452, + "learning_rate": 9.865448551985788e-06, + "loss": 0.6766, + "step": 787 + }, + { + "epoch": 0.5724014255555808, + "grad_norm": 0.6283754813822696, + "learning_rate": 9.864897916531807e-06, + "loss": 0.6718, + "step": 788 + }, + { + "epoch": 0.5731278233037478, + "grad_norm": 0.6560916613823546, + "learning_rate": 9.864346172103016e-06, + "loss": 0.6791, + "step": 789 + }, + { + "epoch": 0.5738542210519147, + "grad_norm": 0.6579158607817118, + "learning_rate": 9.863793318825186e-06, + "loss": 0.6719, + "step": 790 + }, + { + "epoch": 0.5745806188000817, + "grad_norm": 0.6209174204285736, + "learning_rate": 9.863239356824346e-06, + "loss": 0.6668, + "step": 791 + }, + { + "epoch": 0.5753070165482487, + "grad_norm": 0.6674035902894433, + "learning_rate": 9.862684286226769e-06, + "loss": 0.663, + "step": 792 + }, + { + "epoch": 0.5760334142964156, + "grad_norm": 0.6390144070498434, + "learning_rate": 9.86212810715899e-06, + "loss": 0.6734, + "step": 793 + }, + { + "epoch": 0.5767598120445827, + "grad_norm": 0.6627367196299603, + "learning_rate": 9.86157081974779e-06, + "loss": 0.6527, + "step": 794 + }, + { + "epoch": 0.5774862097927497, + "grad_norm": 0.6596657350941416, + "learning_rate": 9.861012424120207e-06, + "loss": 0.6535, + "step": 795 + }, + { + "epoch": 0.5782126075409166, + "grad_norm": 0.6778250210754974, + "learning_rate": 9.860452920403528e-06, + "loss": 0.6806, + "step": 796 + }, + { + "epoch": 0.5789390052890836, + "grad_norm": 0.7402223552090328, + "learning_rate": 9.859892308725296e-06, + "loss": 0.6659, + "step": 797 + }, + { + "epoch": 0.5796654030372506, + "grad_norm": 0.6521000937099793, + "learning_rate": 9.859330589213302e-06, + "loss": 0.6747, + "step": 798 + }, + { + "epoch": 0.5803918007854175, + "grad_norm": 0.6589709652600224, + "learning_rate": 9.858767761995597e-06, + "loss": 0.6516, + "step": 799 + }, + { + "epoch": 0.5811181985335846, + "grad_norm": 0.6221340208951421, + "learning_rate": 9.858203827200477e-06, + "loss": 0.6575, + "step": 800 + }, + { + "epoch": 0.5818445962817516, + "grad_norm": 1.4392475113310477, + "learning_rate": 9.857638784956492e-06, + "loss": 0.6735, + "step": 801 + }, + { + "epoch": 0.5825709940299185, + "grad_norm": 0.6345408144241541, + "learning_rate": 9.857072635392449e-06, + "loss": 0.6704, + "step": 802 + }, + { + "epoch": 0.5832973917780855, + "grad_norm": 0.7024596655943957, + "learning_rate": 9.856505378637402e-06, + "loss": 0.6829, + "step": 803 + }, + { + "epoch": 0.5840237895262524, + "grad_norm": 0.6292491947390643, + "learning_rate": 9.85593701482066e-06, + "loss": 0.6877, + "step": 804 + }, + { + "epoch": 0.5847501872744194, + "grad_norm": 0.6444928744841502, + "learning_rate": 9.855367544071785e-06, + "loss": 0.664, + "step": 805 + }, + { + "epoch": 0.5854765850225865, + "grad_norm": 0.6334451683246128, + "learning_rate": 9.85479696652059e-06, + "loss": 0.6696, + "step": 806 + }, + { + "epoch": 0.5862029827707534, + "grad_norm": 0.6809663655418011, + "learning_rate": 9.854225282297141e-06, + "loss": 0.6951, + "step": 807 + }, + { + "epoch": 0.5869293805189204, + "grad_norm": 0.6633437669132748, + "learning_rate": 9.853652491531756e-06, + "loss": 0.6822, + "step": 808 + }, + { + "epoch": 0.5876557782670874, + "grad_norm": 0.6756561406729703, + "learning_rate": 9.853078594355003e-06, + "loss": 0.6729, + "step": 809 + }, + { + "epoch": 0.5883821760152543, + "grad_norm": 0.6175772675068528, + "learning_rate": 9.852503590897708e-06, + "loss": 0.6654, + "step": 810 + }, + { + "epoch": 0.5891085737634213, + "grad_norm": 0.6342072664742784, + "learning_rate": 9.851927481290943e-06, + "loss": 0.6813, + "step": 811 + }, + { + "epoch": 0.5898349715115884, + "grad_norm": 0.615481235538996, + "learning_rate": 9.851350265666034e-06, + "loss": 0.6569, + "step": 812 + }, + { + "epoch": 0.5905613692597553, + "grad_norm": 0.623506210218369, + "learning_rate": 9.850771944154563e-06, + "loss": 0.664, + "step": 813 + }, + { + "epoch": 0.5912877670079223, + "grad_norm": 0.6268532226267906, + "learning_rate": 9.850192516888357e-06, + "loss": 0.662, + "step": 814 + }, + { + "epoch": 0.5920141647560893, + "grad_norm": 0.6137453262606134, + "learning_rate": 9.849611983999503e-06, + "loss": 0.6481, + "step": 815 + }, + { + "epoch": 0.5927405625042562, + "grad_norm": 0.593844174928628, + "learning_rate": 9.849030345620333e-06, + "loss": 0.6664, + "step": 816 + }, + { + "epoch": 0.5934669602524232, + "grad_norm": 0.6066439569327111, + "learning_rate": 9.848447601883436e-06, + "loss": 0.6625, + "step": 817 + }, + { + "epoch": 0.5941933580005901, + "grad_norm": 0.6131220553241871, + "learning_rate": 9.847863752921649e-06, + "loss": 0.6769, + "step": 818 + }, + { + "epoch": 0.5949197557487572, + "grad_norm": 0.6127604757106416, + "learning_rate": 9.847278798868065e-06, + "loss": 0.6475, + "step": 819 + }, + { + "epoch": 0.5956461534969242, + "grad_norm": 0.6099319552963688, + "learning_rate": 9.846692739856023e-06, + "loss": 0.6714, + "step": 820 + }, + { + "epoch": 0.5963725512450911, + "grad_norm": 0.6400652371963013, + "learning_rate": 9.846105576019124e-06, + "loss": 0.6549, + "step": 821 + }, + { + "epoch": 0.5970989489932581, + "grad_norm": 0.6455148000021561, + "learning_rate": 9.84551730749121e-06, + "loss": 0.671, + "step": 822 + }, + { + "epoch": 0.5978253467414251, + "grad_norm": 0.6703898500769329, + "learning_rate": 9.844927934406381e-06, + "loss": 0.6672, + "step": 823 + }, + { + "epoch": 0.598551744489592, + "grad_norm": 0.6818434803159491, + "learning_rate": 9.844337456898985e-06, + "loss": 0.6559, + "step": 824 + }, + { + "epoch": 0.5992781422377591, + "grad_norm": 0.6307691400318461, + "learning_rate": 9.843745875103628e-06, + "loss": 0.6497, + "step": 825 + }, + { + "epoch": 0.6000045399859261, + "grad_norm": 0.605302882547688, + "learning_rate": 9.84315318915516e-06, + "loss": 0.6723, + "step": 826 + }, + { + "epoch": 0.600730937734093, + "grad_norm": 0.6250992899123806, + "learning_rate": 9.842559399188687e-06, + "loss": 0.6481, + "step": 827 + }, + { + "epoch": 0.60145733548226, + "grad_norm": 0.6090951991297758, + "learning_rate": 9.841964505339568e-06, + "loss": 0.6534, + "step": 828 + }, + { + "epoch": 0.602183733230427, + "grad_norm": 0.6731271428571752, + "learning_rate": 9.84136850774341e-06, + "loss": 0.6479, + "step": 829 + }, + { + "epoch": 0.6029101309785939, + "grad_norm": 0.6039650209903596, + "learning_rate": 9.840771406536073e-06, + "loss": 0.6731, + "step": 830 + }, + { + "epoch": 0.603636528726761, + "grad_norm": 0.6011413219903619, + "learning_rate": 9.84017320185367e-06, + "loss": 0.6838, + "step": 831 + }, + { + "epoch": 0.604362926474928, + "grad_norm": 0.6247705736995238, + "learning_rate": 9.839573893832564e-06, + "loss": 0.6723, + "step": 832 + }, + { + "epoch": 0.6050893242230949, + "grad_norm": 0.6220282560246873, + "learning_rate": 9.83897348260937e-06, + "loss": 0.6511, + "step": 833 + }, + { + "epoch": 0.6058157219712619, + "grad_norm": 0.6196889968424526, + "learning_rate": 9.838371968320951e-06, + "loss": 0.6669, + "step": 834 + }, + { + "epoch": 0.6065421197194288, + "grad_norm": 0.6183641181183734, + "learning_rate": 9.837769351104431e-06, + "loss": 0.6492, + "step": 835 + }, + { + "epoch": 0.6072685174675958, + "grad_norm": 0.7105892308887408, + "learning_rate": 9.837165631097177e-06, + "loss": 0.66, + "step": 836 + }, + { + "epoch": 0.6079949152157629, + "grad_norm": 0.5994749592259667, + "learning_rate": 9.836560808436806e-06, + "loss": 0.6766, + "step": 837 + }, + { + "epoch": 0.6087213129639298, + "grad_norm": 0.6002717353440816, + "learning_rate": 9.835954883261195e-06, + "loss": 0.6705, + "step": 838 + }, + { + "epoch": 0.6094477107120968, + "grad_norm": 0.6091493878958004, + "learning_rate": 9.835347855708464e-06, + "loss": 0.6582, + "step": 839 + }, + { + "epoch": 0.6101741084602638, + "grad_norm": 0.5968783813759632, + "learning_rate": 9.834739725916988e-06, + "loss": 0.6562, + "step": 840 + }, + { + "epoch": 0.6109005062084307, + "grad_norm": 0.7226406343809145, + "learning_rate": 9.834130494025395e-06, + "loss": 0.6682, + "step": 841 + }, + { + "epoch": 0.6116269039565977, + "grad_norm": 0.6236245151795735, + "learning_rate": 9.833520160172561e-06, + "loss": 0.6716, + "step": 842 + }, + { + "epoch": 0.6123533017047648, + "grad_norm": 0.649954848171266, + "learning_rate": 9.832908724497613e-06, + "loss": 0.6793, + "step": 843 + }, + { + "epoch": 0.6130796994529317, + "grad_norm": 0.8026775812212916, + "learning_rate": 9.83229618713993e-06, + "loss": 0.6482, + "step": 844 + }, + { + "epoch": 0.6138060972010987, + "grad_norm": 0.6549633112318536, + "learning_rate": 9.831682548239145e-06, + "loss": 0.6554, + "step": 845 + }, + { + "epoch": 0.6145324949492657, + "grad_norm": 0.6713749440634522, + "learning_rate": 9.83106780793514e-06, + "loss": 0.6501, + "step": 846 + }, + { + "epoch": 0.6152588926974326, + "grad_norm": 0.5976092035267176, + "learning_rate": 9.830451966368044e-06, + "loss": 0.6531, + "step": 847 + }, + { + "epoch": 0.6159852904455996, + "grad_norm": 0.6049813779727117, + "learning_rate": 9.829835023678243e-06, + "loss": 0.6511, + "step": 848 + }, + { + "epoch": 0.6167116881937666, + "grad_norm": 0.6075110205247508, + "learning_rate": 9.829216980006372e-06, + "loss": 0.6521, + "step": 849 + }, + { + "epoch": 0.6174380859419336, + "grad_norm": 0.6252951495072839, + "learning_rate": 9.828597835493319e-06, + "loss": 0.6458, + "step": 850 + }, + { + "epoch": 0.6181644836901006, + "grad_norm": 0.6655929334398094, + "learning_rate": 9.827977590280217e-06, + "loss": 0.6442, + "step": 851 + }, + { + "epoch": 0.6188908814382675, + "grad_norm": 0.5849309071329679, + "learning_rate": 9.827356244508455e-06, + "loss": 0.654, + "step": 852 + }, + { + "epoch": 0.6196172791864345, + "grad_norm": 0.6329417045286834, + "learning_rate": 9.82673379831967e-06, + "loss": 0.6541, + "step": 853 + }, + { + "epoch": 0.6203436769346015, + "grad_norm": 0.6746285553018466, + "learning_rate": 9.826110251855755e-06, + "loss": 0.6468, + "step": 854 + }, + { + "epoch": 0.6210700746827685, + "grad_norm": 0.6625458089858239, + "learning_rate": 9.825485605258846e-06, + "loss": 0.6475, + "step": 855 + }, + { + "epoch": 0.6217964724309355, + "grad_norm": 0.5924986665162189, + "learning_rate": 9.824859858671337e-06, + "loss": 0.6533, + "step": 856 + }, + { + "epoch": 0.6225228701791025, + "grad_norm": 0.5925928031908367, + "learning_rate": 9.824233012235869e-06, + "loss": 0.6523, + "step": 857 + }, + { + "epoch": 0.6232492679272694, + "grad_norm": 0.6605467068380393, + "learning_rate": 9.823605066095333e-06, + "loss": 0.6655, + "step": 858 + }, + { + "epoch": 0.6239756656754364, + "grad_norm": 0.5758378226313285, + "learning_rate": 9.822976020392874e-06, + "loss": 0.6527, + "step": 859 + }, + { + "epoch": 0.6247020634236033, + "grad_norm": 0.8187112632079937, + "learning_rate": 9.822345875271884e-06, + "loss": 0.6708, + "step": 860 + }, + { + "epoch": 0.6254284611717704, + "grad_norm": 0.6091453080384412, + "learning_rate": 9.821714630876009e-06, + "loss": 0.6741, + "step": 861 + }, + { + "epoch": 0.6261548589199374, + "grad_norm": 0.6029200767372679, + "learning_rate": 9.821082287349142e-06, + "loss": 0.652, + "step": 862 + }, + { + "epoch": 0.6268812566681043, + "grad_norm": 0.6106188486403386, + "learning_rate": 9.820448844835428e-06, + "loss": 0.6535, + "step": 863 + }, + { + "epoch": 0.6276076544162713, + "grad_norm": 0.5798197726465589, + "learning_rate": 9.819814303479268e-06, + "loss": 0.6448, + "step": 864 + }, + { + "epoch": 0.6283340521644383, + "grad_norm": 0.5943359460871429, + "learning_rate": 9.819178663425302e-06, + "loss": 0.6593, + "step": 865 + }, + { + "epoch": 0.6290604499126052, + "grad_norm": 0.5711044824370614, + "learning_rate": 9.818541924818432e-06, + "loss": 0.6672, + "step": 866 + }, + { + "epoch": 0.6297868476607722, + "grad_norm": 0.6952982586972026, + "learning_rate": 9.817904087803802e-06, + "loss": 0.6609, + "step": 867 + }, + { + "epoch": 0.6305132454089393, + "grad_norm": 0.5666054412389501, + "learning_rate": 9.817265152526811e-06, + "loss": 0.6608, + "step": 868 + }, + { + "epoch": 0.6312396431571062, + "grad_norm": 0.607448748434137, + "learning_rate": 9.816625119133109e-06, + "loss": 0.6532, + "step": 869 + }, + { + "epoch": 0.6319660409052732, + "grad_norm": 0.6135073373387316, + "learning_rate": 9.81598398776859e-06, + "loss": 0.657, + "step": 870 + }, + { + "epoch": 0.6326924386534402, + "grad_norm": 0.5926396668336961, + "learning_rate": 9.815341758579407e-06, + "loss": 0.6576, + "step": 871 + }, + { + "epoch": 0.6334188364016071, + "grad_norm": 0.5597381238227889, + "learning_rate": 9.814698431711957e-06, + "loss": 0.6547, + "step": 872 + }, + { + "epoch": 0.6341452341497741, + "grad_norm": 0.5823788201141964, + "learning_rate": 9.814054007312888e-06, + "loss": 0.6552, + "step": 873 + }, + { + "epoch": 0.6348716318979412, + "grad_norm": 0.5756772937219624, + "learning_rate": 9.813408485529103e-06, + "loss": 0.664, + "step": 874 + }, + { + "epoch": 0.6355980296461081, + "grad_norm": 0.5809430277549485, + "learning_rate": 9.812761866507748e-06, + "loss": 0.665, + "step": 875 + }, + { + "epoch": 0.6363244273942751, + "grad_norm": 0.5656279015120884, + "learning_rate": 9.812114150396223e-06, + "loss": 0.6685, + "step": 876 + }, + { + "epoch": 0.637050825142442, + "grad_norm": 0.5645400936452617, + "learning_rate": 9.81146533734218e-06, + "loss": 0.6661, + "step": 877 + }, + { + "epoch": 0.637777222890609, + "grad_norm": 0.6038920581138288, + "learning_rate": 9.810815427493516e-06, + "loss": 0.6513, + "step": 878 + }, + { + "epoch": 0.638503620638776, + "grad_norm": 0.6287288278527075, + "learning_rate": 9.810164420998385e-06, + "loss": 0.6378, + "step": 879 + }, + { + "epoch": 0.639230018386943, + "grad_norm": 0.5533432856351014, + "learning_rate": 9.80951231800518e-06, + "loss": 0.6503, + "step": 880 + }, + { + "epoch": 0.63995641613511, + "grad_norm": 0.5470769024145264, + "learning_rate": 9.808859118662558e-06, + "loss": 0.6487, + "step": 881 + }, + { + "epoch": 0.640682813883277, + "grad_norm": 0.5735833277625101, + "learning_rate": 9.808204823119414e-06, + "loss": 0.6383, + "step": 882 + }, + { + "epoch": 0.6414092116314439, + "grad_norm": 0.556322686472196, + "learning_rate": 9.8075494315249e-06, + "loss": 0.6465, + "step": 883 + }, + { + "epoch": 0.6421356093796109, + "grad_norm": 0.5843117762599995, + "learning_rate": 9.806892944028414e-06, + "loss": 0.662, + "step": 884 + }, + { + "epoch": 0.6428620071277779, + "grad_norm": 0.5499746337779424, + "learning_rate": 9.806235360779605e-06, + "loss": 0.6469, + "step": 885 + }, + { + "epoch": 0.6435884048759449, + "grad_norm": 0.5773912530605604, + "learning_rate": 9.805576681928373e-06, + "loss": 0.6573, + "step": 886 + }, + { + "epoch": 0.6443148026241119, + "grad_norm": 0.5699879773421873, + "learning_rate": 9.804916907624865e-06, + "loss": 0.6674, + "step": 887 + }, + { + "epoch": 0.6450412003722789, + "grad_norm": 0.5789871539035835, + "learning_rate": 9.804256038019482e-06, + "loss": 0.6588, + "step": 888 + }, + { + "epoch": 0.6457675981204458, + "grad_norm": 0.5261604408640989, + "learning_rate": 9.80359407326287e-06, + "loss": 0.6562, + "step": 889 + }, + { + "epoch": 0.6464939958686128, + "grad_norm": 0.5557539019590454, + "learning_rate": 9.802931013505929e-06, + "loss": 0.6409, + "step": 890 + }, + { + "epoch": 0.6472203936167797, + "grad_norm": 0.5481978932889137, + "learning_rate": 9.802266858899802e-06, + "loss": 0.645, + "step": 891 + }, + { + "epoch": 0.6479467913649468, + "grad_norm": 0.5563785530002918, + "learning_rate": 9.801601609595892e-06, + "loss": 0.6453, + "step": 892 + }, + { + "epoch": 0.6486731891131138, + "grad_norm": 0.5694891433544842, + "learning_rate": 9.80093526574584e-06, + "loss": 0.6409, + "step": 893 + }, + { + "epoch": 0.6493995868612807, + "grad_norm": 0.6636903196693987, + "learning_rate": 9.800267827501548e-06, + "loss": 0.6245, + "step": 894 + }, + { + "epoch": 0.6501259846094477, + "grad_norm": 0.6284598297309115, + "learning_rate": 9.799599295015154e-06, + "loss": 0.6366, + "step": 895 + }, + { + "epoch": 0.6508523823576147, + "grad_norm": 0.5437492801963629, + "learning_rate": 9.798929668439059e-06, + "loss": 0.6618, + "step": 896 + }, + { + "epoch": 0.6515787801057816, + "grad_norm": 0.5833253680539345, + "learning_rate": 9.798258947925905e-06, + "loss": 0.6482, + "step": 897 + }, + { + "epoch": 0.6523051778539487, + "grad_norm": 0.639449411834674, + "learning_rate": 9.797587133628586e-06, + "loss": 0.6472, + "step": 898 + }, + { + "epoch": 0.6530315756021157, + "grad_norm": 0.535584341412608, + "learning_rate": 9.796914225700243e-06, + "loss": 0.6593, + "step": 899 + }, + { + "epoch": 0.6537579733502826, + "grad_norm": 0.547567869110903, + "learning_rate": 9.79624022429427e-06, + "loss": 0.6393, + "step": 900 + }, + { + "epoch": 0.6544843710984496, + "grad_norm": 0.5522922741935506, + "learning_rate": 9.79556512956431e-06, + "loss": 0.6572, + "step": 901 + }, + { + "epoch": 0.6552107688466166, + "grad_norm": 0.5893600739438992, + "learning_rate": 9.794888941664253e-06, + "loss": 0.6416, + "step": 902 + }, + { + "epoch": 0.6559371665947835, + "grad_norm": 0.5351461232055588, + "learning_rate": 9.794211660748239e-06, + "loss": 0.6421, + "step": 903 + }, + { + "epoch": 0.6566635643429506, + "grad_norm": 0.5945928584609042, + "learning_rate": 9.793533286970655e-06, + "loss": 0.6706, + "step": 904 + }, + { + "epoch": 0.6573899620911176, + "grad_norm": 0.5570387654513104, + "learning_rate": 9.79285382048614e-06, + "loss": 0.6413, + "step": 905 + }, + { + "epoch": 0.6581163598392845, + "grad_norm": 0.5846520230724995, + "learning_rate": 9.792173261449586e-06, + "loss": 0.653, + "step": 906 + }, + { + "epoch": 0.6588427575874515, + "grad_norm": 0.5526390975042158, + "learning_rate": 9.791491610016123e-06, + "loss": 0.65, + "step": 907 + }, + { + "epoch": 0.6595691553356184, + "grad_norm": 0.5574061987078701, + "learning_rate": 9.79080886634114e-06, + "loss": 0.6425, + "step": 908 + }, + { + "epoch": 0.6602955530837854, + "grad_norm": 0.5341499920110605, + "learning_rate": 9.79012503058027e-06, + "loss": 0.6521, + "step": 909 + }, + { + "epoch": 0.6610219508319525, + "grad_norm": 0.5791055056403942, + "learning_rate": 9.789440102889396e-06, + "loss": 0.658, + "step": 910 + }, + { + "epoch": 0.6617483485801194, + "grad_norm": 0.5260761897292887, + "learning_rate": 9.788754083424654e-06, + "loss": 0.6401, + "step": 911 + }, + { + "epoch": 0.6624747463282864, + "grad_norm": 0.5109150407815444, + "learning_rate": 9.788066972342421e-06, + "loss": 0.6506, + "step": 912 + }, + { + "epoch": 0.6632011440764534, + "grad_norm": 0.5319111825792613, + "learning_rate": 9.787378769799326e-06, + "loss": 0.652, + "step": 913 + }, + { + "epoch": 0.6639275418246203, + "grad_norm": 0.5240092882015561, + "learning_rate": 9.786689475952254e-06, + "loss": 0.6445, + "step": 914 + }, + { + "epoch": 0.6646539395727873, + "grad_norm": 0.5440903216518904, + "learning_rate": 9.785999090958326e-06, + "loss": 0.6308, + "step": 915 + }, + { + "epoch": 0.6653803373209543, + "grad_norm": 0.5182863646250728, + "learning_rate": 9.785307614974922e-06, + "loss": 0.6353, + "step": 916 + }, + { + "epoch": 0.6661067350691213, + "grad_norm": 0.5407517444632997, + "learning_rate": 9.784615048159665e-06, + "loss": 0.6567, + "step": 917 + }, + { + "epoch": 0.6668331328172883, + "grad_norm": 0.5595401196963885, + "learning_rate": 9.78392139067043e-06, + "loss": 0.6618, + "step": 918 + }, + { + "epoch": 0.6675595305654553, + "grad_norm": 0.5281983812603945, + "learning_rate": 9.783226642665338e-06, + "loss": 0.637, + "step": 919 + }, + { + "epoch": 0.6682859283136222, + "grad_norm": 0.588468629477807, + "learning_rate": 9.782530804302763e-06, + "loss": 0.6406, + "step": 920 + }, + { + "epoch": 0.6690123260617892, + "grad_norm": 0.5387289554693768, + "learning_rate": 9.78183387574132e-06, + "loss": 0.6327, + "step": 921 + }, + { + "epoch": 0.6697387238099561, + "grad_norm": 0.5521325969515342, + "learning_rate": 9.781135857139881e-06, + "loss": 0.6374, + "step": 922 + }, + { + "epoch": 0.6704651215581232, + "grad_norm": 0.5453121610390395, + "learning_rate": 9.780436748657559e-06, + "loss": 0.6742, + "step": 923 + }, + { + "epoch": 0.6711915193062902, + "grad_norm": 0.527286669061359, + "learning_rate": 9.77973655045372e-06, + "loss": 0.648, + "step": 924 + }, + { + "epoch": 0.6719179170544571, + "grad_norm": 0.5625960128189396, + "learning_rate": 9.779035262687976e-06, + "loss": 0.6288, + "step": 925 + }, + { + "epoch": 0.6726443148026241, + "grad_norm": 0.5362827537791287, + "learning_rate": 9.778332885520195e-06, + "loss": 0.6277, + "step": 926 + }, + { + "epoch": 0.6733707125507911, + "grad_norm": 0.745633309649563, + "learning_rate": 9.777629419110478e-06, + "loss": 0.636, + "step": 927 + }, + { + "epoch": 0.674097110298958, + "grad_norm": 0.5743650133346059, + "learning_rate": 9.776924863619187e-06, + "loss": 0.6313, + "step": 928 + }, + { + "epoch": 0.6748235080471251, + "grad_norm": 0.5339811952680448, + "learning_rate": 9.776219219206933e-06, + "loss": 0.6534, + "step": 929 + }, + { + "epoch": 0.6755499057952921, + "grad_norm": 0.5412542702478056, + "learning_rate": 9.775512486034564e-06, + "loss": 0.6439, + "step": 930 + }, + { + "epoch": 0.676276303543459, + "grad_norm": 0.5390435892309067, + "learning_rate": 9.774804664263184e-06, + "loss": 0.658, + "step": 931 + }, + { + "epoch": 0.677002701291626, + "grad_norm": 0.6095626038690631, + "learning_rate": 9.774095754054147e-06, + "loss": 0.6488, + "step": 932 + }, + { + "epoch": 0.677729099039793, + "grad_norm": 0.5497445658882337, + "learning_rate": 9.773385755569052e-06, + "loss": 0.6394, + "step": 933 + }, + { + "epoch": 0.6784554967879599, + "grad_norm": 0.5966473206171481, + "learning_rate": 9.772674668969743e-06, + "loss": 0.6429, + "step": 934 + }, + { + "epoch": 0.679181894536127, + "grad_norm": 0.5590698378286874, + "learning_rate": 9.771962494418319e-06, + "loss": 0.6359, + "step": 935 + }, + { + "epoch": 0.679908292284294, + "grad_norm": 0.5339631288047351, + "learning_rate": 9.77124923207712e-06, + "loss": 0.6255, + "step": 936 + }, + { + "epoch": 0.6806346900324609, + "grad_norm": 0.5693835323242853, + "learning_rate": 9.77053488210874e-06, + "loss": 0.6454, + "step": 937 + }, + { + "epoch": 0.6813610877806279, + "grad_norm": 0.5136483877400199, + "learning_rate": 9.769819444676017e-06, + "loss": 0.6423, + "step": 938 + }, + { + "epoch": 0.6820874855287948, + "grad_norm": 0.5644953061878822, + "learning_rate": 9.769102919942038e-06, + "loss": 0.6327, + "step": 939 + }, + { + "epoch": 0.6828138832769618, + "grad_norm": 0.5243587189424204, + "learning_rate": 9.768385308070139e-06, + "loss": 0.6523, + "step": 940 + }, + { + "epoch": 0.6835402810251289, + "grad_norm": 0.53271625284346, + "learning_rate": 9.767666609223902e-06, + "loss": 0.6441, + "step": 941 + }, + { + "epoch": 0.6842666787732958, + "grad_norm": 0.5814209737223036, + "learning_rate": 9.766946823567157e-06, + "loss": 0.619, + "step": 942 + }, + { + "epoch": 0.6849930765214628, + "grad_norm": 0.5309212150196624, + "learning_rate": 9.766225951263982e-06, + "loss": 0.6108, + "step": 943 + }, + { + "epoch": 0.6857194742696298, + "grad_norm": 0.5374951063410888, + "learning_rate": 9.765503992478704e-06, + "loss": 0.627, + "step": 944 + }, + { + "epoch": 0.6864458720177967, + "grad_norm": 0.5142193404257767, + "learning_rate": 9.764780947375898e-06, + "loss": 0.65, + "step": 945 + }, + { + "epoch": 0.6871722697659637, + "grad_norm": 0.5385889304668647, + "learning_rate": 9.764056816120383e-06, + "loss": 0.6447, + "step": 946 + }, + { + "epoch": 0.6878986675141308, + "grad_norm": 0.5079349675047481, + "learning_rate": 9.763331598877229e-06, + "loss": 0.6252, + "step": 947 + }, + { + "epoch": 0.6886250652622977, + "grad_norm": 0.5252424116585818, + "learning_rate": 9.762605295811751e-06, + "loss": 0.6378, + "step": 948 + }, + { + "epoch": 0.6893514630104647, + "grad_norm": 0.5351541129299291, + "learning_rate": 9.761877907089515e-06, + "loss": 0.6316, + "step": 949 + }, + { + "epoch": 0.6900778607586316, + "grad_norm": 0.5213029147033851, + "learning_rate": 9.761149432876331e-06, + "loss": 0.6434, + "step": 950 + }, + { + "epoch": 0.6908042585067986, + "grad_norm": 0.5287068047571596, + "learning_rate": 9.760419873338261e-06, + "loss": 0.6434, + "step": 951 + }, + { + "epoch": 0.6915306562549656, + "grad_norm": 0.5534499293711467, + "learning_rate": 9.759689228641605e-06, + "loss": 0.6395, + "step": 952 + }, + { + "epoch": 0.6922570540031326, + "grad_norm": 0.586784866270694, + "learning_rate": 9.758957498952923e-06, + "loss": 0.653, + "step": 953 + }, + { + "epoch": 0.6929834517512996, + "grad_norm": 0.5266464417997495, + "learning_rate": 9.758224684439013e-06, + "loss": 0.6266, + "step": 954 + }, + { + "epoch": 0.6937098494994666, + "grad_norm": 0.5122612880356957, + "learning_rate": 9.757490785266924e-06, + "loss": 0.6347, + "step": 955 + }, + { + "epoch": 0.6944362472476335, + "grad_norm": 0.6375844075472203, + "learning_rate": 9.75675580160395e-06, + "loss": 0.625, + "step": 956 + }, + { + "epoch": 0.6951626449958005, + "grad_norm": 0.5212655025584345, + "learning_rate": 9.756019733617634e-06, + "loss": 0.6396, + "step": 957 + }, + { + "epoch": 0.6958890427439675, + "grad_norm": 0.5086286936385832, + "learning_rate": 9.755282581475769e-06, + "loss": 0.6342, + "step": 958 + }, + { + "epoch": 0.6966154404921344, + "grad_norm": 0.5128731180861688, + "learning_rate": 9.754544345346388e-06, + "loss": 0.6292, + "step": 959 + }, + { + "epoch": 0.6973418382403015, + "grad_norm": 0.4999617398047923, + "learning_rate": 9.75380502539778e-06, + "loss": 0.6339, + "step": 960 + }, + { + "epoch": 0.6980682359884685, + "grad_norm": 0.5282274063606959, + "learning_rate": 9.753064621798472e-06, + "loss": 0.6432, + "step": 961 + }, + { + "epoch": 0.6987946337366354, + "grad_norm": 0.5252127904810339, + "learning_rate": 9.752323134717244e-06, + "loss": 0.6281, + "step": 962 + }, + { + "epoch": 0.6995210314848024, + "grad_norm": 0.5082613730934245, + "learning_rate": 9.75158056432312e-06, + "loss": 0.6338, + "step": 963 + }, + { + "epoch": 0.7002474292329693, + "grad_norm": 0.4882829401463821, + "learning_rate": 9.750836910785372e-06, + "loss": 0.6304, + "step": 964 + }, + { + "epoch": 0.7009738269811363, + "grad_norm": 0.5538873366273053, + "learning_rate": 9.75009217427352e-06, + "loss": 0.6355, + "step": 965 + }, + { + "epoch": 0.7017002247293034, + "grad_norm": 0.7956754652954774, + "learning_rate": 9.749346354957331e-06, + "loss": 0.6462, + "step": 966 + }, + { + "epoch": 0.7024266224774703, + "grad_norm": 0.5219908542511834, + "learning_rate": 9.748599453006818e-06, + "loss": 0.6265, + "step": 967 + }, + { + "epoch": 0.7031530202256373, + "grad_norm": 0.5210128918221577, + "learning_rate": 9.74785146859224e-06, + "loss": 0.6537, + "step": 968 + }, + { + "epoch": 0.7038794179738043, + "grad_norm": 0.5040833546403364, + "learning_rate": 9.7471024018841e-06, + "loss": 0.6385, + "step": 969 + }, + { + "epoch": 0.7046058157219712, + "grad_norm": 0.5001664206030246, + "learning_rate": 9.746352253053155e-06, + "loss": 0.6313, + "step": 970 + }, + { + "epoch": 0.7053322134701382, + "grad_norm": 0.5098747327571058, + "learning_rate": 9.745601022270403e-06, + "loss": 0.633, + "step": 971 + }, + { + "epoch": 0.7060586112183053, + "grad_norm": 0.8219230135157451, + "learning_rate": 9.74484870970709e-06, + "loss": 0.6325, + "step": 972 + }, + { + "epoch": 0.7067850089664722, + "grad_norm": 0.5041778633169351, + "learning_rate": 9.744095315534711e-06, + "loss": 0.6515, + "step": 973 + }, + { + "epoch": 0.7075114067146392, + "grad_norm": 0.48370176730119585, + "learning_rate": 9.743340839925006e-06, + "loss": 0.6188, + "step": 974 + }, + { + "epoch": 0.7082378044628062, + "grad_norm": 0.5875403728810346, + "learning_rate": 9.742585283049957e-06, + "loss": 0.6616, + "step": 975 + }, + { + "epoch": 0.7089642022109731, + "grad_norm": 0.6310835977211383, + "learning_rate": 9.741828645081799e-06, + "loss": 0.6453, + "step": 976 + }, + { + "epoch": 0.7096905999591401, + "grad_norm": 0.545599500733757, + "learning_rate": 9.741070926193013e-06, + "loss": 0.6322, + "step": 977 + }, + { + "epoch": 0.7104169977073072, + "grad_norm": 0.5017837062704927, + "learning_rate": 9.74031212655632e-06, + "loss": 0.6512, + "step": 978 + }, + { + "epoch": 0.7111433954554741, + "grad_norm": 0.503250513244053, + "learning_rate": 9.739552246344692e-06, + "loss": 0.6547, + "step": 979 + }, + { + "epoch": 0.7118697932036411, + "grad_norm": 0.5163358870991519, + "learning_rate": 9.738791285731353e-06, + "loss": 0.6483, + "step": 980 + }, + { + "epoch": 0.712596190951808, + "grad_norm": 0.49564125514372964, + "learning_rate": 9.73802924488976e-06, + "loss": 0.6462, + "step": 981 + }, + { + "epoch": 0.713322588699975, + "grad_norm": 0.4913113083401416, + "learning_rate": 9.737266123993627e-06, + "loss": 0.6428, + "step": 982 + }, + { + "epoch": 0.714048986448142, + "grad_norm": 0.5095254384295473, + "learning_rate": 9.736501923216912e-06, + "loss": 0.6188, + "step": 983 + }, + { + "epoch": 0.714775384196309, + "grad_norm": 0.5189549653207551, + "learning_rate": 9.735736642733816e-06, + "loss": 0.6122, + "step": 984 + }, + { + "epoch": 0.715501781944476, + "grad_norm": 0.5112989759132592, + "learning_rate": 9.734970282718788e-06, + "loss": 0.6334, + "step": 985 + }, + { + "epoch": 0.716228179692643, + "grad_norm": 0.5107580279579825, + "learning_rate": 9.734202843346522e-06, + "loss": 0.6252, + "step": 986 + }, + { + "epoch": 0.7169545774408099, + "grad_norm": 0.5167080885659902, + "learning_rate": 9.733434324791962e-06, + "loss": 0.6401, + "step": 987 + }, + { + "epoch": 0.7176809751889769, + "grad_norm": 0.5023084975379272, + "learning_rate": 9.732664727230293e-06, + "loss": 0.6281, + "step": 988 + }, + { + "epoch": 0.7184073729371439, + "grad_norm": 0.537562102338342, + "learning_rate": 9.731894050836952e-06, + "loss": 0.6246, + "step": 989 + }, + { + "epoch": 0.7191337706853109, + "grad_norm": 0.48088299234652243, + "learning_rate": 9.731122295787611e-06, + "loss": 0.6354, + "step": 990 + }, + { + "epoch": 0.7198601684334779, + "grad_norm": 0.563544077811699, + "learning_rate": 9.730349462258202e-06, + "loss": 0.6261, + "step": 991 + }, + { + "epoch": 0.7205865661816448, + "grad_norm": 0.49876787412524776, + "learning_rate": 9.729575550424892e-06, + "loss": 0.6181, + "step": 992 + }, + { + "epoch": 0.7213129639298118, + "grad_norm": 0.527873118148596, + "learning_rate": 9.728800560464097e-06, + "loss": 0.6362, + "step": 993 + }, + { + "epoch": 0.7220393616779788, + "grad_norm": 0.6547177893128939, + "learning_rate": 9.728024492552484e-06, + "loss": 0.6312, + "step": 994 + }, + { + "epoch": 0.7227657594261457, + "grad_norm": 0.4938014707944413, + "learning_rate": 9.727247346866955e-06, + "loss": 0.6386, + "step": 995 + }, + { + "epoch": 0.7234921571743128, + "grad_norm": 0.4969962276566467, + "learning_rate": 9.726469123584668e-06, + "loss": 0.6301, + "step": 996 + }, + { + "epoch": 0.7242185549224798, + "grad_norm": 0.5610061497738755, + "learning_rate": 9.72568982288302e-06, + "loss": 0.6261, + "step": 997 + }, + { + "epoch": 0.7249449526706467, + "grad_norm": 0.48231769992974666, + "learning_rate": 9.724909444939657e-06, + "loss": 0.6321, + "step": 998 + }, + { + "epoch": 0.7256713504188137, + "grad_norm": 0.5566561152996569, + "learning_rate": 9.72412798993247e-06, + "loss": 0.6301, + "step": 999 + }, + { + "epoch": 0.7263977481669807, + "grad_norm": 0.4740045617404099, + "learning_rate": 9.723345458039595e-06, + "loss": 0.6333, + "step": 1000 + }, + { + "epoch": 0.7271241459151476, + "grad_norm": 0.4673201163985113, + "learning_rate": 9.722561849439414e-06, + "loss": 0.6273, + "step": 1001 + }, + { + "epoch": 0.7278505436633147, + "grad_norm": 0.5108892842569576, + "learning_rate": 9.72177716431055e-06, + "loss": 0.6239, + "step": 1002 + }, + { + "epoch": 0.7285769414114817, + "grad_norm": 0.5098263048545012, + "learning_rate": 9.720991402831883e-06, + "loss": 0.6012, + "step": 1003 + }, + { + "epoch": 0.7293033391596486, + "grad_norm": 0.5815800988061166, + "learning_rate": 9.720204565182522e-06, + "loss": 0.6204, + "step": 1004 + }, + { + "epoch": 0.7300297369078156, + "grad_norm": 0.47676217623350187, + "learning_rate": 9.719416651541839e-06, + "loss": 0.6105, + "step": 1005 + }, + { + "epoch": 0.7307561346559825, + "grad_norm": 0.47753745865698777, + "learning_rate": 9.718627662089435e-06, + "loss": 0.6395, + "step": 1006 + }, + { + "epoch": 0.7314825324041495, + "grad_norm": 0.4879838290382639, + "learning_rate": 9.717837597005169e-06, + "loss": 0.639, + "step": 1007 + }, + { + "epoch": 0.7322089301523165, + "grad_norm": 0.5013716494090865, + "learning_rate": 9.717046456469135e-06, + "loss": 0.6325, + "step": 1008 + }, + { + "epoch": 0.7329353279004835, + "grad_norm": 0.4751272371204352, + "learning_rate": 9.71625424066168e-06, + "loss": 0.6241, + "step": 1009 + }, + { + "epoch": 0.7336617256486505, + "grad_norm": 0.4699052765127366, + "learning_rate": 9.715460949763393e-06, + "loss": 0.6285, + "step": 1010 + }, + { + "epoch": 0.7343881233968175, + "grad_norm": 0.5191334092088646, + "learning_rate": 9.714666583955108e-06, + "loss": 0.6228, + "step": 1011 + }, + { + "epoch": 0.7351145211449844, + "grad_norm": 0.49374498413992257, + "learning_rate": 9.713871143417902e-06, + "loss": 0.6441, + "step": 1012 + }, + { + "epoch": 0.7358409188931514, + "grad_norm": 0.4863582873661774, + "learning_rate": 9.713074628333102e-06, + "loss": 0.6254, + "step": 1013 + }, + { + "epoch": 0.7365673166413184, + "grad_norm": 0.4498108134181778, + "learning_rate": 9.712277038882274e-06, + "loss": 0.6301, + "step": 1014 + }, + { + "epoch": 0.7372937143894854, + "grad_norm": 0.5126199564731452, + "learning_rate": 9.711478375247237e-06, + "loss": 0.6335, + "step": 1015 + }, + { + "epoch": 0.7380201121376524, + "grad_norm": 0.49607691586084646, + "learning_rate": 9.710678637610045e-06, + "loss": 0.6268, + "step": 1016 + }, + { + "epoch": 0.7387465098858194, + "grad_norm": 0.458163972923393, + "learning_rate": 9.709877826153003e-06, + "loss": 0.6478, + "step": 1017 + }, + { + "epoch": 0.7394729076339863, + "grad_norm": 0.4874017595835269, + "learning_rate": 9.709075941058661e-06, + "loss": 0.6251, + "step": 1018 + }, + { + "epoch": 0.7401993053821533, + "grad_norm": 0.47017693490570733, + "learning_rate": 9.70827298250981e-06, + "loss": 0.6363, + "step": 1019 + }, + { + "epoch": 0.7409257031303202, + "grad_norm": 0.6968313114729715, + "learning_rate": 9.70746895068949e-06, + "loss": 0.6149, + "step": 1020 + }, + { + "epoch": 0.7416521008784873, + "grad_norm": 0.4965074758910424, + "learning_rate": 9.706663845780984e-06, + "loss": 0.6154, + "step": 1021 + }, + { + "epoch": 0.7423784986266543, + "grad_norm": 0.48551084123096894, + "learning_rate": 9.705857667967818e-06, + "loss": 0.6312, + "step": 1022 + }, + { + "epoch": 0.7431048963748212, + "grad_norm": 0.4866557666908839, + "learning_rate": 9.705050417433762e-06, + "loss": 0.6192, + "step": 1023 + }, + { + "epoch": 0.7438312941229882, + "grad_norm": 0.5025491441411499, + "learning_rate": 9.704242094362834e-06, + "loss": 0.6086, + "step": 1024 + }, + { + "epoch": 0.7445576918711552, + "grad_norm": 0.4786396836500014, + "learning_rate": 9.703432698939294e-06, + "loss": 0.6355, + "step": 1025 + }, + { + "epoch": 0.7452840896193221, + "grad_norm": 0.5195325928953927, + "learning_rate": 9.70262223134765e-06, + "loss": 0.6176, + "step": 1026 + }, + { + "epoch": 0.7460104873674892, + "grad_norm": 0.47770348548058295, + "learning_rate": 9.70181069177265e-06, + "loss": 0.6208, + "step": 1027 + }, + { + "epoch": 0.7467368851156562, + "grad_norm": 0.5669878887712634, + "learning_rate": 9.700998080399287e-06, + "loss": 0.6118, + "step": 1028 + }, + { + "epoch": 0.7474632828638231, + "grad_norm": 0.4697665700113132, + "learning_rate": 9.700184397412799e-06, + "loss": 0.6287, + "step": 1029 + }, + { + "epoch": 0.7481896806119901, + "grad_norm": 0.4642121247185043, + "learning_rate": 9.699369642998671e-06, + "loss": 0.627, + "step": 1030 + }, + { + "epoch": 0.7489160783601571, + "grad_norm": 0.48927006449177757, + "learning_rate": 9.698553817342627e-06, + "loss": 0.6185, + "step": 1031 + }, + { + "epoch": 0.749642476108324, + "grad_norm": 0.5077132474206864, + "learning_rate": 9.697736920630641e-06, + "loss": 0.6382, + "step": 1032 + }, + { + "epoch": 0.7503688738564911, + "grad_norm": 0.4849060148767091, + "learning_rate": 9.696918953048925e-06, + "loss": 0.6371, + "step": 1033 + }, + { + "epoch": 0.751095271604658, + "grad_norm": 0.4952415892198469, + "learning_rate": 9.69609991478394e-06, + "loss": 0.6219, + "step": 1034 + }, + { + "epoch": 0.751821669352825, + "grad_norm": 0.46870581026203334, + "learning_rate": 9.695279806022391e-06, + "loss": 0.6062, + "step": 1035 + }, + { + "epoch": 0.752548067100992, + "grad_norm": 0.44552062529184516, + "learning_rate": 9.694458626951223e-06, + "loss": 0.6336, + "step": 1036 + }, + { + "epoch": 0.7532744648491589, + "grad_norm": 0.4701788285387683, + "learning_rate": 9.693636377757628e-06, + "loss": 0.6348, + "step": 1037 + }, + { + "epoch": 0.7540008625973259, + "grad_norm": 0.4930902095299255, + "learning_rate": 9.692813058629041e-06, + "loss": 0.6309, + "step": 1038 + }, + { + "epoch": 0.754727260345493, + "grad_norm": 0.513865274475057, + "learning_rate": 9.691988669753141e-06, + "loss": 0.6313, + "step": 1039 + }, + { + "epoch": 0.7554536580936599, + "grad_norm": 0.5004488299102964, + "learning_rate": 9.691163211317853e-06, + "loss": 0.6186, + "step": 1040 + }, + { + "epoch": 0.7561800558418269, + "grad_norm": 0.4632958610592306, + "learning_rate": 9.69033668351134e-06, + "loss": 0.6335, + "step": 1041 + }, + { + "epoch": 0.7569064535899939, + "grad_norm": 0.5029063747150795, + "learning_rate": 9.689509086522019e-06, + "loss": 0.6247, + "step": 1042 + }, + { + "epoch": 0.7576328513381608, + "grad_norm": 0.4634748963399614, + "learning_rate": 9.688680420538539e-06, + "loss": 0.6309, + "step": 1043 + }, + { + "epoch": 0.7583592490863278, + "grad_norm": 0.4718510576080903, + "learning_rate": 9.6878506857498e-06, + "loss": 0.6371, + "step": 1044 + }, + { + "epoch": 0.7590856468344949, + "grad_norm": 0.5131883999829031, + "learning_rate": 9.687019882344942e-06, + "loss": 0.6227, + "step": 1045 + }, + { + "epoch": 0.7598120445826618, + "grad_norm": 0.5286803442259869, + "learning_rate": 9.68618801051335e-06, + "loss": 0.6347, + "step": 1046 + }, + { + "epoch": 0.7605384423308288, + "grad_norm": 0.4592287328761153, + "learning_rate": 9.685355070444658e-06, + "loss": 0.6289, + "step": 1047 + }, + { + "epoch": 0.7612648400789958, + "grad_norm": 0.46453734575529443, + "learning_rate": 9.684521062328734e-06, + "loss": 0.6167, + "step": 1048 + }, + { + "epoch": 0.7619912378271627, + "grad_norm": 0.554800979203622, + "learning_rate": 9.683685986355692e-06, + "loss": 0.6174, + "step": 1049 + }, + { + "epoch": 0.7627176355753297, + "grad_norm": 0.47868566678845503, + "learning_rate": 9.682849842715895e-06, + "loss": 0.6169, + "step": 1050 + }, + { + "epoch": 0.7634440333234968, + "grad_norm": 0.4752654997453638, + "learning_rate": 9.682012631599945e-06, + "loss": 0.6115, + "step": 1051 + }, + { + "epoch": 0.7641704310716637, + "grad_norm": 0.4779615895205545, + "learning_rate": 9.681174353198687e-06, + "loss": 0.6145, + "step": 1052 + }, + { + "epoch": 0.7648968288198307, + "grad_norm": 0.48803918870499163, + "learning_rate": 9.68033500770321e-06, + "loss": 0.6149, + "step": 1053 + }, + { + "epoch": 0.7656232265679976, + "grad_norm": 0.4696941476074349, + "learning_rate": 9.679494595304848e-06, + "loss": 0.6327, + "step": 1054 + }, + { + "epoch": 0.7663496243161646, + "grad_norm": 0.4844539384776647, + "learning_rate": 9.678653116195174e-06, + "loss": 0.6239, + "step": 1055 + }, + { + "epoch": 0.7670760220643316, + "grad_norm": 0.6568435300988817, + "learning_rate": 9.677810570566011e-06, + "loss": 0.6181, + "step": 1056 + }, + { + "epoch": 0.7678024198124985, + "grad_norm": 0.5597089201336259, + "learning_rate": 9.676966958609417e-06, + "loss": 0.6265, + "step": 1057 + }, + { + "epoch": 0.7685288175606656, + "grad_norm": 0.5421661140844151, + "learning_rate": 9.676122280517699e-06, + "loss": 0.6243, + "step": 1058 + }, + { + "epoch": 0.7692552153088326, + "grad_norm": 0.45296333286927, + "learning_rate": 9.675276536483404e-06, + "loss": 0.6265, + "step": 1059 + }, + { + "epoch": 0.7699816130569995, + "grad_norm": 0.4950287756900467, + "learning_rate": 9.674429726699324e-06, + "loss": 0.6139, + "step": 1060 + }, + { + "epoch": 0.7707080108051665, + "grad_norm": 0.4530383471143353, + "learning_rate": 9.673581851358493e-06, + "loss": 0.613, + "step": 1061 + }, + { + "epoch": 0.7714344085533335, + "grad_norm": 0.46904027613626054, + "learning_rate": 9.672732910654187e-06, + "loss": 0.61, + "step": 1062 + }, + { + "epoch": 0.7721608063015004, + "grad_norm": 0.4562341083900913, + "learning_rate": 9.671882904779927e-06, + "loss": 0.6019, + "step": 1063 + }, + { + "epoch": 0.7728872040496675, + "grad_norm": 0.47972675828391576, + "learning_rate": 9.671031833929474e-06, + "loss": 0.6012, + "step": 1064 + }, + { + "epoch": 0.7736136017978344, + "grad_norm": 0.4889221372366856, + "learning_rate": 9.670179698296835e-06, + "loss": 0.6057, + "step": 1065 + }, + { + "epoch": 0.7743399995460014, + "grad_norm": 0.4998365163823747, + "learning_rate": 9.669326498076258e-06, + "loss": 0.6244, + "step": 1066 + }, + { + "epoch": 0.7750663972941684, + "grad_norm": 0.4461796783543215, + "learning_rate": 9.668472233462232e-06, + "loss": 0.5957, + "step": 1067 + }, + { + "epoch": 0.7757927950423353, + "grad_norm": 0.4653827741141469, + "learning_rate": 9.667616904649491e-06, + "loss": 0.6223, + "step": 1068 + }, + { + "epoch": 0.7765191927905023, + "grad_norm": 0.5101486161304896, + "learning_rate": 9.666760511833014e-06, + "loss": 0.6206, + "step": 1069 + }, + { + "epoch": 0.7772455905386694, + "grad_norm": 0.44938945973356537, + "learning_rate": 9.665903055208013e-06, + "loss": 0.614, + "step": 1070 + }, + { + "epoch": 0.7779719882868363, + "grad_norm": 0.48210108522020845, + "learning_rate": 9.665044534969957e-06, + "loss": 0.6294, + "step": 1071 + }, + { + "epoch": 0.7786983860350033, + "grad_norm": 0.536826645391065, + "learning_rate": 9.664184951314542e-06, + "loss": 0.6216, + "step": 1072 + }, + { + "epoch": 0.7794247837831703, + "grad_norm": 0.46290611415210603, + "learning_rate": 9.663324304437717e-06, + "loss": 0.6162, + "step": 1073 + }, + { + "epoch": 0.7801511815313372, + "grad_norm": 0.4389114689546384, + "learning_rate": 9.662462594535674e-06, + "loss": 0.6107, + "step": 1074 + }, + { + "epoch": 0.7808775792795042, + "grad_norm": 0.4572319500457869, + "learning_rate": 9.661599821804836e-06, + "loss": 0.6241, + "step": 1075 + }, + { + "epoch": 0.7816039770276713, + "grad_norm": 0.4827522638295839, + "learning_rate": 9.660735986441881e-06, + "loss": 0.6122, + "step": 1076 + }, + { + "epoch": 0.7823303747758382, + "grad_norm": 0.4740115259109735, + "learning_rate": 9.659871088643724e-06, + "loss": 0.6115, + "step": 1077 + }, + { + "epoch": 0.7830567725240052, + "grad_norm": 0.4590734004873104, + "learning_rate": 9.659005128607518e-06, + "loss": 0.6185, + "step": 1078 + }, + { + "epoch": 0.7837831702721721, + "grad_norm": 0.8749921694047537, + "learning_rate": 9.65813810653067e-06, + "loss": 0.5945, + "step": 1079 + }, + { + "epoch": 0.7845095680203391, + "grad_norm": 0.7035666241156626, + "learning_rate": 9.657270022610814e-06, + "loss": 0.6091, + "step": 1080 + }, + { + "epoch": 0.7852359657685061, + "grad_norm": 0.5346422298106133, + "learning_rate": 9.656400877045836e-06, + "loss": 0.6152, + "step": 1081 + }, + { + "epoch": 0.7859623635166731, + "grad_norm": 0.4782527712539082, + "learning_rate": 9.655530670033866e-06, + "loss": 0.6225, + "step": 1082 + }, + { + "epoch": 0.7866887612648401, + "grad_norm": 0.47908779964086995, + "learning_rate": 9.654659401773264e-06, + "loss": 0.6161, + "step": 1083 + }, + { + "epoch": 0.7874151590130071, + "grad_norm": 0.4391207715095105, + "learning_rate": 9.653787072462644e-06, + "loss": 0.6166, + "step": 1084 + }, + { + "epoch": 0.788141556761174, + "grad_norm": 0.4423155828257749, + "learning_rate": 9.652913682300856e-06, + "loss": 0.5925, + "step": 1085 + }, + { + "epoch": 0.788867954509341, + "grad_norm": 0.4425248921573, + "learning_rate": 9.652039231486993e-06, + "loss": 0.6095, + "step": 1086 + }, + { + "epoch": 0.789594352257508, + "grad_norm": 0.4700153894358856, + "learning_rate": 9.651163720220391e-06, + "loss": 0.6084, + "step": 1087 + }, + { + "epoch": 0.790320750005675, + "grad_norm": 0.5178001098713066, + "learning_rate": 9.650287148700626e-06, + "loss": 0.6016, + "step": 1088 + }, + { + "epoch": 0.791047147753842, + "grad_norm": 0.47127932138489764, + "learning_rate": 9.649409517127515e-06, + "loss": 0.632, + "step": 1089 + }, + { + "epoch": 0.791773545502009, + "grad_norm": 0.5332912769127331, + "learning_rate": 9.648530825701118e-06, + "loss": 0.6172, + "step": 1090 + }, + { + "epoch": 0.7924999432501759, + "grad_norm": 0.5257897079993382, + "learning_rate": 9.647651074621741e-06, + "loss": 0.6212, + "step": 1091 + }, + { + "epoch": 0.7932263409983429, + "grad_norm": 0.5086619699814738, + "learning_rate": 9.646770264089921e-06, + "loss": 0.6206, + "step": 1092 + }, + { + "epoch": 0.7939527387465098, + "grad_norm": 0.47689364802107714, + "learning_rate": 9.645888394306448e-06, + "loss": 0.6112, + "step": 1093 + }, + { + "epoch": 0.7946791364946769, + "grad_norm": 0.5028782234896939, + "learning_rate": 9.645005465472345e-06, + "loss": 0.6258, + "step": 1094 + }, + { + "epoch": 0.7954055342428439, + "grad_norm": 0.48796076270457783, + "learning_rate": 9.644121477788879e-06, + "loss": 0.6159, + "step": 1095 + }, + { + "epoch": 0.7961319319910108, + "grad_norm": 0.4538044670324512, + "learning_rate": 9.643236431457561e-06, + "loss": 0.608, + "step": 1096 + }, + { + "epoch": 0.7968583297391778, + "grad_norm": 0.4616355979057923, + "learning_rate": 9.64235032668014e-06, + "loss": 0.6164, + "step": 1097 + }, + { + "epoch": 0.7975847274873448, + "grad_norm": 0.5297894983069743, + "learning_rate": 9.641463163658606e-06, + "loss": 0.6352, + "step": 1098 + }, + { + "epoch": 0.7983111252355117, + "grad_norm": 0.4591303780851485, + "learning_rate": 9.640574942595195e-06, + "loss": 0.5976, + "step": 1099 + }, + { + "epoch": 0.7990375229836788, + "grad_norm": 0.465064202103376, + "learning_rate": 9.63968566369238e-06, + "loss": 0.614, + "step": 1100 + }, + { + "epoch": 0.7997639207318458, + "grad_norm": 0.4665514161467082, + "learning_rate": 9.638795327152872e-06, + "loss": 0.5919, + "step": 1101 + }, + { + "epoch": 0.8004903184800127, + "grad_norm": 0.44878678129345034, + "learning_rate": 9.637903933179633e-06, + "loss": 0.6059, + "step": 1102 + }, + { + "epoch": 0.8012167162281797, + "grad_norm": 0.4741833500595366, + "learning_rate": 9.637011481975858e-06, + "loss": 0.6378, + "step": 1103 + }, + { + "epoch": 0.8019431139763467, + "grad_norm": 0.45791949887831834, + "learning_rate": 9.636117973744982e-06, + "loss": 0.6179, + "step": 1104 + }, + { + "epoch": 0.8026695117245136, + "grad_norm": 0.5404917727674573, + "learning_rate": 9.635223408690688e-06, + "loss": 0.6081, + "step": 1105 + }, + { + "epoch": 0.8033959094726806, + "grad_norm": 0.4476699095118304, + "learning_rate": 9.634327787016894e-06, + "loss": 0.6212, + "step": 1106 + }, + { + "epoch": 0.8041223072208477, + "grad_norm": 0.4645084972315594, + "learning_rate": 9.633431108927764e-06, + "loss": 0.6103, + "step": 1107 + }, + { + "epoch": 0.8048487049690146, + "grad_norm": 0.4591418402004859, + "learning_rate": 9.632533374627695e-06, + "loss": 0.624, + "step": 1108 + }, + { + "epoch": 0.8055751027171816, + "grad_norm": 0.4264466790448802, + "learning_rate": 9.631634584321334e-06, + "loss": 0.604, + "step": 1109 + }, + { + "epoch": 0.8063015004653485, + "grad_norm": 0.462102047288957, + "learning_rate": 9.630734738213561e-06, + "loss": 0.6193, + "step": 1110 + }, + { + "epoch": 0.8070278982135155, + "grad_norm": 0.44591136659649727, + "learning_rate": 9.6298338365095e-06, + "loss": 0.5942, + "step": 1111 + }, + { + "epoch": 0.8077542959616825, + "grad_norm": 0.5135813110167959, + "learning_rate": 9.628931879414519e-06, + "loss": 0.6073, + "step": 1112 + }, + { + "epoch": 0.8084806937098495, + "grad_norm": 0.4301105343160582, + "learning_rate": 9.628028867134216e-06, + "loss": 0.5967, + "step": 1113 + }, + { + "epoch": 0.8092070914580165, + "grad_norm": 0.6578782656668021, + "learning_rate": 9.627124799874446e-06, + "loss": 0.6026, + "step": 1114 + }, + { + "epoch": 0.8099334892061835, + "grad_norm": 0.4958492633099008, + "learning_rate": 9.626219677841287e-06, + "loss": 0.6152, + "step": 1115 + }, + { + "epoch": 0.8106598869543504, + "grad_norm": 0.4756410241779951, + "learning_rate": 9.625313501241069e-06, + "loss": 0.5805, + "step": 1116 + }, + { + "epoch": 0.8113862847025174, + "grad_norm": 0.5433562306080713, + "learning_rate": 9.62440627028036e-06, + "loss": 0.626, + "step": 1117 + }, + { + "epoch": 0.8121126824506844, + "grad_norm": 0.48943135901184903, + "learning_rate": 9.623497985165964e-06, + "loss": 0.6052, + "step": 1118 + }, + { + "epoch": 0.8128390801988514, + "grad_norm": 0.5114674351446247, + "learning_rate": 9.622588646104934e-06, + "loss": 0.6023, + "step": 1119 + }, + { + "epoch": 0.8135654779470184, + "grad_norm": 0.5220309988554744, + "learning_rate": 9.62167825330455e-06, + "loss": 0.6, + "step": 1120 + }, + { + "epoch": 0.8142918756951854, + "grad_norm": 0.4994686649781941, + "learning_rate": 9.620766806972348e-06, + "loss": 0.5928, + "step": 1121 + }, + { + "epoch": 0.8150182734433523, + "grad_norm": 0.4514390687848239, + "learning_rate": 9.61985430731609e-06, + "loss": 0.6155, + "step": 1122 + }, + { + "epoch": 0.8157446711915193, + "grad_norm": 0.4681036653080852, + "learning_rate": 9.618940754543788e-06, + "loss": 0.5874, + "step": 1123 + }, + { + "epoch": 0.8164710689396862, + "grad_norm": 0.4395085861388702, + "learning_rate": 9.618026148863689e-06, + "loss": 0.5947, + "step": 1124 + }, + { + "epoch": 0.8171974666878533, + "grad_norm": 0.5053784535742852, + "learning_rate": 9.617110490484281e-06, + "loss": 0.6001, + "step": 1125 + }, + { + "epoch": 0.8179238644360203, + "grad_norm": 0.4960217010563304, + "learning_rate": 9.616193779614294e-06, + "loss": 0.6125, + "step": 1126 + }, + { + "epoch": 0.8186502621841872, + "grad_norm": 0.4852233185765258, + "learning_rate": 9.615276016462694e-06, + "loss": 0.631, + "step": 1127 + }, + { + "epoch": 0.8193766599323542, + "grad_norm": 0.4550745865680181, + "learning_rate": 9.61435720123869e-06, + "loss": 0.6036, + "step": 1128 + }, + { + "epoch": 0.8201030576805212, + "grad_norm": 0.46274283671479666, + "learning_rate": 9.613437334151731e-06, + "loss": 0.6051, + "step": 1129 + }, + { + "epoch": 0.8208294554286881, + "grad_norm": 0.4768928920065255, + "learning_rate": 9.612516415411505e-06, + "loss": 0.6172, + "step": 1130 + }, + { + "epoch": 0.8215558531768552, + "grad_norm": 0.4382177071505045, + "learning_rate": 9.611594445227939e-06, + "loss": 0.6208, + "step": 1131 + }, + { + "epoch": 0.8222822509250222, + "grad_norm": 0.4637384188259406, + "learning_rate": 9.610671423811197e-06, + "loss": 0.6223, + "step": 1132 + }, + { + "epoch": 0.8230086486731891, + "grad_norm": 0.4480710315760765, + "learning_rate": 9.60974735137169e-06, + "loss": 0.6044, + "step": 1133 + }, + { + "epoch": 0.8237350464213561, + "grad_norm": 0.7018777620026823, + "learning_rate": 9.608822228120063e-06, + "loss": 0.6102, + "step": 1134 + }, + { + "epoch": 0.824461444169523, + "grad_norm": 0.4466079797386302, + "learning_rate": 9.607896054267201e-06, + "loss": 0.6009, + "step": 1135 + }, + { + "epoch": 0.82518784191769, + "grad_norm": 0.5051574955921055, + "learning_rate": 9.60696883002423e-06, + "loss": 0.6031, + "step": 1136 + }, + { + "epoch": 0.8259142396658571, + "grad_norm": 0.4809646087862845, + "learning_rate": 9.606040555602516e-06, + "loss": 0.5941, + "step": 1137 + }, + { + "epoch": 0.826640637414024, + "grad_norm": 0.44871620180448013, + "learning_rate": 9.60511123121366e-06, + "loss": 0.6103, + "step": 1138 + }, + { + "epoch": 0.827367035162191, + "grad_norm": 0.49068878161291896, + "learning_rate": 9.604180857069509e-06, + "loss": 0.6228, + "step": 1139 + }, + { + "epoch": 0.828093432910358, + "grad_norm": 0.4768595004531384, + "learning_rate": 9.603249433382145e-06, + "loss": 0.6194, + "step": 1140 + }, + { + "epoch": 0.8288198306585249, + "grad_norm": 0.48342908704288834, + "learning_rate": 9.602316960363889e-06, + "loss": 0.6033, + "step": 1141 + }, + { + "epoch": 0.8295462284066919, + "grad_norm": 0.6161075106997672, + "learning_rate": 9.601383438227303e-06, + "loss": 0.6043, + "step": 1142 + }, + { + "epoch": 0.830272626154859, + "grad_norm": 0.4365995879925137, + "learning_rate": 9.600448867185188e-06, + "loss": 0.5937, + "step": 1143 + }, + { + "epoch": 0.8309990239030259, + "grad_norm": 0.43924899950044805, + "learning_rate": 9.599513247450581e-06, + "loss": 0.6126, + "step": 1144 + }, + { + "epoch": 0.8317254216511929, + "grad_norm": 0.4606375387545376, + "learning_rate": 9.598576579236765e-06, + "loss": 0.6205, + "step": 1145 + }, + { + "epoch": 0.8324518193993599, + "grad_norm": 0.4370080723956695, + "learning_rate": 9.597638862757255e-06, + "loss": 0.6222, + "step": 1146 + }, + { + "epoch": 0.8331782171475268, + "grad_norm": 0.45381509810289633, + "learning_rate": 9.596700098225806e-06, + "loss": 0.5969, + "step": 1147 + }, + { + "epoch": 0.8339046148956938, + "grad_norm": 0.45557634817071224, + "learning_rate": 9.595760285856418e-06, + "loss": 0.5993, + "step": 1148 + }, + { + "epoch": 0.8346310126438609, + "grad_norm": 0.4413227228028858, + "learning_rate": 9.59481942586332e-06, + "loss": 0.6113, + "step": 1149 + }, + { + "epoch": 0.8353574103920278, + "grad_norm": 0.45948096434027247, + "learning_rate": 9.593877518460988e-06, + "loss": 0.6003, + "step": 1150 + }, + { + "epoch": 0.8360838081401948, + "grad_norm": 0.4077180909410102, + "learning_rate": 9.592934563864136e-06, + "loss": 0.6172, + "step": 1151 + }, + { + "epoch": 0.8368102058883617, + "grad_norm": 0.43642720072650515, + "learning_rate": 9.59199056228771e-06, + "loss": 0.5942, + "step": 1152 + }, + { + "epoch": 0.8375366036365287, + "grad_norm": 0.4566716571217164, + "learning_rate": 9.591045513946904e-06, + "loss": 0.6288, + "step": 1153 + }, + { + "epoch": 0.8382630013846957, + "grad_norm": 0.4256375577799388, + "learning_rate": 9.590099419057142e-06, + "loss": 0.5925, + "step": 1154 + }, + { + "epoch": 0.8389893991328626, + "grad_norm": 0.43304644846968315, + "learning_rate": 9.589152277834093e-06, + "loss": 0.6201, + "step": 1155 + }, + { + "epoch": 0.8397157968810297, + "grad_norm": 0.4100228725712963, + "learning_rate": 9.58820409049366e-06, + "loss": 0.5943, + "step": 1156 + }, + { + "epoch": 0.8404421946291967, + "grad_norm": 0.4099528807583989, + "learning_rate": 9.587254857251988e-06, + "loss": 0.6109, + "step": 1157 + }, + { + "epoch": 0.8411685923773636, + "grad_norm": 0.491774985036076, + "learning_rate": 9.586304578325457e-06, + "loss": 0.6096, + "step": 1158 + }, + { + "epoch": 0.8418949901255306, + "grad_norm": 0.559738280848998, + "learning_rate": 9.585353253930692e-06, + "loss": 0.6069, + "step": 1159 + }, + { + "epoch": 0.8426213878736976, + "grad_norm": 0.40139731209421825, + "learning_rate": 9.584400884284546e-06, + "loss": 0.6131, + "step": 1160 + }, + { + "epoch": 0.8433477856218645, + "grad_norm": 0.4400340950055217, + "learning_rate": 9.58344746960412e-06, + "loss": 0.6129, + "step": 1161 + }, + { + "epoch": 0.8440741833700316, + "grad_norm": 0.4166850878820713, + "learning_rate": 9.582493010106744e-06, + "loss": 0.6041, + "step": 1162 + }, + { + "epoch": 0.8448005811181986, + "grad_norm": 0.5165919941194519, + "learning_rate": 9.581537506009996e-06, + "loss": 0.6115, + "step": 1163 + }, + { + "epoch": 0.8455269788663655, + "grad_norm": 0.4151737437790665, + "learning_rate": 9.580580957531685e-06, + "loss": 0.5974, + "step": 1164 + }, + { + "epoch": 0.8462533766145325, + "grad_norm": 0.4550231106805591, + "learning_rate": 9.579623364889863e-06, + "loss": 0.6238, + "step": 1165 + }, + { + "epoch": 0.8469797743626994, + "grad_norm": 0.4506499145111714, + "learning_rate": 9.578664728302813e-06, + "loss": 0.608, + "step": 1166 + }, + { + "epoch": 0.8477061721108664, + "grad_norm": 0.5499080210212972, + "learning_rate": 9.577705047989065e-06, + "loss": 0.6023, + "step": 1167 + }, + { + "epoch": 0.8484325698590335, + "grad_norm": 0.625121033765801, + "learning_rate": 9.57674432416738e-06, + "loss": 0.5984, + "step": 1168 + }, + { + "epoch": 0.8491589676072004, + "grad_norm": 1.1862443305257935, + "learning_rate": 9.575782557056759e-06, + "loss": 0.6052, + "step": 1169 + }, + { + "epoch": 0.8498853653553674, + "grad_norm": 0.47769553696473244, + "learning_rate": 9.57481974687644e-06, + "loss": 0.5912, + "step": 1170 + }, + { + "epoch": 0.8506117631035344, + "grad_norm": 0.4080077932005349, + "learning_rate": 9.573855893845905e-06, + "loss": 0.6235, + "step": 1171 + }, + { + "epoch": 0.8513381608517013, + "grad_norm": 0.4761375945040686, + "learning_rate": 9.572890998184862e-06, + "loss": 0.606, + "step": 1172 + }, + { + "epoch": 0.8520645585998683, + "grad_norm": 0.5766409060005017, + "learning_rate": 9.571925060113267e-06, + "loss": 0.5979, + "step": 1173 + }, + { + "epoch": 0.8527909563480354, + "grad_norm": 0.42596241668640383, + "learning_rate": 9.57095807985131e-06, + "loss": 0.5915, + "step": 1174 + }, + { + "epoch": 0.8535173540962023, + "grad_norm": 0.5056126132604277, + "learning_rate": 9.569990057619414e-06, + "loss": 0.5898, + "step": 1175 + }, + { + "epoch": 0.8542437518443693, + "grad_norm": 0.43335266407709333, + "learning_rate": 9.569020993638249e-06, + "loss": 0.6022, + "step": 1176 + }, + { + "epoch": 0.8549701495925363, + "grad_norm": 0.44207808281110866, + "learning_rate": 9.568050888128716e-06, + "loss": 0.6071, + "step": 1177 + }, + { + "epoch": 0.8556965473407032, + "grad_norm": 0.4341399555907341, + "learning_rate": 9.567079741311956e-06, + "loss": 0.6043, + "step": 1178 + }, + { + "epoch": 0.8564229450888702, + "grad_norm": 0.4369089309389326, + "learning_rate": 9.566107553409341e-06, + "loss": 0.587, + "step": 1179 + }, + { + "epoch": 0.8571493428370373, + "grad_norm": 0.4599730151913197, + "learning_rate": 9.565134324642491e-06, + "loss": 0.6102, + "step": 1180 + }, + { + "epoch": 0.8578757405852042, + "grad_norm": 0.43977979280579654, + "learning_rate": 9.564160055233256e-06, + "loss": 0.5987, + "step": 1181 + }, + { + "epoch": 0.8586021383333712, + "grad_norm": 0.46783053277652853, + "learning_rate": 9.563184745403725e-06, + "loss": 0.6029, + "step": 1182 + }, + { + "epoch": 0.8593285360815381, + "grad_norm": 0.44458871164794883, + "learning_rate": 9.562208395376223e-06, + "loss": 0.593, + "step": 1183 + }, + { + "epoch": 0.8600549338297051, + "grad_norm": 0.5733471186258695, + "learning_rate": 9.561231005373315e-06, + "loss": 0.6064, + "step": 1184 + }, + { + "epoch": 0.8607813315778721, + "grad_norm": 0.45341893514784476, + "learning_rate": 9.560252575617798e-06, + "loss": 0.5947, + "step": 1185 + }, + { + "epoch": 0.8615077293260391, + "grad_norm": 0.5303361989038317, + "learning_rate": 9.559273106332716e-06, + "loss": 0.5921, + "step": 1186 + }, + { + "epoch": 0.8622341270742061, + "grad_norm": 0.4370520610409579, + "learning_rate": 9.558292597741337e-06, + "loss": 0.607, + "step": 1187 + }, + { + "epoch": 0.8629605248223731, + "grad_norm": 0.4175970592323233, + "learning_rate": 9.557311050067175e-06, + "loss": 0.6018, + "step": 1188 + }, + { + "epoch": 0.86368692257054, + "grad_norm": 0.46416643253026196, + "learning_rate": 9.556328463533976e-06, + "loss": 0.6089, + "step": 1189 + }, + { + "epoch": 0.864413320318707, + "grad_norm": 0.6168553625857448, + "learning_rate": 9.555344838365727e-06, + "loss": 0.5861, + "step": 1190 + }, + { + "epoch": 0.865139718066874, + "grad_norm": 0.48915934761576074, + "learning_rate": 9.554360174786651e-06, + "loss": 0.6088, + "step": 1191 + }, + { + "epoch": 0.865866115815041, + "grad_norm": 0.44675004959116177, + "learning_rate": 9.553374473021204e-06, + "loss": 0.5922, + "step": 1192 + }, + { + "epoch": 0.866592513563208, + "grad_norm": 0.4385531991454444, + "learning_rate": 9.552387733294081e-06, + "loss": 0.5968, + "step": 1193 + }, + { + "epoch": 0.867318911311375, + "grad_norm": 0.472160005452995, + "learning_rate": 9.551399955830215e-06, + "loss": 0.6009, + "step": 1194 + }, + { + "epoch": 0.8680453090595419, + "grad_norm": 0.5114615003958931, + "learning_rate": 9.550411140854772e-06, + "loss": 0.5957, + "step": 1195 + }, + { + "epoch": 0.8687717068077089, + "grad_norm": 0.46197727438219466, + "learning_rate": 9.549421288593157e-06, + "loss": 0.6035, + "step": 1196 + }, + { + "epoch": 0.8694981045558758, + "grad_norm": 0.4505938549631761, + "learning_rate": 9.548430399271013e-06, + "loss": 0.6237, + "step": 1197 + }, + { + "epoch": 0.8702245023040429, + "grad_norm": 0.43654822032779134, + "learning_rate": 9.547438473114219e-06, + "loss": 0.6089, + "step": 1198 + }, + { + "epoch": 0.8709509000522099, + "grad_norm": 0.46067909068019247, + "learning_rate": 9.546445510348884e-06, + "loss": 0.61, + "step": 1199 + }, + { + "epoch": 0.8716772978003768, + "grad_norm": 0.41694081713666115, + "learning_rate": 9.545451511201365e-06, + "loss": 0.5918, + "step": 1200 + }, + { + "epoch": 0.8724036955485438, + "grad_norm": 0.46578768324984093, + "learning_rate": 9.54445647589824e-06, + "loss": 0.6005, + "step": 1201 + }, + { + "epoch": 0.8731300932967108, + "grad_norm": 0.4438616477116534, + "learning_rate": 9.543460404666338e-06, + "loss": 0.5999, + "step": 1202 + }, + { + "epoch": 0.8738564910448777, + "grad_norm": 0.48358201793997363, + "learning_rate": 9.542463297732716e-06, + "loss": 0.6064, + "step": 1203 + }, + { + "epoch": 0.8745828887930447, + "grad_norm": 0.501153042263637, + "learning_rate": 9.54146515532467e-06, + "loss": 0.6203, + "step": 1204 + }, + { + "epoch": 0.8753092865412118, + "grad_norm": 0.4521383558348136, + "learning_rate": 9.540465977669728e-06, + "loss": 0.6022, + "step": 1205 + }, + { + "epoch": 0.8760356842893787, + "grad_norm": 0.5112596720285417, + "learning_rate": 9.539465764995658e-06, + "loss": 0.5977, + "step": 1206 + }, + { + "epoch": 0.8767620820375457, + "grad_norm": 0.4861584352029897, + "learning_rate": 9.538464517530466e-06, + "loss": 0.5856, + "step": 1207 + }, + { + "epoch": 0.8774884797857126, + "grad_norm": 0.4200265080369869, + "learning_rate": 9.537462235502385e-06, + "loss": 0.5866, + "step": 1208 + }, + { + "epoch": 0.8782148775338796, + "grad_norm": 0.4261094133432591, + "learning_rate": 9.536458919139894e-06, + "loss": 0.5933, + "step": 1209 + }, + { + "epoch": 0.8789412752820466, + "grad_norm": 0.4003104970002111, + "learning_rate": 9.535454568671705e-06, + "loss": 0.5844, + "step": 1210 + }, + { + "epoch": 0.8796676730302136, + "grad_norm": 0.42295910863999364, + "learning_rate": 9.53444918432676e-06, + "loss": 0.6024, + "step": 1211 + }, + { + "epoch": 0.8803940707783806, + "grad_norm": 0.4149494814967926, + "learning_rate": 9.53344276633424e-06, + "loss": 0.5863, + "step": 1212 + }, + { + "epoch": 0.8811204685265476, + "grad_norm": 0.5110127220773379, + "learning_rate": 9.532435314923566e-06, + "loss": 0.5767, + "step": 1213 + }, + { + "epoch": 0.8818468662747145, + "grad_norm": 0.43349333837655685, + "learning_rate": 9.531426830324388e-06, + "loss": 0.5951, + "step": 1214 + }, + { + "epoch": 0.8825732640228815, + "grad_norm": 0.44997031713934815, + "learning_rate": 9.530417312766598e-06, + "loss": 0.6187, + "step": 1215 + }, + { + "epoch": 0.8832996617710485, + "grad_norm": 0.4446591721155272, + "learning_rate": 9.529406762480318e-06, + "loss": 0.5985, + "step": 1216 + }, + { + "epoch": 0.8840260595192155, + "grad_norm": 0.44120700182390005, + "learning_rate": 9.528395179695907e-06, + "loss": 0.5813, + "step": 1217 + }, + { + "epoch": 0.8847524572673825, + "grad_norm": 0.4295735205310877, + "learning_rate": 9.527382564643957e-06, + "loss": 0.5971, + "step": 1218 + }, + { + "epoch": 0.8854788550155495, + "grad_norm": 0.43730360118736156, + "learning_rate": 9.526368917555304e-06, + "loss": 0.6023, + "step": 1219 + }, + { + "epoch": 0.8862052527637164, + "grad_norm": 0.43250587556589976, + "learning_rate": 9.52535423866101e-06, + "loss": 0.5893, + "step": 1220 + }, + { + "epoch": 0.8869316505118834, + "grad_norm": 0.47439969227732015, + "learning_rate": 9.524338528192375e-06, + "loss": 0.5983, + "step": 1221 + }, + { + "epoch": 0.8876580482600503, + "grad_norm": 0.43225531857326166, + "learning_rate": 9.523321786380936e-06, + "loss": 0.5784, + "step": 1222 + }, + { + "epoch": 0.8883844460082174, + "grad_norm": 0.5170344250080916, + "learning_rate": 9.522304013458464e-06, + "loss": 0.6216, + "step": 1223 + }, + { + "epoch": 0.8891108437563844, + "grad_norm": 0.42919086642378623, + "learning_rate": 9.521285209656964e-06, + "loss": 0.5881, + "step": 1224 + }, + { + "epoch": 0.8898372415045513, + "grad_norm": 0.47659107028807385, + "learning_rate": 9.520265375208675e-06, + "loss": 0.5952, + "step": 1225 + }, + { + "epoch": 0.8905636392527183, + "grad_norm": 0.41359128320071464, + "learning_rate": 9.519244510346076e-06, + "loss": 0.5908, + "step": 1226 + }, + { + "epoch": 0.8912900370008853, + "grad_norm": 0.5644872809341976, + "learning_rate": 9.518222615301875e-06, + "loss": 0.5875, + "step": 1227 + }, + { + "epoch": 0.8920164347490522, + "grad_norm": 0.47490025096662025, + "learning_rate": 9.51719969030902e-06, + "loss": 0.5917, + "step": 1228 + }, + { + "epoch": 0.8927428324972193, + "grad_norm": 0.569369819329793, + "learning_rate": 9.516175735600688e-06, + "loss": 0.6024, + "step": 1229 + }, + { + "epoch": 0.8934692302453863, + "grad_norm": 0.43225621864223424, + "learning_rate": 9.515150751410298e-06, + "loss": 0.6077, + "step": 1230 + }, + { + "epoch": 0.8941956279935532, + "grad_norm": 0.708224935453519, + "learning_rate": 9.514124737971495e-06, + "loss": 0.5964, + "step": 1231 + }, + { + "epoch": 0.8949220257417202, + "grad_norm": 0.4739759856133637, + "learning_rate": 9.51309769551817e-06, + "loss": 0.5763, + "step": 1232 + }, + { + "epoch": 0.8956484234898872, + "grad_norm": 0.7728743977820823, + "learning_rate": 9.512069624284433e-06, + "loss": 0.5842, + "step": 1233 + }, + { + "epoch": 0.8963748212380541, + "grad_norm": 0.43514666056639595, + "learning_rate": 9.511040524504645e-06, + "loss": 0.5967, + "step": 1234 + }, + { + "epoch": 0.8971012189862212, + "grad_norm": 0.45557762839508126, + "learning_rate": 9.510010396413389e-06, + "loss": 0.5897, + "step": 1235 + }, + { + "epoch": 0.8978276167343882, + "grad_norm": 0.5039340185306304, + "learning_rate": 9.508979240245489e-06, + "loss": 0.5773, + "step": 1236 + }, + { + "epoch": 0.8985540144825551, + "grad_norm": 0.46341563582692274, + "learning_rate": 9.507947056236003e-06, + "loss": 0.5814, + "step": 1237 + }, + { + "epoch": 0.8992804122307221, + "grad_norm": 0.47824940112014946, + "learning_rate": 9.506913844620217e-06, + "loss": 0.6016, + "step": 1238 + }, + { + "epoch": 0.900006809978889, + "grad_norm": 0.46306899110926264, + "learning_rate": 9.505879605633664e-06, + "loss": 0.5969, + "step": 1239 + }, + { + "epoch": 0.900733207727056, + "grad_norm": 0.4785112448900572, + "learning_rate": 9.504844339512096e-06, + "loss": 0.6181, + "step": 1240 + }, + { + "epoch": 0.9014596054752231, + "grad_norm": 0.44806415943157596, + "learning_rate": 9.503808046491511e-06, + "loss": 0.5909, + "step": 1241 + }, + { + "epoch": 0.90218600322339, + "grad_norm": 0.4039053308077116, + "learning_rate": 9.502770726808133e-06, + "loss": 0.5942, + "step": 1242 + }, + { + "epoch": 0.902912400971557, + "grad_norm": 0.4988364921150387, + "learning_rate": 9.501732380698426e-06, + "loss": 0.6076, + "step": 1243 + }, + { + "epoch": 0.903638798719724, + "grad_norm": 0.4467206176328996, + "learning_rate": 9.500693008399085e-06, + "loss": 0.6061, + "step": 1244 + }, + { + "epoch": 0.9043651964678909, + "grad_norm": 0.40186754448407863, + "learning_rate": 9.49965261014704e-06, + "loss": 0.5745, + "step": 1245 + }, + { + "epoch": 0.9050915942160579, + "grad_norm": 0.4649796467570825, + "learning_rate": 9.498611186179454e-06, + "loss": 0.5891, + "step": 1246 + }, + { + "epoch": 0.9058179919642249, + "grad_norm": 0.4076705157572678, + "learning_rate": 9.497568736733722e-06, + "loss": 0.6031, + "step": 1247 + }, + { + "epoch": 0.9065443897123919, + "grad_norm": 0.4512970825986025, + "learning_rate": 9.496525262047479e-06, + "loss": 0.589, + "step": 1248 + }, + { + "epoch": 0.9072707874605589, + "grad_norm": 0.42034195469706914, + "learning_rate": 9.495480762358585e-06, + "loss": 0.5858, + "step": 1249 + }, + { + "epoch": 0.9079971852087259, + "grad_norm": 0.42929899360092316, + "learning_rate": 9.494435237905142e-06, + "loss": 0.5743, + "step": 1250 + }, + { + "epoch": 0.9087235829568928, + "grad_norm": 0.4118235244162934, + "learning_rate": 9.493388688925481e-06, + "loss": 0.5844, + "step": 1251 + }, + { + "epoch": 0.9094499807050598, + "grad_norm": 0.41506895210123806, + "learning_rate": 9.492341115658167e-06, + "loss": 0.6049, + "step": 1252 + }, + { + "epoch": 0.9101763784532267, + "grad_norm": 0.4165328260065925, + "learning_rate": 9.491292518341998e-06, + "loss": 0.6008, + "step": 1253 + }, + { + "epoch": 0.9109027762013938, + "grad_norm": 0.38977236916049524, + "learning_rate": 9.490242897216008e-06, + "loss": 0.615, + "step": 1254 + }, + { + "epoch": 0.9116291739495608, + "grad_norm": 0.46505897512184374, + "learning_rate": 9.489192252519462e-06, + "loss": 0.5814, + "step": 1255 + }, + { + "epoch": 0.9123555716977277, + "grad_norm": 0.446928251555708, + "learning_rate": 9.488140584491859e-06, + "loss": 0.6047, + "step": 1256 + }, + { + "epoch": 0.9130819694458947, + "grad_norm": 0.43706570616826285, + "learning_rate": 9.487087893372933e-06, + "loss": 0.5741, + "step": 1257 + }, + { + "epoch": 0.9138083671940617, + "grad_norm": 0.4537167354168948, + "learning_rate": 9.486034179402645e-06, + "loss": 0.5896, + "step": 1258 + }, + { + "epoch": 0.9145347649422286, + "grad_norm": 0.41506148446047436, + "learning_rate": 9.484979442821199e-06, + "loss": 0.5949, + "step": 1259 + }, + { + "epoch": 0.9152611626903957, + "grad_norm": 0.4638563917811123, + "learning_rate": 9.483923683869025e-06, + "loss": 0.584, + "step": 1260 + }, + { + "epoch": 0.9159875604385627, + "grad_norm": 0.41081717634677034, + "learning_rate": 9.482866902786784e-06, + "loss": 0.5958, + "step": 1261 + }, + { + "epoch": 0.9167139581867296, + "grad_norm": 0.44940733112945636, + "learning_rate": 9.481809099815382e-06, + "loss": 0.5868, + "step": 1262 + }, + { + "epoch": 0.9174403559348966, + "grad_norm": 0.5181433591757858, + "learning_rate": 9.480750275195942e-06, + "loss": 0.609, + "step": 1263 + }, + { + "epoch": 0.9181667536830636, + "grad_norm": 0.4264257820314425, + "learning_rate": 9.479690429169832e-06, + "loss": 0.585, + "step": 1264 + }, + { + "epoch": 0.9188931514312305, + "grad_norm": 0.4385872968678699, + "learning_rate": 9.478629561978648e-06, + "loss": 0.5789, + "step": 1265 + }, + { + "epoch": 0.9196195491793976, + "grad_norm": 0.4339437079303383, + "learning_rate": 9.477567673864217e-06, + "loss": 0.5947, + "step": 1266 + }, + { + "epoch": 0.9203459469275646, + "grad_norm": 0.4415046209883479, + "learning_rate": 9.476504765068604e-06, + "loss": 0.6006, + "step": 1267 + }, + { + "epoch": 0.9210723446757315, + "grad_norm": 0.42129146459625944, + "learning_rate": 9.475440835834103e-06, + "loss": 0.582, + "step": 1268 + }, + { + "epoch": 0.9217987424238985, + "grad_norm": 0.48232608637628704, + "learning_rate": 9.474375886403239e-06, + "loss": 0.5914, + "step": 1269 + }, + { + "epoch": 0.9225251401720654, + "grad_norm": 0.4770042456279813, + "learning_rate": 9.473309917018774e-06, + "loss": 0.6019, + "step": 1270 + }, + { + "epoch": 0.9232515379202324, + "grad_norm": 0.4536442707571275, + "learning_rate": 9.4722429279237e-06, + "loss": 0.5864, + "step": 1271 + }, + { + "epoch": 0.9239779356683995, + "grad_norm": 0.42626513154423673, + "learning_rate": 9.471174919361244e-06, + "loss": 0.5763, + "step": 1272 + }, + { + "epoch": 0.9247043334165664, + "grad_norm": 0.6205482644129555, + "learning_rate": 9.47010589157486e-06, + "loss": 0.5901, + "step": 1273 + }, + { + "epoch": 0.9254307311647334, + "grad_norm": 0.43225121195112626, + "learning_rate": 9.469035844808238e-06, + "loss": 0.5956, + "step": 1274 + }, + { + "epoch": 0.9261571289129004, + "grad_norm": 0.41283182941994373, + "learning_rate": 9.467964779305304e-06, + "loss": 0.5997, + "step": 1275 + }, + { + "epoch": 0.9268835266610673, + "grad_norm": 0.41546967158855563, + "learning_rate": 9.466892695310204e-06, + "loss": 0.584, + "step": 1276 + }, + { + "epoch": 0.9276099244092343, + "grad_norm": 0.4300218140453633, + "learning_rate": 9.465819593067332e-06, + "loss": 0.5849, + "step": 1277 + }, + { + "epoch": 0.9283363221574014, + "grad_norm": 0.4456416918954236, + "learning_rate": 9.464745472821302e-06, + "loss": 0.5973, + "step": 1278 + }, + { + "epoch": 0.9290627199055683, + "grad_norm": 0.4482037090374508, + "learning_rate": 9.463670334816967e-06, + "loss": 0.58, + "step": 1279 + }, + { + "epoch": 0.9297891176537353, + "grad_norm": 0.46007508129114516, + "learning_rate": 9.462594179299408e-06, + "loss": 0.6001, + "step": 1280 + }, + { + "epoch": 0.9305155154019022, + "grad_norm": 0.5011958549498003, + "learning_rate": 9.461517006513938e-06, + "loss": 0.5845, + "step": 1281 + }, + { + "epoch": 0.9312419131500692, + "grad_norm": 0.42431402288059444, + "learning_rate": 9.460438816706106e-06, + "loss": 0.5905, + "step": 1282 + }, + { + "epoch": 0.9319683108982362, + "grad_norm": 0.4502683304139396, + "learning_rate": 9.459359610121688e-06, + "loss": 0.5704, + "step": 1283 + }, + { + "epoch": 0.9326947086464032, + "grad_norm": 0.3796874806100935, + "learning_rate": 9.458279387006695e-06, + "loss": 0.5768, + "step": 1284 + }, + { + "epoch": 0.9334211063945702, + "grad_norm": 0.5004568592535342, + "learning_rate": 9.45719814760737e-06, + "loss": 0.5774, + "step": 1285 + }, + { + "epoch": 0.9341475041427372, + "grad_norm": 0.4340577622957361, + "learning_rate": 9.456115892170183e-06, + "loss": 0.5911, + "step": 1286 + }, + { + "epoch": 0.9348739018909041, + "grad_norm": 0.45670721572589634, + "learning_rate": 9.45503262094184e-06, + "loss": 0.599, + "step": 1287 + }, + { + "epoch": 0.9356002996390711, + "grad_norm": 0.45572780467011514, + "learning_rate": 9.453948334169279e-06, + "loss": 0.5747, + "step": 1288 + }, + { + "epoch": 0.9363266973872381, + "grad_norm": 0.38841356192268106, + "learning_rate": 9.452863032099666e-06, + "loss": 0.5971, + "step": 1289 + }, + { + "epoch": 0.9370530951354051, + "grad_norm": 0.4298037377188982, + "learning_rate": 9.451776714980402e-06, + "loss": 0.5913, + "step": 1290 + }, + { + "epoch": 0.9377794928835721, + "grad_norm": 0.41341305861326055, + "learning_rate": 9.450689383059118e-06, + "loss": 0.5796, + "step": 1291 + }, + { + "epoch": 0.9385058906317391, + "grad_norm": 0.42660791554657346, + "learning_rate": 9.449601036583673e-06, + "loss": 0.5819, + "step": 1292 + }, + { + "epoch": 0.939232288379906, + "grad_norm": 0.5598103689052227, + "learning_rate": 9.448511675802162e-06, + "loss": 0.5838, + "step": 1293 + }, + { + "epoch": 0.939958686128073, + "grad_norm": 0.4586706687352272, + "learning_rate": 9.447421300962911e-06, + "loss": 0.5976, + "step": 1294 + }, + { + "epoch": 0.94068508387624, + "grad_norm": 0.43063693058417746, + "learning_rate": 9.446329912314475e-06, + "loss": 0.5797, + "step": 1295 + }, + { + "epoch": 0.9414114816244069, + "grad_norm": 0.4404865199677851, + "learning_rate": 9.44523751010564e-06, + "loss": 0.5937, + "step": 1296 + }, + { + "epoch": 0.942137879372574, + "grad_norm": 0.4531349959997388, + "learning_rate": 9.444144094585425e-06, + "loss": 0.5941, + "step": 1297 + }, + { + "epoch": 0.942864277120741, + "grad_norm": 0.5032016176400471, + "learning_rate": 9.443049666003077e-06, + "loss": 0.5945, + "step": 1298 + }, + { + "epoch": 0.9435906748689079, + "grad_norm": 0.44126419665834615, + "learning_rate": 9.441954224608079e-06, + "loss": 0.5949, + "step": 1299 + }, + { + "epoch": 0.9443170726170749, + "grad_norm": 0.45391630167344893, + "learning_rate": 9.440857770650139e-06, + "loss": 0.5898, + "step": 1300 + }, + { + "epoch": 0.9450434703652418, + "grad_norm": 0.3956164187382792, + "learning_rate": 9.439760304379197e-06, + "loss": 0.5874, + "step": 1301 + }, + { + "epoch": 0.9457698681134088, + "grad_norm": 0.4597343602525186, + "learning_rate": 9.438661826045429e-06, + "loss": 0.5841, + "step": 1302 + }, + { + "epoch": 0.9464962658615759, + "grad_norm": 0.4933253827016849, + "learning_rate": 9.437562335899237e-06, + "loss": 0.5949, + "step": 1303 + }, + { + "epoch": 0.9472226636097428, + "grad_norm": 0.42059079303606517, + "learning_rate": 9.436461834191254e-06, + "loss": 0.5735, + "step": 1304 + }, + { + "epoch": 0.9479490613579098, + "grad_norm": 0.6604789856347021, + "learning_rate": 9.435360321172344e-06, + "loss": 0.5951, + "step": 1305 + }, + { + "epoch": 0.9486754591060768, + "grad_norm": 0.6174395501507661, + "learning_rate": 9.434257797093602e-06, + "loss": 0.5904, + "step": 1306 + }, + { + "epoch": 0.9494018568542437, + "grad_norm": 0.5221870207891371, + "learning_rate": 9.433154262206353e-06, + "loss": 0.6028, + "step": 1307 + }, + { + "epoch": 0.9501282546024107, + "grad_norm": 0.4748141147379064, + "learning_rate": 9.432049716762151e-06, + "loss": 0.5967, + "step": 1308 + }, + { + "epoch": 0.9508546523505778, + "grad_norm": 0.4557962640119291, + "learning_rate": 9.430944161012784e-06, + "loss": 0.5936, + "step": 1309 + }, + { + "epoch": 0.9515810500987447, + "grad_norm": 0.4067491075418985, + "learning_rate": 9.42983759521027e-06, + "loss": 0.6003, + "step": 1310 + }, + { + "epoch": 0.9523074478469117, + "grad_norm": 0.44211224151004935, + "learning_rate": 9.428730019606853e-06, + "loss": 0.5775, + "step": 1311 + }, + { + "epoch": 0.9530338455950786, + "grad_norm": 0.407063349141801, + "learning_rate": 9.42762143445501e-06, + "loss": 0.59, + "step": 1312 + }, + { + "epoch": 0.9537602433432456, + "grad_norm": 0.4028594981568491, + "learning_rate": 9.426511840007448e-06, + "loss": 0.5801, + "step": 1313 + }, + { + "epoch": 0.9544866410914126, + "grad_norm": 0.3919749106342764, + "learning_rate": 9.425401236517104e-06, + "loss": 0.5905, + "step": 1314 + }, + { + "epoch": 0.9552130388395796, + "grad_norm": 0.4032757710830502, + "learning_rate": 9.424289624237143e-06, + "loss": 0.6031, + "step": 1315 + }, + { + "epoch": 0.9559394365877466, + "grad_norm": 0.4293862484316795, + "learning_rate": 9.423177003420966e-06, + "loss": 0.5708, + "step": 1316 + }, + { + "epoch": 0.9566658343359136, + "grad_norm": 0.45285430043737923, + "learning_rate": 9.422063374322197e-06, + "loss": 0.5883, + "step": 1317 + }, + { + "epoch": 0.9573922320840805, + "grad_norm": 0.5523709468876832, + "learning_rate": 9.420948737194693e-06, + "loss": 0.5866, + "step": 1318 + }, + { + "epoch": 0.9581186298322475, + "grad_norm": 0.4459756657537176, + "learning_rate": 9.419833092292542e-06, + "loss": 0.5641, + "step": 1319 + }, + { + "epoch": 0.9588450275804145, + "grad_norm": 0.48301638137861325, + "learning_rate": 9.418716439870056e-06, + "loss": 0.5865, + "step": 1320 + }, + { + "epoch": 0.9595714253285815, + "grad_norm": 0.4128881676008887, + "learning_rate": 9.417598780181787e-06, + "loss": 0.6071, + "step": 1321 + }, + { + "epoch": 0.9602978230767485, + "grad_norm": 0.44753948046420106, + "learning_rate": 9.416480113482505e-06, + "loss": 0.5822, + "step": 1322 + }, + { + "epoch": 0.9610242208249155, + "grad_norm": 0.4279853381773522, + "learning_rate": 9.415360440027216e-06, + "loss": 0.5804, + "step": 1323 + }, + { + "epoch": 0.9617506185730824, + "grad_norm": 0.4338581451320516, + "learning_rate": 9.414239760071158e-06, + "loss": 0.5887, + "step": 1324 + }, + { + "epoch": 0.9624770163212494, + "grad_norm": 0.42860628721189403, + "learning_rate": 9.413118073869791e-06, + "loss": 0.5882, + "step": 1325 + }, + { + "epoch": 0.9632034140694163, + "grad_norm": 0.44956892815122457, + "learning_rate": 9.411995381678811e-06, + "loss": 0.5782, + "step": 1326 + }, + { + "epoch": 0.9639298118175834, + "grad_norm": 0.4814860194215449, + "learning_rate": 9.410871683754139e-06, + "loss": 0.5702, + "step": 1327 + }, + { + "epoch": 0.9646562095657504, + "grad_norm": 0.42581017230827334, + "learning_rate": 9.409746980351927e-06, + "loss": 0.5678, + "step": 1328 + }, + { + "epoch": 0.9653826073139173, + "grad_norm": 0.44826183504581213, + "learning_rate": 9.408621271728555e-06, + "loss": 0.5591, + "step": 1329 + }, + { + "epoch": 0.9661090050620843, + "grad_norm": 0.5714153732960652, + "learning_rate": 9.407494558140636e-06, + "loss": 0.6072, + "step": 1330 + }, + { + "epoch": 0.9668354028102513, + "grad_norm": 0.6836517336640814, + "learning_rate": 9.406366839845004e-06, + "loss": 0.577, + "step": 1331 + }, + { + "epoch": 0.9675618005584182, + "grad_norm": 0.41510009176677465, + "learning_rate": 9.405238117098736e-06, + "loss": 0.582, + "step": 1332 + }, + { + "epoch": 0.9682881983065853, + "grad_norm": 0.42043939764561933, + "learning_rate": 9.40410839015912e-06, + "loss": 0.5638, + "step": 1333 + }, + { + "epoch": 0.9690145960547523, + "grad_norm": 0.47474607904645816, + "learning_rate": 9.40297765928369e-06, + "loss": 0.5808, + "step": 1334 + }, + { + "epoch": 0.9697409938029192, + "grad_norm": 0.4404458653043789, + "learning_rate": 9.401845924730197e-06, + "loss": 0.6007, + "step": 1335 + }, + { + "epoch": 0.9704673915510862, + "grad_norm": 0.41381749150233, + "learning_rate": 9.400713186756625e-06, + "loss": 0.5756, + "step": 1336 + }, + { + "epoch": 0.9711937892992532, + "grad_norm": 0.40002352121048046, + "learning_rate": 9.399579445621187e-06, + "loss": 0.5845, + "step": 1337 + }, + { + "epoch": 0.9719201870474201, + "grad_norm": 0.3819643412059154, + "learning_rate": 9.398444701582325e-06, + "loss": 0.5871, + "step": 1338 + }, + { + "epoch": 0.9726465847955872, + "grad_norm": 0.4096279784042784, + "learning_rate": 9.397308954898708e-06, + "loss": 0.5602, + "step": 1339 + }, + { + "epoch": 0.9733729825437542, + "grad_norm": 0.41853453487079706, + "learning_rate": 9.396172205829235e-06, + "loss": 0.5776, + "step": 1340 + }, + { + "epoch": 0.9740993802919211, + "grad_norm": 0.40740299719446227, + "learning_rate": 9.395034454633032e-06, + "loss": 0.5805, + "step": 1341 + }, + { + "epoch": 0.9748257780400881, + "grad_norm": 0.5010130777863815, + "learning_rate": 9.393895701569455e-06, + "loss": 0.5824, + "step": 1342 + }, + { + "epoch": 0.975552175788255, + "grad_norm": 0.423630646161036, + "learning_rate": 9.392755946898087e-06, + "loss": 0.5933, + "step": 1343 + }, + { + "epoch": 0.976278573536422, + "grad_norm": 0.5460848532649165, + "learning_rate": 9.391615190878742e-06, + "loss": 0.5716, + "step": 1344 + }, + { + "epoch": 0.977004971284589, + "grad_norm": 0.4314740817665518, + "learning_rate": 9.39047343377146e-06, + "loss": 0.5858, + "step": 1345 + }, + { + "epoch": 0.977731369032756, + "grad_norm": 0.4172447068705145, + "learning_rate": 9.389330675836508e-06, + "loss": 0.5855, + "step": 1346 + }, + { + "epoch": 0.978457766780923, + "grad_norm": 0.40537926860470647, + "learning_rate": 9.388186917334382e-06, + "loss": 0.5806, + "step": 1347 + }, + { + "epoch": 0.97918416452909, + "grad_norm": 0.42272881390227746, + "learning_rate": 9.38704215852581e-06, + "loss": 0.5833, + "step": 1348 + }, + { + "epoch": 0.9799105622772569, + "grad_norm": 0.4071530915037966, + "learning_rate": 9.38589639967174e-06, + "loss": 0.5868, + "step": 1349 + }, + { + "epoch": 0.9806369600254239, + "grad_norm": 0.4174487525760431, + "learning_rate": 9.384749641033358e-06, + "loss": 0.592, + "step": 1350 + }, + { + "epoch": 0.9813633577735908, + "grad_norm": 0.5016160558691094, + "learning_rate": 9.38360188287207e-06, + "loss": 0.5688, + "step": 1351 + }, + { + "epoch": 0.9820897555217579, + "grad_norm": 0.3709738274098191, + "learning_rate": 9.382453125449513e-06, + "loss": 0.5797, + "step": 1352 + }, + { + "epoch": 0.9828161532699249, + "grad_norm": 0.4792601165573076, + "learning_rate": 9.381303369027552e-06, + "loss": 0.5799, + "step": 1353 + }, + { + "epoch": 0.9835425510180918, + "grad_norm": 0.3842929548512234, + "learning_rate": 9.380152613868276e-06, + "loss": 0.5867, + "step": 1354 + }, + { + "epoch": 0.9842689487662588, + "grad_norm": 0.443529556661457, + "learning_rate": 9.37900086023401e-06, + "loss": 0.573, + "step": 1355 + }, + { + "epoch": 0.9849953465144258, + "grad_norm": 0.40568726593321786, + "learning_rate": 9.377848108387295e-06, + "loss": 0.5779, + "step": 1356 + }, + { + "epoch": 0.9857217442625927, + "grad_norm": 0.4435967514139609, + "learning_rate": 9.37669435859091e-06, + "loss": 0.5798, + "step": 1357 + }, + { + "epoch": 0.9864481420107598, + "grad_norm": 0.4245008371251557, + "learning_rate": 9.375539611107856e-06, + "loss": 0.5931, + "step": 1358 + }, + { + "epoch": 0.9871745397589268, + "grad_norm": 0.4348440208427361, + "learning_rate": 9.374383866201364e-06, + "loss": 0.5866, + "step": 1359 + }, + { + "epoch": 0.9879009375070937, + "grad_norm": 0.5474657163639375, + "learning_rate": 9.373227124134888e-06, + "loss": 0.5747, + "step": 1360 + }, + { + "epoch": 0.9886273352552607, + "grad_norm": 0.46556069978991227, + "learning_rate": 9.372069385172115e-06, + "loss": 0.5951, + "step": 1361 + }, + { + "epoch": 0.9893537330034277, + "grad_norm": 0.4871044110501798, + "learning_rate": 9.37091064957696e-06, + "loss": 0.5887, + "step": 1362 + }, + { + "epoch": 0.9900801307515946, + "grad_norm": 0.4259323411994358, + "learning_rate": 9.369750917613554e-06, + "loss": 0.5907, + "step": 1363 + }, + { + "epoch": 0.9908065284997617, + "grad_norm": 0.4100962064140313, + "learning_rate": 9.368590189546268e-06, + "loss": 0.5905, + "step": 1364 + }, + { + "epoch": 0.9915329262479287, + "grad_norm": 0.4598716942351165, + "learning_rate": 9.367428465639696e-06, + "loss": 0.5791, + "step": 1365 + }, + { + "epoch": 0.9922593239960956, + "grad_norm": 0.49479968191021007, + "learning_rate": 9.366265746158653e-06, + "loss": 0.5491, + "step": 1366 + }, + { + "epoch": 0.9929857217442626, + "grad_norm": 0.40911147247756996, + "learning_rate": 9.365102031368191e-06, + "loss": 0.5765, + "step": 1367 + }, + { + "epoch": 0.9937121194924295, + "grad_norm": 0.40613400942325284, + "learning_rate": 9.363937321533583e-06, + "loss": 0.5781, + "step": 1368 + }, + { + "epoch": 0.9944385172405965, + "grad_norm": 0.41346069340622715, + "learning_rate": 9.362771616920328e-06, + "loss": 0.594, + "step": 1369 + }, + { + "epoch": 0.9951649149887636, + "grad_norm": 0.42248849941999883, + "learning_rate": 9.361604917794155e-06, + "loss": 0.5784, + "step": 1370 + }, + { + "epoch": 0.9958913127369305, + "grad_norm": 0.4395488497003654, + "learning_rate": 9.360437224421017e-06, + "loss": 0.5663, + "step": 1371 + }, + { + "epoch": 0.9966177104850975, + "grad_norm": 0.48165754989833665, + "learning_rate": 9.359268537067094e-06, + "loss": 0.6047, + "step": 1372 + }, + { + "epoch": 0.9973441082332645, + "grad_norm": 0.8462980679724643, + "learning_rate": 9.358098855998797e-06, + "loss": 0.5906, + "step": 1373 + }, + { + "epoch": 0.9980705059814314, + "grad_norm": 0.44867266240084885, + "learning_rate": 9.356928181482757e-06, + "loss": 0.5807, + "step": 1374 + }, + { + "epoch": 0.9987969037295984, + "grad_norm": 0.3922906494420764, + "learning_rate": 9.355756513785834e-06, + "loss": 0.5738, + "step": 1375 + }, + { + "epoch": 0.9995233014777655, + "grad_norm": 0.3852398607524496, + "learning_rate": 9.354583853175117e-06, + "loss": 0.591, + "step": 1376 + }, + { + "epoch": 1.0002496992259324, + "grad_norm": 0.44617123921430374, + "learning_rate": 9.353410199917916e-06, + "loss": 0.5784, + "step": 1377 + }, + { + "epoch": 1.0009760969740993, + "grad_norm": 0.38482197267852375, + "learning_rate": 9.352235554281775e-06, + "loss": 0.5642, + "step": 1378 + }, + { + "epoch": 1.0017024947222664, + "grad_norm": 0.4127525659520533, + "learning_rate": 9.351059916534456e-06, + "loss": 0.5659, + "step": 1379 + }, + { + "epoch": 1.0024288924704334, + "grad_norm": 0.41330752590052344, + "learning_rate": 9.349883286943951e-06, + "loss": 0.5983, + "step": 1380 + }, + { + "epoch": 1.0031552902186003, + "grad_norm": 0.40680416072814213, + "learning_rate": 9.348705665778479e-06, + "loss": 0.5733, + "step": 1381 + }, + { + "epoch": 1.0038816879667674, + "grad_norm": 0.4128818362833564, + "learning_rate": 9.347527053306482e-06, + "loss": 0.5882, + "step": 1382 + }, + { + "epoch": 1.0046080857149342, + "grad_norm": 0.4170411955512922, + "learning_rate": 9.346347449796634e-06, + "loss": 0.5874, + "step": 1383 + }, + { + "epoch": 1.0053344834631013, + "grad_norm": 0.45301343977096553, + "learning_rate": 9.345166855517827e-06, + "loss": 0.5915, + "step": 1384 + }, + { + "epoch": 1.0060608812112684, + "grad_norm": 0.4174938754667378, + "learning_rate": 9.343985270739184e-06, + "loss": 0.5699, + "step": 1385 + }, + { + "epoch": 1.0067872789594352, + "grad_norm": 0.6243604998156592, + "learning_rate": 9.34280269573005e-06, + "loss": 0.561, + "step": 1386 + }, + { + "epoch": 1.0075136767076023, + "grad_norm": 0.4128286014435526, + "learning_rate": 9.341619130760004e-06, + "loss": 0.5883, + "step": 1387 + }, + { + "epoch": 1.0082400744557691, + "grad_norm": 0.4168211775574965, + "learning_rate": 9.34043457609884e-06, + "loss": 0.5734, + "step": 1388 + }, + { + "epoch": 1.0089664722039362, + "grad_norm": 0.418443742472078, + "learning_rate": 9.339249032016584e-06, + "loss": 0.549, + "step": 1389 + }, + { + "epoch": 1.009692869952103, + "grad_norm": 0.412506877347843, + "learning_rate": 9.338062498783485e-06, + "loss": 0.5777, + "step": 1390 + }, + { + "epoch": 1.0104192677002701, + "grad_norm": 0.4288751680263293, + "learning_rate": 9.336874976670018e-06, + "loss": 0.5745, + "step": 1391 + }, + { + "epoch": 1.0111456654484372, + "grad_norm": 0.48767089366316047, + "learning_rate": 9.335686465946888e-06, + "loss": 0.5835, + "step": 1392 + }, + { + "epoch": 1.011872063196604, + "grad_norm": 0.400362746791716, + "learning_rate": 9.334496966885017e-06, + "loss": 0.5917, + "step": 1393 + }, + { + "epoch": 1.0125984609447711, + "grad_norm": 0.37674500749905987, + "learning_rate": 9.333306479755557e-06, + "loss": 0.5714, + "step": 1394 + }, + { + "epoch": 1.013324858692938, + "grad_norm": 0.43401884599608276, + "learning_rate": 9.332115004829885e-06, + "loss": 0.5727, + "step": 1395 + }, + { + "epoch": 1.014051256441105, + "grad_norm": 0.4355295649860661, + "learning_rate": 9.330922542379605e-06, + "loss": 0.5588, + "step": 1396 + }, + { + "epoch": 1.014777654189272, + "grad_norm": 0.45556005093229185, + "learning_rate": 9.329729092676542e-06, + "loss": 0.571, + "step": 1397 + }, + { + "epoch": 1.015504051937439, + "grad_norm": 0.43547493346898447, + "learning_rate": 9.328534655992747e-06, + "loss": 0.577, + "step": 1398 + }, + { + "epoch": 1.016230449685606, + "grad_norm": 0.39133492272444154, + "learning_rate": 9.3273392326005e-06, + "loss": 0.5845, + "step": 1399 + }, + { + "epoch": 1.016956847433773, + "grad_norm": 0.38477807853847923, + "learning_rate": 9.326142822772301e-06, + "loss": 0.5687, + "step": 1400 + }, + { + "epoch": 1.01768324518194, + "grad_norm": 0.4027839692661464, + "learning_rate": 9.324945426780879e-06, + "loss": 0.5788, + "step": 1401 + }, + { + "epoch": 1.0184096429301068, + "grad_norm": 0.4215575711137664, + "learning_rate": 9.323747044899184e-06, + "loss": 0.5787, + "step": 1402 + }, + { + "epoch": 1.019136040678274, + "grad_norm": 0.4829986330178923, + "learning_rate": 9.322547677400391e-06, + "loss": 0.5848, + "step": 1403 + }, + { + "epoch": 1.019862438426441, + "grad_norm": 0.39452309057088686, + "learning_rate": 9.321347324557904e-06, + "loss": 0.5944, + "step": 1404 + }, + { + "epoch": 1.0205888361746078, + "grad_norm": 0.4509442010918262, + "learning_rate": 9.320145986645348e-06, + "loss": 0.5865, + "step": 1405 + }, + { + "epoch": 1.021315233922775, + "grad_norm": 0.4257629876201213, + "learning_rate": 9.31894366393657e-06, + "loss": 0.5713, + "step": 1406 + }, + { + "epoch": 1.0220416316709418, + "grad_norm": 0.48116645828094656, + "learning_rate": 9.317740356705649e-06, + "loss": 0.6006, + "step": 1407 + }, + { + "epoch": 1.0227680294191088, + "grad_norm": 0.5090569406001516, + "learning_rate": 9.316536065226883e-06, + "loss": 0.5891, + "step": 1408 + }, + { + "epoch": 1.0234944271672757, + "grad_norm": 0.5625989892238936, + "learning_rate": 9.315330789774795e-06, + "loss": 0.5852, + "step": 1409 + }, + { + "epoch": 1.0242208249154428, + "grad_norm": 1.4737104492769104, + "learning_rate": 9.31412453062413e-06, + "loss": 0.5987, + "step": 1410 + }, + { + "epoch": 1.0249472226636098, + "grad_norm": 0.4422415845798024, + "learning_rate": 9.312917288049868e-06, + "loss": 0.5858, + "step": 1411 + }, + { + "epoch": 1.0256736204117767, + "grad_norm": 0.3877553271798556, + "learning_rate": 9.311709062327197e-06, + "loss": 0.5753, + "step": 1412 + }, + { + "epoch": 1.0264000181599438, + "grad_norm": 0.5253155430426784, + "learning_rate": 9.31049985373154e-06, + "loss": 0.5866, + "step": 1413 + }, + { + "epoch": 1.0271264159081106, + "grad_norm": 0.3967005737441752, + "learning_rate": 9.309289662538543e-06, + "loss": 0.5605, + "step": 1414 + }, + { + "epoch": 1.0278528136562777, + "grad_norm": 0.429486431143533, + "learning_rate": 9.308078489024072e-06, + "loss": 0.5609, + "step": 1415 + }, + { + "epoch": 1.0285792114044447, + "grad_norm": 0.4650393072669976, + "learning_rate": 9.306866333464223e-06, + "loss": 0.5597, + "step": 1416 + }, + { + "epoch": 1.0293056091526116, + "grad_norm": 0.41440859115144746, + "learning_rate": 9.305653196135308e-06, + "loss": 0.6004, + "step": 1417 + }, + { + "epoch": 1.0300320069007787, + "grad_norm": 0.41388563070702816, + "learning_rate": 9.30443907731387e-06, + "loss": 0.5823, + "step": 1418 + }, + { + "epoch": 1.0307584046489455, + "grad_norm": 0.43655904948926433, + "learning_rate": 9.303223977276669e-06, + "loss": 0.568, + "step": 1419 + }, + { + "epoch": 1.0314848023971126, + "grad_norm": 0.3926843050634251, + "learning_rate": 9.302007896300697e-06, + "loss": 0.5635, + "step": 1420 + }, + { + "epoch": 1.0322112001452795, + "grad_norm": 0.42957369993443373, + "learning_rate": 9.300790834663163e-06, + "loss": 0.5746, + "step": 1421 + }, + { + "epoch": 1.0329375978934465, + "grad_norm": 0.4035745493692686, + "learning_rate": 9.2995727926415e-06, + "loss": 0.5862, + "step": 1422 + }, + { + "epoch": 1.0336639956416136, + "grad_norm": 0.3920427008419327, + "learning_rate": 9.298353770513368e-06, + "loss": 0.5663, + "step": 1423 + }, + { + "epoch": 1.0343903933897804, + "grad_norm": 0.46057274244672136, + "learning_rate": 9.297133768556649e-06, + "loss": 0.5812, + "step": 1424 + }, + { + "epoch": 1.0351167911379475, + "grad_norm": 0.5253446644849149, + "learning_rate": 9.295912787049447e-06, + "loss": 0.5916, + "step": 1425 + }, + { + "epoch": 1.0358431888861144, + "grad_norm": 0.4397820455055968, + "learning_rate": 9.294690826270086e-06, + "loss": 0.5773, + "step": 1426 + }, + { + "epoch": 1.0365695866342814, + "grad_norm": 0.3794802433265421, + "learning_rate": 9.293467886497123e-06, + "loss": 0.5638, + "step": 1427 + }, + { + "epoch": 1.0372959843824485, + "grad_norm": 0.39267603669754747, + "learning_rate": 9.292243968009332e-06, + "loss": 0.5825, + "step": 1428 + }, + { + "epoch": 1.0380223821306154, + "grad_norm": 0.39438762623612705, + "learning_rate": 9.291019071085707e-06, + "loss": 0.5683, + "step": 1429 + }, + { + "epoch": 1.0387487798787824, + "grad_norm": 0.4124722400201054, + "learning_rate": 9.289793196005472e-06, + "loss": 0.5887, + "step": 1430 + }, + { + "epoch": 1.0394751776269493, + "grad_norm": 0.3710351913452333, + "learning_rate": 9.288566343048069e-06, + "loss": 0.5617, + "step": 1431 + }, + { + "epoch": 1.0402015753751164, + "grad_norm": 0.3696840977628092, + "learning_rate": 9.287338512493165e-06, + "loss": 0.5713, + "step": 1432 + }, + { + "epoch": 1.0409279731232832, + "grad_norm": 0.4711524354304988, + "learning_rate": 9.286109704620648e-06, + "loss": 0.5703, + "step": 1433 + }, + { + "epoch": 1.0416543708714503, + "grad_norm": 0.40615010982081595, + "learning_rate": 9.284879919710631e-06, + "loss": 0.564, + "step": 1434 + }, + { + "epoch": 1.0423807686196174, + "grad_norm": 0.41109169173341603, + "learning_rate": 9.283649158043452e-06, + "loss": 0.5732, + "step": 1435 + }, + { + "epoch": 1.0431071663677842, + "grad_norm": 0.44915827468482616, + "learning_rate": 9.282417419899664e-06, + "loss": 0.592, + "step": 1436 + }, + { + "epoch": 1.0438335641159513, + "grad_norm": 0.36242059051621556, + "learning_rate": 9.281184705560049e-06, + "loss": 0.575, + "step": 1437 + }, + { + "epoch": 1.0445599618641181, + "grad_norm": 0.3951607612016366, + "learning_rate": 9.279951015305611e-06, + "loss": 0.5629, + "step": 1438 + }, + { + "epoch": 1.0452863596122852, + "grad_norm": 0.42649878376009126, + "learning_rate": 9.27871634941757e-06, + "loss": 0.5697, + "step": 1439 + }, + { + "epoch": 1.0460127573604523, + "grad_norm": 0.5203166547135417, + "learning_rate": 9.27748070817738e-06, + "loss": 0.5837, + "step": 1440 + }, + { + "epoch": 1.0467391551086191, + "grad_norm": 0.3776646098453339, + "learning_rate": 9.276244091866706e-06, + "loss": 0.5588, + "step": 1441 + }, + { + "epoch": 1.0474655528567862, + "grad_norm": 0.44599908364573615, + "learning_rate": 9.275006500767444e-06, + "loss": 0.5693, + "step": 1442 + }, + { + "epoch": 1.048191950604953, + "grad_norm": 0.38020056144781483, + "learning_rate": 9.273767935161705e-06, + "loss": 0.5741, + "step": 1443 + }, + { + "epoch": 1.0489183483531201, + "grad_norm": 0.4268767302486434, + "learning_rate": 9.272528395331826e-06, + "loss": 0.5732, + "step": 1444 + }, + { + "epoch": 1.049644746101287, + "grad_norm": 0.40829365584244665, + "learning_rate": 9.271287881560368e-06, + "loss": 0.5584, + "step": 1445 + }, + { + "epoch": 1.050371143849454, + "grad_norm": 0.39136192636878153, + "learning_rate": 9.27004639413011e-06, + "loss": 0.5784, + "step": 1446 + }, + { + "epoch": 1.0510975415976211, + "grad_norm": 0.39740978287858214, + "learning_rate": 9.268803933324053e-06, + "loss": 0.5862, + "step": 1447 + }, + { + "epoch": 1.051823939345788, + "grad_norm": 0.42772270593881806, + "learning_rate": 9.267560499425425e-06, + "loss": 0.5712, + "step": 1448 + }, + { + "epoch": 1.052550337093955, + "grad_norm": 0.4015339188340092, + "learning_rate": 9.266316092717666e-06, + "loss": 0.5804, + "step": 1449 + }, + { + "epoch": 1.053276734842122, + "grad_norm": 0.4721614746960808, + "learning_rate": 9.265070713484452e-06, + "loss": 0.5752, + "step": 1450 + }, + { + "epoch": 1.054003132590289, + "grad_norm": 0.46272828478575495, + "learning_rate": 9.263824362009667e-06, + "loss": 0.5725, + "step": 1451 + }, + { + "epoch": 1.0547295303384558, + "grad_norm": 0.43524012594081773, + "learning_rate": 9.262577038577423e-06, + "loss": 0.5745, + "step": 1452 + }, + { + "epoch": 1.055455928086623, + "grad_norm": 0.3697781664579044, + "learning_rate": 9.261328743472055e-06, + "loss": 0.5661, + "step": 1453 + }, + { + "epoch": 1.05618232583479, + "grad_norm": 0.44617814494960756, + "learning_rate": 9.260079476978117e-06, + "loss": 0.5852, + "step": 1454 + }, + { + "epoch": 1.0569087235829568, + "grad_norm": 0.4019452947154167, + "learning_rate": 9.25882923938038e-06, + "loss": 0.5939, + "step": 1455 + }, + { + "epoch": 1.057635121331124, + "grad_norm": 0.4191763655875106, + "learning_rate": 9.25757803096385e-06, + "loss": 0.5748, + "step": 1456 + }, + { + "epoch": 1.0583615190792908, + "grad_norm": 0.4126101858730787, + "learning_rate": 9.256325852013736e-06, + "loss": 0.5835, + "step": 1457 + }, + { + "epoch": 1.0590879168274578, + "grad_norm": 0.4339550235322551, + "learning_rate": 9.255072702815487e-06, + "loss": 0.5872, + "step": 1458 + }, + { + "epoch": 1.059814314575625, + "grad_norm": 0.388417450620657, + "learning_rate": 9.253818583654754e-06, + "loss": 0.5659, + "step": 1459 + }, + { + "epoch": 1.0605407123237918, + "grad_norm": 0.4540043570449997, + "learning_rate": 9.252563494817426e-06, + "loss": 0.5756, + "step": 1460 + }, + { + "epoch": 1.0612671100719588, + "grad_norm": 0.38753097423844807, + "learning_rate": 9.251307436589605e-06, + "loss": 0.5716, + "step": 1461 + }, + { + "epoch": 1.0619935078201257, + "grad_norm": 0.37777144670282375, + "learning_rate": 9.250050409257612e-06, + "loss": 0.5586, + "step": 1462 + }, + { + "epoch": 1.0627199055682928, + "grad_norm": 0.445355054898785, + "learning_rate": 9.248792413107994e-06, + "loss": 0.5707, + "step": 1463 + }, + { + "epoch": 1.0634463033164596, + "grad_norm": 0.43309925235084556, + "learning_rate": 9.247533448427519e-06, + "loss": 0.5857, + "step": 1464 + }, + { + "epoch": 1.0641727010646267, + "grad_norm": 0.4464319853779842, + "learning_rate": 9.246273515503169e-06, + "loss": 0.5802, + "step": 1465 + }, + { + "epoch": 1.0648990988127938, + "grad_norm": 0.3924050059125471, + "learning_rate": 9.245012614622153e-06, + "loss": 0.5706, + "step": 1466 + }, + { + "epoch": 1.0656254965609606, + "grad_norm": 0.40147856573132024, + "learning_rate": 9.2437507460719e-06, + "loss": 0.5706, + "step": 1467 + }, + { + "epoch": 1.0663518943091277, + "grad_norm": 0.4001433161155672, + "learning_rate": 9.242487910140057e-06, + "loss": 0.5548, + "step": 1468 + }, + { + "epoch": 1.0670782920572945, + "grad_norm": 0.4079874843159747, + "learning_rate": 9.241224107114495e-06, + "loss": 0.5635, + "step": 1469 + }, + { + "epoch": 1.0678046898054616, + "grad_norm": 0.4560119919834794, + "learning_rate": 9.239959337283301e-06, + "loss": 0.5723, + "step": 1470 + }, + { + "epoch": 1.0685310875536285, + "grad_norm": 0.4407800440761631, + "learning_rate": 9.238693600934785e-06, + "loss": 0.5775, + "step": 1471 + }, + { + "epoch": 1.0692574853017955, + "grad_norm": 0.4840882494667328, + "learning_rate": 9.23742689835748e-06, + "loss": 0.5754, + "step": 1472 + }, + { + "epoch": 1.0699838830499626, + "grad_norm": 0.37810093243018317, + "learning_rate": 9.236159229840133e-06, + "loss": 0.5566, + "step": 1473 + }, + { + "epoch": 1.0707102807981295, + "grad_norm": 0.3863492962548962, + "learning_rate": 9.234890595671717e-06, + "loss": 0.565, + "step": 1474 + }, + { + "epoch": 1.0714366785462965, + "grad_norm": 0.7615309489873513, + "learning_rate": 9.233620996141421e-06, + "loss": 0.5598, + "step": 1475 + }, + { + "epoch": 1.0721630762944634, + "grad_norm": 0.3976142210474501, + "learning_rate": 9.232350431538656e-06, + "loss": 0.568, + "step": 1476 + }, + { + "epoch": 1.0728894740426305, + "grad_norm": 0.3850052436582411, + "learning_rate": 9.231078902153058e-06, + "loss": 0.5587, + "step": 1477 + }, + { + "epoch": 1.0736158717907975, + "grad_norm": 0.7063675394401735, + "learning_rate": 9.229806408274467e-06, + "loss": 0.5527, + "step": 1478 + }, + { + "epoch": 1.0743422695389644, + "grad_norm": 0.4258570084193788, + "learning_rate": 9.228532950192964e-06, + "loss": 0.5688, + "step": 1479 + }, + { + "epoch": 1.0750686672871315, + "grad_norm": 0.41356428724432126, + "learning_rate": 9.227258528198832e-06, + "loss": 0.5764, + "step": 1480 + }, + { + "epoch": 1.0757950650352983, + "grad_norm": 0.3686752782723383, + "learning_rate": 9.225983142582584e-06, + "loss": 0.5571, + "step": 1481 + }, + { + "epoch": 1.0765214627834654, + "grad_norm": 0.38160042743844014, + "learning_rate": 9.224706793634951e-06, + "loss": 0.5604, + "step": 1482 + }, + { + "epoch": 1.0772478605316325, + "grad_norm": 0.3707774059356207, + "learning_rate": 9.223429481646881e-06, + "loss": 0.581, + "step": 1483 + }, + { + "epoch": 1.0779742582797993, + "grad_norm": 0.3695515047035739, + "learning_rate": 9.222151206909541e-06, + "loss": 0.5498, + "step": 1484 + }, + { + "epoch": 1.0787006560279664, + "grad_norm": 0.3831609762559241, + "learning_rate": 9.220871969714325e-06, + "loss": 0.5715, + "step": 1485 + }, + { + "epoch": 1.0794270537761332, + "grad_norm": 0.36103940460289247, + "learning_rate": 9.219591770352836e-06, + "loss": 0.5713, + "step": 1486 + }, + { + "epoch": 1.0801534515243003, + "grad_norm": 0.4427569819582544, + "learning_rate": 9.2183106091169e-06, + "loss": 0.5873, + "step": 1487 + }, + { + "epoch": 1.0808798492724672, + "grad_norm": 0.45995302001148347, + "learning_rate": 9.217028486298567e-06, + "loss": 0.5714, + "step": 1488 + }, + { + "epoch": 1.0816062470206342, + "grad_norm": 0.4569865672570247, + "learning_rate": 9.2157454021901e-06, + "loss": 0.5734, + "step": 1489 + }, + { + "epoch": 1.0823326447688013, + "grad_norm": 0.4061969443898093, + "learning_rate": 9.214461357083986e-06, + "loss": 0.5611, + "step": 1490 + }, + { + "epoch": 1.0830590425169682, + "grad_norm": 0.36787594074816127, + "learning_rate": 9.213176351272926e-06, + "loss": 0.5771, + "step": 1491 + }, + { + "epoch": 1.0837854402651352, + "grad_norm": 0.5034943262793765, + "learning_rate": 9.211890385049845e-06, + "loss": 0.5707, + "step": 1492 + }, + { + "epoch": 1.084511838013302, + "grad_norm": 0.38467957622459686, + "learning_rate": 9.210603458707883e-06, + "loss": 0.5697, + "step": 1493 + }, + { + "epoch": 1.0852382357614692, + "grad_norm": 0.42418759374955084, + "learning_rate": 9.209315572540402e-06, + "loss": 0.5625, + "step": 1494 + }, + { + "epoch": 1.085964633509636, + "grad_norm": 0.3998370227866447, + "learning_rate": 9.20802672684098e-06, + "loss": 0.5813, + "step": 1495 + }, + { + "epoch": 1.086691031257803, + "grad_norm": 0.42451732990267654, + "learning_rate": 9.206736921903414e-06, + "loss": 0.5759, + "step": 1496 + }, + { + "epoch": 1.0874174290059702, + "grad_norm": 0.5284676685659139, + "learning_rate": 9.205446158021725e-06, + "loss": 0.5638, + "step": 1497 + }, + { + "epoch": 1.088143826754137, + "grad_norm": 0.4213863049015563, + "learning_rate": 9.204154435490143e-06, + "loss": 0.5794, + "step": 1498 + }, + { + "epoch": 1.088870224502304, + "grad_norm": 0.4080295627251175, + "learning_rate": 9.202861754603126e-06, + "loss": 0.5751, + "step": 1499 + }, + { + "epoch": 1.089596622250471, + "grad_norm": 0.39291554245061283, + "learning_rate": 9.201568115655343e-06, + "loss": 0.556, + "step": 1500 + }, + { + "epoch": 1.090323019998638, + "grad_norm": 0.3740570835089451, + "learning_rate": 9.200273518941688e-06, + "loss": 0.5753, + "step": 1501 + }, + { + "epoch": 1.091049417746805, + "grad_norm": 0.407499502900159, + "learning_rate": 9.198977964757266e-06, + "loss": 0.5729, + "step": 1502 + }, + { + "epoch": 1.091775815494972, + "grad_norm": 0.3719259405007259, + "learning_rate": 9.197681453397407e-06, + "loss": 0.5627, + "step": 1503 + }, + { + "epoch": 1.092502213243139, + "grad_norm": 0.4068817249799283, + "learning_rate": 9.196383985157657e-06, + "loss": 0.5682, + "step": 1504 + }, + { + "epoch": 1.0932286109913059, + "grad_norm": 0.3822730777415677, + "learning_rate": 9.195085560333777e-06, + "loss": 0.5613, + "step": 1505 + }, + { + "epoch": 1.093955008739473, + "grad_norm": 0.509101461961365, + "learning_rate": 9.193786179221751e-06, + "loss": 0.5821, + "step": 1506 + }, + { + "epoch": 1.0946814064876398, + "grad_norm": 0.36195978677680385, + "learning_rate": 9.192485842117777e-06, + "loss": 0.5572, + "step": 1507 + }, + { + "epoch": 1.0954078042358069, + "grad_norm": 0.43123149253343235, + "learning_rate": 9.191184549318275e-06, + "loss": 0.5676, + "step": 1508 + }, + { + "epoch": 1.096134201983974, + "grad_norm": 0.39497568720609605, + "learning_rate": 9.189882301119878e-06, + "loss": 0.5662, + "step": 1509 + }, + { + "epoch": 1.0968605997321408, + "grad_norm": 0.392750640761948, + "learning_rate": 9.188579097819439e-06, + "loss": 0.5673, + "step": 1510 + }, + { + "epoch": 1.0975869974803079, + "grad_norm": 0.39765453193046085, + "learning_rate": 9.18727493971403e-06, + "loss": 0.5867, + "step": 1511 + }, + { + "epoch": 1.0983133952284747, + "grad_norm": 0.4802452358360034, + "learning_rate": 9.18596982710094e-06, + "loss": 0.5591, + "step": 1512 + }, + { + "epoch": 1.0990397929766418, + "grad_norm": 0.3956359994045311, + "learning_rate": 9.184663760277674e-06, + "loss": 0.5708, + "step": 1513 + }, + { + "epoch": 1.0997661907248089, + "grad_norm": 0.4135810979273268, + "learning_rate": 9.183356739541958e-06, + "loss": 0.5706, + "step": 1514 + }, + { + "epoch": 1.1004925884729757, + "grad_norm": 0.38969981003076315, + "learning_rate": 9.182048765191729e-06, + "loss": 0.5751, + "step": 1515 + }, + { + "epoch": 1.1012189862211428, + "grad_norm": 0.4182089910086, + "learning_rate": 9.180739837525148e-06, + "loss": 0.5545, + "step": 1516 + }, + { + "epoch": 1.1019453839693096, + "grad_norm": 0.42615730462223866, + "learning_rate": 9.17942995684059e-06, + "loss": 0.5654, + "step": 1517 + }, + { + "epoch": 1.1026717817174767, + "grad_norm": 0.38455182758076384, + "learning_rate": 9.178119123436651e-06, + "loss": 0.576, + "step": 1518 + }, + { + "epoch": 1.1033981794656436, + "grad_norm": 0.4081208032550898, + "learning_rate": 9.176807337612136e-06, + "loss": 0.5673, + "step": 1519 + }, + { + "epoch": 1.1041245772138106, + "grad_norm": 0.43376128159499927, + "learning_rate": 9.175494599666078e-06, + "loss": 0.5721, + "step": 1520 + }, + { + "epoch": 1.1048509749619777, + "grad_norm": 0.40332872980030554, + "learning_rate": 9.174180909897715e-06, + "loss": 0.5687, + "step": 1521 + }, + { + "epoch": 1.1055773727101446, + "grad_norm": 0.38223586760682, + "learning_rate": 9.172866268606514e-06, + "loss": 0.5702, + "step": 1522 + }, + { + "epoch": 1.1063037704583116, + "grad_norm": 0.40924832268015005, + "learning_rate": 9.171550676092151e-06, + "loss": 0.5802, + "step": 1523 + }, + { + "epoch": 1.1070301682064785, + "grad_norm": 0.4228837477832897, + "learning_rate": 9.170234132654521e-06, + "loss": 0.5779, + "step": 1524 + }, + { + "epoch": 1.1077565659546456, + "grad_norm": 0.43068444275154966, + "learning_rate": 9.168916638593736e-06, + "loss": 0.5733, + "step": 1525 + }, + { + "epoch": 1.1084829637028126, + "grad_norm": 0.4368955020773527, + "learning_rate": 9.167598194210124e-06, + "loss": 0.5637, + "step": 1526 + }, + { + "epoch": 1.1092093614509795, + "grad_norm": 0.46106898550934694, + "learning_rate": 9.166278799804232e-06, + "loss": 0.5621, + "step": 1527 + }, + { + "epoch": 1.1099357591991466, + "grad_norm": 0.4998543301741803, + "learning_rate": 9.164958455676818e-06, + "loss": 0.5738, + "step": 1528 + }, + { + "epoch": 1.1106621569473134, + "grad_norm": 0.3770729245790208, + "learning_rate": 9.163637162128865e-06, + "loss": 0.5618, + "step": 1529 + }, + { + "epoch": 1.1113885546954805, + "grad_norm": 0.49041521965602436, + "learning_rate": 9.162314919461566e-06, + "loss": 0.5533, + "step": 1530 + }, + { + "epoch": 1.1121149524436473, + "grad_norm": 0.3900108576009805, + "learning_rate": 9.16099172797633e-06, + "loss": 0.5529, + "step": 1531 + }, + { + "epoch": 1.1128413501918144, + "grad_norm": 0.37032433949378185, + "learning_rate": 9.159667587974786e-06, + "loss": 0.5696, + "step": 1532 + }, + { + "epoch": 1.1135677479399815, + "grad_norm": 0.42684873849663624, + "learning_rate": 9.158342499758777e-06, + "loss": 0.5434, + "step": 1533 + }, + { + "epoch": 1.1142941456881483, + "grad_norm": 0.43643707876206966, + "learning_rate": 9.157016463630363e-06, + "loss": 0.5706, + "step": 1534 + }, + { + "epoch": 1.1150205434363154, + "grad_norm": 0.4454221836123075, + "learning_rate": 9.15568947989182e-06, + "loss": 0.5708, + "step": 1535 + }, + { + "epoch": 1.1157469411844823, + "grad_norm": 0.38248652147035744, + "learning_rate": 9.154361548845639e-06, + "loss": 0.5571, + "step": 1536 + }, + { + "epoch": 1.1164733389326493, + "grad_norm": 0.39988857054600924, + "learning_rate": 9.153032670794527e-06, + "loss": 0.5705, + "step": 1537 + }, + { + "epoch": 1.1171997366808162, + "grad_norm": 0.4055927377484529, + "learning_rate": 9.15170284604141e-06, + "loss": 0.5589, + "step": 1538 + }, + { + "epoch": 1.1179261344289833, + "grad_norm": 0.6645482334421663, + "learning_rate": 9.150372074889427e-06, + "loss": 0.5665, + "step": 1539 + }, + { + "epoch": 1.1186525321771503, + "grad_norm": 0.37649582235674955, + "learning_rate": 9.14904035764193e-06, + "loss": 0.5655, + "step": 1540 + }, + { + "epoch": 1.1193789299253172, + "grad_norm": 0.456135150501283, + "learning_rate": 9.147707694602492e-06, + "loss": 0.575, + "step": 1541 + }, + { + "epoch": 1.1201053276734843, + "grad_norm": 0.4340819757352092, + "learning_rate": 9.1463740860749e-06, + "loss": 0.5541, + "step": 1542 + }, + { + "epoch": 1.120831725421651, + "grad_norm": 0.5154115608102522, + "learning_rate": 9.145039532363156e-06, + "loss": 0.5728, + "step": 1543 + }, + { + "epoch": 1.1215581231698182, + "grad_norm": 0.415248125939544, + "learning_rate": 9.143704033771476e-06, + "loss": 0.573, + "step": 1544 + }, + { + "epoch": 1.1222845209179853, + "grad_norm": 0.47571569465415514, + "learning_rate": 9.142367590604294e-06, + "loss": 0.5724, + "step": 1545 + }, + { + "epoch": 1.123010918666152, + "grad_norm": 0.4328702166226174, + "learning_rate": 9.141030203166256e-06, + "loss": 0.5881, + "step": 1546 + }, + { + "epoch": 1.1237373164143192, + "grad_norm": 0.4029536592492071, + "learning_rate": 9.139691871762229e-06, + "loss": 0.5569, + "step": 1547 + }, + { + "epoch": 1.124463714162486, + "grad_norm": 0.4467378583250728, + "learning_rate": 9.138352596697287e-06, + "loss": 0.5776, + "step": 1548 + }, + { + "epoch": 1.125190111910653, + "grad_norm": 0.41357552315372287, + "learning_rate": 9.137012378276729e-06, + "loss": 0.552, + "step": 1549 + }, + { + "epoch": 1.1259165096588202, + "grad_norm": 0.3961970064201352, + "learning_rate": 9.135671216806057e-06, + "loss": 0.542, + "step": 1550 + }, + { + "epoch": 1.126642907406987, + "grad_norm": 0.3799347314928259, + "learning_rate": 9.134329112591e-06, + "loss": 0.5783, + "step": 1551 + }, + { + "epoch": 1.127369305155154, + "grad_norm": 0.4409962185466233, + "learning_rate": 9.132986065937495e-06, + "loss": 0.5833, + "step": 1552 + }, + { + "epoch": 1.128095702903321, + "grad_norm": 0.4199431422304019, + "learning_rate": 9.131642077151695e-06, + "loss": 0.5544, + "step": 1553 + }, + { + "epoch": 1.128822100651488, + "grad_norm": 0.4359092717576693, + "learning_rate": 9.130297146539967e-06, + "loss": 0.5601, + "step": 1554 + }, + { + "epoch": 1.1295484983996549, + "grad_norm": 0.4386918332276292, + "learning_rate": 9.128951274408898e-06, + "loss": 0.5779, + "step": 1555 + }, + { + "epoch": 1.130274896147822, + "grad_norm": 0.43388198361205227, + "learning_rate": 9.12760446106528e-06, + "loss": 0.5601, + "step": 1556 + }, + { + "epoch": 1.1310012938959888, + "grad_norm": 0.4020892074156709, + "learning_rate": 9.126256706816129e-06, + "loss": 0.558, + "step": 1557 + }, + { + "epoch": 1.1317276916441559, + "grad_norm": 0.3612712750947298, + "learning_rate": 9.124908011968667e-06, + "loss": 0.5615, + "step": 1558 + }, + { + "epoch": 1.132454089392323, + "grad_norm": 0.4384818833438229, + "learning_rate": 9.123558376830342e-06, + "loss": 0.5683, + "step": 1559 + }, + { + "epoch": 1.1331804871404898, + "grad_norm": 0.4018494187243776, + "learning_rate": 9.122207801708802e-06, + "loss": 0.5727, + "step": 1560 + }, + { + "epoch": 1.1339068848886569, + "grad_norm": 0.40446557479401735, + "learning_rate": 9.120856286911922e-06, + "loss": 0.5686, + "step": 1561 + }, + { + "epoch": 1.1346332826368237, + "grad_norm": 0.7222820279604052, + "learning_rate": 9.119503832747782e-06, + "loss": 0.5595, + "step": 1562 + }, + { + "epoch": 1.1353596803849908, + "grad_norm": 0.4118173443641317, + "learning_rate": 9.118150439524682e-06, + "loss": 0.5591, + "step": 1563 + }, + { + "epoch": 1.1360860781331579, + "grad_norm": 0.39203921438913475, + "learning_rate": 9.116796107551134e-06, + "loss": 0.5664, + "step": 1564 + }, + { + "epoch": 1.1368124758813247, + "grad_norm": 0.44213420522319913, + "learning_rate": 9.115440837135862e-06, + "loss": 0.5497, + "step": 1565 + }, + { + "epoch": 1.1375388736294918, + "grad_norm": 0.3915843971752154, + "learning_rate": 9.114084628587806e-06, + "loss": 0.5743, + "step": 1566 + }, + { + "epoch": 1.1382652713776586, + "grad_norm": 0.37457500840606167, + "learning_rate": 9.112727482216123e-06, + "loss": 0.5645, + "step": 1567 + }, + { + "epoch": 1.1389916691258257, + "grad_norm": 0.39111251152334303, + "learning_rate": 9.111369398330177e-06, + "loss": 0.5793, + "step": 1568 + }, + { + "epoch": 1.1397180668739928, + "grad_norm": 0.5449219416546525, + "learning_rate": 9.110010377239552e-06, + "loss": 0.582, + "step": 1569 + }, + { + "epoch": 1.1404444646221596, + "grad_norm": 0.49691466965603404, + "learning_rate": 9.108650419254037e-06, + "loss": 0.5646, + "step": 1570 + }, + { + "epoch": 1.1411708623703267, + "grad_norm": 0.40646649401243895, + "learning_rate": 9.107289524683648e-06, + "loss": 0.5645, + "step": 1571 + }, + { + "epoch": 1.1418972601184936, + "grad_norm": 0.39891945566604176, + "learning_rate": 9.105927693838601e-06, + "loss": 0.5642, + "step": 1572 + }, + { + "epoch": 1.1426236578666606, + "grad_norm": 0.4048822149119588, + "learning_rate": 9.104564927029337e-06, + "loss": 0.5652, + "step": 1573 + }, + { + "epoch": 1.1433500556148275, + "grad_norm": 0.45448169948196127, + "learning_rate": 9.103201224566499e-06, + "loss": 0.5704, + "step": 1574 + }, + { + "epoch": 1.1440764533629946, + "grad_norm": 0.3903073163448049, + "learning_rate": 9.101836586760951e-06, + "loss": 0.5751, + "step": 1575 + }, + { + "epoch": 1.1448028511111616, + "grad_norm": 0.5097478191145266, + "learning_rate": 9.10047101392377e-06, + "loss": 0.5623, + "step": 1576 + }, + { + "epoch": 1.1455292488593285, + "grad_norm": 0.5564016875040064, + "learning_rate": 9.099104506366242e-06, + "loss": 0.5735, + "step": 1577 + }, + { + "epoch": 1.1462556466074956, + "grad_norm": 0.45792127134281246, + "learning_rate": 9.09773706439987e-06, + "loss": 0.562, + "step": 1578 + }, + { + "epoch": 1.1469820443556624, + "grad_norm": 0.40573516591270037, + "learning_rate": 9.096368688336365e-06, + "loss": 0.5712, + "step": 1579 + }, + { + "epoch": 1.1477084421038295, + "grad_norm": 0.3746216254001378, + "learning_rate": 9.094999378487659e-06, + "loss": 0.565, + "step": 1580 + }, + { + "epoch": 1.1484348398519963, + "grad_norm": 0.4132489165843512, + "learning_rate": 9.09362913516589e-06, + "loss": 0.5611, + "step": 1581 + }, + { + "epoch": 1.1491612376001634, + "grad_norm": 0.4327030952311928, + "learning_rate": 9.092257958683411e-06, + "loss": 0.572, + "step": 1582 + }, + { + "epoch": 1.1498876353483305, + "grad_norm": 0.5657636997480442, + "learning_rate": 9.090885849352788e-06, + "loss": 0.5725, + "step": 1583 + }, + { + "epoch": 1.1506140330964973, + "grad_norm": 0.43552805930495025, + "learning_rate": 9.0895128074868e-06, + "loss": 0.5722, + "step": 1584 + }, + { + "epoch": 1.1513404308446644, + "grad_norm": 0.4021861507833854, + "learning_rate": 9.088138833398435e-06, + "loss": 0.5773, + "step": 1585 + }, + { + "epoch": 1.1520668285928313, + "grad_norm": 0.4019858854361604, + "learning_rate": 9.086763927400898e-06, + "loss": 0.538, + "step": 1586 + }, + { + "epoch": 1.1527932263409983, + "grad_norm": 0.5661378384808389, + "learning_rate": 9.085388089807607e-06, + "loss": 0.5519, + "step": 1587 + }, + { + "epoch": 1.1535196240891654, + "grad_norm": 0.3744204610801191, + "learning_rate": 9.08401132093219e-06, + "loss": 0.5452, + "step": 1588 + }, + { + "epoch": 1.1542460218373323, + "grad_norm": 0.6743822486879415, + "learning_rate": 9.082633621088483e-06, + "loss": 0.5664, + "step": 1589 + }, + { + "epoch": 1.1549724195854993, + "grad_norm": 0.4620844862314671, + "learning_rate": 9.081254990590542e-06, + "loss": 0.5443, + "step": 1590 + }, + { + "epoch": 1.1556988173336662, + "grad_norm": 0.35175578314298, + "learning_rate": 9.079875429752633e-06, + "loss": 0.5412, + "step": 1591 + }, + { + "epoch": 1.1564252150818333, + "grad_norm": 0.391580626589242, + "learning_rate": 9.07849493888923e-06, + "loss": 0.5537, + "step": 1592 + }, + { + "epoch": 1.1571516128300003, + "grad_norm": 0.4529476753979372, + "learning_rate": 9.077113518315024e-06, + "loss": 0.5604, + "step": 1593 + }, + { + "epoch": 1.1578780105781672, + "grad_norm": 0.4550412254149466, + "learning_rate": 9.075731168344917e-06, + "loss": 0.5618, + "step": 1594 + }, + { + "epoch": 1.1586044083263343, + "grad_norm": 0.3809100704198691, + "learning_rate": 9.074347889294017e-06, + "loss": 0.5896, + "step": 1595 + }, + { + "epoch": 1.1593308060745011, + "grad_norm": 0.3905562748653443, + "learning_rate": 9.072963681477654e-06, + "loss": 0.5726, + "step": 1596 + }, + { + "epoch": 1.1600572038226682, + "grad_norm": 0.4091043456983746, + "learning_rate": 9.071578545211362e-06, + "loss": 0.5696, + "step": 1597 + }, + { + "epoch": 1.160783601570835, + "grad_norm": 0.4839681586665417, + "learning_rate": 9.070192480810888e-06, + "loss": 0.5621, + "step": 1598 + }, + { + "epoch": 1.1615099993190021, + "grad_norm": 0.40196070880566054, + "learning_rate": 9.068805488592191e-06, + "loss": 0.5779, + "step": 1599 + }, + { + "epoch": 1.162236397067169, + "grad_norm": 0.3918863091270513, + "learning_rate": 9.067417568871444e-06, + "loss": 0.5785, + "step": 1600 + }, + { + "epoch": 1.162962794815336, + "grad_norm": 0.36932354291748165, + "learning_rate": 9.06602872196503e-06, + "loss": 0.5688, + "step": 1601 + }, + { + "epoch": 1.1636891925635031, + "grad_norm": 0.511494494122197, + "learning_rate": 9.064638948189539e-06, + "loss": 0.5554, + "step": 1602 + }, + { + "epoch": 1.16441559031167, + "grad_norm": 0.37895248932231274, + "learning_rate": 9.06324824786178e-06, + "loss": 0.5575, + "step": 1603 + }, + { + "epoch": 1.165141988059837, + "grad_norm": 0.3832823773236108, + "learning_rate": 9.061856621298767e-06, + "loss": 0.5654, + "step": 1604 + }, + { + "epoch": 1.165868385808004, + "grad_norm": 0.43465330879731395, + "learning_rate": 9.060464068817728e-06, + "loss": 0.5738, + "step": 1605 + }, + { + "epoch": 1.166594783556171, + "grad_norm": 0.5551169241788557, + "learning_rate": 9.059070590736101e-06, + "loss": 0.5613, + "step": 1606 + }, + { + "epoch": 1.167321181304338, + "grad_norm": 0.5780876697750443, + "learning_rate": 9.057676187371536e-06, + "loss": 0.5462, + "step": 1607 + }, + { + "epoch": 1.168047579052505, + "grad_norm": 0.38976775310867845, + "learning_rate": 9.056280859041893e-06, + "loss": 0.5602, + "step": 1608 + }, + { + "epoch": 1.168773976800672, + "grad_norm": 0.4137511184661419, + "learning_rate": 9.054884606065243e-06, + "loss": 0.5359, + "step": 1609 + }, + { + "epoch": 1.1695003745488388, + "grad_norm": 0.4545114979291193, + "learning_rate": 9.053487428759869e-06, + "loss": 0.5566, + "step": 1610 + }, + { + "epoch": 1.170226772297006, + "grad_norm": 0.40448803287087315, + "learning_rate": 9.052089327444263e-06, + "loss": 0.5613, + "step": 1611 + }, + { + "epoch": 1.170953170045173, + "grad_norm": 0.4204853059368482, + "learning_rate": 9.050690302437128e-06, + "loss": 0.5506, + "step": 1612 + }, + { + "epoch": 1.1716795677933398, + "grad_norm": 0.41937616591275495, + "learning_rate": 9.049290354057379e-06, + "loss": 0.5584, + "step": 1613 + }, + { + "epoch": 1.172405965541507, + "grad_norm": 0.3812136298560557, + "learning_rate": 9.047889482624139e-06, + "loss": 0.5502, + "step": 1614 + }, + { + "epoch": 1.1731323632896737, + "grad_norm": 0.42863197566192074, + "learning_rate": 9.046487688456745e-06, + "loss": 0.5717, + "step": 1615 + }, + { + "epoch": 1.1738587610378408, + "grad_norm": 0.4416572927164445, + "learning_rate": 9.045084971874738e-06, + "loss": 0.5629, + "step": 1616 + }, + { + "epoch": 1.1745851587860079, + "grad_norm": 0.4318909760541322, + "learning_rate": 9.043681333197878e-06, + "loss": 0.5616, + "step": 1617 + }, + { + "epoch": 1.1753115565341747, + "grad_norm": 0.4913993162820547, + "learning_rate": 9.042276772746127e-06, + "loss": 0.5555, + "step": 1618 + }, + { + "epoch": 1.1760379542823418, + "grad_norm": 0.38866225966344226, + "learning_rate": 9.040871290839663e-06, + "loss": 0.5804, + "step": 1619 + }, + { + "epoch": 1.1767643520305087, + "grad_norm": 0.4473064645504217, + "learning_rate": 9.03946488779887e-06, + "loss": 0.558, + "step": 1620 + }, + { + "epoch": 1.1774907497786757, + "grad_norm": 0.4002867177138744, + "learning_rate": 9.038057563944346e-06, + "loss": 0.5563, + "step": 1621 + }, + { + "epoch": 1.1782171475268426, + "grad_norm": 0.3619082015242835, + "learning_rate": 9.036649319596895e-06, + "loss": 0.5704, + "step": 1622 + }, + { + "epoch": 1.1789435452750097, + "grad_norm": 0.42685374009998284, + "learning_rate": 9.035240155077532e-06, + "loss": 0.572, + "step": 1623 + }, + { + "epoch": 1.1796699430231765, + "grad_norm": 0.38283761766805846, + "learning_rate": 9.033830070707485e-06, + "loss": 0.5586, + "step": 1624 + }, + { + "epoch": 1.1803963407713436, + "grad_norm": 0.5072262281997689, + "learning_rate": 9.032419066808184e-06, + "loss": 0.5596, + "step": 1625 + }, + { + "epoch": 1.1811227385195107, + "grad_norm": 0.47901385507935723, + "learning_rate": 9.03100714370128e-06, + "loss": 0.5742, + "step": 1626 + }, + { + "epoch": 1.1818491362676775, + "grad_norm": 0.4621432294903584, + "learning_rate": 9.029594301708622e-06, + "loss": 0.559, + "step": 1627 + }, + { + "epoch": 1.1825755340158446, + "grad_norm": 0.35661902505888965, + "learning_rate": 9.028180541152275e-06, + "loss": 0.5427, + "step": 1628 + }, + { + "epoch": 1.1833019317640114, + "grad_norm": 0.39979814096373295, + "learning_rate": 9.026765862354512e-06, + "loss": 0.5621, + "step": 1629 + }, + { + "epoch": 1.1840283295121785, + "grad_norm": 0.6290392164977799, + "learning_rate": 9.025350265637816e-06, + "loss": 0.5631, + "step": 1630 + }, + { + "epoch": 1.1847547272603456, + "grad_norm": 0.41675307891525165, + "learning_rate": 9.023933751324876e-06, + "loss": 0.56, + "step": 1631 + }, + { + "epoch": 1.1854811250085124, + "grad_norm": 0.40915617471251176, + "learning_rate": 9.022516319738598e-06, + "loss": 0.568, + "step": 1632 + }, + { + "epoch": 1.1862075227566795, + "grad_norm": 0.36057322747830917, + "learning_rate": 9.021097971202085e-06, + "loss": 0.548, + "step": 1633 + }, + { + "epoch": 1.1869339205048464, + "grad_norm": 0.398305816253801, + "learning_rate": 9.01967870603866e-06, + "loss": 0.5648, + "step": 1634 + }, + { + "epoch": 1.1876603182530134, + "grad_norm": 0.40047400844909886, + "learning_rate": 9.018258524571848e-06, + "loss": 0.5648, + "step": 1635 + }, + { + "epoch": 1.1883867160011805, + "grad_norm": 0.36446634860953764, + "learning_rate": 9.016837427125389e-06, + "loss": 0.5458, + "step": 1636 + }, + { + "epoch": 1.1891131137493474, + "grad_norm": 0.4437052109459863, + "learning_rate": 9.015415414023226e-06, + "loss": 0.5598, + "step": 1637 + }, + { + "epoch": 1.1898395114975144, + "grad_norm": 0.40903270592078117, + "learning_rate": 9.013992485589513e-06, + "loss": 0.5544, + "step": 1638 + }, + { + "epoch": 1.1905659092456813, + "grad_norm": 0.39745344686830697, + "learning_rate": 9.012568642148615e-06, + "loss": 0.5776, + "step": 1639 + }, + { + "epoch": 1.1912923069938484, + "grad_norm": 0.5021202842523842, + "learning_rate": 9.0111438840251e-06, + "loss": 0.5513, + "step": 1640 + }, + { + "epoch": 1.1920187047420152, + "grad_norm": 0.5116684721496529, + "learning_rate": 9.009718211543752e-06, + "loss": 0.5615, + "step": 1641 + }, + { + "epoch": 1.1927451024901823, + "grad_norm": 0.3758199164186561, + "learning_rate": 9.008291625029556e-06, + "loss": 0.5675, + "step": 1642 + }, + { + "epoch": 1.1934715002383491, + "grad_norm": 0.3805578386122031, + "learning_rate": 9.00686412480771e-06, + "loss": 0.5511, + "step": 1643 + }, + { + "epoch": 1.1941978979865162, + "grad_norm": 0.44933161613407974, + "learning_rate": 9.005435711203619e-06, + "loss": 0.5492, + "step": 1644 + }, + { + "epoch": 1.1949242957346833, + "grad_norm": 0.43365968490721074, + "learning_rate": 9.004006384542894e-06, + "loss": 0.5541, + "step": 1645 + }, + { + "epoch": 1.1956506934828501, + "grad_norm": 0.3871348134506192, + "learning_rate": 9.002576145151359e-06, + "loss": 0.5641, + "step": 1646 + }, + { + "epoch": 1.1963770912310172, + "grad_norm": 0.4064010639567433, + "learning_rate": 9.001144993355042e-06, + "loss": 0.5753, + "step": 1647 + }, + { + "epoch": 1.197103488979184, + "grad_norm": 0.46830624397535536, + "learning_rate": 8.99971292948018e-06, + "loss": 0.5656, + "step": 1648 + }, + { + "epoch": 1.1978298867273511, + "grad_norm": 0.38093273605602024, + "learning_rate": 8.99827995385322e-06, + "loss": 0.5537, + "step": 1649 + }, + { + "epoch": 1.1985562844755182, + "grad_norm": 0.4356470351028575, + "learning_rate": 8.996846066800815e-06, + "loss": 0.5609, + "step": 1650 + }, + { + "epoch": 1.199282682223685, + "grad_norm": 0.39422074961192954, + "learning_rate": 8.995411268649823e-06, + "loss": 0.5608, + "step": 1651 + }, + { + "epoch": 1.2000090799718521, + "grad_norm": 0.375468512449991, + "learning_rate": 8.993975559727316e-06, + "loss": 0.5668, + "step": 1652 + }, + { + "epoch": 1.200735477720019, + "grad_norm": 0.40297827175793516, + "learning_rate": 8.992538940360568e-06, + "loss": 0.5538, + "step": 1653 + }, + { + "epoch": 1.201461875468186, + "grad_norm": 0.38635504994472913, + "learning_rate": 8.991101410877064e-06, + "loss": 0.5646, + "step": 1654 + }, + { + "epoch": 1.2021882732163531, + "grad_norm": 0.33581752318849784, + "learning_rate": 8.989662971604491e-06, + "loss": 0.5413, + "step": 1655 + }, + { + "epoch": 1.20291467096452, + "grad_norm": 0.3409030181703885, + "learning_rate": 8.988223622870754e-06, + "loss": 0.5442, + "step": 1656 + }, + { + "epoch": 1.203641068712687, + "grad_norm": 0.4306916478895894, + "learning_rate": 8.986783365003955e-06, + "loss": 0.5695, + "step": 1657 + }, + { + "epoch": 1.204367466460854, + "grad_norm": 0.43933251871690226, + "learning_rate": 8.985342198332407e-06, + "loss": 0.5614, + "step": 1658 + }, + { + "epoch": 1.205093864209021, + "grad_norm": 0.4168589094786127, + "learning_rate": 8.983900123184634e-06, + "loss": 0.5626, + "step": 1659 + }, + { + "epoch": 1.205820261957188, + "grad_norm": 0.39213159298507183, + "learning_rate": 8.982457139889358e-06, + "loss": 0.5508, + "step": 1660 + }, + { + "epoch": 1.206546659705355, + "grad_norm": 0.3930281890036753, + "learning_rate": 8.981013248775516e-06, + "loss": 0.5575, + "step": 1661 + }, + { + "epoch": 1.207273057453522, + "grad_norm": 0.4032943583882293, + "learning_rate": 8.979568450172248e-06, + "loss": 0.5513, + "step": 1662 + }, + { + "epoch": 1.2079994552016888, + "grad_norm": 0.35483539042175233, + "learning_rate": 8.978122744408905e-06, + "loss": 0.5521, + "step": 1663 + }, + { + "epoch": 1.208725852949856, + "grad_norm": 0.604044247562166, + "learning_rate": 8.976676131815041e-06, + "loss": 0.5669, + "step": 1664 + }, + { + "epoch": 1.2094522506980228, + "grad_norm": 0.42184739662747306, + "learning_rate": 8.975228612720415e-06, + "loss": 0.5497, + "step": 1665 + }, + { + "epoch": 1.2101786484461898, + "grad_norm": 0.4423424779217794, + "learning_rate": 8.973780187454999e-06, + "loss": 0.5508, + "step": 1666 + }, + { + "epoch": 1.2109050461943567, + "grad_norm": 0.41083410822149735, + "learning_rate": 8.972330856348965e-06, + "loss": 0.5541, + "step": 1667 + }, + { + "epoch": 1.2116314439425238, + "grad_norm": 0.3870919292235364, + "learning_rate": 8.970880619732695e-06, + "loss": 0.5364, + "step": 1668 + }, + { + "epoch": 1.2123578416906908, + "grad_norm": 0.41285288636232365, + "learning_rate": 8.96942947793678e-06, + "loss": 0.5753, + "step": 1669 + }, + { + "epoch": 1.2130842394388577, + "grad_norm": 0.39234857911866095, + "learning_rate": 8.967977431292009e-06, + "loss": 0.5522, + "step": 1670 + }, + { + "epoch": 1.2138106371870248, + "grad_norm": 0.4284657100652839, + "learning_rate": 8.966524480129386e-06, + "loss": 0.558, + "step": 1671 + }, + { + "epoch": 1.2145370349351916, + "grad_norm": 0.42743619885583367, + "learning_rate": 8.965070624780117e-06, + "loss": 0.5732, + "step": 1672 + }, + { + "epoch": 1.2152634326833587, + "grad_norm": 0.38064277890598813, + "learning_rate": 8.963615865575613e-06, + "loss": 0.541, + "step": 1673 + }, + { + "epoch": 1.2159898304315258, + "grad_norm": 0.39160633024799835, + "learning_rate": 8.962160202847494e-06, + "loss": 0.5461, + "step": 1674 + }, + { + "epoch": 1.2167162281796926, + "grad_norm": 0.45820430910342846, + "learning_rate": 8.960703636927585e-06, + "loss": 0.5398, + "step": 1675 + }, + { + "epoch": 1.2174426259278597, + "grad_norm": 0.4594650336331551, + "learning_rate": 8.959246168147915e-06, + "loss": 0.5433, + "step": 1676 + }, + { + "epoch": 1.2181690236760265, + "grad_norm": 0.43281785549799423, + "learning_rate": 8.957787796840724e-06, + "loss": 0.564, + "step": 1677 + }, + { + "epoch": 1.2188954214241936, + "grad_norm": 0.3444644303493312, + "learning_rate": 8.95632852333845e-06, + "loss": 0.5567, + "step": 1678 + }, + { + "epoch": 1.2196218191723607, + "grad_norm": 0.3718226041891921, + "learning_rate": 8.954868347973742e-06, + "loss": 0.5462, + "step": 1679 + }, + { + "epoch": 1.2203482169205275, + "grad_norm": 0.408627736693502, + "learning_rate": 8.953407271079456e-06, + "loss": 0.5365, + "step": 1680 + }, + { + "epoch": 1.2210746146686946, + "grad_norm": 0.37139547247679977, + "learning_rate": 8.951945292988647e-06, + "loss": 0.5471, + "step": 1681 + }, + { + "epoch": 1.2218010124168615, + "grad_norm": 0.3724276111177844, + "learning_rate": 8.950482414034583e-06, + "loss": 0.5477, + "step": 1682 + }, + { + "epoch": 1.2225274101650285, + "grad_norm": 0.39946058697205866, + "learning_rate": 8.94901863455073e-06, + "loss": 0.5371, + "step": 1683 + }, + { + "epoch": 1.2232538079131954, + "grad_norm": 0.36753474211608916, + "learning_rate": 8.947553954870765e-06, + "loss": 0.5574, + "step": 1684 + }, + { + "epoch": 1.2239802056613625, + "grad_norm": 0.5392178107861165, + "learning_rate": 8.946088375328569e-06, + "loss": 0.5587, + "step": 1685 + }, + { + "epoch": 1.2247066034095295, + "grad_norm": 0.3644470759919107, + "learning_rate": 8.944621896258226e-06, + "loss": 0.5621, + "step": 1686 + }, + { + "epoch": 1.2254330011576964, + "grad_norm": 0.44219202544370106, + "learning_rate": 8.943154517994026e-06, + "loss": 0.5536, + "step": 1687 + }, + { + "epoch": 1.2261593989058635, + "grad_norm": 0.3764996895947529, + "learning_rate": 8.941686240870464e-06, + "loss": 0.5558, + "step": 1688 + }, + { + "epoch": 1.2268857966540303, + "grad_norm": 0.3462514441906337, + "learning_rate": 8.940217065222241e-06, + "loss": 0.5434, + "step": 1689 + }, + { + "epoch": 1.2276121944021974, + "grad_norm": 0.37441059075120486, + "learning_rate": 8.938746991384264e-06, + "loss": 0.5576, + "step": 1690 + }, + { + "epoch": 1.2283385921503642, + "grad_norm": 0.4985554959055178, + "learning_rate": 8.937276019691638e-06, + "loss": 0.5577, + "step": 1691 + }, + { + "epoch": 1.2290649898985313, + "grad_norm": 0.3864813217862178, + "learning_rate": 8.935804150479683e-06, + "loss": 0.5442, + "step": 1692 + }, + { + "epoch": 1.2297913876466984, + "grad_norm": 0.3845478893478187, + "learning_rate": 8.934331384083914e-06, + "loss": 0.5554, + "step": 1693 + }, + { + "epoch": 1.2305177853948652, + "grad_norm": 1.6701398796419848, + "learning_rate": 8.932857720840056e-06, + "loss": 0.5666, + "step": 1694 + }, + { + "epoch": 1.2312441831430323, + "grad_norm": 0.4220956274028827, + "learning_rate": 8.931383161084037e-06, + "loss": 0.5488, + "step": 1695 + }, + { + "epoch": 1.2319705808911992, + "grad_norm": 0.38464465303061424, + "learning_rate": 8.92990770515199e-06, + "loss": 0.5392, + "step": 1696 + }, + { + "epoch": 1.2326969786393662, + "grad_norm": 0.38261868231706775, + "learning_rate": 8.928431353380249e-06, + "loss": 0.551, + "step": 1697 + }, + { + "epoch": 1.2334233763875333, + "grad_norm": 0.4963784011318068, + "learning_rate": 8.926954106105358e-06, + "loss": 0.5572, + "step": 1698 + }, + { + "epoch": 1.2341497741357002, + "grad_norm": 0.4774876780938401, + "learning_rate": 8.92547596366406e-06, + "loss": 0.5671, + "step": 1699 + }, + { + "epoch": 1.2348761718838672, + "grad_norm": 0.5664978767143851, + "learning_rate": 8.923996926393306e-06, + "loss": 0.5343, + "step": 1700 + }, + { + "epoch": 1.235602569632034, + "grad_norm": 0.4431798160139461, + "learning_rate": 8.922516994630246e-06, + "loss": 0.5484, + "step": 1701 + }, + { + "epoch": 1.2363289673802011, + "grad_norm": 0.39504209873632384, + "learning_rate": 8.921036168712241e-06, + "loss": 0.5636, + "step": 1702 + }, + { + "epoch": 1.2370553651283682, + "grad_norm": 1.8434104950338672, + "learning_rate": 8.919554448976848e-06, + "loss": 0.5624, + "step": 1703 + }, + { + "epoch": 1.237781762876535, + "grad_norm": 2.226707270583763, + "learning_rate": 8.918071835761833e-06, + "loss": 0.5408, + "step": 1704 + }, + { + "epoch": 1.2385081606247021, + "grad_norm": 0.46962685875100246, + "learning_rate": 8.916588329405164e-06, + "loss": 0.5627, + "step": 1705 + }, + { + "epoch": 1.239234558372869, + "grad_norm": 0.41155769724241303, + "learning_rate": 8.915103930245015e-06, + "loss": 0.56, + "step": 1706 + }, + { + "epoch": 1.239960956121036, + "grad_norm": 0.4924690840390668, + "learning_rate": 8.913618638619757e-06, + "loss": 0.5503, + "step": 1707 + }, + { + "epoch": 1.240687353869203, + "grad_norm": 0.44712240099824757, + "learning_rate": 8.912132454867972e-06, + "loss": 0.5658, + "step": 1708 + }, + { + "epoch": 1.24141375161737, + "grad_norm": 0.5552373112222078, + "learning_rate": 8.910645379328442e-06, + "loss": 0.5483, + "step": 1709 + }, + { + "epoch": 1.2421401493655368, + "grad_norm": 0.7778708121407731, + "learning_rate": 8.90915741234015e-06, + "loss": 0.5438, + "step": 1710 + }, + { + "epoch": 1.242866547113704, + "grad_norm": 0.48332018315159286, + "learning_rate": 8.907668554242286e-06, + "loss": 0.5506, + "step": 1711 + }, + { + "epoch": 1.243592944861871, + "grad_norm": 0.4372921808238204, + "learning_rate": 8.906178805374243e-06, + "loss": 0.5508, + "step": 1712 + }, + { + "epoch": 1.2443193426100378, + "grad_norm": 0.4716974668725763, + "learning_rate": 8.904688166075614e-06, + "loss": 0.5515, + "step": 1713 + }, + { + "epoch": 1.245045740358205, + "grad_norm": 0.5165417601435339, + "learning_rate": 8.903196636686198e-06, + "loss": 0.5343, + "step": 1714 + }, + { + "epoch": 1.2457721381063718, + "grad_norm": 0.47588955550056855, + "learning_rate": 8.901704217545995e-06, + "loss": 0.5535, + "step": 1715 + }, + { + "epoch": 1.2464985358545388, + "grad_norm": 0.41375809514814743, + "learning_rate": 8.900210908995207e-06, + "loss": 0.5495, + "step": 1716 + }, + { + "epoch": 1.247224933602706, + "grad_norm": 0.3667236648873448, + "learning_rate": 8.898716711374243e-06, + "loss": 0.5657, + "step": 1717 + }, + { + "epoch": 1.2479513313508728, + "grad_norm": 0.472340981602009, + "learning_rate": 8.89722162502371e-06, + "loss": 0.541, + "step": 1718 + }, + { + "epoch": 1.2486777290990398, + "grad_norm": 0.4053169462679586, + "learning_rate": 8.89572565028442e-06, + "loss": 0.5594, + "step": 1719 + }, + { + "epoch": 1.2494041268472067, + "grad_norm": 0.5282769709840852, + "learning_rate": 8.894228787497389e-06, + "loss": 0.5687, + "step": 1720 + }, + { + "epoch": 1.2501305245953738, + "grad_norm": 0.4227663685562685, + "learning_rate": 8.89273103700383e-06, + "loss": 0.5313, + "step": 1721 + }, + { + "epoch": 1.2508569223435408, + "grad_norm": 0.48655409953672724, + "learning_rate": 8.891232399145164e-06, + "loss": 0.5461, + "step": 1722 + }, + { + "epoch": 1.2515833200917077, + "grad_norm": 0.4797567850949162, + "learning_rate": 8.88973287426301e-06, + "loss": 0.5577, + "step": 1723 + }, + { + "epoch": 1.2523097178398748, + "grad_norm": 0.37881110627643433, + "learning_rate": 8.888232462699196e-06, + "loss": 0.5517, + "step": 1724 + }, + { + "epoch": 1.2530361155880416, + "grad_norm": 0.4458797743651538, + "learning_rate": 8.88673116479574e-06, + "loss": 0.5386, + "step": 1725 + }, + { + "epoch": 1.2537625133362087, + "grad_norm": 0.47035221294764884, + "learning_rate": 8.885228980894877e-06, + "loss": 0.5675, + "step": 1726 + }, + { + "epoch": 1.2544889110843758, + "grad_norm": 0.4037941136698805, + "learning_rate": 8.883725911339032e-06, + "loss": 0.5492, + "step": 1727 + }, + { + "epoch": 1.2552153088325426, + "grad_norm": 0.43376466055770363, + "learning_rate": 8.882221956470838e-06, + "loss": 0.5576, + "step": 1728 + }, + { + "epoch": 1.2559417065807095, + "grad_norm": 0.4404900043276078, + "learning_rate": 8.880717116633126e-06, + "loss": 0.5497, + "step": 1729 + }, + { + "epoch": 1.2566681043288765, + "grad_norm": 0.4159398594778771, + "learning_rate": 8.87921139216893e-06, + "loss": 0.5403, + "step": 1730 + }, + { + "epoch": 1.2573945020770436, + "grad_norm": 0.40016155319282093, + "learning_rate": 8.877704783421492e-06, + "loss": 0.5596, + "step": 1731 + }, + { + "epoch": 1.2581208998252105, + "grad_norm": 0.6201551996430268, + "learning_rate": 8.876197290734247e-06, + "loss": 0.5486, + "step": 1732 + }, + { + "epoch": 1.2588472975733775, + "grad_norm": 0.423114797642255, + "learning_rate": 8.874688914450833e-06, + "loss": 0.5619, + "step": 1733 + }, + { + "epoch": 1.2595736953215444, + "grad_norm": 0.4015212537751768, + "learning_rate": 8.873179654915093e-06, + "loss": 0.5506, + "step": 1734 + }, + { + "epoch": 1.2603000930697115, + "grad_norm": 0.4138072597511688, + "learning_rate": 8.871669512471068e-06, + "loss": 0.5414, + "step": 1735 + }, + { + "epoch": 1.2610264908178785, + "grad_norm": 0.37740103115812385, + "learning_rate": 8.870158487463003e-06, + "loss": 0.5652, + "step": 1736 + }, + { + "epoch": 1.2617528885660454, + "grad_norm": 0.4672154959611961, + "learning_rate": 8.86864658023534e-06, + "loss": 0.5509, + "step": 1737 + }, + { + "epoch": 1.2624792863142125, + "grad_norm": 0.3794877865595524, + "learning_rate": 8.867133791132729e-06, + "loss": 0.5378, + "step": 1738 + }, + { + "epoch": 1.2632056840623793, + "grad_norm": 0.3941484564763056, + "learning_rate": 8.865620120500017e-06, + "loss": 0.5313, + "step": 1739 + }, + { + "epoch": 1.2639320818105464, + "grad_norm": 0.433625775037335, + "learning_rate": 8.864105568682245e-06, + "loss": 0.5477, + "step": 1740 + }, + { + "epoch": 1.2646584795587135, + "grad_norm": 0.4256197777407076, + "learning_rate": 8.862590136024668e-06, + "loss": 0.547, + "step": 1741 + }, + { + "epoch": 1.2653848773068803, + "grad_norm": 0.36925502674557587, + "learning_rate": 8.861073822872735e-06, + "loss": 0.5558, + "step": 1742 + }, + { + "epoch": 1.2661112750550474, + "grad_norm": 0.4404635405799064, + "learning_rate": 8.859556629572095e-06, + "loss": 0.5528, + "step": 1743 + }, + { + "epoch": 1.2668376728032142, + "grad_norm": 0.4142530403240027, + "learning_rate": 8.858038556468598e-06, + "loss": 0.5577, + "step": 1744 + }, + { + "epoch": 1.2675640705513813, + "grad_norm": 0.418909820251429, + "learning_rate": 8.856519603908295e-06, + "loss": 0.5482, + "step": 1745 + }, + { + "epoch": 1.2682904682995484, + "grad_norm": 0.47128421655077196, + "learning_rate": 8.854999772237442e-06, + "loss": 0.5437, + "step": 1746 + }, + { + "epoch": 1.2690168660477152, + "grad_norm": 0.5111805391386319, + "learning_rate": 8.853479061802489e-06, + "loss": 0.5317, + "step": 1747 + }, + { + "epoch": 1.2697432637958823, + "grad_norm": 0.3996020838695995, + "learning_rate": 8.851957472950086e-06, + "loss": 0.5593, + "step": 1748 + }, + { + "epoch": 1.2704696615440492, + "grad_norm": 0.4130100221068787, + "learning_rate": 8.85043500602709e-06, + "loss": 0.5549, + "step": 1749 + }, + { + "epoch": 1.2711960592922162, + "grad_norm": 0.3513680463359487, + "learning_rate": 8.84891166138055e-06, + "loss": 0.5423, + "step": 1750 + }, + { + "epoch": 1.271922457040383, + "grad_norm": 0.4395399002655679, + "learning_rate": 8.847387439357725e-06, + "loss": 0.5557, + "step": 1751 + }, + { + "epoch": 1.2726488547885502, + "grad_norm": 0.38446859995160715, + "learning_rate": 8.84586234030606e-06, + "loss": 0.5463, + "step": 1752 + }, + { + "epoch": 1.273375252536717, + "grad_norm": 0.3957873922173363, + "learning_rate": 8.844336364573214e-06, + "loss": 0.5602, + "step": 1753 + }, + { + "epoch": 1.274101650284884, + "grad_norm": 0.38643960679786543, + "learning_rate": 8.842809512507038e-06, + "loss": 0.5458, + "step": 1754 + }, + { + "epoch": 1.2748280480330512, + "grad_norm": 0.3706602353251092, + "learning_rate": 8.841281784455586e-06, + "loss": 0.5602, + "step": 1755 + }, + { + "epoch": 1.275554445781218, + "grad_norm": 0.3796213188989138, + "learning_rate": 8.839753180767108e-06, + "loss": 0.5635, + "step": 1756 + }, + { + "epoch": 1.276280843529385, + "grad_norm": 0.45928464006098907, + "learning_rate": 8.838223701790057e-06, + "loss": 0.5426, + "step": 1757 + }, + { + "epoch": 1.277007241277552, + "grad_norm": 0.3818371835551699, + "learning_rate": 8.836693347873084e-06, + "loss": 0.5463, + "step": 1758 + }, + { + "epoch": 1.277733639025719, + "grad_norm": 0.36166963772964084, + "learning_rate": 8.835162119365042e-06, + "loss": 0.5264, + "step": 1759 + }, + { + "epoch": 1.278460036773886, + "grad_norm": 0.385670474786461, + "learning_rate": 8.833630016614976e-06, + "loss": 0.543, + "step": 1760 + }, + { + "epoch": 1.279186434522053, + "grad_norm": 0.3990063492555785, + "learning_rate": 8.832097039972144e-06, + "loss": 0.5338, + "step": 1761 + }, + { + "epoch": 1.27991283227022, + "grad_norm": 0.6653963038347863, + "learning_rate": 8.830563189785986e-06, + "loss": 0.5375, + "step": 1762 + }, + { + "epoch": 1.2806392300183869, + "grad_norm": 0.39286885480174005, + "learning_rate": 8.829028466406156e-06, + "loss": 0.5575, + "step": 1763 + }, + { + "epoch": 1.281365627766554, + "grad_norm": 0.39103188453512855, + "learning_rate": 8.827492870182496e-06, + "loss": 0.5467, + "step": 1764 + }, + { + "epoch": 1.282092025514721, + "grad_norm": 0.3530567148397571, + "learning_rate": 8.825956401465056e-06, + "loss": 0.5501, + "step": 1765 + }, + { + "epoch": 1.2828184232628879, + "grad_norm": 0.3962145821734583, + "learning_rate": 8.824419060604081e-06, + "loss": 0.5512, + "step": 1766 + }, + { + "epoch": 1.283544821011055, + "grad_norm": 0.4155954746083735, + "learning_rate": 8.822880847950011e-06, + "loss": 0.543, + "step": 1767 + }, + { + "epoch": 1.2842712187592218, + "grad_norm": 0.36229776590475643, + "learning_rate": 8.82134176385349e-06, + "loss": 0.5494, + "step": 1768 + }, + { + "epoch": 1.2849976165073889, + "grad_norm": 0.3892187196516395, + "learning_rate": 8.819801808665361e-06, + "loss": 0.5606, + "step": 1769 + }, + { + "epoch": 1.285724014255556, + "grad_norm": 0.6272602907160172, + "learning_rate": 8.818260982736662e-06, + "loss": 0.5466, + "step": 1770 + }, + { + "epoch": 1.2864504120037228, + "grad_norm": 0.3920063283771201, + "learning_rate": 8.81671928641863e-06, + "loss": 0.5346, + "step": 1771 + }, + { + "epoch": 1.2871768097518896, + "grad_norm": 0.34977300906231246, + "learning_rate": 8.815176720062701e-06, + "loss": 0.5343, + "step": 1772 + }, + { + "epoch": 1.2879032075000567, + "grad_norm": 0.4164422641969452, + "learning_rate": 8.813633284020512e-06, + "loss": 0.5552, + "step": 1773 + }, + { + "epoch": 1.2886296052482238, + "grad_norm": 0.389797438250589, + "learning_rate": 8.812088978643894e-06, + "loss": 0.5468, + "step": 1774 + }, + { + "epoch": 1.2893560029963906, + "grad_norm": 0.8495184137168471, + "learning_rate": 8.810543804284879e-06, + "loss": 0.5501, + "step": 1775 + }, + { + "epoch": 1.2900824007445577, + "grad_norm": 0.46604450197114805, + "learning_rate": 8.808997761295698e-06, + "loss": 0.5455, + "step": 1776 + }, + { + "epoch": 1.2908087984927246, + "grad_norm": 0.39491912481678554, + "learning_rate": 8.807450850028776e-06, + "loss": 0.5362, + "step": 1777 + }, + { + "epoch": 1.2915351962408916, + "grad_norm": 0.49234274419716917, + "learning_rate": 8.805903070836738e-06, + "loss": 0.5406, + "step": 1778 + }, + { + "epoch": 1.2922615939890587, + "grad_norm": 0.34933402697149396, + "learning_rate": 8.80435442407241e-06, + "loss": 0.5448, + "step": 1779 + }, + { + "epoch": 1.2929879917372256, + "grad_norm": 0.4826316850229999, + "learning_rate": 8.80280491008881e-06, + "loss": 0.5451, + "step": 1780 + }, + { + "epoch": 1.2937143894853926, + "grad_norm": 0.39767972884114156, + "learning_rate": 8.801254529239156e-06, + "loss": 0.5499, + "step": 1781 + }, + { + "epoch": 1.2944407872335595, + "grad_norm": 0.4612738879037992, + "learning_rate": 8.799703281876866e-06, + "loss": 0.5473, + "step": 1782 + }, + { + "epoch": 1.2951671849817266, + "grad_norm": 0.4211579062948049, + "learning_rate": 8.798151168355555e-06, + "loss": 0.5347, + "step": 1783 + }, + { + "epoch": 1.2958935827298936, + "grad_norm": 0.4422061037768507, + "learning_rate": 8.79659818902903e-06, + "loss": 0.5535, + "step": 1784 + }, + { + "epoch": 1.2966199804780605, + "grad_norm": 0.5858687372390492, + "learning_rate": 8.795044344251302e-06, + "loss": 0.547, + "step": 1785 + }, + { + "epoch": 1.2973463782262276, + "grad_norm": 0.39423622712419204, + "learning_rate": 8.793489634376576e-06, + "loss": 0.5372, + "step": 1786 + }, + { + "epoch": 1.2980727759743944, + "grad_norm": 0.4078027764596252, + "learning_rate": 8.791934059759256e-06, + "loss": 0.53, + "step": 1787 + }, + { + "epoch": 1.2987991737225615, + "grad_norm": 0.4120785404484633, + "learning_rate": 8.79037762075394e-06, + "loss": 0.5541, + "step": 1788 + }, + { + "epoch": 1.2995255714707286, + "grad_norm": 0.36186240974562306, + "learning_rate": 8.788820317715427e-06, + "loss": 0.5424, + "step": 1789 + }, + { + "epoch": 1.3002519692188954, + "grad_norm": 0.4251288194120092, + "learning_rate": 8.787262150998713e-06, + "loss": 0.5549, + "step": 1790 + }, + { + "epoch": 1.3009783669670625, + "grad_norm": 0.4915263914925816, + "learning_rate": 8.785703120958984e-06, + "loss": 0.5326, + "step": 1791 + }, + { + "epoch": 1.3017047647152293, + "grad_norm": 0.3983668140161739, + "learning_rate": 8.784143227951628e-06, + "loss": 0.5303, + "step": 1792 + }, + { + "epoch": 1.3024311624633964, + "grad_norm": 0.3754965619413052, + "learning_rate": 8.782582472332236e-06, + "loss": 0.5393, + "step": 1793 + }, + { + "epoch": 1.3031575602115635, + "grad_norm": 0.3773856242093586, + "learning_rate": 8.781020854456582e-06, + "loss": 0.535, + "step": 1794 + }, + { + "epoch": 1.3038839579597303, + "grad_norm": 0.4490866762349985, + "learning_rate": 8.779458374680646e-06, + "loss": 0.5308, + "step": 1795 + }, + { + "epoch": 1.3046103557078972, + "grad_norm": 0.40733809453212066, + "learning_rate": 8.777895033360603e-06, + "loss": 0.5478, + "step": 1796 + }, + { + "epoch": 1.3053367534560643, + "grad_norm": 0.42410169866054975, + "learning_rate": 8.776330830852825e-06, + "loss": 0.5534, + "step": 1797 + }, + { + "epoch": 1.3060631512042313, + "grad_norm": 0.34776512661909453, + "learning_rate": 8.774765767513876e-06, + "loss": 0.5411, + "step": 1798 + }, + { + "epoch": 1.3067895489523982, + "grad_norm": 0.3769902398807461, + "learning_rate": 8.773199843700518e-06, + "loss": 0.5309, + "step": 1799 + }, + { + "epoch": 1.3075159467005653, + "grad_norm": 0.37330832388241414, + "learning_rate": 8.771633059769712e-06, + "loss": 0.549, + "step": 1800 + }, + { + "epoch": 1.308242344448732, + "grad_norm": 0.39550029782820983, + "learning_rate": 8.770065416078615e-06, + "loss": 0.5539, + "step": 1801 + }, + { + "epoch": 1.3089687421968992, + "grad_norm": 0.3819609212474707, + "learning_rate": 8.768496912984574e-06, + "loss": 0.5346, + "step": 1802 + }, + { + "epoch": 1.3096951399450663, + "grad_norm": 0.4118791910236496, + "learning_rate": 8.76692755084514e-06, + "loss": 0.5563, + "step": 1803 + }, + { + "epoch": 1.310421537693233, + "grad_norm": 0.44571217160818744, + "learning_rate": 8.765357330018056e-06, + "loss": 0.5438, + "step": 1804 + }, + { + "epoch": 1.3111479354414002, + "grad_norm": 0.4121837514951978, + "learning_rate": 8.763786250861258e-06, + "loss": 0.5497, + "step": 1805 + }, + { + "epoch": 1.311874333189567, + "grad_norm": 0.4083163261875877, + "learning_rate": 8.762214313732881e-06, + "loss": 0.5552, + "step": 1806 + }, + { + "epoch": 1.312600730937734, + "grad_norm": 0.4181152080467721, + "learning_rate": 8.760641518991257e-06, + "loss": 0.564, + "step": 1807 + }, + { + "epoch": 1.3133271286859012, + "grad_norm": 0.38364370277204013, + "learning_rate": 8.75906786699491e-06, + "loss": 0.5378, + "step": 1808 + }, + { + "epoch": 1.314053526434068, + "grad_norm": 0.41869608928831936, + "learning_rate": 8.75749335810256e-06, + "loss": 0.5464, + "step": 1809 + }, + { + "epoch": 1.314779924182235, + "grad_norm": 0.42528426296113725, + "learning_rate": 8.755917992673126e-06, + "loss": 0.5463, + "step": 1810 + }, + { + "epoch": 1.315506321930402, + "grad_norm": 0.38483565529483227, + "learning_rate": 8.754341771065716e-06, + "loss": 0.5398, + "step": 1811 + }, + { + "epoch": 1.316232719678569, + "grad_norm": 0.46536447305861245, + "learning_rate": 8.75276469363964e-06, + "loss": 0.5464, + "step": 1812 + }, + { + "epoch": 1.316959117426736, + "grad_norm": 0.41489340357941334, + "learning_rate": 8.751186760754397e-06, + "loss": 0.5319, + "step": 1813 + }, + { + "epoch": 1.317685515174903, + "grad_norm": 0.4578205762462449, + "learning_rate": 8.749607972769685e-06, + "loss": 0.5439, + "step": 1814 + }, + { + "epoch": 1.3184119129230698, + "grad_norm": 0.49155744319398526, + "learning_rate": 8.748028330045395e-06, + "loss": 0.5508, + "step": 1815 + }, + { + "epoch": 1.3191383106712369, + "grad_norm": 0.37190008962866994, + "learning_rate": 8.746447832941614e-06, + "loss": 0.5534, + "step": 1816 + }, + { + "epoch": 1.319864708419404, + "grad_norm": 0.3769835428333359, + "learning_rate": 8.744866481818624e-06, + "loss": 0.5389, + "step": 1817 + }, + { + "epoch": 1.3205911061675708, + "grad_norm": 0.41417801427669604, + "learning_rate": 8.743284277036899e-06, + "loss": 0.5271, + "step": 1818 + }, + { + "epoch": 1.3213175039157379, + "grad_norm": 0.3655224383838069, + "learning_rate": 8.74170121895711e-06, + "loss": 0.5684, + "step": 1819 + }, + { + "epoch": 1.3220439016639047, + "grad_norm": 0.5029173229142005, + "learning_rate": 8.740117307940123e-06, + "loss": 0.5374, + "step": 1820 + }, + { + "epoch": 1.3227702994120718, + "grad_norm": 0.4438745767992502, + "learning_rate": 8.738532544346998e-06, + "loss": 0.544, + "step": 1821 + }, + { + "epoch": 1.3234966971602389, + "grad_norm": 0.3818937011598134, + "learning_rate": 8.736946928538988e-06, + "loss": 0.5456, + "step": 1822 + }, + { + "epoch": 1.3242230949084057, + "grad_norm": 0.4314164584511377, + "learning_rate": 8.73536046087754e-06, + "loss": 0.5336, + "step": 1823 + }, + { + "epoch": 1.3249494926565728, + "grad_norm": 0.39232944801618264, + "learning_rate": 8.733773141724298e-06, + "loss": 0.5363, + "step": 1824 + }, + { + "epoch": 1.3256758904047397, + "grad_norm": 0.505176054594776, + "learning_rate": 8.732184971441098e-06, + "loss": 0.5478, + "step": 1825 + }, + { + "epoch": 1.3264022881529067, + "grad_norm": 0.4141339957842407, + "learning_rate": 8.730595950389968e-06, + "loss": 0.5758, + "step": 1826 + }, + { + "epoch": 1.3271286859010738, + "grad_norm": 0.3771489964312615, + "learning_rate": 8.729006078933136e-06, + "loss": 0.5457, + "step": 1827 + }, + { + "epoch": 1.3278550836492407, + "grad_norm": 0.3418085527128151, + "learning_rate": 8.727415357433018e-06, + "loss": 0.541, + "step": 1828 + }, + { + "epoch": 1.3285814813974077, + "grad_norm": 0.36072786868922635, + "learning_rate": 8.725823786252226e-06, + "loss": 0.55, + "step": 1829 + }, + { + "epoch": 1.3293078791455746, + "grad_norm": 0.35881277088044755, + "learning_rate": 8.724231365753567e-06, + "loss": 0.5497, + "step": 1830 + }, + { + "epoch": 1.3300342768937417, + "grad_norm": 0.4034179783998635, + "learning_rate": 8.722638096300037e-06, + "loss": 0.5403, + "step": 1831 + }, + { + "epoch": 1.3307606746419087, + "grad_norm": 0.38808308265799407, + "learning_rate": 8.721043978254833e-06, + "loss": 0.5521, + "step": 1832 + }, + { + "epoch": 1.3314870723900756, + "grad_norm": 0.37141607365637724, + "learning_rate": 8.71944901198134e-06, + "loss": 0.5288, + "step": 1833 + }, + { + "epoch": 1.3322134701382427, + "grad_norm": 0.4153027779362984, + "learning_rate": 8.717853197843134e-06, + "loss": 0.563, + "step": 1834 + }, + { + "epoch": 1.3329398678864095, + "grad_norm": 0.4063731663632702, + "learning_rate": 8.716256536203992e-06, + "loss": 0.5381, + "step": 1835 + }, + { + "epoch": 1.3336662656345766, + "grad_norm": 0.3720949582825728, + "learning_rate": 8.714659027427878e-06, + "loss": 0.5378, + "step": 1836 + }, + { + "epoch": 1.3343926633827436, + "grad_norm": 0.38950561831594427, + "learning_rate": 8.713060671878953e-06, + "loss": 0.5408, + "step": 1837 + }, + { + "epoch": 1.3351190611309105, + "grad_norm": 0.3921429083344398, + "learning_rate": 8.711461469921568e-06, + "loss": 0.5313, + "step": 1838 + }, + { + "epoch": 1.3358454588790774, + "grad_norm": 0.3842738383447121, + "learning_rate": 8.709861421920267e-06, + "loss": 0.5579, + "step": 1839 + }, + { + "epoch": 1.3365718566272444, + "grad_norm": 0.4466227404643763, + "learning_rate": 8.708260528239788e-06, + "loss": 0.5391, + "step": 1840 + }, + { + "epoch": 1.3372982543754115, + "grad_norm": 0.40000502784137687, + "learning_rate": 8.706658789245064e-06, + "loss": 0.5329, + "step": 1841 + }, + { + "epoch": 1.3380246521235784, + "grad_norm": 0.3681548592380934, + "learning_rate": 8.705056205301217e-06, + "loss": 0.543, + "step": 1842 + }, + { + "epoch": 1.3387510498717454, + "grad_norm": 0.5442305579733531, + "learning_rate": 8.703452776773563e-06, + "loss": 0.5572, + "step": 1843 + }, + { + "epoch": 1.3394774476199123, + "grad_norm": 0.41846847295976275, + "learning_rate": 8.70184850402761e-06, + "loss": 0.5336, + "step": 1844 + }, + { + "epoch": 1.3402038453680793, + "grad_norm": 0.39723343483159357, + "learning_rate": 8.700243387429061e-06, + "loss": 0.5432, + "step": 1845 + }, + { + "epoch": 1.3409302431162464, + "grad_norm": 0.38939768776621403, + "learning_rate": 8.698637427343809e-06, + "loss": 0.5229, + "step": 1846 + }, + { + "epoch": 1.3416566408644133, + "grad_norm": 0.4013658344287758, + "learning_rate": 8.697030624137937e-06, + "loss": 0.5599, + "step": 1847 + }, + { + "epoch": 1.3423830386125803, + "grad_norm": 0.38252834404139674, + "learning_rate": 8.695422978177724e-06, + "loss": 0.5364, + "step": 1848 + }, + { + "epoch": 1.3431094363607472, + "grad_norm": 0.37487413981443296, + "learning_rate": 8.693814489829643e-06, + "loss": 0.5305, + "step": 1849 + }, + { + "epoch": 1.3438358341089143, + "grad_norm": 0.5045145749139772, + "learning_rate": 8.692205159460349e-06, + "loss": 0.5324, + "step": 1850 + }, + { + "epoch": 1.3445622318570813, + "grad_norm": 0.3834848107935937, + "learning_rate": 8.690594987436705e-06, + "loss": 0.5373, + "step": 1851 + }, + { + "epoch": 1.3452886296052482, + "grad_norm": 0.402784897329814, + "learning_rate": 8.68898397412575e-06, + "loss": 0.5405, + "step": 1852 + }, + { + "epoch": 1.3460150273534153, + "grad_norm": 0.42427307386128277, + "learning_rate": 8.687372119894723e-06, + "loss": 0.5583, + "step": 1853 + }, + { + "epoch": 1.3467414251015821, + "grad_norm": 0.3911558225143398, + "learning_rate": 8.685759425111056e-06, + "loss": 0.5583, + "step": 1854 + }, + { + "epoch": 1.3474678228497492, + "grad_norm": 0.3543476050936793, + "learning_rate": 8.684145890142366e-06, + "loss": 0.5393, + "step": 1855 + }, + { + "epoch": 1.3481942205979163, + "grad_norm": 0.4137430614844222, + "learning_rate": 8.682531515356467e-06, + "loss": 0.5418, + "step": 1856 + }, + { + "epoch": 1.3489206183460831, + "grad_norm": 0.44802159447591655, + "learning_rate": 8.680916301121365e-06, + "loss": 0.5558, + "step": 1857 + }, + { + "epoch": 1.34964701609425, + "grad_norm": 0.39855739423499525, + "learning_rate": 8.679300247805252e-06, + "loss": 0.5488, + "step": 1858 + }, + { + "epoch": 1.350373413842417, + "grad_norm": 0.35349526794579944, + "learning_rate": 8.677683355776515e-06, + "loss": 0.5348, + "step": 1859 + }, + { + "epoch": 1.3510998115905841, + "grad_norm": 0.4079156229909949, + "learning_rate": 8.676065625403733e-06, + "loss": 0.5355, + "step": 1860 + }, + { + "epoch": 1.351826209338751, + "grad_norm": 0.4101135183395125, + "learning_rate": 8.674447057055673e-06, + "loss": 0.5592, + "step": 1861 + }, + { + "epoch": 1.352552607086918, + "grad_norm": 0.4373664713387905, + "learning_rate": 8.672827651101297e-06, + "loss": 0.5451, + "step": 1862 + }, + { + "epoch": 1.353279004835085, + "grad_norm": 0.3981073036267596, + "learning_rate": 8.671207407909754e-06, + "loss": 0.5478, + "step": 1863 + }, + { + "epoch": 1.354005402583252, + "grad_norm": 0.3773769438352129, + "learning_rate": 8.669586327850386e-06, + "loss": 0.5414, + "step": 1864 + }, + { + "epoch": 1.354731800331419, + "grad_norm": 0.4121506936367948, + "learning_rate": 8.667964411292725e-06, + "loss": 0.5418, + "step": 1865 + }, + { + "epoch": 1.355458198079586, + "grad_norm": 0.3721842200340298, + "learning_rate": 8.666341658606493e-06, + "loss": 0.533, + "step": 1866 + }, + { + "epoch": 1.356184595827753, + "grad_norm": 0.4303316522403638, + "learning_rate": 8.664718070161607e-06, + "loss": 0.5486, + "step": 1867 + }, + { + "epoch": 1.3569109935759198, + "grad_norm": 0.3803968632015748, + "learning_rate": 8.663093646328166e-06, + "loss": 0.5482, + "step": 1868 + }, + { + "epoch": 1.357637391324087, + "grad_norm": 0.4155775190064203, + "learning_rate": 8.661468387476471e-06, + "loss": 0.5249, + "step": 1869 + }, + { + "epoch": 1.358363789072254, + "grad_norm": 0.41649246283951763, + "learning_rate": 8.659842293977e-06, + "loss": 0.5353, + "step": 1870 + }, + { + "epoch": 1.3590901868204208, + "grad_norm": 0.4117337284313234, + "learning_rate": 8.658215366200433e-06, + "loss": 0.5451, + "step": 1871 + }, + { + "epoch": 1.359816584568588, + "grad_norm": 0.4392355311303106, + "learning_rate": 8.656587604517635e-06, + "loss": 0.5366, + "step": 1872 + }, + { + "epoch": 1.3605429823167547, + "grad_norm": 0.39648896071408246, + "learning_rate": 8.654959009299657e-06, + "loss": 0.5404, + "step": 1873 + }, + { + "epoch": 1.3612693800649218, + "grad_norm": 0.3872675445529076, + "learning_rate": 8.653329580917752e-06, + "loss": 0.5475, + "step": 1874 + }, + { + "epoch": 1.361995777813089, + "grad_norm": 0.4218469959253971, + "learning_rate": 8.651699319743348e-06, + "loss": 0.5293, + "step": 1875 + }, + { + "epoch": 1.3627221755612557, + "grad_norm": 0.41767884109068387, + "learning_rate": 8.650068226148074e-06, + "loss": 0.5317, + "step": 1876 + }, + { + "epoch": 1.3634485733094228, + "grad_norm": 0.3859802721551015, + "learning_rate": 8.648436300503742e-06, + "loss": 0.5293, + "step": 1877 + }, + { + "epoch": 1.3641749710575897, + "grad_norm": 0.42945118279457706, + "learning_rate": 8.646803543182361e-06, + "loss": 0.5376, + "step": 1878 + }, + { + "epoch": 1.3649013688057567, + "grad_norm": 0.37671501911345834, + "learning_rate": 8.645169954556123e-06, + "loss": 0.5395, + "step": 1879 + }, + { + "epoch": 1.3656277665539238, + "grad_norm": 0.369319222907703, + "learning_rate": 8.64353553499741e-06, + "loss": 0.5338, + "step": 1880 + }, + { + "epoch": 1.3663541643020907, + "grad_norm": 0.34095153700697944, + "learning_rate": 8.641900284878798e-06, + "loss": 0.5418, + "step": 1881 + }, + { + "epoch": 1.3670805620502575, + "grad_norm": 0.6919819010110789, + "learning_rate": 8.640264204573049e-06, + "loss": 0.5415, + "step": 1882 + }, + { + "epoch": 1.3678069597984246, + "grad_norm": 0.3531349893202495, + "learning_rate": 8.63862729445311e-06, + "loss": 0.5551, + "step": 1883 + }, + { + "epoch": 1.3685333575465917, + "grad_norm": 0.37080974188770977, + "learning_rate": 8.63698955489213e-06, + "loss": 0.5424, + "step": 1884 + }, + { + "epoch": 1.3692597552947585, + "grad_norm": 0.40867422693303573, + "learning_rate": 8.635350986263431e-06, + "loss": 0.5451, + "step": 1885 + }, + { + "epoch": 1.3699861530429256, + "grad_norm": 0.4680403849119622, + "learning_rate": 8.633711588940538e-06, + "loss": 0.5471, + "step": 1886 + }, + { + "epoch": 1.3707125507910924, + "grad_norm": 0.3835701618473411, + "learning_rate": 8.632071363297155e-06, + "loss": 0.5423, + "step": 1887 + }, + { + "epoch": 1.3714389485392595, + "grad_norm": 0.38961022680836604, + "learning_rate": 8.63043030970718e-06, + "loss": 0.5414, + "step": 1888 + }, + { + "epoch": 1.3721653462874266, + "grad_norm": 0.36131709249510413, + "learning_rate": 8.628788428544698e-06, + "loss": 0.5316, + "step": 1889 + }, + { + "epoch": 1.3728917440355934, + "grad_norm": 0.3730237230261701, + "learning_rate": 8.627145720183986e-06, + "loss": 0.5461, + "step": 1890 + }, + { + "epoch": 1.3736181417837605, + "grad_norm": 0.44393089036469924, + "learning_rate": 8.6255021849995e-06, + "loss": 0.5606, + "step": 1891 + }, + { + "epoch": 1.3743445395319274, + "grad_norm": 0.3878934576688868, + "learning_rate": 8.623857823365896e-06, + "loss": 0.5303, + "step": 1892 + }, + { + "epoch": 1.3750709372800944, + "grad_norm": 0.36740185024361566, + "learning_rate": 8.622212635658015e-06, + "loss": 0.5318, + "step": 1893 + }, + { + "epoch": 1.3757973350282615, + "grad_norm": 0.3873406422624257, + "learning_rate": 8.620566622250878e-06, + "loss": 0.5422, + "step": 1894 + }, + { + "epoch": 1.3765237327764284, + "grad_norm": 0.43456363913952073, + "learning_rate": 8.618919783519706e-06, + "loss": 0.5392, + "step": 1895 + }, + { + "epoch": 1.3772501305245954, + "grad_norm": 0.40024030236124175, + "learning_rate": 8.617272119839903e-06, + "loss": 0.525, + "step": 1896 + }, + { + "epoch": 1.3779765282727623, + "grad_norm": 0.3761797253882858, + "learning_rate": 8.61562363158706e-06, + "loss": 0.541, + "step": 1897 + }, + { + "epoch": 1.3787029260209294, + "grad_norm": 0.3707955979046983, + "learning_rate": 8.613974319136959e-06, + "loss": 0.5317, + "step": 1898 + }, + { + "epoch": 1.3794293237690964, + "grad_norm": 0.3760358723392047, + "learning_rate": 8.612324182865565e-06, + "loss": 0.5465, + "step": 1899 + }, + { + "epoch": 1.3801557215172633, + "grad_norm": 0.4256562839209359, + "learning_rate": 8.610673223149036e-06, + "loss": 0.5414, + "step": 1900 + }, + { + "epoch": 1.3808821192654301, + "grad_norm": 0.4279063471862655, + "learning_rate": 8.609021440363712e-06, + "loss": 0.5535, + "step": 1901 + }, + { + "epoch": 1.3816085170135972, + "grad_norm": 0.3509451349635699, + "learning_rate": 8.60736883488613e-06, + "loss": 0.5357, + "step": 1902 + }, + { + "epoch": 1.3823349147617643, + "grad_norm": 0.581332340740415, + "learning_rate": 8.605715407093005e-06, + "loss": 0.5311, + "step": 1903 + }, + { + "epoch": 1.3830613125099311, + "grad_norm": 0.4407596324959687, + "learning_rate": 8.604061157361243e-06, + "loss": 0.5467, + "step": 1904 + }, + { + "epoch": 1.3837877102580982, + "grad_norm": 0.4659283840524074, + "learning_rate": 8.602406086067938e-06, + "loss": 0.5415, + "step": 1905 + }, + { + "epoch": 1.384514108006265, + "grad_norm": 0.3802760596789848, + "learning_rate": 8.60075019359037e-06, + "loss": 0.5515, + "step": 1906 + }, + { + "epoch": 1.3852405057544321, + "grad_norm": 0.38316526842068516, + "learning_rate": 8.599093480306009e-06, + "loss": 0.5577, + "step": 1907 + }, + { + "epoch": 1.3859669035025992, + "grad_norm": 0.49345679046274693, + "learning_rate": 8.59743594659251e-06, + "loss": 0.5567, + "step": 1908 + }, + { + "epoch": 1.386693301250766, + "grad_norm": 0.4407146899861917, + "learning_rate": 8.595777592827713e-06, + "loss": 0.5374, + "step": 1909 + }, + { + "epoch": 1.3874196989989331, + "grad_norm": 0.3686260662439781, + "learning_rate": 8.594118419389648e-06, + "loss": 0.5358, + "step": 1910 + }, + { + "epoch": 1.3881460967471, + "grad_norm": 0.43813341796941524, + "learning_rate": 8.592458426656531e-06, + "loss": 0.5292, + "step": 1911 + }, + { + "epoch": 1.388872494495267, + "grad_norm": 0.5289201011280882, + "learning_rate": 8.590797615006763e-06, + "loss": 0.5314, + "step": 1912 + }, + { + "epoch": 1.3895988922434341, + "grad_norm": 0.3545435738165874, + "learning_rate": 8.589135984818936e-06, + "loss": 0.5331, + "step": 1913 + }, + { + "epoch": 1.390325289991601, + "grad_norm": 0.3880181286798106, + "learning_rate": 8.587473536471824e-06, + "loss": 0.5292, + "step": 1914 + }, + { + "epoch": 1.391051687739768, + "grad_norm": 0.3711953855294708, + "learning_rate": 8.585810270344391e-06, + "loss": 0.5398, + "step": 1915 + }, + { + "epoch": 1.391778085487935, + "grad_norm": 0.3606452388095397, + "learning_rate": 8.584146186815785e-06, + "loss": 0.5388, + "step": 1916 + }, + { + "epoch": 1.392504483236102, + "grad_norm": 0.49144573790253854, + "learning_rate": 8.582481286265341e-06, + "loss": 0.5331, + "step": 1917 + }, + { + "epoch": 1.393230880984269, + "grad_norm": 0.44018366120876584, + "learning_rate": 8.580815569072579e-06, + "loss": 0.553, + "step": 1918 + }, + { + "epoch": 1.393957278732436, + "grad_norm": 0.4114140131639351, + "learning_rate": 8.57914903561721e-06, + "loss": 0.5438, + "step": 1919 + }, + { + "epoch": 1.394683676480603, + "grad_norm": 0.3779639707831259, + "learning_rate": 8.577481686279123e-06, + "loss": 0.5315, + "step": 1920 + }, + { + "epoch": 1.3954100742287698, + "grad_norm": 0.5961234774932549, + "learning_rate": 8.575813521438401e-06, + "loss": 0.53, + "step": 1921 + }, + { + "epoch": 1.396136471976937, + "grad_norm": 0.5384898478460642, + "learning_rate": 8.574144541475309e-06, + "loss": 0.5325, + "step": 1922 + }, + { + "epoch": 1.396862869725104, + "grad_norm": 0.40857235280824467, + "learning_rate": 8.572474746770298e-06, + "loss": 0.5478, + "step": 1923 + }, + { + "epoch": 1.3975892674732708, + "grad_norm": 0.425386958242189, + "learning_rate": 8.570804137704005e-06, + "loss": 0.5312, + "step": 1924 + }, + { + "epoch": 1.3983156652214377, + "grad_norm": 0.7508299384321971, + "learning_rate": 8.56913271465725e-06, + "loss": 0.5324, + "step": 1925 + }, + { + "epoch": 1.3990420629696048, + "grad_norm": 0.36782172271514246, + "learning_rate": 8.567460478011048e-06, + "loss": 0.5435, + "step": 1926 + }, + { + "epoch": 1.3997684607177718, + "grad_norm": 0.38067589919289535, + "learning_rate": 8.565787428146586e-06, + "loss": 0.536, + "step": 1927 + }, + { + "epoch": 1.4004948584659387, + "grad_norm": 0.5934857184987411, + "learning_rate": 8.564113565445247e-06, + "loss": 0.5341, + "step": 1928 + }, + { + "epoch": 1.4012212562141058, + "grad_norm": 0.3596646337745538, + "learning_rate": 8.56243889028859e-06, + "loss": 0.5165, + "step": 1929 + }, + { + "epoch": 1.4019476539622726, + "grad_norm": 0.4280239182874118, + "learning_rate": 8.560763403058375e-06, + "loss": 0.5389, + "step": 1930 + }, + { + "epoch": 1.4026740517104397, + "grad_norm": 0.41277127732364816, + "learning_rate": 8.559087104136525e-06, + "loss": 0.5437, + "step": 1931 + }, + { + "epoch": 1.4034004494586068, + "grad_norm": 0.35672219835666513, + "learning_rate": 8.557409993905165e-06, + "loss": 0.5297, + "step": 1932 + }, + { + "epoch": 1.4041268472067736, + "grad_norm": 0.3761535339499059, + "learning_rate": 8.5557320727466e-06, + "loss": 0.5419, + "step": 1933 + }, + { + "epoch": 1.4048532449549407, + "grad_norm": 0.403660946737396, + "learning_rate": 8.55405334104332e-06, + "loss": 0.5283, + "step": 1934 + }, + { + "epoch": 1.4055796427031075, + "grad_norm": 0.387444269843542, + "learning_rate": 8.552373799177997e-06, + "loss": 0.5436, + "step": 1935 + }, + { + "epoch": 1.4063060404512746, + "grad_norm": 0.3691292665136819, + "learning_rate": 8.550693447533493e-06, + "loss": 0.5567, + "step": 1936 + }, + { + "epoch": 1.4070324381994417, + "grad_norm": 0.405303329778648, + "learning_rate": 8.549012286492848e-06, + "loss": 0.5315, + "step": 1937 + }, + { + "epoch": 1.4077588359476085, + "grad_norm": 0.6503597959594926, + "learning_rate": 8.54733031643929e-06, + "loss": 0.5369, + "step": 1938 + }, + { + "epoch": 1.4084852336957756, + "grad_norm": 0.39065256110950697, + "learning_rate": 8.545647537756236e-06, + "loss": 0.5332, + "step": 1939 + }, + { + "epoch": 1.4092116314439425, + "grad_norm": 0.4609381596916133, + "learning_rate": 8.543963950827279e-06, + "loss": 0.5442, + "step": 1940 + }, + { + "epoch": 1.4099380291921095, + "grad_norm": 0.3649780548895913, + "learning_rate": 8.5422795560362e-06, + "loss": 0.548, + "step": 1941 + }, + { + "epoch": 1.4106644269402766, + "grad_norm": 0.4168463039717824, + "learning_rate": 8.540594353766966e-06, + "loss": 0.5426, + "step": 1942 + }, + { + "epoch": 1.4113908246884435, + "grad_norm": 0.3605395594921, + "learning_rate": 8.538908344403724e-06, + "loss": 0.522, + "step": 1943 + }, + { + "epoch": 1.4121172224366103, + "grad_norm": 0.43891493266382947, + "learning_rate": 8.53722152833081e-06, + "loss": 0.5485, + "step": 1944 + }, + { + "epoch": 1.4128436201847774, + "grad_norm": 0.3616022146521361, + "learning_rate": 8.535533905932739e-06, + "loss": 0.5428, + "step": 1945 + }, + { + "epoch": 1.4135700179329445, + "grad_norm": 0.4830840461126677, + "learning_rate": 8.533845477594212e-06, + "loss": 0.5359, + "step": 1946 + }, + { + "epoch": 1.4142964156811113, + "grad_norm": 0.44953993531184466, + "learning_rate": 8.532156243700114e-06, + "loss": 0.5396, + "step": 1947 + }, + { + "epoch": 1.4150228134292784, + "grad_norm": 0.34692781192390154, + "learning_rate": 8.530466204635514e-06, + "loss": 0.5394, + "step": 1948 + }, + { + "epoch": 1.4157492111774452, + "grad_norm": 0.383099285241623, + "learning_rate": 8.528775360785665e-06, + "loss": 0.5512, + "step": 1949 + }, + { + "epoch": 1.4164756089256123, + "grad_norm": 0.38018322615703837, + "learning_rate": 8.527083712535998e-06, + "loss": 0.5334, + "step": 1950 + }, + { + "epoch": 1.4172020066737794, + "grad_norm": 0.4071584173973299, + "learning_rate": 8.525391260272134e-06, + "loss": 0.5376, + "step": 1951 + }, + { + "epoch": 1.4179284044219462, + "grad_norm": 0.33721951307742976, + "learning_rate": 8.523698004379878e-06, + "loss": 0.5279, + "step": 1952 + }, + { + "epoch": 1.4186548021701133, + "grad_norm": 0.8918147939342991, + "learning_rate": 8.52200394524521e-06, + "loss": 0.5394, + "step": 1953 + }, + { + "epoch": 1.4193811999182802, + "grad_norm": 0.38716601747441487, + "learning_rate": 8.520309083254301e-06, + "loss": 0.5348, + "step": 1954 + }, + { + "epoch": 1.4201075976664472, + "grad_norm": 0.4193252319693734, + "learning_rate": 8.518613418793502e-06, + "loss": 0.5232, + "step": 1955 + }, + { + "epoch": 1.4208339954146143, + "grad_norm": 0.39452248506913185, + "learning_rate": 8.516916952249346e-06, + "loss": 0.5293, + "step": 1956 + }, + { + "epoch": 1.4215603931627812, + "grad_norm": 0.47356394532545604, + "learning_rate": 8.51521968400855e-06, + "loss": 0.5335, + "step": 1957 + }, + { + "epoch": 1.4222867909109482, + "grad_norm": 0.4721127066478117, + "learning_rate": 8.513521614458015e-06, + "loss": 0.5395, + "step": 1958 + }, + { + "epoch": 1.423013188659115, + "grad_norm": 0.37657901193003795, + "learning_rate": 8.511822743984824e-06, + "loss": 0.5396, + "step": 1959 + }, + { + "epoch": 1.4237395864072822, + "grad_norm": 0.39741495285863576, + "learning_rate": 8.51012307297624e-06, + "loss": 0.5153, + "step": 1960 + }, + { + "epoch": 1.4244659841554492, + "grad_norm": 0.41305756922410347, + "learning_rate": 8.508422601819713e-06, + "loss": 0.5241, + "step": 1961 + }, + { + "epoch": 1.425192381903616, + "grad_norm": 0.37996646400149187, + "learning_rate": 8.506721330902869e-06, + "loss": 0.5295, + "step": 1962 + }, + { + "epoch": 1.4259187796517832, + "grad_norm": 0.3852696944338915, + "learning_rate": 8.505019260613523e-06, + "loss": 0.5305, + "step": 1963 + }, + { + "epoch": 1.42664517739995, + "grad_norm": 0.3958821782540526, + "learning_rate": 8.503316391339668e-06, + "loss": 0.5323, + "step": 1964 + }, + { + "epoch": 1.427371575148117, + "grad_norm": 0.37831995894612974, + "learning_rate": 8.501612723469483e-06, + "loss": 0.5181, + "step": 1965 + }, + { + "epoch": 1.4280979728962842, + "grad_norm": 0.39926135943658303, + "learning_rate": 8.499908257391324e-06, + "loss": 0.5417, + "step": 1966 + }, + { + "epoch": 1.428824370644451, + "grad_norm": 0.358842675107982, + "learning_rate": 8.49820299349373e-06, + "loss": 0.5428, + "step": 1967 + }, + { + "epoch": 1.4295507683926179, + "grad_norm": 0.40179609610989964, + "learning_rate": 8.49649693216543e-06, + "loss": 0.5369, + "step": 1968 + }, + { + "epoch": 1.430277166140785, + "grad_norm": 0.3789589574251804, + "learning_rate": 8.494790073795323e-06, + "loss": 0.5423, + "step": 1969 + }, + { + "epoch": 1.431003563888952, + "grad_norm": 0.3590287073648434, + "learning_rate": 8.493082418772494e-06, + "loss": 0.5353, + "step": 1970 + }, + { + "epoch": 1.4317299616371189, + "grad_norm": 0.6134094833899469, + "learning_rate": 8.491373967486212e-06, + "loss": 0.5349, + "step": 1971 + }, + { + "epoch": 1.432456359385286, + "grad_norm": 0.36762452749849744, + "learning_rate": 8.489664720325928e-06, + "loss": 0.5275, + "step": 1972 + }, + { + "epoch": 1.4331827571334528, + "grad_norm": 0.3692895289988314, + "learning_rate": 8.487954677681269e-06, + "loss": 0.54, + "step": 1973 + }, + { + "epoch": 1.4339091548816199, + "grad_norm": 0.3763720711023518, + "learning_rate": 8.486243839942048e-06, + "loss": 0.5268, + "step": 1974 + }, + { + "epoch": 1.434635552629787, + "grad_norm": 0.3806714456242494, + "learning_rate": 8.48453220749826e-06, + "loss": 0.5287, + "step": 1975 + }, + { + "epoch": 1.4353619503779538, + "grad_norm": 0.35038099734210776, + "learning_rate": 8.482819780740076e-06, + "loss": 0.5479, + "step": 1976 + }, + { + "epoch": 1.4360883481261209, + "grad_norm": 0.4317281985340517, + "learning_rate": 8.481106560057852e-06, + "loss": 0.5417, + "step": 1977 + }, + { + "epoch": 1.4368147458742877, + "grad_norm": 0.3701376689135592, + "learning_rate": 8.479392545842126e-06, + "loss": 0.5301, + "step": 1978 + }, + { + "epoch": 1.4375411436224548, + "grad_norm": 0.3774269963014525, + "learning_rate": 8.477677738483614e-06, + "loss": 0.5379, + "step": 1979 + }, + { + "epoch": 1.4382675413706218, + "grad_norm": 0.3857892291442041, + "learning_rate": 8.475962138373212e-06, + "loss": 0.5456, + "step": 1980 + }, + { + "epoch": 1.4389939391187887, + "grad_norm": 0.3887872479758726, + "learning_rate": 8.474245745902002e-06, + "loss": 0.5261, + "step": 1981 + }, + { + "epoch": 1.4397203368669558, + "grad_norm": 0.3862173859493997, + "learning_rate": 8.47252856146124e-06, + "loss": 0.5383, + "step": 1982 + }, + { + "epoch": 1.4404467346151226, + "grad_norm": 0.4132140002809266, + "learning_rate": 8.470810585442367e-06, + "loss": 0.5508, + "step": 1983 + }, + { + "epoch": 1.4411731323632897, + "grad_norm": 0.44883189440473237, + "learning_rate": 8.469091818237004e-06, + "loss": 0.5507, + "step": 1984 + }, + { + "epoch": 1.4418995301114568, + "grad_norm": 0.9548703866806727, + "learning_rate": 8.467372260236951e-06, + "loss": 0.5275, + "step": 1985 + }, + { + "epoch": 1.4426259278596236, + "grad_norm": 0.3560892250131109, + "learning_rate": 8.465651911834187e-06, + "loss": 0.5245, + "step": 1986 + }, + { + "epoch": 1.4433523256077907, + "grad_norm": 0.3750927938822984, + "learning_rate": 8.463930773420874e-06, + "loss": 0.5278, + "step": 1987 + }, + { + "epoch": 1.4440787233559575, + "grad_norm": 0.3861372560323569, + "learning_rate": 8.462208845389356e-06, + "loss": 0.5357, + "step": 1988 + }, + { + "epoch": 1.4448051211041246, + "grad_norm": 0.4953428079781934, + "learning_rate": 8.460486128132151e-06, + "loss": 0.5322, + "step": 1989 + }, + { + "epoch": 1.4455315188522915, + "grad_norm": 0.7225665784307765, + "learning_rate": 8.458762622041959e-06, + "loss": 0.5328, + "step": 1990 + }, + { + "epoch": 1.4462579166004585, + "grad_norm": 0.3635088658724107, + "learning_rate": 8.457038327511663e-06, + "loss": 0.5302, + "step": 1991 + }, + { + "epoch": 1.4469843143486254, + "grad_norm": 0.3951491124497268, + "learning_rate": 8.455313244934324e-06, + "loss": 0.5407, + "step": 1992 + }, + { + "epoch": 1.4477107120967925, + "grad_norm": 0.6229750799649701, + "learning_rate": 8.45358737470318e-06, + "loss": 0.5444, + "step": 1993 + }, + { + "epoch": 1.4484371098449595, + "grad_norm": 0.4284237318323039, + "learning_rate": 8.451860717211653e-06, + "loss": 0.5441, + "step": 1994 + }, + { + "epoch": 1.4491635075931264, + "grad_norm": 0.5036938209369309, + "learning_rate": 8.450133272853338e-06, + "loss": 0.5438, + "step": 1995 + }, + { + "epoch": 1.4498899053412935, + "grad_norm": 0.5598656231285419, + "learning_rate": 8.448405042022018e-06, + "loss": 0.5328, + "step": 1996 + }, + { + "epoch": 1.4506163030894603, + "grad_norm": 0.4352745677117185, + "learning_rate": 8.44667602511165e-06, + "loss": 0.5353, + "step": 1997 + }, + { + "epoch": 1.4513427008376274, + "grad_norm": 0.3727822813407849, + "learning_rate": 8.444946222516369e-06, + "loss": 0.5382, + "step": 1998 + }, + { + "epoch": 1.4520690985857945, + "grad_norm": 0.3673506875698348, + "learning_rate": 8.443215634630493e-06, + "loss": 0.5312, + "step": 1999 + }, + { + "epoch": 1.4527954963339613, + "grad_norm": 0.42066624613930265, + "learning_rate": 8.441484261848514e-06, + "loss": 0.5572, + "step": 2000 + }, + { + "epoch": 1.4535218940821284, + "grad_norm": 0.4167581664824996, + "learning_rate": 8.43975210456511e-06, + "loss": 0.5439, + "step": 2001 + }, + { + "epoch": 1.4542482918302952, + "grad_norm": 0.4384370655418264, + "learning_rate": 8.438019163175132e-06, + "loss": 0.5454, + "step": 2002 + }, + { + "epoch": 1.4549746895784623, + "grad_norm": 0.41078736364713037, + "learning_rate": 8.436285438073612e-06, + "loss": 0.5278, + "step": 2003 + }, + { + "epoch": 1.4557010873266294, + "grad_norm": 0.5068341665843389, + "learning_rate": 8.43455092965576e-06, + "loss": 0.532, + "step": 2004 + }, + { + "epoch": 1.4564274850747962, + "grad_norm": 0.3361925311738195, + "learning_rate": 8.432815638316964e-06, + "loss": 0.5221, + "step": 2005 + }, + { + "epoch": 1.4571538828229633, + "grad_norm": 0.3907476605116234, + "learning_rate": 8.431079564452794e-06, + "loss": 0.5259, + "step": 2006 + }, + { + "epoch": 1.4578802805711302, + "grad_norm": 0.39243927084717206, + "learning_rate": 8.429342708458991e-06, + "loss": 0.5211, + "step": 2007 + }, + { + "epoch": 1.4586066783192972, + "grad_norm": 0.4033731808083159, + "learning_rate": 8.427605070731482e-06, + "loss": 0.5342, + "step": 2008 + }, + { + "epoch": 1.4593330760674643, + "grad_norm": 0.38186916648202734, + "learning_rate": 8.42586665166637e-06, + "loss": 0.5202, + "step": 2009 + }, + { + "epoch": 1.4600594738156312, + "grad_norm": 0.3752026451214445, + "learning_rate": 8.424127451659933e-06, + "loss": 0.5299, + "step": 2010 + }, + { + "epoch": 1.460785871563798, + "grad_norm": 0.40256241040537144, + "learning_rate": 8.422387471108631e-06, + "loss": 0.536, + "step": 2011 + }, + { + "epoch": 1.461512269311965, + "grad_norm": 0.38801726970593625, + "learning_rate": 8.420646710409099e-06, + "loss": 0.5298, + "step": 2012 + }, + { + "epoch": 1.4622386670601322, + "grad_norm": 0.3491406745354709, + "learning_rate": 8.418905169958152e-06, + "loss": 0.5358, + "step": 2013 + }, + { + "epoch": 1.462965064808299, + "grad_norm": 0.3743679163458471, + "learning_rate": 8.41716285015278e-06, + "loss": 0.5434, + "step": 2014 + }, + { + "epoch": 1.463691462556466, + "grad_norm": 0.37363032051278366, + "learning_rate": 8.415419751390155e-06, + "loss": 0.5306, + "step": 2015 + }, + { + "epoch": 1.464417860304633, + "grad_norm": 0.3793987257982108, + "learning_rate": 8.413675874067622e-06, + "loss": 0.5419, + "step": 2016 + }, + { + "epoch": 1.4651442580528, + "grad_norm": 0.4060396681017493, + "learning_rate": 8.411931218582706e-06, + "loss": 0.5306, + "step": 2017 + }, + { + "epoch": 1.465870655800967, + "grad_norm": 0.7718495729878065, + "learning_rate": 8.410185785333111e-06, + "loss": 0.5528, + "step": 2018 + }, + { + "epoch": 1.466597053549134, + "grad_norm": 0.3809931135117274, + "learning_rate": 8.408439574716712e-06, + "loss": 0.5151, + "step": 2019 + }, + { + "epoch": 1.467323451297301, + "grad_norm": 0.3464947862227488, + "learning_rate": 8.406692587131569e-06, + "loss": 0.5268, + "step": 2020 + }, + { + "epoch": 1.4680498490454679, + "grad_norm": 0.37560537023157703, + "learning_rate": 8.404944822975914e-06, + "loss": 0.5582, + "step": 2021 + }, + { + "epoch": 1.468776246793635, + "grad_norm": 0.3527908030684601, + "learning_rate": 8.403196282648156e-06, + "loss": 0.5464, + "step": 2022 + }, + { + "epoch": 1.469502644541802, + "grad_norm": 0.3744135464359761, + "learning_rate": 8.401446966546885e-06, + "loss": 0.5411, + "step": 2023 + }, + { + "epoch": 1.4702290422899689, + "grad_norm": 0.3898440069807425, + "learning_rate": 8.399696875070864e-06, + "loss": 0.5505, + "step": 2024 + }, + { + "epoch": 1.470955440038136, + "grad_norm": 0.44165838423074744, + "learning_rate": 8.397946008619035e-06, + "loss": 0.5151, + "step": 2025 + }, + { + "epoch": 1.4716818377863028, + "grad_norm": 0.4152275308276423, + "learning_rate": 8.396194367590515e-06, + "loss": 0.5274, + "step": 2026 + }, + { + "epoch": 1.4724082355344699, + "grad_norm": 0.3324160779039609, + "learning_rate": 8.394441952384597e-06, + "loss": 0.5406, + "step": 2027 + }, + { + "epoch": 1.473134633282637, + "grad_norm": 0.41106911809184316, + "learning_rate": 8.392688763400755e-06, + "loss": 0.532, + "step": 2028 + }, + { + "epoch": 1.4738610310308038, + "grad_norm": 0.3924212014605282, + "learning_rate": 8.390934801038632e-06, + "loss": 0.5233, + "step": 2029 + }, + { + "epoch": 1.4745874287789709, + "grad_norm": 0.414921189006647, + "learning_rate": 8.389180065698055e-06, + "loss": 0.5319, + "step": 2030 + }, + { + "epoch": 1.4753138265271377, + "grad_norm": 0.8311008021451181, + "learning_rate": 8.387424557779022e-06, + "loss": 0.551, + "step": 2031 + }, + { + "epoch": 1.4760402242753048, + "grad_norm": 0.361198159956959, + "learning_rate": 8.385668277681709e-06, + "loss": 0.5131, + "step": 2032 + }, + { + "epoch": 1.4767666220234719, + "grad_norm": 0.38745901241674185, + "learning_rate": 8.383911225806468e-06, + "loss": 0.5474, + "step": 2033 + }, + { + "epoch": 1.4774930197716387, + "grad_norm": 0.37053434565256693, + "learning_rate": 8.382153402553825e-06, + "loss": 0.5409, + "step": 2034 + }, + { + "epoch": 1.4782194175198056, + "grad_norm": 0.3666466711492958, + "learning_rate": 8.380394808324484e-06, + "loss": 0.5218, + "step": 2035 + }, + { + "epoch": 1.4789458152679726, + "grad_norm": 0.43161781550679396, + "learning_rate": 8.378635443519327e-06, + "loss": 0.5225, + "step": 2036 + }, + { + "epoch": 1.4796722130161397, + "grad_norm": 0.47557759375567127, + "learning_rate": 8.376875308539406e-06, + "loss": 0.5555, + "step": 2037 + }, + { + "epoch": 1.4803986107643066, + "grad_norm": 0.4422804846214722, + "learning_rate": 8.375114403785953e-06, + "loss": 0.5229, + "step": 2038 + }, + { + "epoch": 1.4811250085124736, + "grad_norm": 0.35316778327064985, + "learning_rate": 8.373352729660373e-06, + "loss": 0.532, + "step": 2039 + }, + { + "epoch": 1.4818514062606405, + "grad_norm": 0.43635553639514263, + "learning_rate": 8.371590286564247e-06, + "loss": 0.5267, + "step": 2040 + }, + { + "epoch": 1.4825778040088076, + "grad_norm": 0.3647953718925828, + "learning_rate": 8.369827074899333e-06, + "loss": 0.5172, + "step": 2041 + }, + { + "epoch": 1.4833042017569746, + "grad_norm": 0.43758131127371425, + "learning_rate": 8.368063095067564e-06, + "loss": 0.5299, + "step": 2042 + }, + { + "epoch": 1.4840305995051415, + "grad_norm": 0.3481965420729561, + "learning_rate": 8.366298347471043e-06, + "loss": 0.5529, + "step": 2043 + }, + { + "epoch": 1.4847569972533086, + "grad_norm": 0.3809271562611856, + "learning_rate": 8.364532832512055e-06, + "loss": 0.5298, + "step": 2044 + }, + { + "epoch": 1.4854833950014754, + "grad_norm": 0.39900438510168235, + "learning_rate": 8.362766550593055e-06, + "loss": 0.553, + "step": 2045 + }, + { + "epoch": 1.4862097927496425, + "grad_norm": 0.4024051219002026, + "learning_rate": 8.360999502116678e-06, + "loss": 0.5251, + "step": 2046 + }, + { + "epoch": 1.4869361904978096, + "grad_norm": 0.5664636582845303, + "learning_rate": 8.359231687485724e-06, + "loss": 0.5268, + "step": 2047 + }, + { + "epoch": 1.4876625882459764, + "grad_norm": 0.321380891619596, + "learning_rate": 8.357463107103182e-06, + "loss": 0.5262, + "step": 2048 + }, + { + "epoch": 1.4883889859941435, + "grad_norm": 0.3781206568900052, + "learning_rate": 8.355693761372203e-06, + "loss": 0.5265, + "step": 2049 + }, + { + "epoch": 1.4891153837423103, + "grad_norm": 0.4901907426957913, + "learning_rate": 8.353923650696119e-06, + "loss": 0.5352, + "step": 2050 + }, + { + "epoch": 1.4898417814904774, + "grad_norm": 0.4383754725203851, + "learning_rate": 8.352152775478431e-06, + "loss": 0.5351, + "step": 2051 + }, + { + "epoch": 1.4905681792386445, + "grad_norm": 0.3793071731555198, + "learning_rate": 8.350381136122823e-06, + "loss": 0.533, + "step": 2052 + }, + { + "epoch": 1.4912945769868113, + "grad_norm": 0.3936571281921193, + "learning_rate": 8.348608733033143e-06, + "loss": 0.5423, + "step": 2053 + }, + { + "epoch": 1.4920209747349782, + "grad_norm": 0.5586778199848161, + "learning_rate": 8.34683556661342e-06, + "loss": 0.542, + "step": 2054 + }, + { + "epoch": 1.4927473724831453, + "grad_norm": 0.4083828436352312, + "learning_rate": 8.345061637267858e-06, + "loss": 0.5437, + "step": 2055 + }, + { + "epoch": 1.4934737702313123, + "grad_norm": 0.39828639694777723, + "learning_rate": 8.343286945400827e-06, + "loss": 0.5404, + "step": 2056 + }, + { + "epoch": 1.4942001679794792, + "grad_norm": 0.4341716027611792, + "learning_rate": 8.341511491416877e-06, + "loss": 0.5419, + "step": 2057 + }, + { + "epoch": 1.4949265657276463, + "grad_norm": 0.3771833000560631, + "learning_rate": 8.339735275720735e-06, + "loss": 0.5272, + "step": 2058 + }, + { + "epoch": 1.4956529634758131, + "grad_norm": 0.4369176038701713, + "learning_rate": 8.337958298717293e-06, + "loss": 0.5344, + "step": 2059 + }, + { + "epoch": 1.4963793612239802, + "grad_norm": 0.3844043904597473, + "learning_rate": 8.336180560811619e-06, + "loss": 0.5323, + "step": 2060 + }, + { + "epoch": 1.4971057589721473, + "grad_norm": 0.3560037681982256, + "learning_rate": 8.334402062408962e-06, + "loss": 0.53, + "step": 2061 + }, + { + "epoch": 1.4978321567203141, + "grad_norm": 0.3683510079298262, + "learning_rate": 8.332622803914734e-06, + "loss": 0.5232, + "step": 2062 + }, + { + "epoch": 1.4985585544684812, + "grad_norm": 0.37159086861045465, + "learning_rate": 8.330842785734525e-06, + "loss": 0.5415, + "step": 2063 + }, + { + "epoch": 1.499284952216648, + "grad_norm": 0.6276047408660594, + "learning_rate": 8.3290620082741e-06, + "loss": 0.5279, + "step": 2064 + }, + { + "epoch": 1.500011349964815, + "grad_norm": 0.372171797078306, + "learning_rate": 8.327280471939392e-06, + "loss": 0.534, + "step": 2065 + }, + { + "epoch": 1.5007377477129822, + "grad_norm": 0.36192295787407436, + "learning_rate": 8.325498177136514e-06, + "loss": 0.5276, + "step": 2066 + }, + { + "epoch": 1.501464145461149, + "grad_norm": 0.4151167746324928, + "learning_rate": 8.323715124271745e-06, + "loss": 0.5465, + "step": 2067 + }, + { + "epoch": 1.5021905432093159, + "grad_norm": 0.5345148655537466, + "learning_rate": 8.32193131375154e-06, + "loss": 0.5248, + "step": 2068 + }, + { + "epoch": 1.502916940957483, + "grad_norm": 0.4010625713543608, + "learning_rate": 8.320146745982528e-06, + "loss": 0.5394, + "step": 2069 + }, + { + "epoch": 1.50364333870565, + "grad_norm": 0.4242426717100806, + "learning_rate": 8.318361421371507e-06, + "loss": 0.5432, + "step": 2070 + }, + { + "epoch": 1.504369736453817, + "grad_norm": 0.3769937648385446, + "learning_rate": 8.31657534032545e-06, + "loss": 0.5404, + "step": 2071 + }, + { + "epoch": 1.505096134201984, + "grad_norm": 0.4002762206548443, + "learning_rate": 8.314788503251506e-06, + "loss": 0.5268, + "step": 2072 + }, + { + "epoch": 1.5058225319501508, + "grad_norm": 0.49513683689221805, + "learning_rate": 8.313000910556986e-06, + "loss": 0.5325, + "step": 2073 + }, + { + "epoch": 1.5065489296983179, + "grad_norm": 0.37293996421107484, + "learning_rate": 8.311212562649383e-06, + "loss": 0.5347, + "step": 2074 + }, + { + "epoch": 1.507275327446485, + "grad_norm": 0.3735325900469624, + "learning_rate": 8.30942345993636e-06, + "loss": 0.541, + "step": 2075 + }, + { + "epoch": 1.508001725194652, + "grad_norm": 0.513815208479049, + "learning_rate": 8.307633602825746e-06, + "loss": 0.5335, + "step": 2076 + }, + { + "epoch": 1.5087281229428189, + "grad_norm": 0.42651791780626036, + "learning_rate": 8.305842991725552e-06, + "loss": 0.5359, + "step": 2077 + }, + { + "epoch": 1.5094545206909857, + "grad_norm": 0.7149161389443535, + "learning_rate": 8.304051627043952e-06, + "loss": 0.5337, + "step": 2078 + }, + { + "epoch": 1.5101809184391528, + "grad_norm": 0.5309869942657326, + "learning_rate": 8.302259509189299e-06, + "loss": 0.5139, + "step": 2079 + }, + { + "epoch": 1.5109073161873199, + "grad_norm": 0.44461112251803003, + "learning_rate": 8.30046663857011e-06, + "loss": 0.5289, + "step": 2080 + }, + { + "epoch": 1.511633713935487, + "grad_norm": 0.3676877067349289, + "learning_rate": 8.298673015595083e-06, + "loss": 0.5325, + "step": 2081 + }, + { + "epoch": 1.5123601116836538, + "grad_norm": 0.5061828033162785, + "learning_rate": 8.296878640673077e-06, + "loss": 0.5153, + "step": 2082 + }, + { + "epoch": 1.5130865094318207, + "grad_norm": 0.414329324752058, + "learning_rate": 8.29508351421313e-06, + "loss": 0.5184, + "step": 2083 + }, + { + "epoch": 1.5138129071799877, + "grad_norm": 0.44121955910229965, + "learning_rate": 8.293287636624447e-06, + "loss": 0.5379, + "step": 2084 + }, + { + "epoch": 1.5145393049281548, + "grad_norm": 0.4154035008830126, + "learning_rate": 8.291491008316409e-06, + "loss": 0.5287, + "step": 2085 + }, + { + "epoch": 1.5152657026763217, + "grad_norm": 0.3225024595739773, + "learning_rate": 8.289693629698564e-06, + "loss": 0.5381, + "step": 2086 + }, + { + "epoch": 1.5159921004244887, + "grad_norm": 0.5334881633732634, + "learning_rate": 8.287895501180632e-06, + "loss": 0.5442, + "step": 2087 + }, + { + "epoch": 1.5167184981726556, + "grad_norm": 0.44290513871966686, + "learning_rate": 8.286096623172506e-06, + "loss": 0.5247, + "step": 2088 + }, + { + "epoch": 1.5174448959208227, + "grad_norm": 0.4355594276107615, + "learning_rate": 8.284296996084244e-06, + "loss": 0.5331, + "step": 2089 + }, + { + "epoch": 1.5181712936689897, + "grad_norm": 0.47966811639145174, + "learning_rate": 8.282496620326085e-06, + "loss": 0.5411, + "step": 2090 + }, + { + "epoch": 1.5188976914171566, + "grad_norm": 0.43302335492933264, + "learning_rate": 8.280695496308428e-06, + "loss": 0.5407, + "step": 2091 + }, + { + "epoch": 1.5196240891653234, + "grad_norm": 0.3857374366033102, + "learning_rate": 8.278893624441849e-06, + "loss": 0.5361, + "step": 2092 + }, + { + "epoch": 1.5203504869134905, + "grad_norm": 0.375465296561444, + "learning_rate": 8.27709100513709e-06, + "loss": 0.5375, + "step": 2093 + }, + { + "epoch": 1.5210768846616576, + "grad_norm": 0.3682581338047892, + "learning_rate": 8.275287638805069e-06, + "loss": 0.5327, + "step": 2094 + }, + { + "epoch": 1.5218032824098247, + "grad_norm": 0.3535435043848988, + "learning_rate": 8.273483525856871e-06, + "loss": 0.5423, + "step": 2095 + }, + { + "epoch": 1.5225296801579915, + "grad_norm": 0.36136545072253023, + "learning_rate": 8.27167866670375e-06, + "loss": 0.5315, + "step": 2096 + }, + { + "epoch": 1.5232560779061584, + "grad_norm": 0.4133601831129233, + "learning_rate": 8.269873061757133e-06, + "loss": 0.5307, + "step": 2097 + }, + { + "epoch": 1.5239824756543254, + "grad_norm": 0.5370894632116522, + "learning_rate": 8.268066711428614e-06, + "loss": 0.543, + "step": 2098 + }, + { + "epoch": 1.5247088734024925, + "grad_norm": 0.3613559837813718, + "learning_rate": 8.266259616129959e-06, + "loss": 0.5283, + "step": 2099 + }, + { + "epoch": 1.5254352711506596, + "grad_norm": 0.35068415153350707, + "learning_rate": 8.264451776273104e-06, + "loss": 0.5246, + "step": 2100 + }, + { + "epoch": 1.5261616688988264, + "grad_norm": 0.4693794087880394, + "learning_rate": 8.262643192270152e-06, + "loss": 0.5271, + "step": 2101 + }, + { + "epoch": 1.5268880666469933, + "grad_norm": 0.36685476737449635, + "learning_rate": 8.26083386453338e-06, + "loss": 0.5402, + "step": 2102 + }, + { + "epoch": 1.5276144643951604, + "grad_norm": 0.4240709701742305, + "learning_rate": 8.259023793475233e-06, + "loss": 0.531, + "step": 2103 + }, + { + "epoch": 1.5283408621433274, + "grad_norm": 0.3992723948801737, + "learning_rate": 8.257212979508321e-06, + "loss": 0.5403, + "step": 2104 + }, + { + "epoch": 1.5290672598914945, + "grad_norm": 0.39217915177008994, + "learning_rate": 8.25540142304543e-06, + "loss": 0.5176, + "step": 2105 + }, + { + "epoch": 1.5297936576396614, + "grad_norm": 0.3691939997812454, + "learning_rate": 8.253589124499513e-06, + "loss": 0.5234, + "step": 2106 + }, + { + "epoch": 1.5305200553878282, + "grad_norm": 0.34426922828140977, + "learning_rate": 8.251776084283687e-06, + "loss": 0.5255, + "step": 2107 + }, + { + "epoch": 1.5312464531359953, + "grad_norm": 0.4228685867855801, + "learning_rate": 8.249962302811246e-06, + "loss": 0.5355, + "step": 2108 + }, + { + "epoch": 1.5319728508841624, + "grad_norm": 0.6702335615461068, + "learning_rate": 8.248147780495648e-06, + "loss": 0.53, + "step": 2109 + }, + { + "epoch": 1.5326992486323292, + "grad_norm": 0.3620865144375418, + "learning_rate": 8.246332517750524e-06, + "loss": 0.5256, + "step": 2110 + }, + { + "epoch": 1.533425646380496, + "grad_norm": 0.3508763292761052, + "learning_rate": 8.244516514989667e-06, + "loss": 0.5156, + "step": 2111 + }, + { + "epoch": 1.5341520441286631, + "grad_norm": 0.42627555830890596, + "learning_rate": 8.242699772627044e-06, + "loss": 0.5355, + "step": 2112 + }, + { + "epoch": 1.5348784418768302, + "grad_norm": 0.3684308599352157, + "learning_rate": 8.240882291076794e-06, + "loss": 0.5182, + "step": 2113 + }, + { + "epoch": 1.5356048396249973, + "grad_norm": 0.43590036510920216, + "learning_rate": 8.239064070753213e-06, + "loss": 0.5165, + "step": 2114 + }, + { + "epoch": 1.5363312373731641, + "grad_norm": 0.3774981089091263, + "learning_rate": 8.237245112070775e-06, + "loss": 0.5433, + "step": 2115 + }, + { + "epoch": 1.537057635121331, + "grad_norm": 0.5796489601067345, + "learning_rate": 8.235425415444123e-06, + "loss": 0.5255, + "step": 2116 + }, + { + "epoch": 1.537784032869498, + "grad_norm": 0.4040368368012006, + "learning_rate": 8.233604981288059e-06, + "loss": 0.5207, + "step": 2117 + }, + { + "epoch": 1.5385104306176651, + "grad_norm": 0.4050182472662472, + "learning_rate": 8.231783810017562e-06, + "loss": 0.5281, + "step": 2118 + }, + { + "epoch": 1.5392368283658322, + "grad_norm": 0.43834797443504225, + "learning_rate": 8.229961902047777e-06, + "loss": 0.5269, + "step": 2119 + }, + { + "epoch": 1.539963226113999, + "grad_norm": 0.3867432482382013, + "learning_rate": 8.228139257794012e-06, + "loss": 0.5305, + "step": 2120 + }, + { + "epoch": 1.540689623862166, + "grad_norm": 0.3334578775651982, + "learning_rate": 8.22631587767175e-06, + "loss": 0.53, + "step": 2121 + }, + { + "epoch": 1.541416021610333, + "grad_norm": 0.3670398743223365, + "learning_rate": 8.22449176209664e-06, + "loss": 0.5321, + "step": 2122 + }, + { + "epoch": 1.5421424193585, + "grad_norm": 0.4172991422978223, + "learning_rate": 8.222666911484494e-06, + "loss": 0.5422, + "step": 2123 + }, + { + "epoch": 1.5428688171066671, + "grad_norm": 0.394926083100875, + "learning_rate": 8.220841326251297e-06, + "loss": 0.5323, + "step": 2124 + }, + { + "epoch": 1.543595214854834, + "grad_norm": 0.3701822483507409, + "learning_rate": 8.219015006813197e-06, + "loss": 0.5333, + "step": 2125 + }, + { + "epoch": 1.5443216126030008, + "grad_norm": 0.3830765074130067, + "learning_rate": 8.217187953586512e-06, + "loss": 0.533, + "step": 2126 + }, + { + "epoch": 1.545048010351168, + "grad_norm": 0.3495916719722912, + "learning_rate": 8.215360166987728e-06, + "loss": 0.529, + "step": 2127 + }, + { + "epoch": 1.545774408099335, + "grad_norm": 0.38420382108991424, + "learning_rate": 8.213531647433494e-06, + "loss": 0.5231, + "step": 2128 + }, + { + "epoch": 1.5465008058475018, + "grad_norm": 0.3664634654071994, + "learning_rate": 8.211702395340633e-06, + "loss": 0.5232, + "step": 2129 + }, + { + "epoch": 1.547227203595669, + "grad_norm": 0.39373304275204196, + "learning_rate": 8.209872411126127e-06, + "loss": 0.5404, + "step": 2130 + }, + { + "epoch": 1.5479536013438358, + "grad_norm": 0.5226087869025603, + "learning_rate": 8.208041695207134e-06, + "loss": 0.5271, + "step": 2131 + }, + { + "epoch": 1.5486799990920028, + "grad_norm": 0.37897080800573535, + "learning_rate": 8.20621024800097e-06, + "loss": 0.5257, + "step": 2132 + }, + { + "epoch": 1.54940639684017, + "grad_norm": 0.3607192702673823, + "learning_rate": 8.204378069925121e-06, + "loss": 0.5278, + "step": 2133 + }, + { + "epoch": 1.5501327945883367, + "grad_norm": 0.4491567190506299, + "learning_rate": 8.202545161397242e-06, + "loss": 0.5212, + "step": 2134 + }, + { + "epoch": 1.5508591923365036, + "grad_norm": 0.4648783169043336, + "learning_rate": 8.200711522835153e-06, + "loss": 0.5255, + "step": 2135 + }, + { + "epoch": 1.5515855900846707, + "grad_norm": 0.40825688941196847, + "learning_rate": 8.198877154656838e-06, + "loss": 0.5187, + "step": 2136 + }, + { + "epoch": 1.5523119878328377, + "grad_norm": 0.5055858786813612, + "learning_rate": 8.19704205728045e-06, + "loss": 0.5214, + "step": 2137 + }, + { + "epoch": 1.5530383855810048, + "grad_norm": 0.452686470398235, + "learning_rate": 8.195206231124309e-06, + "loss": 0.5272, + "step": 2138 + }, + { + "epoch": 1.5537647833291717, + "grad_norm": 0.37942785376928945, + "learning_rate": 8.193369676606896e-06, + "loss": 0.5348, + "step": 2139 + }, + { + "epoch": 1.5544911810773385, + "grad_norm": 0.4065149027015626, + "learning_rate": 8.191532394146865e-06, + "loss": 0.5292, + "step": 2140 + }, + { + "epoch": 1.5552175788255056, + "grad_norm": 0.5083598648566944, + "learning_rate": 8.189694384163032e-06, + "loss": 0.5176, + "step": 2141 + }, + { + "epoch": 1.5559439765736727, + "grad_norm": 0.3722742765668192, + "learning_rate": 8.187855647074376e-06, + "loss": 0.5262, + "step": 2142 + }, + { + "epoch": 1.5566703743218397, + "grad_norm": 0.40226452791696343, + "learning_rate": 8.186016183300052e-06, + "loss": 0.5231, + "step": 2143 + }, + { + "epoch": 1.5573967720700066, + "grad_norm": 0.43419760435625404, + "learning_rate": 8.184175993259367e-06, + "loss": 0.5258, + "step": 2144 + }, + { + "epoch": 1.5581231698181734, + "grad_norm": 0.4013434470364981, + "learning_rate": 8.182335077371803e-06, + "loss": 0.5259, + "step": 2145 + }, + { + "epoch": 1.5588495675663405, + "grad_norm": 0.5035945890529423, + "learning_rate": 8.180493436057008e-06, + "loss": 0.5227, + "step": 2146 + }, + { + "epoch": 1.5595759653145076, + "grad_norm": 0.37150966714124883, + "learning_rate": 8.178651069734787e-06, + "loss": 0.5336, + "step": 2147 + }, + { + "epoch": 1.5603023630626747, + "grad_norm": 0.4429586606794134, + "learning_rate": 8.17680797882512e-06, + "loss": 0.5453, + "step": 2148 + }, + { + "epoch": 1.5610287608108415, + "grad_norm": 0.37270998378690173, + "learning_rate": 8.174964163748142e-06, + "loss": 0.5269, + "step": 2149 + }, + { + "epoch": 1.5617551585590084, + "grad_norm": 0.4122813391997847, + "learning_rate": 8.173119624924164e-06, + "loss": 0.5204, + "step": 2150 + }, + { + "epoch": 1.5624815563071754, + "grad_norm": 0.37256623087813406, + "learning_rate": 8.171274362773657e-06, + "loss": 0.5284, + "step": 2151 + }, + { + "epoch": 1.5632079540553425, + "grad_norm": 0.366519633839237, + "learning_rate": 8.169428377717253e-06, + "loss": 0.5292, + "step": 2152 + }, + { + "epoch": 1.5639343518035094, + "grad_norm": 0.39571030872429297, + "learning_rate": 8.167581670175752e-06, + "loss": 0.5244, + "step": 2153 + }, + { + "epoch": 1.5646607495516764, + "grad_norm": 0.49398026531249895, + "learning_rate": 8.165734240570124e-06, + "loss": 0.5162, + "step": 2154 + }, + { + "epoch": 1.5653871472998433, + "grad_norm": 0.3702564028006646, + "learning_rate": 8.163886089321493e-06, + "loss": 0.5471, + "step": 2155 + }, + { + "epoch": 1.5661135450480104, + "grad_norm": 0.3958082047337379, + "learning_rate": 8.162037216851158e-06, + "loss": 0.5361, + "step": 2156 + }, + { + "epoch": 1.5668399427961774, + "grad_norm": 0.38027394888118066, + "learning_rate": 8.160187623580575e-06, + "loss": 0.5101, + "step": 2157 + }, + { + "epoch": 1.5675663405443443, + "grad_norm": 0.6188493033488481, + "learning_rate": 8.158337309931365e-06, + "loss": 0.5206, + "step": 2158 + }, + { + "epoch": 1.5682927382925111, + "grad_norm": 0.485014339706956, + "learning_rate": 8.15648627632532e-06, + "loss": 0.522, + "step": 2159 + }, + { + "epoch": 1.5690191360406782, + "grad_norm": 0.3372921872528581, + "learning_rate": 8.154634523184389e-06, + "loss": 0.5204, + "step": 2160 + }, + { + "epoch": 1.5697455337888453, + "grad_norm": 0.3775246242830134, + "learning_rate": 8.152782050930685e-06, + "loss": 0.5139, + "step": 2161 + }, + { + "epoch": 1.5704719315370124, + "grad_norm": 0.3441463438997507, + "learning_rate": 8.150928859986488e-06, + "loss": 0.5341, + "step": 2162 + }, + { + "epoch": 1.5711983292851792, + "grad_norm": 0.3958596216052891, + "learning_rate": 8.149074950774244e-06, + "loss": 0.5325, + "step": 2163 + }, + { + "epoch": 1.571924727033346, + "grad_norm": 0.3555124159077814, + "learning_rate": 8.14722032371656e-06, + "loss": 0.531, + "step": 2164 + }, + { + "epoch": 1.5726511247815131, + "grad_norm": 0.4197910562498298, + "learning_rate": 8.145364979236201e-06, + "loss": 0.5183, + "step": 2165 + }, + { + "epoch": 1.5733775225296802, + "grad_norm": 0.38377856071557065, + "learning_rate": 8.143508917756105e-06, + "loss": 0.539, + "step": 2166 + }, + { + "epoch": 1.5741039202778473, + "grad_norm": 0.3498504206619673, + "learning_rate": 8.14165213969937e-06, + "loss": 0.5318, + "step": 2167 + }, + { + "epoch": 1.5748303180260141, + "grad_norm": 0.35939784890857573, + "learning_rate": 8.139794645489252e-06, + "loss": 0.5244, + "step": 2168 + }, + { + "epoch": 1.575556715774181, + "grad_norm": 0.5191106846293712, + "learning_rate": 8.13793643554918e-06, + "loss": 0.5236, + "step": 2169 + }, + { + "epoch": 1.576283113522348, + "grad_norm": 0.3754443761805671, + "learning_rate": 8.13607751030274e-06, + "loss": 0.5218, + "step": 2170 + }, + { + "epoch": 1.5770095112705151, + "grad_norm": 0.3765275515814297, + "learning_rate": 8.13421787017368e-06, + "loss": 0.525, + "step": 2171 + }, + { + "epoch": 1.577735909018682, + "grad_norm": 0.3619776638617869, + "learning_rate": 8.132357515585913e-06, + "loss": 0.5387, + "step": 2172 + }, + { + "epoch": 1.578462306766849, + "grad_norm": 0.43674111006208605, + "learning_rate": 8.130496446963518e-06, + "loss": 0.528, + "step": 2173 + }, + { + "epoch": 1.579188704515016, + "grad_norm": 0.45352096537017633, + "learning_rate": 8.128634664730734e-06, + "loss": 0.5223, + "step": 2174 + }, + { + "epoch": 1.579915102263183, + "grad_norm": 0.3638288881531629, + "learning_rate": 8.126772169311959e-06, + "loss": 0.5272, + "step": 2175 + }, + { + "epoch": 1.58064150001135, + "grad_norm": 0.3678937439443033, + "learning_rate": 8.124908961131759e-06, + "loss": 0.5327, + "step": 2176 + }, + { + "epoch": 1.581367897759517, + "grad_norm": 0.42819149568379095, + "learning_rate": 8.123045040614859e-06, + "loss": 0.5273, + "step": 2177 + }, + { + "epoch": 1.5820942955076838, + "grad_norm": 0.451401867343068, + "learning_rate": 8.121180408186151e-06, + "loss": 0.52, + "step": 2178 + }, + { + "epoch": 1.5828206932558508, + "grad_norm": 0.35404518347187564, + "learning_rate": 8.119315064270683e-06, + "loss": 0.5418, + "step": 2179 + }, + { + "epoch": 1.583547091004018, + "grad_norm": 0.3874862248814074, + "learning_rate": 8.117449009293668e-06, + "loss": 0.515, + "step": 2180 + }, + { + "epoch": 1.584273488752185, + "grad_norm": 0.35996595384110947, + "learning_rate": 8.115582243680484e-06, + "loss": 0.5231, + "step": 2181 + }, + { + "epoch": 1.5849998865003518, + "grad_norm": 0.49032221582780916, + "learning_rate": 8.113714767856668e-06, + "loss": 0.533, + "step": 2182 + }, + { + "epoch": 1.5857262842485187, + "grad_norm": 0.3789203895555697, + "learning_rate": 8.111846582247917e-06, + "loss": 0.5204, + "step": 2183 + }, + { + "epoch": 1.5864526819966858, + "grad_norm": 0.44070282653119874, + "learning_rate": 8.109977687280095e-06, + "loss": 0.5182, + "step": 2184 + }, + { + "epoch": 1.5871790797448528, + "grad_norm": 0.3734368116904144, + "learning_rate": 8.108108083379224e-06, + "loss": 0.5239, + "step": 2185 + }, + { + "epoch": 1.58790547749302, + "grad_norm": 0.7794357692303779, + "learning_rate": 8.106237770971486e-06, + "loss": 0.5348, + "step": 2186 + }, + { + "epoch": 1.5886318752411868, + "grad_norm": 0.40048407359634847, + "learning_rate": 8.10436675048323e-06, + "loss": 0.5329, + "step": 2187 + }, + { + "epoch": 1.5893582729893536, + "grad_norm": 0.41996079281176507, + "learning_rate": 8.102495022340962e-06, + "loss": 0.5348, + "step": 2188 + }, + { + "epoch": 1.5900846707375207, + "grad_norm": 0.4141606229227644, + "learning_rate": 8.100622586971349e-06, + "loss": 0.5231, + "step": 2189 + }, + { + "epoch": 1.5908110684856878, + "grad_norm": 0.39613241249403197, + "learning_rate": 8.098749444801226e-06, + "loss": 0.5154, + "step": 2190 + }, + { + "epoch": 1.5915374662338548, + "grad_norm": 0.40699509976783327, + "learning_rate": 8.096875596257578e-06, + "loss": 0.5361, + "step": 2191 + }, + { + "epoch": 1.5922638639820217, + "grad_norm": 0.39720514745810015, + "learning_rate": 8.095001041767561e-06, + "loss": 0.5384, + "step": 2192 + }, + { + "epoch": 1.5929902617301885, + "grad_norm": 0.379944597113303, + "learning_rate": 8.093125781758485e-06, + "loss": 0.5191, + "step": 2193 + }, + { + "epoch": 1.5937166594783556, + "grad_norm": 0.40124115305103786, + "learning_rate": 8.091249816657826e-06, + "loss": 0.5319, + "step": 2194 + }, + { + "epoch": 1.5944430572265227, + "grad_norm": 0.36199001054735935, + "learning_rate": 8.089373146893216e-06, + "loss": 0.53, + "step": 2195 + }, + { + "epoch": 1.5951694549746895, + "grad_norm": 0.3839271041750099, + "learning_rate": 8.087495772892455e-06, + "loss": 0.5209, + "step": 2196 + }, + { + "epoch": 1.5958958527228566, + "grad_norm": 0.3633683466203577, + "learning_rate": 8.085617695083493e-06, + "loss": 0.5127, + "step": 2197 + }, + { + "epoch": 1.5966222504710235, + "grad_norm": 0.4480574212500137, + "learning_rate": 8.08373891389445e-06, + "loss": 0.5171, + "step": 2198 + }, + { + "epoch": 1.5973486482191905, + "grad_norm": 0.4441680664452495, + "learning_rate": 8.0818594297536e-06, + "loss": 0.5235, + "step": 2199 + }, + { + "epoch": 1.5980750459673576, + "grad_norm": 0.3848708364131823, + "learning_rate": 8.07997924308938e-06, + "loss": 0.5272, + "step": 2200 + }, + { + "epoch": 1.5988014437155245, + "grad_norm": 0.46643645595697314, + "learning_rate": 8.078098354330386e-06, + "loss": 0.5416, + "step": 2201 + }, + { + "epoch": 1.5995278414636913, + "grad_norm": 0.42482180204664727, + "learning_rate": 8.076216763905379e-06, + "loss": 0.5199, + "step": 2202 + }, + { + "epoch": 1.6002542392118584, + "grad_norm": 0.38351767411017756, + "learning_rate": 8.074334472243273e-06, + "loss": 0.5385, + "step": 2203 + }, + { + "epoch": 1.6009806369600255, + "grad_norm": 0.44335436503062614, + "learning_rate": 8.072451479773143e-06, + "loss": 0.5221, + "step": 2204 + }, + { + "epoch": 1.6017070347081925, + "grad_norm": 0.33046482661721827, + "learning_rate": 8.070567786924228e-06, + "loss": 0.5243, + "step": 2205 + }, + { + "epoch": 1.6024334324563594, + "grad_norm": 0.36748965271151585, + "learning_rate": 8.068683394125923e-06, + "loss": 0.531, + "step": 2206 + }, + { + "epoch": 1.6031598302045262, + "grad_norm": 0.3833859585923725, + "learning_rate": 8.066798301807782e-06, + "loss": 0.5254, + "step": 2207 + }, + { + "epoch": 1.6038862279526933, + "grad_norm": 0.3393603194185105, + "learning_rate": 8.064912510399524e-06, + "loss": 0.522, + "step": 2208 + }, + { + "epoch": 1.6046126257008604, + "grad_norm": 0.41474782851520825, + "learning_rate": 8.06302602033102e-06, + "loss": 0.5344, + "step": 2209 + }, + { + "epoch": 1.6053390234490275, + "grad_norm": 0.4576990738112103, + "learning_rate": 8.061138832032304e-06, + "loss": 0.5024, + "step": 2210 + }, + { + "epoch": 1.6060654211971943, + "grad_norm": 0.349589634316685, + "learning_rate": 8.05925094593357e-06, + "loss": 0.5231, + "step": 2211 + }, + { + "epoch": 1.6067918189453612, + "grad_norm": 0.3592881325763187, + "learning_rate": 8.05736236246517e-06, + "loss": 0.5243, + "step": 2212 + }, + { + "epoch": 1.6075182166935282, + "grad_norm": 0.342386535794566, + "learning_rate": 8.055473082057615e-06, + "loss": 0.5243, + "step": 2213 + }, + { + "epoch": 1.6082446144416953, + "grad_norm": 0.34698494413806036, + "learning_rate": 8.053583105141573e-06, + "loss": 0.5267, + "step": 2214 + }, + { + "epoch": 1.6089710121898622, + "grad_norm": 0.3444169193095043, + "learning_rate": 8.051692432147876e-06, + "loss": 0.5124, + "step": 2215 + }, + { + "epoch": 1.6096974099380292, + "grad_norm": 0.3523998685772173, + "learning_rate": 8.049801063507505e-06, + "loss": 0.5279, + "step": 2216 + }, + { + "epoch": 1.610423807686196, + "grad_norm": 0.3600581649454914, + "learning_rate": 8.047908999651613e-06, + "loss": 0.5243, + "step": 2217 + }, + { + "epoch": 1.6111502054343632, + "grad_norm": 0.3770696606265363, + "learning_rate": 8.0460162410115e-06, + "loss": 0.5323, + "step": 2218 + }, + { + "epoch": 1.6118766031825302, + "grad_norm": 0.3627755782168038, + "learning_rate": 8.04412278801863e-06, + "loss": 0.5322, + "step": 2219 + }, + { + "epoch": 1.612603000930697, + "grad_norm": 0.39535887557139776, + "learning_rate": 8.042228641104622e-06, + "loss": 0.5211, + "step": 2220 + }, + { + "epoch": 1.613329398678864, + "grad_norm": 0.5402785851323881, + "learning_rate": 8.040333800701258e-06, + "loss": 0.533, + "step": 2221 + }, + { + "epoch": 1.614055796427031, + "grad_norm": 0.3958450342645416, + "learning_rate": 8.038438267240473e-06, + "loss": 0.5338, + "step": 2222 + }, + { + "epoch": 1.614782194175198, + "grad_norm": 0.3819125676640373, + "learning_rate": 8.036542041154363e-06, + "loss": 0.5186, + "step": 2223 + }, + { + "epoch": 1.6155085919233652, + "grad_norm": 0.5474682629830405, + "learning_rate": 8.034645122875183e-06, + "loss": 0.5155, + "step": 2224 + }, + { + "epoch": 1.616234989671532, + "grad_norm": 0.36761066631477124, + "learning_rate": 8.032747512835338e-06, + "loss": 0.5128, + "step": 2225 + }, + { + "epoch": 1.6169613874196989, + "grad_norm": 0.3553974845209146, + "learning_rate": 8.030849211467401e-06, + "loss": 0.5339, + "step": 2226 + }, + { + "epoch": 1.617687785167866, + "grad_norm": 0.4054275992193891, + "learning_rate": 8.0289502192041e-06, + "loss": 0.5192, + "step": 2227 + }, + { + "epoch": 1.618414182916033, + "grad_norm": 0.5004251882280716, + "learning_rate": 8.027050536478315e-06, + "loss": 0.5182, + "step": 2228 + }, + { + "epoch": 1.6191405806642, + "grad_norm": 0.505586412419713, + "learning_rate": 8.025150163723087e-06, + "loss": 0.5328, + "step": 2229 + }, + { + "epoch": 1.619866978412367, + "grad_norm": 0.33396295149729055, + "learning_rate": 8.023249101371616e-06, + "loss": 0.5086, + "step": 2230 + }, + { + "epoch": 1.6205933761605338, + "grad_norm": 0.3865239183606997, + "learning_rate": 8.021347349857258e-06, + "loss": 0.5302, + "step": 2231 + }, + { + "epoch": 1.6213197739087009, + "grad_norm": 0.5417414484743928, + "learning_rate": 8.019444909613524e-06, + "loss": 0.5416, + "step": 2232 + }, + { + "epoch": 1.622046171656868, + "grad_norm": 0.381620451485267, + "learning_rate": 8.017541781074083e-06, + "loss": 0.5274, + "step": 2233 + }, + { + "epoch": 1.622772569405035, + "grad_norm": 0.4633290472028216, + "learning_rate": 8.015637964672764e-06, + "loss": 0.524, + "step": 2234 + }, + { + "epoch": 1.6234989671532019, + "grad_norm": 0.3435908519465226, + "learning_rate": 8.013733460843546e-06, + "loss": 0.5147, + "step": 2235 + }, + { + "epoch": 1.6242253649013687, + "grad_norm": 0.3724847066613284, + "learning_rate": 8.011828270020575e-06, + "loss": 0.5137, + "step": 2236 + }, + { + "epoch": 1.6249517626495358, + "grad_norm": 0.41822288543551656, + "learning_rate": 8.009922392638143e-06, + "loss": 0.5292, + "step": 2237 + }, + { + "epoch": 1.6256781603977029, + "grad_norm": 0.36389885402184613, + "learning_rate": 8.008015829130704e-06, + "loss": 0.5559, + "step": 2238 + }, + { + "epoch": 1.6264045581458697, + "grad_norm": 0.49820298992503304, + "learning_rate": 8.006108579932869e-06, + "loss": 0.5115, + "step": 2239 + }, + { + "epoch": 1.6271309558940368, + "grad_norm": 0.3858200969364259, + "learning_rate": 8.004200645479403e-06, + "loss": 0.5297, + "step": 2240 + }, + { + "epoch": 1.6278573536422036, + "grad_norm": 0.36454219334100507, + "learning_rate": 8.002292026205229e-06, + "loss": 0.5304, + "step": 2241 + }, + { + "epoch": 1.6285837513903707, + "grad_norm": 0.4297533946554223, + "learning_rate": 8.000382722545423e-06, + "loss": 0.5203, + "step": 2242 + }, + { + "epoch": 1.6293101491385378, + "grad_norm": 0.4333838989231198, + "learning_rate": 7.99847273493522e-06, + "loss": 0.5165, + "step": 2243 + }, + { + "epoch": 1.6300365468867046, + "grad_norm": 0.3353522195529111, + "learning_rate": 7.996562063810009e-06, + "loss": 0.521, + "step": 2244 + }, + { + "epoch": 1.6307629446348715, + "grad_norm": 0.3506370164538408, + "learning_rate": 7.994650709605338e-06, + "loss": 0.5357, + "step": 2245 + }, + { + "epoch": 1.6314893423830386, + "grad_norm": 0.3793671010753837, + "learning_rate": 7.992738672756909e-06, + "loss": 0.5138, + "step": 2246 + }, + { + "epoch": 1.6322157401312056, + "grad_norm": 0.5060055389925995, + "learning_rate": 7.990825953700577e-06, + "loss": 0.5169, + "step": 2247 + }, + { + "epoch": 1.6329421378793727, + "grad_norm": 0.3501754362435583, + "learning_rate": 7.988912552872354e-06, + "loss": 0.5225, + "step": 2248 + }, + { + "epoch": 1.6336685356275396, + "grad_norm": 0.42524904353834486, + "learning_rate": 7.98699847070841e-06, + "loss": 0.5216, + "step": 2249 + }, + { + "epoch": 1.6343949333757064, + "grad_norm": 0.38999245002865135, + "learning_rate": 7.985083707645071e-06, + "loss": 0.5222, + "step": 2250 + }, + { + "epoch": 1.6351213311238735, + "grad_norm": 0.4078484544716897, + "learning_rate": 7.983168264118811e-06, + "loss": 0.5276, + "step": 2251 + }, + { + "epoch": 1.6358477288720406, + "grad_norm": 0.36978836915791335, + "learning_rate": 7.981252140566264e-06, + "loss": 0.5311, + "step": 2252 + }, + { + "epoch": 1.6365741266202076, + "grad_norm": 0.36068856400219423, + "learning_rate": 7.979335337424222e-06, + "loss": 0.5069, + "step": 2253 + }, + { + "epoch": 1.6373005243683745, + "grad_norm": 0.4777656356045701, + "learning_rate": 7.977417855129626e-06, + "loss": 0.5218, + "step": 2254 + }, + { + "epoch": 1.6380269221165413, + "grad_norm": 0.40092908368657715, + "learning_rate": 7.975499694119576e-06, + "loss": 0.5226, + "step": 2255 + }, + { + "epoch": 1.6387533198647084, + "grad_norm": 0.4112514216593581, + "learning_rate": 7.973580854831323e-06, + "loss": 0.5216, + "step": 2256 + }, + { + "epoch": 1.6394797176128755, + "grad_norm": 0.557912888011925, + "learning_rate": 7.971661337702278e-06, + "loss": 0.5367, + "step": 2257 + }, + { + "epoch": 1.6402061153610423, + "grad_norm": 0.36306247064629654, + "learning_rate": 7.969741143170003e-06, + "loss": 0.5131, + "step": 2258 + }, + { + "epoch": 1.6409325131092094, + "grad_norm": 0.4098049589502914, + "learning_rate": 7.967820271672211e-06, + "loss": 0.5294, + "step": 2259 + }, + { + "epoch": 1.6416589108573763, + "grad_norm": 0.42971189819772837, + "learning_rate": 7.965898723646777e-06, + "loss": 0.5162, + "step": 2260 + }, + { + "epoch": 1.6423853086055433, + "grad_norm": 0.43974465607779384, + "learning_rate": 7.963976499531724e-06, + "loss": 0.5198, + "step": 2261 + }, + { + "epoch": 1.6431117063537104, + "grad_norm": 0.3782390833088789, + "learning_rate": 7.962053599765234e-06, + "loss": 0.5199, + "step": 2262 + }, + { + "epoch": 1.6438381041018773, + "grad_norm": 0.3642711346135813, + "learning_rate": 7.960130024785636e-06, + "loss": 0.5045, + "step": 2263 + }, + { + "epoch": 1.644564501850044, + "grad_norm": 0.4462608703081043, + "learning_rate": 7.958205775031423e-06, + "loss": 0.5165, + "step": 2264 + }, + { + "epoch": 1.6452908995982112, + "grad_norm": 0.3937762431924642, + "learning_rate": 7.95628085094123e-06, + "loss": 0.5187, + "step": 2265 + }, + { + "epoch": 1.6460172973463782, + "grad_norm": 0.3763979930994422, + "learning_rate": 7.954355252953859e-06, + "loss": 0.5141, + "step": 2266 + }, + { + "epoch": 1.6467436950945453, + "grad_norm": 0.7372133545800805, + "learning_rate": 7.952428981508254e-06, + "loss": 0.532, + "step": 2267 + }, + { + "epoch": 1.6474700928427122, + "grad_norm": 0.3950337311935571, + "learning_rate": 7.950502037043515e-06, + "loss": 0.5261, + "step": 2268 + }, + { + "epoch": 1.648196490590879, + "grad_norm": 0.48949465520274943, + "learning_rate": 7.948574419998903e-06, + "loss": 0.5136, + "step": 2269 + }, + { + "epoch": 1.648922888339046, + "grad_norm": 0.4338054113514785, + "learning_rate": 7.946646130813822e-06, + "loss": 0.5222, + "step": 2270 + }, + { + "epoch": 1.6496492860872132, + "grad_norm": 0.4816646656555247, + "learning_rate": 7.944717169927838e-06, + "loss": 0.5307, + "step": 2271 + }, + { + "epoch": 1.6503756838353802, + "grad_norm": 0.44931497631662126, + "learning_rate": 7.942787537780663e-06, + "loss": 0.518, + "step": 2272 + }, + { + "epoch": 1.651102081583547, + "grad_norm": 0.5414627251303079, + "learning_rate": 7.940857234812164e-06, + "loss": 0.5002, + "step": 2273 + }, + { + "epoch": 1.651828479331714, + "grad_norm": 0.39261828520787145, + "learning_rate": 7.938926261462366e-06, + "loss": 0.5141, + "step": 2274 + }, + { + "epoch": 1.652554877079881, + "grad_norm": 0.4903351786500484, + "learning_rate": 7.93699461817144e-06, + "loss": 0.5296, + "step": 2275 + }, + { + "epoch": 1.653281274828048, + "grad_norm": 0.47308665326073535, + "learning_rate": 7.935062305379715e-06, + "loss": 0.5141, + "step": 2276 + }, + { + "epoch": 1.6540076725762152, + "grad_norm": 0.4171785888876544, + "learning_rate": 7.933129323527668e-06, + "loss": 0.5163, + "step": 2277 + }, + { + "epoch": 1.654734070324382, + "grad_norm": 0.3629005440782251, + "learning_rate": 7.931195673055932e-06, + "loss": 0.5203, + "step": 2278 + }, + { + "epoch": 1.6554604680725489, + "grad_norm": 0.45630944853496325, + "learning_rate": 7.929261354405289e-06, + "loss": 0.5286, + "step": 2279 + }, + { + "epoch": 1.656186865820716, + "grad_norm": 0.39043909477075656, + "learning_rate": 7.927326368016677e-06, + "loss": 0.5315, + "step": 2280 + }, + { + "epoch": 1.656913263568883, + "grad_norm": 0.45459187248014943, + "learning_rate": 7.925390714331185e-06, + "loss": 0.5211, + "step": 2281 + }, + { + "epoch": 1.6576396613170499, + "grad_norm": 0.4232053463313131, + "learning_rate": 7.923454393790053e-06, + "loss": 0.5393, + "step": 2282 + }, + { + "epoch": 1.658366059065217, + "grad_norm": 0.44803108722347296, + "learning_rate": 7.921517406834675e-06, + "loss": 0.528, + "step": 2283 + }, + { + "epoch": 1.6590924568133838, + "grad_norm": 0.514049972418491, + "learning_rate": 7.919579753906595e-06, + "loss": 0.5276, + "step": 2284 + }, + { + "epoch": 1.6598188545615509, + "grad_norm": 0.36688233278611193, + "learning_rate": 7.917641435447508e-06, + "loss": 0.5106, + "step": 2285 + }, + { + "epoch": 1.660545252309718, + "grad_norm": 0.37359403943964403, + "learning_rate": 7.915702451899264e-06, + "loss": 0.5164, + "step": 2286 + }, + { + "epoch": 1.6612716500578848, + "grad_norm": 0.41369210429929865, + "learning_rate": 7.913762803703861e-06, + "loss": 0.521, + "step": 2287 + }, + { + "epoch": 1.6619980478060516, + "grad_norm": 0.4009306112880066, + "learning_rate": 7.911822491303453e-06, + "loss": 0.5239, + "step": 2288 + }, + { + "epoch": 1.6627244455542187, + "grad_norm": 0.7311945919702445, + "learning_rate": 7.90988151514034e-06, + "loss": 0.5308, + "step": 2289 + }, + { + "epoch": 1.6634508433023858, + "grad_norm": 0.43626534803107847, + "learning_rate": 7.907939875656978e-06, + "loss": 0.5288, + "step": 2290 + }, + { + "epoch": 1.6641772410505529, + "grad_norm": 0.35224352677829485, + "learning_rate": 7.90599757329597e-06, + "loss": 0.5175, + "step": 2291 + }, + { + "epoch": 1.6649036387987197, + "grad_norm": 0.5331153980860087, + "learning_rate": 7.904054608500075e-06, + "loss": 0.5116, + "step": 2292 + }, + { + "epoch": 1.6656300365468866, + "grad_norm": 0.6710273266272868, + "learning_rate": 7.902110981712198e-06, + "loss": 0.5107, + "step": 2293 + }, + { + "epoch": 1.6663564342950536, + "grad_norm": 0.3526683617151688, + "learning_rate": 7.9001666933754e-06, + "loss": 0.5233, + "step": 2294 + }, + { + "epoch": 1.6670828320432207, + "grad_norm": 0.36532635665255275, + "learning_rate": 7.898221743932887e-06, + "loss": 0.5169, + "step": 2295 + }, + { + "epoch": 1.6678092297913878, + "grad_norm": 0.3585418169457838, + "learning_rate": 7.896276133828023e-06, + "loss": 0.5082, + "step": 2296 + }, + { + "epoch": 1.6685356275395546, + "grad_norm": 0.3544722750485833, + "learning_rate": 7.894329863504314e-06, + "loss": 0.5072, + "step": 2297 + }, + { + "epoch": 1.6692620252877215, + "grad_norm": 0.3546616658809727, + "learning_rate": 7.89238293340542e-06, + "loss": 0.5185, + "step": 2298 + }, + { + "epoch": 1.6699884230358886, + "grad_norm": 0.3625346473206743, + "learning_rate": 7.890435343975158e-06, + "loss": 0.5204, + "step": 2299 + }, + { + "epoch": 1.6707148207840556, + "grad_norm": 1.0160843919774494, + "learning_rate": 7.888487095657484e-06, + "loss": 0.5225, + "step": 2300 + }, + { + "epoch": 1.6714412185322225, + "grad_norm": 0.4678669118863533, + "learning_rate": 7.886538188896511e-06, + "loss": 0.5134, + "step": 2301 + }, + { + "epoch": 1.6721676162803896, + "grad_norm": 0.4058252989421966, + "learning_rate": 7.884588624136505e-06, + "loss": 0.537, + "step": 2302 + }, + { + "epoch": 1.6728940140285564, + "grad_norm": 0.5993490154767858, + "learning_rate": 7.882638401821873e-06, + "loss": 0.5204, + "step": 2303 + }, + { + "epoch": 1.6736204117767235, + "grad_norm": 0.4303193883714196, + "learning_rate": 7.880687522397177e-06, + "loss": 0.5125, + "step": 2304 + }, + { + "epoch": 1.6743468095248906, + "grad_norm": 0.35404110515752385, + "learning_rate": 7.878735986307133e-06, + "loss": 0.5295, + "step": 2305 + }, + { + "epoch": 1.6750732072730574, + "grad_norm": 0.39844908796784556, + "learning_rate": 7.876783793996597e-06, + "loss": 0.5211, + "step": 2306 + }, + { + "epoch": 1.6757996050212243, + "grad_norm": 0.5182957330420442, + "learning_rate": 7.874830945910581e-06, + "loss": 0.5266, + "step": 2307 + }, + { + "epoch": 1.6765260027693913, + "grad_norm": 0.5028901218850625, + "learning_rate": 7.872877442494248e-06, + "loss": 0.5172, + "step": 2308 + }, + { + "epoch": 1.6772524005175584, + "grad_norm": 0.3663308757833073, + "learning_rate": 7.870923284192904e-06, + "loss": 0.525, + "step": 2309 + }, + { + "epoch": 1.6779787982657255, + "grad_norm": 0.3306688367658942, + "learning_rate": 7.868968471452012e-06, + "loss": 0.5097, + "step": 2310 + }, + { + "epoch": 1.6787051960138923, + "grad_norm": 0.38291734663388316, + "learning_rate": 7.867013004717177e-06, + "loss": 0.5036, + "step": 2311 + }, + { + "epoch": 1.6794315937620592, + "grad_norm": 0.37384892481056603, + "learning_rate": 7.865056884434156e-06, + "loss": 0.5048, + "step": 2312 + }, + { + "epoch": 1.6801579915102263, + "grad_norm": 0.40607943931890755, + "learning_rate": 7.863100111048858e-06, + "loss": 0.541, + "step": 2313 + }, + { + "epoch": 1.6808843892583933, + "grad_norm": 0.3674587638957362, + "learning_rate": 7.861142685007335e-06, + "loss": 0.5415, + "step": 2314 + }, + { + "epoch": 1.6816107870065604, + "grad_norm": 0.3971339138125036, + "learning_rate": 7.859184606755793e-06, + "loss": 0.5155, + "step": 2315 + }, + { + "epoch": 1.6823371847547273, + "grad_norm": 0.4463837403698997, + "learning_rate": 7.857225876740585e-06, + "loss": 0.5256, + "step": 2316 + }, + { + "epoch": 1.6830635825028941, + "grad_norm": 0.39645405562986596, + "learning_rate": 7.855266495408208e-06, + "loss": 0.5257, + "step": 2317 + }, + { + "epoch": 1.6837899802510612, + "grad_norm": 0.3647194703278608, + "learning_rate": 7.853306463205317e-06, + "loss": 0.5159, + "step": 2318 + }, + { + "epoch": 1.6845163779992283, + "grad_norm": 0.3589570080440384, + "learning_rate": 7.851345780578705e-06, + "loss": 0.5033, + "step": 2319 + }, + { + "epoch": 1.6852427757473953, + "grad_norm": 0.38129920749398744, + "learning_rate": 7.849384447975322e-06, + "loss": 0.5071, + "step": 2320 + }, + { + "epoch": 1.6859691734955622, + "grad_norm": 0.42195602445792807, + "learning_rate": 7.84742246584226e-06, + "loss": 0.518, + "step": 2321 + }, + { + "epoch": 1.686695571243729, + "grad_norm": 0.3687914244924957, + "learning_rate": 7.845459834626763e-06, + "loss": 0.5051, + "step": 2322 + }, + { + "epoch": 1.6874219689918961, + "grad_norm": 0.3685612246975215, + "learning_rate": 7.843496554776222e-06, + "loss": 0.5182, + "step": 2323 + }, + { + "epoch": 1.6881483667400632, + "grad_norm": 0.3708898771698278, + "learning_rate": 7.841532626738174e-06, + "loss": 0.5192, + "step": 2324 + }, + { + "epoch": 1.68887476448823, + "grad_norm": 0.3794685872043508, + "learning_rate": 7.839568050960302e-06, + "loss": 0.5147, + "step": 2325 + }, + { + "epoch": 1.6896011622363971, + "grad_norm": 0.4042012250094266, + "learning_rate": 7.837602827890445e-06, + "loss": 0.5179, + "step": 2326 + }, + { + "epoch": 1.690327559984564, + "grad_norm": 0.46794334682587546, + "learning_rate": 7.83563695797658e-06, + "loss": 0.5236, + "step": 2327 + }, + { + "epoch": 1.691053957732731, + "grad_norm": 0.401175275572764, + "learning_rate": 7.83367044166684e-06, + "loss": 0.5271, + "step": 2328 + }, + { + "epoch": 1.6917803554808981, + "grad_norm": 0.34682830425628947, + "learning_rate": 7.831703279409498e-06, + "loss": 0.5151, + "step": 2329 + }, + { + "epoch": 1.692506753229065, + "grad_norm": 0.43242771033611604, + "learning_rate": 7.829735471652978e-06, + "loss": 0.5253, + "step": 2330 + }, + { + "epoch": 1.6932331509772318, + "grad_norm": 0.3938986346700491, + "learning_rate": 7.827767018845847e-06, + "loss": 0.5106, + "step": 2331 + }, + { + "epoch": 1.693959548725399, + "grad_norm": 0.37655455018986816, + "learning_rate": 7.825797921436829e-06, + "loss": 0.5207, + "step": 2332 + }, + { + "epoch": 1.694685946473566, + "grad_norm": 0.42286674943570335, + "learning_rate": 7.823828179874784e-06, + "loss": 0.5227, + "step": 2333 + }, + { + "epoch": 1.695412344221733, + "grad_norm": 2.153158814997957, + "learning_rate": 7.821857794608724e-06, + "loss": 0.5097, + "step": 2334 + }, + { + "epoch": 1.6961387419698999, + "grad_norm": 0.3888254323572561, + "learning_rate": 7.819886766087808e-06, + "loss": 0.5073, + "step": 2335 + }, + { + "epoch": 1.6968651397180667, + "grad_norm": 0.4613541494505823, + "learning_rate": 7.81791509476134e-06, + "loss": 0.5133, + "step": 2336 + }, + { + "epoch": 1.6975915374662338, + "grad_norm": 0.39905679540451405, + "learning_rate": 7.815942781078772e-06, + "loss": 0.4997, + "step": 2337 + }, + { + "epoch": 1.6983179352144009, + "grad_norm": 0.42740584656532915, + "learning_rate": 7.813969825489698e-06, + "loss": 0.5125, + "step": 2338 + }, + { + "epoch": 1.699044332962568, + "grad_norm": 0.39208560533952436, + "learning_rate": 7.811996228443866e-06, + "loss": 0.5203, + "step": 2339 + }, + { + "epoch": 1.6997707307107348, + "grad_norm": 0.4894039824183243, + "learning_rate": 7.810021990391163e-06, + "loss": 0.5213, + "step": 2340 + }, + { + "epoch": 1.7004971284589017, + "grad_norm": 0.5137340338976691, + "learning_rate": 7.808047111781628e-06, + "loss": 0.5201, + "step": 2341 + }, + { + "epoch": 1.7012235262070687, + "grad_norm": 0.5554929278674202, + "learning_rate": 7.806071593065441e-06, + "loss": 0.5193, + "step": 2342 + }, + { + "epoch": 1.7019499239552358, + "grad_norm": 0.6216775677657115, + "learning_rate": 7.804095434692933e-06, + "loss": 0.5321, + "step": 2343 + }, + { + "epoch": 1.7026763217034029, + "grad_norm": 0.5283921114968795, + "learning_rate": 7.802118637114575e-06, + "loss": 0.5177, + "step": 2344 + }, + { + "epoch": 1.7034027194515697, + "grad_norm": 0.5158106759029534, + "learning_rate": 7.800141200780987e-06, + "loss": 0.5214, + "step": 2345 + }, + { + "epoch": 1.7041291171997366, + "grad_norm": 0.3910353597249199, + "learning_rate": 7.798163126142935e-06, + "loss": 0.5214, + "step": 2346 + }, + { + "epoch": 1.7048555149479037, + "grad_norm": 0.357667881957513, + "learning_rate": 7.796184413651331e-06, + "loss": 0.5209, + "step": 2347 + }, + { + "epoch": 1.7055819126960707, + "grad_norm": 0.3663707168426187, + "learning_rate": 7.79420506375723e-06, + "loss": 0.5095, + "step": 2348 + }, + { + "epoch": 1.7063083104442376, + "grad_norm": 0.34462530618496784, + "learning_rate": 7.792225076911833e-06, + "loss": 0.5144, + "step": 2349 + }, + { + "epoch": 1.7070347081924044, + "grad_norm": 0.43033742655038754, + "learning_rate": 7.790244453566486e-06, + "loss": 0.5156, + "step": 2350 + }, + { + "epoch": 1.7077611059405715, + "grad_norm": 0.39115934719776607, + "learning_rate": 7.788263194172684e-06, + "loss": 0.5296, + "step": 2351 + }, + { + "epoch": 1.7084875036887386, + "grad_norm": 0.3751449069052403, + "learning_rate": 7.78628129918206e-06, + "loss": 0.4988, + "step": 2352 + }, + { + "epoch": 1.7092139014369057, + "grad_norm": 2.2865318787607047, + "learning_rate": 7.784298769046399e-06, + "loss": 0.5238, + "step": 2353 + }, + { + "epoch": 1.7099402991850725, + "grad_norm": 0.3308255892497287, + "learning_rate": 7.782315604217623e-06, + "loss": 0.5084, + "step": 2354 + }, + { + "epoch": 1.7106666969332394, + "grad_norm": 0.42055271857264337, + "learning_rate": 7.780331805147809e-06, + "loss": 0.5109, + "step": 2355 + }, + { + "epoch": 1.7113930946814064, + "grad_norm": 0.3484879952270989, + "learning_rate": 7.778347372289168e-06, + "loss": 0.5209, + "step": 2356 + }, + { + "epoch": 1.7121194924295735, + "grad_norm": 0.36894370842792445, + "learning_rate": 7.776362306094063e-06, + "loss": 0.5071, + "step": 2357 + }, + { + "epoch": 1.7128458901777406, + "grad_norm": 0.4033215008387943, + "learning_rate": 7.774376607014995e-06, + "loss": 0.5123, + "step": 2358 + }, + { + "epoch": 1.7135722879259074, + "grad_norm": 0.34254343652123953, + "learning_rate": 7.772390275504618e-06, + "loss": 0.5221, + "step": 2359 + }, + { + "epoch": 1.7142986856740743, + "grad_norm": 0.6806500999901334, + "learning_rate": 7.77040331201572e-06, + "loss": 0.528, + "step": 2360 + }, + { + "epoch": 1.7150250834222414, + "grad_norm": 0.3957327094391318, + "learning_rate": 7.768415717001241e-06, + "loss": 0.5061, + "step": 2361 + }, + { + "epoch": 1.7157514811704084, + "grad_norm": 0.8887630735489043, + "learning_rate": 7.766427490914261e-06, + "loss": 0.5009, + "step": 2362 + }, + { + "epoch": 1.7164778789185755, + "grad_norm": 0.41414804407176514, + "learning_rate": 7.764438634208007e-06, + "loss": 0.5091, + "step": 2363 + }, + { + "epoch": 1.7172042766667424, + "grad_norm": 0.3997412655268966, + "learning_rate": 7.762449147335843e-06, + "loss": 0.5262, + "step": 2364 + }, + { + "epoch": 1.7179306744149092, + "grad_norm": 0.41272868045457767, + "learning_rate": 7.760459030751285e-06, + "loss": 0.5122, + "step": 2365 + }, + { + "epoch": 1.7186570721630763, + "grad_norm": 0.35094999229763957, + "learning_rate": 7.758468284907988e-06, + "loss": 0.5122, + "step": 2366 + }, + { + "epoch": 1.7193834699112434, + "grad_norm": 0.45851342079117346, + "learning_rate": 7.756476910259749e-06, + "loss": 0.5184, + "step": 2367 + }, + { + "epoch": 1.7201098676594102, + "grad_norm": 0.4626631942589889, + "learning_rate": 7.754484907260513e-06, + "loss": 0.523, + "step": 2368 + }, + { + "epoch": 1.7208362654075773, + "grad_norm": 0.3868549743980831, + "learning_rate": 7.752492276364368e-06, + "loss": 0.5176, + "step": 2369 + }, + { + "epoch": 1.7215626631557441, + "grad_norm": 0.3810781296920432, + "learning_rate": 7.750499018025537e-06, + "loss": 0.5064, + "step": 2370 + }, + { + "epoch": 1.7222890609039112, + "grad_norm": 0.5430177338034557, + "learning_rate": 7.748505132698396e-06, + "loss": 0.5297, + "step": 2371 + }, + { + "epoch": 1.7230154586520783, + "grad_norm": 0.3297386109193866, + "learning_rate": 7.74651062083746e-06, + "loss": 0.5241, + "step": 2372 + }, + { + "epoch": 1.7237418564002451, + "grad_norm": 0.3606346057085869, + "learning_rate": 7.744515482897386e-06, + "loss": 0.539, + "step": 2373 + }, + { + "epoch": 1.724468254148412, + "grad_norm": 0.34521028847501406, + "learning_rate": 7.742519719332972e-06, + "loss": 0.5191, + "step": 2374 + }, + { + "epoch": 1.725194651896579, + "grad_norm": 0.3755384059531963, + "learning_rate": 7.740523330599166e-06, + "loss": 0.5433, + "step": 2375 + }, + { + "epoch": 1.7259210496447461, + "grad_norm": 0.4504796185974387, + "learning_rate": 7.73852631715105e-06, + "loss": 0.506, + "step": 2376 + }, + { + "epoch": 1.7266474473929132, + "grad_norm": 0.5107682186808763, + "learning_rate": 7.736528679443853e-06, + "loss": 0.5193, + "step": 2377 + }, + { + "epoch": 1.72737384514108, + "grad_norm": 0.4141904772610753, + "learning_rate": 7.734530417932947e-06, + "loss": 0.5169, + "step": 2378 + }, + { + "epoch": 1.728100242889247, + "grad_norm": 0.36227736569282754, + "learning_rate": 7.73253153307384e-06, + "loss": 0.5149, + "step": 2379 + }, + { + "epoch": 1.728826640637414, + "grad_norm": 0.4409459557738853, + "learning_rate": 7.73053202532219e-06, + "loss": 0.5239, + "step": 2380 + }, + { + "epoch": 1.729553038385581, + "grad_norm": 0.33578102943722477, + "learning_rate": 7.728531895133792e-06, + "loss": 0.5109, + "step": 2381 + }, + { + "epoch": 1.7302794361337481, + "grad_norm": 0.4250354978387207, + "learning_rate": 7.726531142964587e-06, + "loss": 0.5155, + "step": 2382 + }, + { + "epoch": 1.731005833881915, + "grad_norm": 0.4302601157882407, + "learning_rate": 7.724529769270652e-06, + "loss": 0.5141, + "step": 2383 + }, + { + "epoch": 1.7317322316300818, + "grad_norm": 0.43407033758643465, + "learning_rate": 7.722527774508212e-06, + "loss": 0.5511, + "step": 2384 + }, + { + "epoch": 1.732458629378249, + "grad_norm": 0.40151669451134714, + "learning_rate": 7.720525159133627e-06, + "loss": 0.5219, + "step": 2385 + }, + { + "epoch": 1.733185027126416, + "grad_norm": 0.35884136542676387, + "learning_rate": 7.718521923603404e-06, + "loss": 0.5093, + "step": 2386 + }, + { + "epoch": 1.733911424874583, + "grad_norm": 0.3816066124029688, + "learning_rate": 7.71651806837419e-06, + "loss": 0.5049, + "step": 2387 + }, + { + "epoch": 1.73463782262275, + "grad_norm": 0.3676995514155556, + "learning_rate": 7.714513593902771e-06, + "loss": 0.521, + "step": 2388 + }, + { + "epoch": 1.7353642203709168, + "grad_norm": 0.4667926009034362, + "learning_rate": 7.712508500646077e-06, + "loss": 0.5317, + "step": 2389 + }, + { + "epoch": 1.7360906181190838, + "grad_norm": 0.4129366004482653, + "learning_rate": 7.710502789061178e-06, + "loss": 0.5099, + "step": 2390 + }, + { + "epoch": 1.736817015867251, + "grad_norm": 0.3592680433090348, + "learning_rate": 7.708496459605283e-06, + "loss": 0.5212, + "step": 2391 + }, + { + "epoch": 1.7375434136154178, + "grad_norm": 0.34614013237021196, + "learning_rate": 7.706489512735745e-06, + "loss": 0.5303, + "step": 2392 + }, + { + "epoch": 1.7382698113635848, + "grad_norm": 0.3555021535589424, + "learning_rate": 7.704481948910057e-06, + "loss": 0.5277, + "step": 2393 + }, + { + "epoch": 1.7389962091117517, + "grad_norm": 0.37639501186215735, + "learning_rate": 7.702473768585847e-06, + "loss": 0.525, + "step": 2394 + }, + { + "epoch": 1.7397226068599188, + "grad_norm": 0.37361879877081183, + "learning_rate": 7.700464972220895e-06, + "loss": 0.5289, + "step": 2395 + }, + { + "epoch": 1.7404490046080858, + "grad_norm": 0.39992477600936793, + "learning_rate": 7.698455560273112e-06, + "loss": 0.5312, + "step": 2396 + }, + { + "epoch": 1.7411754023562527, + "grad_norm": 0.37406681284872073, + "learning_rate": 7.696445533200553e-06, + "loss": 0.5307, + "step": 2397 + }, + { + "epoch": 1.7419018001044195, + "grad_norm": 0.42683497458023184, + "learning_rate": 7.694434891461413e-06, + "loss": 0.5001, + "step": 2398 + }, + { + "epoch": 1.7426281978525866, + "grad_norm": 0.41158580911618886, + "learning_rate": 7.692423635514025e-06, + "loss": 0.5332, + "step": 2399 + }, + { + "epoch": 1.7433545956007537, + "grad_norm": 0.36702630887820314, + "learning_rate": 7.690411765816864e-06, + "loss": 0.5278, + "step": 2400 + }, + { + "epoch": 1.7440809933489207, + "grad_norm": 0.37401723327421216, + "learning_rate": 7.688399282828544e-06, + "loss": 0.5189, + "step": 2401 + }, + { + "epoch": 1.7448073910970876, + "grad_norm": 0.4612737879621514, + "learning_rate": 7.686386187007822e-06, + "loss": 0.5219, + "step": 2402 + }, + { + "epoch": 1.7455337888452545, + "grad_norm": 0.36477017291251335, + "learning_rate": 7.68437247881359e-06, + "loss": 0.5103, + "step": 2403 + }, + { + "epoch": 1.7462601865934215, + "grad_norm": 0.3617003411712773, + "learning_rate": 7.682358158704882e-06, + "loss": 0.5025, + "step": 2404 + }, + { + "epoch": 1.7469865843415886, + "grad_norm": 0.37483282727802664, + "learning_rate": 7.68034322714087e-06, + "loss": 0.5273, + "step": 2405 + }, + { + "epoch": 1.7477129820897557, + "grad_norm": 0.3495224935981692, + "learning_rate": 7.678327684580867e-06, + "loss": 0.5176, + "step": 2406 + }, + { + "epoch": 1.7484393798379225, + "grad_norm": 0.3385413622434483, + "learning_rate": 7.676311531484324e-06, + "loss": 0.5274, + "step": 2407 + }, + { + "epoch": 1.7491657775860894, + "grad_norm": 0.3414632763254147, + "learning_rate": 7.674294768310836e-06, + "loss": 0.5324, + "step": 2408 + }, + { + "epoch": 1.7498921753342565, + "grad_norm": 0.3305838987379325, + "learning_rate": 7.672277395520127e-06, + "loss": 0.5109, + "step": 2409 + }, + { + "epoch": 1.7506185730824235, + "grad_norm": 0.38555130400427856, + "learning_rate": 7.670259413572073e-06, + "loss": 0.5024, + "step": 2410 + }, + { + "epoch": 1.7513449708305904, + "grad_norm": 0.3199334271176942, + "learning_rate": 7.668240822926674e-06, + "loss": 0.508, + "step": 2411 + }, + { + "epoch": 1.7520713685787574, + "grad_norm": 0.3747470270663056, + "learning_rate": 7.66622162404408e-06, + "loss": 0.5014, + "step": 2412 + }, + { + "epoch": 1.7527977663269243, + "grad_norm": 0.3682403245152217, + "learning_rate": 7.66420181738458e-06, + "loss": 0.506, + "step": 2413 + }, + { + "epoch": 1.7535241640750914, + "grad_norm": 0.38757289195608097, + "learning_rate": 7.662181403408593e-06, + "loss": 0.522, + "step": 2414 + }, + { + "epoch": 1.7542505618232584, + "grad_norm": 0.3894174612444933, + "learning_rate": 7.660160382576683e-06, + "loss": 0.5237, + "step": 2415 + }, + { + "epoch": 1.7549769595714253, + "grad_norm": 0.3553310739437953, + "learning_rate": 7.658138755349552e-06, + "loss": 0.5176, + "step": 2416 + }, + { + "epoch": 1.7557033573195922, + "grad_norm": 0.412183295644057, + "learning_rate": 7.656116522188034e-06, + "loss": 0.5145, + "step": 2417 + }, + { + "epoch": 1.7564297550677592, + "grad_norm": 0.4269731415373766, + "learning_rate": 7.654093683553111e-06, + "loss": 0.505, + "step": 2418 + }, + { + "epoch": 1.7571561528159263, + "grad_norm": 0.40445962828489695, + "learning_rate": 7.652070239905896e-06, + "loss": 0.5385, + "step": 2419 + }, + { + "epoch": 1.7578825505640934, + "grad_norm": 0.35438541605346585, + "learning_rate": 7.650046191707641e-06, + "loss": 0.5234, + "step": 2420 + }, + { + "epoch": 1.7586089483122602, + "grad_norm": 0.36150585874514246, + "learning_rate": 7.648021539419737e-06, + "loss": 0.5071, + "step": 2421 + }, + { + "epoch": 1.759335346060427, + "grad_norm": 0.43918572767685066, + "learning_rate": 7.645996283503713e-06, + "loss": 0.509, + "step": 2422 + }, + { + "epoch": 1.7600617438085941, + "grad_norm": 0.41897424823585466, + "learning_rate": 7.643970424421235e-06, + "loss": 0.5289, + "step": 2423 + }, + { + "epoch": 1.7607881415567612, + "grad_norm": 0.3721778531489851, + "learning_rate": 7.641943962634106e-06, + "loss": 0.5076, + "step": 2424 + }, + { + "epoch": 1.7615145393049283, + "grad_norm": 0.4098431311637956, + "learning_rate": 7.639916898604267e-06, + "loss": 0.5307, + "step": 2425 + }, + { + "epoch": 1.7622409370530951, + "grad_norm": 0.3773059345853294, + "learning_rate": 7.637889232793795e-06, + "loss": 0.5181, + "step": 2426 + }, + { + "epoch": 1.762967334801262, + "grad_norm": 0.35280772424239176, + "learning_rate": 7.63586096566491e-06, + "loss": 0.5145, + "step": 2427 + }, + { + "epoch": 1.763693732549429, + "grad_norm": 0.3423930857113253, + "learning_rate": 7.633832097679959e-06, + "loss": 0.53, + "step": 2428 + }, + { + "epoch": 1.7644201302975961, + "grad_norm": 0.41118629604911827, + "learning_rate": 7.631802629301433e-06, + "loss": 0.5146, + "step": 2429 + }, + { + "epoch": 1.7651465280457632, + "grad_norm": 0.3759577573061447, + "learning_rate": 7.62977256099196e-06, + "loss": 0.5162, + "step": 2430 + }, + { + "epoch": 1.76587292579393, + "grad_norm": 0.33315814587839315, + "learning_rate": 7.6277418932143e-06, + "loss": 0.5309, + "step": 2431 + }, + { + "epoch": 1.766599323542097, + "grad_norm": 0.37737468694841797, + "learning_rate": 7.625710626431354e-06, + "loss": 0.5203, + "step": 2432 + }, + { + "epoch": 1.767325721290264, + "grad_norm": 0.38578068172952584, + "learning_rate": 7.623678761106159e-06, + "loss": 0.5209, + "step": 2433 + }, + { + "epoch": 1.768052119038431, + "grad_norm": 0.3471234798763853, + "learning_rate": 7.621646297701886e-06, + "loss": 0.5175, + "step": 2434 + }, + { + "epoch": 1.768778516786598, + "grad_norm": 0.33769756380371013, + "learning_rate": 7.619613236681845e-06, + "loss": 0.5155, + "step": 2435 + }, + { + "epoch": 1.769504914534765, + "grad_norm": 0.3484262277106607, + "learning_rate": 7.617579578509481e-06, + "loss": 0.5151, + "step": 2436 + }, + { + "epoch": 1.7702313122829318, + "grad_norm": 0.3472024381402114, + "learning_rate": 7.6155453236483745e-06, + "loss": 0.5287, + "step": 2437 + }, + { + "epoch": 1.770957710031099, + "grad_norm": 0.4865050975301496, + "learning_rate": 7.613510472562245e-06, + "loss": 0.5106, + "step": 2438 + }, + { + "epoch": 1.771684107779266, + "grad_norm": 0.43499596061429086, + "learning_rate": 7.611475025714945e-06, + "loss": 0.5126, + "step": 2439 + }, + { + "epoch": 1.7724105055274328, + "grad_norm": 0.35984637513168016, + "learning_rate": 7.609438983570461e-06, + "loss": 0.5126, + "step": 2440 + }, + { + "epoch": 1.7731369032755997, + "grad_norm": 0.3777402269226685, + "learning_rate": 7.6074023465929206e-06, + "loss": 0.5063, + "step": 2441 + }, + { + "epoch": 1.7738633010237668, + "grad_norm": 0.3368264229441657, + "learning_rate": 7.605365115246581e-06, + "loss": 0.5253, + "step": 2442 + }, + { + "epoch": 1.7745896987719338, + "grad_norm": 0.36427617602043305, + "learning_rate": 7.603327289995843e-06, + "loss": 0.5067, + "step": 2443 + }, + { + "epoch": 1.775316096520101, + "grad_norm": 0.36829048675097775, + "learning_rate": 7.601288871305235e-06, + "loss": 0.4993, + "step": 2444 + }, + { + "epoch": 1.7760424942682678, + "grad_norm": 0.3274746744967745, + "learning_rate": 7.5992498596394215e-06, + "loss": 0.5023, + "step": 2445 + }, + { + "epoch": 1.7767688920164346, + "grad_norm": 0.3690904489859233, + "learning_rate": 7.597210255463206e-06, + "loss": 0.5089, + "step": 2446 + }, + { + "epoch": 1.7774952897646017, + "grad_norm": 0.39823221371440226, + "learning_rate": 7.595170059241527e-06, + "loss": 0.5216, + "step": 2447 + }, + { + "epoch": 1.7782216875127688, + "grad_norm": 0.4064508140087145, + "learning_rate": 7.593129271439454e-06, + "loss": 0.5179, + "step": 2448 + }, + { + "epoch": 1.7789480852609358, + "grad_norm": 0.3365068405492714, + "learning_rate": 7.591087892522193e-06, + "loss": 0.5069, + "step": 2449 + }, + { + "epoch": 1.7796744830091027, + "grad_norm": 0.368825203113945, + "learning_rate": 7.589045922955085e-06, + "loss": 0.509, + "step": 2450 + }, + { + "epoch": 1.7804008807572695, + "grad_norm": 0.3702408504105955, + "learning_rate": 7.587003363203609e-06, + "loss": 0.5239, + "step": 2451 + }, + { + "epoch": 1.7811272785054366, + "grad_norm": 0.54516242684235, + "learning_rate": 7.5849602137333745e-06, + "loss": 0.5017, + "step": 2452 + }, + { + "epoch": 1.7818536762536037, + "grad_norm": 0.3789259608047946, + "learning_rate": 7.582916475010125e-06, + "loss": 0.5291, + "step": 2453 + }, + { + "epoch": 1.7825800740017705, + "grad_norm": 0.3929344951040863, + "learning_rate": 7.580872147499738e-06, + "loss": 0.5099, + "step": 2454 + }, + { + "epoch": 1.7833064717499376, + "grad_norm": 0.4521458967644181, + "learning_rate": 7.578827231668231e-06, + "loss": 0.5137, + "step": 2455 + }, + { + "epoch": 1.7840328694981045, + "grad_norm": 0.3396557213768358, + "learning_rate": 7.5767817279817505e-06, + "loss": 0.5165, + "step": 2456 + }, + { + "epoch": 1.7847592672462715, + "grad_norm": 0.3591207918432544, + "learning_rate": 7.574735636906576e-06, + "loss": 0.5031, + "step": 2457 + }, + { + "epoch": 1.7854856649944386, + "grad_norm": 0.3280120304606946, + "learning_rate": 7.572688958909124e-06, + "loss": 0.5184, + "step": 2458 + }, + { + "epoch": 1.7862120627426055, + "grad_norm": 0.3910837966006328, + "learning_rate": 7.570641694455945e-06, + "loss": 0.5042, + "step": 2459 + }, + { + "epoch": 1.7869384604907723, + "grad_norm": 0.4156980995760029, + "learning_rate": 7.5685938440137185e-06, + "loss": 0.5105, + "step": 2460 + }, + { + "epoch": 1.7876648582389394, + "grad_norm": 0.37191043977151966, + "learning_rate": 7.566545408049264e-06, + "loss": 0.5194, + "step": 2461 + }, + { + "epoch": 1.7883912559871065, + "grad_norm": 0.41929543282454695, + "learning_rate": 7.564496387029532e-06, + "loss": 0.4994, + "step": 2462 + }, + { + "epoch": 1.7891176537352735, + "grad_norm": 0.5535216928962473, + "learning_rate": 7.562446781421604e-06, + "loss": 0.5113, + "step": 2463 + }, + { + "epoch": 1.7898440514834404, + "grad_norm": 0.4440265170284457, + "learning_rate": 7.560396591692696e-06, + "loss": 0.5071, + "step": 2464 + }, + { + "epoch": 1.7905704492316072, + "grad_norm": 0.31691196067500516, + "learning_rate": 7.558345818310159e-06, + "loss": 0.5133, + "step": 2465 + }, + { + "epoch": 1.7912968469797743, + "grad_norm": 0.3976398324340962, + "learning_rate": 7.556294461741476e-06, + "loss": 0.5091, + "step": 2466 + }, + { + "epoch": 1.7920232447279414, + "grad_norm": 0.47629872897047004, + "learning_rate": 7.554242522454262e-06, + "loss": 0.5077, + "step": 2467 + }, + { + "epoch": 1.7927496424761085, + "grad_norm": 0.3915447918725823, + "learning_rate": 7.552190000916267e-06, + "loss": 0.5142, + "step": 2468 + }, + { + "epoch": 1.7934760402242753, + "grad_norm": 0.34586851694938503, + "learning_rate": 7.55013689759537e-06, + "loss": 0.5197, + "step": 2469 + }, + { + "epoch": 1.7942024379724422, + "grad_norm": 0.39615903534776675, + "learning_rate": 7.548083212959588e-06, + "loss": 0.5102, + "step": 2470 + }, + { + "epoch": 1.7949288357206092, + "grad_norm": 0.3824072375603738, + "learning_rate": 7.5460289474770645e-06, + "loss": 0.5051, + "step": 2471 + }, + { + "epoch": 1.7956552334687763, + "grad_norm": 0.3984984887068014, + "learning_rate": 7.5439741016160785e-06, + "loss": 0.5264, + "step": 2472 + }, + { + "epoch": 1.7963816312169434, + "grad_norm": 0.3854979324219856, + "learning_rate": 7.541918675845044e-06, + "loss": 0.5112, + "step": 2473 + }, + { + "epoch": 1.7971080289651102, + "grad_norm": 0.4373313917377871, + "learning_rate": 7.539862670632501e-06, + "loss": 0.502, + "step": 2474 + }, + { + "epoch": 1.797834426713277, + "grad_norm": 0.392190615648265, + "learning_rate": 7.537806086447124e-06, + "loss": 0.5299, + "step": 2475 + }, + { + "epoch": 1.7985608244614442, + "grad_norm": 0.33446120395684537, + "learning_rate": 7.5357489237577246e-06, + "loss": 0.4955, + "step": 2476 + }, + { + "epoch": 1.7992872222096112, + "grad_norm": 0.3924100537430557, + "learning_rate": 7.53369118303324e-06, + "loss": 0.5157, + "step": 2477 + }, + { + "epoch": 1.800013619957778, + "grad_norm": 0.3947888140852665, + "learning_rate": 7.531632864742742e-06, + "loss": 0.5108, + "step": 2478 + }, + { + "epoch": 1.8007400177059452, + "grad_norm": 0.41413216803259373, + "learning_rate": 7.52957396935543e-06, + "loss": 0.5258, + "step": 2479 + }, + { + "epoch": 1.801466415454112, + "grad_norm": 0.3430398095552677, + "learning_rate": 7.527514497340642e-06, + "loss": 0.5216, + "step": 2480 + }, + { + "epoch": 1.802192813202279, + "grad_norm": 0.3803404160910686, + "learning_rate": 7.525454449167844e-06, + "loss": 0.5112, + "step": 2481 + }, + { + "epoch": 1.8029192109504462, + "grad_norm": 0.362708206016911, + "learning_rate": 7.52339382530663e-06, + "loss": 0.5031, + "step": 2482 + }, + { + "epoch": 1.803645608698613, + "grad_norm": 0.3874709070824654, + "learning_rate": 7.52133262622673e-06, + "loss": 0.5206, + "step": 2483 + }, + { + "epoch": 1.8043720064467799, + "grad_norm": 0.35370761771305603, + "learning_rate": 7.519270852398002e-06, + "loss": 0.5128, + "step": 2484 + }, + { + "epoch": 1.805098404194947, + "grad_norm": 0.3378750964273353, + "learning_rate": 7.517208504290438e-06, + "loss": 0.5139, + "step": 2485 + }, + { + "epoch": 1.805824801943114, + "grad_norm": 0.47739310413310937, + "learning_rate": 7.5151455823741605e-06, + "loss": 0.5325, + "step": 2486 + }, + { + "epoch": 1.806551199691281, + "grad_norm": 1.0220584924467446, + "learning_rate": 7.513082087119419e-06, + "loss": 0.5134, + "step": 2487 + }, + { + "epoch": 1.807277597439448, + "grad_norm": 0.34733949051067253, + "learning_rate": 7.511018018996597e-06, + "loss": 0.5258, + "step": 2488 + }, + { + "epoch": 1.8080039951876148, + "grad_norm": 0.43738246407528614, + "learning_rate": 7.508953378476207e-06, + "loss": 0.5223, + "step": 2489 + }, + { + "epoch": 1.8087303929357819, + "grad_norm": 0.3884686553974104, + "learning_rate": 7.506888166028893e-06, + "loss": 0.524, + "step": 2490 + }, + { + "epoch": 1.809456790683949, + "grad_norm": 0.3664437825331931, + "learning_rate": 7.504822382125432e-06, + "loss": 0.5057, + "step": 2491 + }, + { + "epoch": 1.810183188432116, + "grad_norm": 0.4718259906344069, + "learning_rate": 7.502756027236725e-06, + "loss": 0.5269, + "step": 2492 + }, + { + "epoch": 1.8109095861802829, + "grad_norm": 0.3578874000507274, + "learning_rate": 7.500689101833809e-06, + "loss": 0.4972, + "step": 2493 + }, + { + "epoch": 1.8116359839284497, + "grad_norm": 0.4239146637573705, + "learning_rate": 7.498621606387848e-06, + "loss": 0.5037, + "step": 2494 + }, + { + "epoch": 1.8123623816766168, + "grad_norm": 0.4527212658614285, + "learning_rate": 7.496553541370136e-06, + "loss": 0.5138, + "step": 2495 + }, + { + "epoch": 1.8130887794247839, + "grad_norm": 0.38027801188865673, + "learning_rate": 7.4944849072520994e-06, + "loss": 0.5002, + "step": 2496 + }, + { + "epoch": 1.8138151771729507, + "grad_norm": 0.4033291849795312, + "learning_rate": 7.492415704505291e-06, + "loss": 0.5226, + "step": 2497 + }, + { + "epoch": 1.8145415749211178, + "grad_norm": 0.3807979142898502, + "learning_rate": 7.490345933601395e-06, + "loss": 0.5204, + "step": 2498 + }, + { + "epoch": 1.8152679726692846, + "grad_norm": 0.36024441295502735, + "learning_rate": 7.488275595012222e-06, + "loss": 0.5123, + "step": 2499 + }, + { + "epoch": 1.8159943704174517, + "grad_norm": 0.34085085330306947, + "learning_rate": 7.486204689209719e-06, + "loss": 0.5172, + "step": 2500 + }, + { + "epoch": 1.8167207681656188, + "grad_norm": 0.384830670861468, + "learning_rate": 7.484133216665956e-06, + "loss": 0.4996, + "step": 2501 + }, + { + "epoch": 1.8174471659137856, + "grad_norm": 0.32600758570920274, + "learning_rate": 7.482061177853135e-06, + "loss": 0.5017, + "step": 2502 + }, + { + "epoch": 1.8181735636619525, + "grad_norm": 0.451413015847686, + "learning_rate": 7.479988573243586e-06, + "loss": 0.4986, + "step": 2503 + }, + { + "epoch": 1.8188999614101196, + "grad_norm": 0.36176694758470074, + "learning_rate": 7.477915403309768e-06, + "loss": 0.5233, + "step": 2504 + }, + { + "epoch": 1.8196263591582866, + "grad_norm": 0.4201765718524903, + "learning_rate": 7.475841668524268e-06, + "loss": 0.5158, + "step": 2505 + }, + { + "epoch": 1.8203527569064537, + "grad_norm": 0.349747935999797, + "learning_rate": 7.473767369359805e-06, + "loss": 0.5084, + "step": 2506 + }, + { + "epoch": 1.8210791546546206, + "grad_norm": 0.37715608223832664, + "learning_rate": 7.471692506289225e-06, + "loss": 0.5133, + "step": 2507 + }, + { + "epoch": 1.8218055524027874, + "grad_norm": 0.3781868222530327, + "learning_rate": 7.4696170797855005e-06, + "loss": 0.5018, + "step": 2508 + }, + { + "epoch": 1.8225319501509545, + "grad_norm": 0.3251394185869421, + "learning_rate": 7.467541090321735e-06, + "loss": 0.5001, + "step": 2509 + }, + { + "epoch": 1.8232583478991216, + "grad_norm": 0.3696746423083357, + "learning_rate": 7.465464538371159e-06, + "loss": 0.5067, + "step": 2510 + }, + { + "epoch": 1.8239847456472886, + "grad_norm": 0.359042070785887, + "learning_rate": 7.463387424407131e-06, + "loss": 0.5146, + "step": 2511 + }, + { + "epoch": 1.8247111433954555, + "grad_norm": 0.467326002019918, + "learning_rate": 7.461309748903138e-06, + "loss": 0.5228, + "step": 2512 + }, + { + "epoch": 1.8254375411436223, + "grad_norm": 0.43379456229117774, + "learning_rate": 7.459231512332799e-06, + "loss": 0.5165, + "step": 2513 + }, + { + "epoch": 1.8261639388917894, + "grad_norm": 0.39953052494671776, + "learning_rate": 7.4571527151698505e-06, + "loss": 0.5054, + "step": 2514 + }, + { + "epoch": 1.8268903366399565, + "grad_norm": 0.4293974894737732, + "learning_rate": 7.45507335788817e-06, + "loss": 0.5028, + "step": 2515 + }, + { + "epoch": 1.8276167343881236, + "grad_norm": 0.3691821182232711, + "learning_rate": 7.452993440961751e-06, + "loss": 0.5014, + "step": 2516 + }, + { + "epoch": 1.8283431321362904, + "grad_norm": 0.3315988208836827, + "learning_rate": 7.450912964864724e-06, + "loss": 0.5079, + "step": 2517 + }, + { + "epoch": 1.8290695298844573, + "grad_norm": 0.38090799522614277, + "learning_rate": 7.44883193007134e-06, + "loss": 0.5174, + "step": 2518 + }, + { + "epoch": 1.8297959276326243, + "grad_norm": 0.37861845120198107, + "learning_rate": 7.4467503370559806e-06, + "loss": 0.5203, + "step": 2519 + }, + { + "epoch": 1.8305223253807914, + "grad_norm": 0.3700730085426923, + "learning_rate": 7.444668186293153e-06, + "loss": 0.5108, + "step": 2520 + }, + { + "epoch": 1.8312487231289583, + "grad_norm": 0.541136540489763, + "learning_rate": 7.4425854782574935e-06, + "loss": 0.5103, + "step": 2521 + }, + { + "epoch": 1.8319751208771253, + "grad_norm": 0.3980367064113061, + "learning_rate": 7.440502213423766e-06, + "loss": 0.5072, + "step": 2522 + }, + { + "epoch": 1.8327015186252922, + "grad_norm": 0.3341368904221697, + "learning_rate": 7.43841839226686e-06, + "loss": 0.5087, + "step": 2523 + }, + { + "epoch": 1.8334279163734593, + "grad_norm": 0.6917196441784205, + "learning_rate": 7.436334015261787e-06, + "loss": 0.522, + "step": 2524 + }, + { + "epoch": 1.8341543141216263, + "grad_norm": 0.3987386461839978, + "learning_rate": 7.434249082883694e-06, + "loss": 0.5026, + "step": 2525 + }, + { + "epoch": 1.8348807118697932, + "grad_norm": 0.39046462611388316, + "learning_rate": 7.432163595607851e-06, + "loss": 0.5129, + "step": 2526 + }, + { + "epoch": 1.83560710961796, + "grad_norm": 0.3474126740060321, + "learning_rate": 7.430077553909651e-06, + "loss": 0.5111, + "step": 2527 + }, + { + "epoch": 1.836333507366127, + "grad_norm": 0.4391027195312627, + "learning_rate": 7.427990958264619e-06, + "loss": 0.5074, + "step": 2528 + }, + { + "epoch": 1.8370599051142942, + "grad_norm": 0.3674104817530332, + "learning_rate": 7.425903809148401e-06, + "loss": 0.5142, + "step": 2529 + }, + { + "epoch": 1.8377863028624613, + "grad_norm": 0.4269078954287343, + "learning_rate": 7.423816107036774e-06, + "loss": 0.5234, + "step": 2530 + }, + { + "epoch": 1.838512700610628, + "grad_norm": 0.3256909453235737, + "learning_rate": 7.421727852405639e-06, + "loss": 0.5067, + "step": 2531 + }, + { + "epoch": 1.839239098358795, + "grad_norm": 0.35553526649015293, + "learning_rate": 7.419639045731022e-06, + "loss": 0.5097, + "step": 2532 + }, + { + "epoch": 1.839965496106962, + "grad_norm": 0.370387646589311, + "learning_rate": 7.417549687489074e-06, + "loss": 0.5112, + "step": 2533 + }, + { + "epoch": 1.840691893855129, + "grad_norm": 0.4062361187580708, + "learning_rate": 7.415459778156075e-06, + "loss": 0.5266, + "step": 2534 + }, + { + "epoch": 1.8414182916032962, + "grad_norm": 0.38909458471907526, + "learning_rate": 7.413369318208431e-06, + "loss": 0.52, + "step": 2535 + }, + { + "epoch": 1.842144689351463, + "grad_norm": 0.3450664136417991, + "learning_rate": 7.411278308122669e-06, + "loss": 0.5115, + "step": 2536 + }, + { + "epoch": 1.8428710870996299, + "grad_norm": 0.3481338815009868, + "learning_rate": 7.409186748375443e-06, + "loss": 0.4961, + "step": 2537 + }, + { + "epoch": 1.843597484847797, + "grad_norm": 0.3794933531601418, + "learning_rate": 7.4070946394435364e-06, + "loss": 0.5213, + "step": 2538 + }, + { + "epoch": 1.844323882595964, + "grad_norm": 0.34680653454325094, + "learning_rate": 7.405001981803851e-06, + "loss": 0.4881, + "step": 2539 + }, + { + "epoch": 1.845050280344131, + "grad_norm": 0.38012119673509104, + "learning_rate": 7.402908775933419e-06, + "loss": 0.5223, + "step": 2540 + }, + { + "epoch": 1.845776678092298, + "grad_norm": 0.391126869682353, + "learning_rate": 7.4008150223093974e-06, + "loss": 0.5082, + "step": 2541 + }, + { + "epoch": 1.8465030758404648, + "grad_norm": 0.3950671051221416, + "learning_rate": 7.398720721409065e-06, + "loss": 0.5038, + "step": 2542 + }, + { + "epoch": 1.8472294735886319, + "grad_norm": 0.7518357674558912, + "learning_rate": 7.396625873709826e-06, + "loss": 0.524, + "step": 2543 + }, + { + "epoch": 1.847955871336799, + "grad_norm": 0.440563224216234, + "learning_rate": 7.394530479689211e-06, + "loss": 0.5075, + "step": 2544 + }, + { + "epoch": 1.8486822690849658, + "grad_norm": 0.37825420804775123, + "learning_rate": 7.392434539824874e-06, + "loss": 0.5067, + "step": 2545 + }, + { + "epoch": 1.8494086668331327, + "grad_norm": 0.4329732625380953, + "learning_rate": 7.390338054594595e-06, + "loss": 0.5144, + "step": 2546 + }, + { + "epoch": 1.8501350645812997, + "grad_norm": 0.37240168722065525, + "learning_rate": 7.388241024476276e-06, + "loss": 0.5066, + "step": 2547 + }, + { + "epoch": 1.8508614623294668, + "grad_norm": 0.39353747779853976, + "learning_rate": 7.386143449947945e-06, + "loss": 0.4925, + "step": 2548 + }, + { + "epoch": 1.8515878600776339, + "grad_norm": 0.3359778131925848, + "learning_rate": 7.3840453314877505e-06, + "loss": 0.505, + "step": 2549 + }, + { + "epoch": 1.8523142578258007, + "grad_norm": 0.4354222874826661, + "learning_rate": 7.381946669573971e-06, + "loss": 0.5012, + "step": 2550 + }, + { + "epoch": 1.8530406555739676, + "grad_norm": 0.36701161656457704, + "learning_rate": 7.3798474646850034e-06, + "loss": 0.5013, + "step": 2551 + }, + { + "epoch": 1.8537670533221347, + "grad_norm": 0.33308540835740597, + "learning_rate": 7.377747717299373e-06, + "loss": 0.5023, + "step": 2552 + }, + { + "epoch": 1.8544934510703017, + "grad_norm": 0.3908552653407151, + "learning_rate": 7.375647427895725e-06, + "loss": 0.5154, + "step": 2553 + }, + { + "epoch": 1.8552198488184688, + "grad_norm": 0.3887867886469977, + "learning_rate": 7.373546596952829e-06, + "loss": 0.5071, + "step": 2554 + }, + { + "epoch": 1.8559462465666356, + "grad_norm": 0.4647679307155514, + "learning_rate": 7.371445224949581e-06, + "loss": 0.5118, + "step": 2555 + }, + { + "epoch": 1.8566726443148025, + "grad_norm": 0.44954855385912307, + "learning_rate": 7.369343312364994e-06, + "loss": 0.5154, + "step": 2556 + }, + { + "epoch": 1.8573990420629696, + "grad_norm": 0.38040083522847523, + "learning_rate": 7.367240859678212e-06, + "loss": 0.5199, + "step": 2557 + }, + { + "epoch": 1.8581254398111366, + "grad_norm": 0.4046416379097866, + "learning_rate": 7.365137867368497e-06, + "loss": 0.5123, + "step": 2558 + }, + { + "epoch": 1.8588518375593037, + "grad_norm": 0.3872560720187017, + "learning_rate": 7.363034335915234e-06, + "loss": 0.5152, + "step": 2559 + }, + { + "epoch": 1.8595782353074706, + "grad_norm": 0.36183881336332757, + "learning_rate": 7.360930265797934e-06, + "loss": 0.505, + "step": 2560 + }, + { + "epoch": 1.8603046330556374, + "grad_norm": 0.4649911594672085, + "learning_rate": 7.358825657496228e-06, + "loss": 0.5054, + "step": 2561 + }, + { + "epoch": 1.8610310308038045, + "grad_norm": 0.3458025268789646, + "learning_rate": 7.356720511489873e-06, + "loss": 0.4952, + "step": 2562 + }, + { + "epoch": 1.8617574285519716, + "grad_norm": 0.45459208605712137, + "learning_rate": 7.354614828258741e-06, + "loss": 0.529, + "step": 2563 + }, + { + "epoch": 1.8624838263001384, + "grad_norm": 0.33773416586056954, + "learning_rate": 7.352508608282838e-06, + "loss": 0.5128, + "step": 2564 + }, + { + "epoch": 1.8632102240483055, + "grad_norm": 0.3808978498416382, + "learning_rate": 7.3504018520422825e-06, + "loss": 0.4986, + "step": 2565 + }, + { + "epoch": 1.8639366217964723, + "grad_norm": 0.37701174485289957, + "learning_rate": 7.3482945600173195e-06, + "loss": 0.512, + "step": 2566 + }, + { + "epoch": 1.8646630195446394, + "grad_norm": 0.38208771506041556, + "learning_rate": 7.346186732688314e-06, + "loss": 0.5125, + "step": 2567 + }, + { + "epoch": 1.8653894172928065, + "grad_norm": 0.3494274116064339, + "learning_rate": 7.344078370535757e-06, + "loss": 0.517, + "step": 2568 + }, + { + "epoch": 1.8661158150409733, + "grad_norm": 0.3780497325193841, + "learning_rate": 7.341969474040257e-06, + "loss": 0.4967, + "step": 2569 + }, + { + "epoch": 1.8668422127891402, + "grad_norm": 0.4486675122297359, + "learning_rate": 7.339860043682548e-06, + "loss": 0.5147, + "step": 2570 + }, + { + "epoch": 1.8675686105373073, + "grad_norm": 0.4138925018957802, + "learning_rate": 7.337750079943483e-06, + "loss": 0.4959, + "step": 2571 + }, + { + "epoch": 1.8682950082854743, + "grad_norm": 0.44735718137758135, + "learning_rate": 7.335639583304037e-06, + "loss": 0.5104, + "step": 2572 + }, + { + "epoch": 1.8690214060336414, + "grad_norm": 0.3372381434600118, + "learning_rate": 7.333528554245307e-06, + "loss": 0.5245, + "step": 2573 + }, + { + "epoch": 1.8697478037818083, + "grad_norm": 0.4449369599629364, + "learning_rate": 7.331416993248511e-06, + "loss": 0.5205, + "step": 2574 + }, + { + "epoch": 1.8704742015299751, + "grad_norm": 0.36217733835115423, + "learning_rate": 7.329304900794991e-06, + "loss": 0.499, + "step": 2575 + }, + { + "epoch": 1.8712005992781422, + "grad_norm": 0.3573225113196394, + "learning_rate": 7.327192277366206e-06, + "loss": 0.5075, + "step": 2576 + }, + { + "epoch": 1.8719269970263093, + "grad_norm": 0.3530999968916237, + "learning_rate": 7.325079123443737e-06, + "loss": 0.5111, + "step": 2577 + }, + { + "epoch": 1.8726533947744763, + "grad_norm": 0.381411425903793, + "learning_rate": 7.322965439509287e-06, + "loss": 0.5145, + "step": 2578 + }, + { + "epoch": 1.8733797925226432, + "grad_norm": 0.43926138695135614, + "learning_rate": 7.3208512260446805e-06, + "loss": 0.5094, + "step": 2579 + }, + { + "epoch": 1.87410619027081, + "grad_norm": 0.48230232367488945, + "learning_rate": 7.318736483531861e-06, + "loss": 0.4854, + "step": 2580 + }, + { + "epoch": 1.8748325880189771, + "grad_norm": 0.41277915585386155, + "learning_rate": 7.316621212452895e-06, + "loss": 0.4943, + "step": 2581 + }, + { + "epoch": 1.8755589857671442, + "grad_norm": 0.3723827383864031, + "learning_rate": 7.314505413289964e-06, + "loss": 0.5323, + "step": 2582 + }, + { + "epoch": 1.8762853835153113, + "grad_norm": 0.3684546635367408, + "learning_rate": 7.3123890865253765e-06, + "loss": 0.5198, + "step": 2583 + }, + { + "epoch": 1.8770117812634781, + "grad_norm": 0.4422550701185108, + "learning_rate": 7.310272232641559e-06, + "loss": 0.4991, + "step": 2584 + }, + { + "epoch": 1.877738179011645, + "grad_norm": 0.35634373073083137, + "learning_rate": 7.308154852121057e-06, + "loss": 0.5054, + "step": 2585 + }, + { + "epoch": 1.878464576759812, + "grad_norm": 0.3807604321924993, + "learning_rate": 7.306036945446535e-06, + "loss": 0.5041, + "step": 2586 + }, + { + "epoch": 1.8791909745079791, + "grad_norm": 0.43867415636051965, + "learning_rate": 7.30391851310078e-06, + "loss": 0.4928, + "step": 2587 + }, + { + "epoch": 1.879917372256146, + "grad_norm": 0.563585109354283, + "learning_rate": 7.301799555566701e-06, + "loss": 0.5268, + "step": 2588 + }, + { + "epoch": 1.8806437700043128, + "grad_norm": 0.45682805335053467, + "learning_rate": 7.2996800733273196e-06, + "loss": 0.5193, + "step": 2589 + }, + { + "epoch": 1.88137016775248, + "grad_norm": 0.4653425530708123, + "learning_rate": 7.297560066865782e-06, + "loss": 0.5199, + "step": 2590 + }, + { + "epoch": 1.882096565500647, + "grad_norm": 0.46759940311509823, + "learning_rate": 7.2954395366653545e-06, + "loss": 0.4955, + "step": 2591 + }, + { + "epoch": 1.882822963248814, + "grad_norm": 0.4788522864372003, + "learning_rate": 7.293318483209419e-06, + "loss": 0.52, + "step": 2592 + }, + { + "epoch": 1.883549360996981, + "grad_norm": 0.3746450129791861, + "learning_rate": 7.29119690698148e-06, + "loss": 0.5057, + "step": 2593 + }, + { + "epoch": 1.8842757587451477, + "grad_norm": 0.4550524021827555, + "learning_rate": 7.289074808465162e-06, + "loss": 0.507, + "step": 2594 + }, + { + "epoch": 1.8850021564933148, + "grad_norm": 0.36358004633730967, + "learning_rate": 7.286952188144204e-06, + "loss": 0.509, + "step": 2595 + }, + { + "epoch": 1.885728554241482, + "grad_norm": 0.3871569601109488, + "learning_rate": 7.284829046502467e-06, + "loss": 0.5017, + "step": 2596 + }, + { + "epoch": 1.886454951989649, + "grad_norm": 0.3555485582102289, + "learning_rate": 7.282705384023933e-06, + "loss": 0.5198, + "step": 2597 + }, + { + "epoch": 1.8871813497378158, + "grad_norm": 0.3741738055108657, + "learning_rate": 7.280581201192696e-06, + "loss": 0.501, + "step": 2598 + }, + { + "epoch": 1.8879077474859827, + "grad_norm": 0.3219865204438388, + "learning_rate": 7.278456498492975e-06, + "loss": 0.5235, + "step": 2599 + }, + { + "epoch": 1.8886341452341497, + "grad_norm": 0.40093648121874215, + "learning_rate": 7.2763312764091055e-06, + "loss": 0.4906, + "step": 2600 + }, + { + "epoch": 1.8893605429823168, + "grad_norm": 0.3942335836495865, + "learning_rate": 7.274205535425543e-06, + "loss": 0.5226, + "step": 2601 + }, + { + "epoch": 1.8900869407304839, + "grad_norm": 0.3743931565246306, + "learning_rate": 7.272079276026855e-06, + "loss": 0.5086, + "step": 2602 + }, + { + "epoch": 1.8908133384786507, + "grad_norm": 0.3921609659047992, + "learning_rate": 7.269952498697734e-06, + "loss": 0.4963, + "step": 2603 + }, + { + "epoch": 1.8915397362268176, + "grad_norm": 0.4079084279428097, + "learning_rate": 7.26782520392299e-06, + "loss": 0.4983, + "step": 2604 + }, + { + "epoch": 1.8922661339749847, + "grad_norm": 0.4573292211751414, + "learning_rate": 7.265697392187547e-06, + "loss": 0.5138, + "step": 2605 + }, + { + "epoch": 1.8929925317231517, + "grad_norm": 0.39025123088916785, + "learning_rate": 7.263569063976449e-06, + "loss": 0.5126, + "step": 2606 + }, + { + "epoch": 1.8937189294713186, + "grad_norm": 0.527907757792281, + "learning_rate": 7.261440219774858e-06, + "loss": 0.5029, + "step": 2607 + }, + { + "epoch": 1.8944453272194857, + "grad_norm": 0.3581702843772708, + "learning_rate": 7.2593108600680524e-06, + "loss": 0.5176, + "step": 2608 + }, + { + "epoch": 1.8951717249676525, + "grad_norm": 0.39048666471502463, + "learning_rate": 7.257180985341432e-06, + "loss": 0.5152, + "step": 2609 + }, + { + "epoch": 1.8958981227158196, + "grad_norm": 0.42110401723381574, + "learning_rate": 7.25505059608051e-06, + "loss": 0.5083, + "step": 2610 + }, + { + "epoch": 1.8966245204639867, + "grad_norm": 0.4571340129019425, + "learning_rate": 7.252919692770916e-06, + "loss": 0.5125, + "step": 2611 + }, + { + "epoch": 1.8973509182121535, + "grad_norm": 0.36420915076613447, + "learning_rate": 7.250788275898401e-06, + "loss": 0.5118, + "step": 2612 + }, + { + "epoch": 1.8980773159603204, + "grad_norm": 0.37701735727524127, + "learning_rate": 7.248656345948828e-06, + "loss": 0.5109, + "step": 2613 + }, + { + "epoch": 1.8988037137084874, + "grad_norm": 0.4371017222186262, + "learning_rate": 7.2465239034081835e-06, + "loss": 0.4916, + "step": 2614 + }, + { + "epoch": 1.8995301114566545, + "grad_norm": 0.3654714865890906, + "learning_rate": 7.244390948762566e-06, + "loss": 0.5172, + "step": 2615 + }, + { + "epoch": 1.9002565092048216, + "grad_norm": 0.47754877856771066, + "learning_rate": 7.242257482498191e-06, + "loss": 0.5287, + "step": 2616 + }, + { + "epoch": 1.9009829069529884, + "grad_norm": 0.36891648606279626, + "learning_rate": 7.2401235051013885e-06, + "loss": 0.5142, + "step": 2617 + }, + { + "epoch": 1.9017093047011553, + "grad_norm": 0.4223943980005817, + "learning_rate": 7.237989017058614e-06, + "loss": 0.4889, + "step": 2618 + }, + { + "epoch": 1.9024357024493224, + "grad_norm": 0.35807149966277446, + "learning_rate": 7.235854018856429e-06, + "loss": 0.509, + "step": 2619 + }, + { + "epoch": 1.9031621001974894, + "grad_norm": 0.4103889588551492, + "learning_rate": 7.23371851098152e-06, + "loss": 0.5074, + "step": 2620 + }, + { + "epoch": 1.9038884979456565, + "grad_norm": 0.47569057216314453, + "learning_rate": 7.23158249392068e-06, + "loss": 0.5167, + "step": 2621 + }, + { + "epoch": 1.9046148956938234, + "grad_norm": 0.7088526151457004, + "learning_rate": 7.2294459681608275e-06, + "loss": 0.5204, + "step": 2622 + }, + { + "epoch": 1.9053412934419902, + "grad_norm": 0.4343984003997984, + "learning_rate": 7.22730893418899e-06, + "loss": 0.5124, + "step": 2623 + }, + { + "epoch": 1.9060676911901573, + "grad_norm": 0.3905184752532141, + "learning_rate": 7.225171392492316e-06, + "loss": 0.5247, + "step": 2624 + }, + { + "epoch": 1.9067940889383244, + "grad_norm": 0.35839885716842484, + "learning_rate": 7.223033343558068e-06, + "loss": 0.52, + "step": 2625 + }, + { + "epoch": 1.9075204866864914, + "grad_norm": 0.40567822205512094, + "learning_rate": 7.220894787873621e-06, + "loss": 0.5013, + "step": 2626 + }, + { + "epoch": 1.9082468844346583, + "grad_norm": 0.49459869136202017, + "learning_rate": 7.218755725926471e-06, + "loss": 0.5093, + "step": 2627 + }, + { + "epoch": 1.9089732821828251, + "grad_norm": 0.3794644738069637, + "learning_rate": 7.216616158204223e-06, + "loss": 0.4999, + "step": 2628 + }, + { + "epoch": 1.9096996799309922, + "grad_norm": 0.4855621032827998, + "learning_rate": 7.214476085194605e-06, + "loss": 0.5087, + "step": 2629 + }, + { + "epoch": 1.9104260776791593, + "grad_norm": 0.36297925771622874, + "learning_rate": 7.212335507385453e-06, + "loss": 0.5063, + "step": 2630 + }, + { + "epoch": 1.9111524754273261, + "grad_norm": 0.3944990999511598, + "learning_rate": 7.210194425264723e-06, + "loss": 0.5068, + "step": 2631 + }, + { + "epoch": 1.9118788731754932, + "grad_norm": 0.4122143059291724, + "learning_rate": 7.208052839320481e-06, + "loss": 0.5162, + "step": 2632 + }, + { + "epoch": 1.91260527092366, + "grad_norm": 0.41692514963900895, + "learning_rate": 7.205910750040914e-06, + "loss": 0.4828, + "step": 2633 + }, + { + "epoch": 1.9133316686718271, + "grad_norm": 0.3663441755784621, + "learning_rate": 7.203768157914321e-06, + "loss": 0.494, + "step": 2634 + }, + { + "epoch": 1.9140580664199942, + "grad_norm": 0.4070037156583176, + "learning_rate": 7.201625063429113e-06, + "loss": 0.5121, + "step": 2635 + }, + { + "epoch": 1.914784464168161, + "grad_norm": 0.4380005820938112, + "learning_rate": 7.199481467073819e-06, + "loss": 0.5157, + "step": 2636 + }, + { + "epoch": 1.915510861916328, + "grad_norm": 0.39134139767667614, + "learning_rate": 7.197337369337081e-06, + "loss": 0.5115, + "step": 2637 + }, + { + "epoch": 1.916237259664495, + "grad_norm": 0.3780955623983413, + "learning_rate": 7.1951927707076545e-06, + "loss": 0.5138, + "step": 2638 + }, + { + "epoch": 1.916963657412662, + "grad_norm": 0.37420155415019496, + "learning_rate": 7.193047671674411e-06, + "loss": 0.5109, + "step": 2639 + }, + { + "epoch": 1.9176900551608291, + "grad_norm": 0.3975633927338851, + "learning_rate": 7.190902072726336e-06, + "loss": 0.5175, + "step": 2640 + }, + { + "epoch": 1.918416452908996, + "grad_norm": 0.41788467337021007, + "learning_rate": 7.188755974352528e-06, + "loss": 0.5034, + "step": 2641 + }, + { + "epoch": 1.9191428506571628, + "grad_norm": 0.36881038549681205, + "learning_rate": 7.186609377042199e-06, + "loss": 0.4932, + "step": 2642 + }, + { + "epoch": 1.91986924840533, + "grad_norm": 0.4923105516525616, + "learning_rate": 7.1844622812846745e-06, + "loss": 0.5074, + "step": 2643 + }, + { + "epoch": 1.920595646153497, + "grad_norm": 0.4675468904655994, + "learning_rate": 7.182314687569395e-06, + "loss": 0.4971, + "step": 2644 + }, + { + "epoch": 1.921322043901664, + "grad_norm": 0.33391764389643824, + "learning_rate": 7.180166596385915e-06, + "loss": 0.5141, + "step": 2645 + }, + { + "epoch": 1.922048441649831, + "grad_norm": 0.37264094779898216, + "learning_rate": 7.1780180082238994e-06, + "loss": 0.5082, + "step": 2646 + }, + { + "epoch": 1.9227748393979978, + "grad_norm": 0.33950669946292117, + "learning_rate": 7.175868923573129e-06, + "loss": 0.519, + "step": 2647 + }, + { + "epoch": 1.9235012371461648, + "grad_norm": 0.3907280044051392, + "learning_rate": 7.1737193429234985e-06, + "loss": 0.5033, + "step": 2648 + }, + { + "epoch": 1.924227634894332, + "grad_norm": 0.3981209785053024, + "learning_rate": 7.171569266765012e-06, + "loss": 0.5113, + "step": 2649 + }, + { + "epoch": 1.9249540326424988, + "grad_norm": 0.3552940618648031, + "learning_rate": 7.169418695587791e-06, + "loss": 0.5262, + "step": 2650 + }, + { + "epoch": 1.9256804303906658, + "grad_norm": 0.45671174402732945, + "learning_rate": 7.167267629882066e-06, + "loss": 0.5087, + "step": 2651 + }, + { + "epoch": 1.9264068281388327, + "grad_norm": 0.36036782434775116, + "learning_rate": 7.165116070138183e-06, + "loss": 0.5093, + "step": 2652 + }, + { + "epoch": 1.9271332258869998, + "grad_norm": 0.36770341112039956, + "learning_rate": 7.162964016846597e-06, + "loss": 0.5015, + "step": 2653 + }, + { + "epoch": 1.9278596236351668, + "grad_norm": 0.4171887047487715, + "learning_rate": 7.160811470497881e-06, + "loss": 0.4875, + "step": 2654 + }, + { + "epoch": 1.9285860213833337, + "grad_norm": 0.7588013368151402, + "learning_rate": 7.158658431582718e-06, + "loss": 0.4812, + "step": 2655 + }, + { + "epoch": 1.9293124191315005, + "grad_norm": 0.4838546627793747, + "learning_rate": 7.156504900591899e-06, + "loss": 0.5061, + "step": 2656 + }, + { + "epoch": 1.9300388168796676, + "grad_norm": 0.3893073278129443, + "learning_rate": 7.154350878016332e-06, + "loss": 0.4997, + "step": 2657 + }, + { + "epoch": 1.9307652146278347, + "grad_norm": 0.35598578094946026, + "learning_rate": 7.152196364347037e-06, + "loss": 0.4818, + "step": 2658 + }, + { + "epoch": 1.9314916123760018, + "grad_norm": 0.7028633945623345, + "learning_rate": 7.1500413600751465e-06, + "loss": 0.5064, + "step": 2659 + }, + { + "epoch": 1.9322180101241686, + "grad_norm": 0.5021000366038473, + "learning_rate": 7.147885865691899e-06, + "loss": 0.4953, + "step": 2660 + }, + { + "epoch": 1.9329444078723355, + "grad_norm": 0.4521851195168597, + "learning_rate": 7.145729881688651e-06, + "loss": 0.508, + "step": 2661 + }, + { + "epoch": 1.9336708056205025, + "grad_norm": 0.32339465479764373, + "learning_rate": 7.143573408556867e-06, + "loss": 0.4954, + "step": 2662 + }, + { + "epoch": 1.9343972033686696, + "grad_norm": 0.401677364787166, + "learning_rate": 7.141416446788128e-06, + "loss": 0.4913, + "step": 2663 + }, + { + "epoch": 1.9351236011168367, + "grad_norm": 0.41714933742992666, + "learning_rate": 7.139258996874122e-06, + "loss": 0.5232, + "step": 2664 + }, + { + "epoch": 1.9358499988650035, + "grad_norm": 0.4588630688440973, + "learning_rate": 7.137101059306647e-06, + "loss": 0.5167, + "step": 2665 + }, + { + "epoch": 1.9365763966131704, + "grad_norm": 0.5128126005709326, + "learning_rate": 7.134942634577615e-06, + "loss": 0.5073, + "step": 2666 + }, + { + "epoch": 1.9373027943613375, + "grad_norm": 0.3220816956148639, + "learning_rate": 7.1327837231790484e-06, + "loss": 0.4918, + "step": 2667 + }, + { + "epoch": 1.9380291921095045, + "grad_norm": 0.3864453720966872, + "learning_rate": 7.1306243256030815e-06, + "loss": 0.5051, + "step": 2668 + }, + { + "epoch": 1.9387555898576716, + "grad_norm": 0.3417311969733972, + "learning_rate": 7.128464442341958e-06, + "loss": 0.4992, + "step": 2669 + }, + { + "epoch": 1.9394819876058385, + "grad_norm": 0.3666997641598726, + "learning_rate": 7.126304073888033e-06, + "loss": 0.4915, + "step": 2670 + }, + { + "epoch": 1.9402083853540053, + "grad_norm": 0.32912302124120496, + "learning_rate": 7.124143220733772e-06, + "loss": 0.5132, + "step": 2671 + }, + { + "epoch": 1.9409347831021724, + "grad_norm": 0.5894784767394627, + "learning_rate": 7.121981883371748e-06, + "loss": 0.5106, + "step": 2672 + }, + { + "epoch": 1.9416611808503395, + "grad_norm": 0.36134194753601506, + "learning_rate": 7.1198200622946516e-06, + "loss": 0.5322, + "step": 2673 + }, + { + "epoch": 1.9423875785985063, + "grad_norm": 0.5233500856029822, + "learning_rate": 7.117657757995277e-06, + "loss": 0.5006, + "step": 2674 + }, + { + "epoch": 1.9431139763466734, + "grad_norm": 0.45441916828563017, + "learning_rate": 7.115494970966532e-06, + "loss": 0.5015, + "step": 2675 + }, + { + "epoch": 1.9438403740948402, + "grad_norm": 0.42922725575066273, + "learning_rate": 7.113331701701433e-06, + "loss": 0.5118, + "step": 2676 + }, + { + "epoch": 1.9445667718430073, + "grad_norm": 0.37288209324065047, + "learning_rate": 7.111167950693106e-06, + "loss": 0.5133, + "step": 2677 + }, + { + "epoch": 1.9452931695911744, + "grad_norm": 0.5428767194900757, + "learning_rate": 7.109003718434788e-06, + "loss": 0.5109, + "step": 2678 + }, + { + "epoch": 1.9460195673393412, + "grad_norm": 0.48476276527501894, + "learning_rate": 7.106839005419825e-06, + "loss": 0.5195, + "step": 2679 + }, + { + "epoch": 1.946745965087508, + "grad_norm": 0.42779549179479115, + "learning_rate": 7.104673812141676e-06, + "loss": 0.5278, + "step": 2680 + }, + { + "epoch": 1.9474723628356752, + "grad_norm": 0.4172577512072656, + "learning_rate": 7.102508139093902e-06, + "loss": 0.5225, + "step": 2681 + }, + { + "epoch": 1.9481987605838422, + "grad_norm": 0.6445236653546356, + "learning_rate": 7.100341986770178e-06, + "loss": 0.4956, + "step": 2682 + }, + { + "epoch": 1.9489251583320093, + "grad_norm": 0.36704041066233434, + "learning_rate": 7.09817535566429e-06, + "loss": 0.5056, + "step": 2683 + }, + { + "epoch": 1.9496515560801762, + "grad_norm": 0.42608468891834045, + "learning_rate": 7.0960082462701315e-06, + "loss": 0.4976, + "step": 2684 + }, + { + "epoch": 1.950377953828343, + "grad_norm": 0.3725547949696026, + "learning_rate": 7.093840659081702e-06, + "loss": 0.5209, + "step": 2685 + }, + { + "epoch": 1.95110435157651, + "grad_norm": 0.347960151425129, + "learning_rate": 7.091672594593114e-06, + "loss": 0.4956, + "step": 2686 + }, + { + "epoch": 1.9518307493246772, + "grad_norm": 0.3919601507999021, + "learning_rate": 7.089504053298587e-06, + "loss": 0.499, + "step": 2687 + }, + { + "epoch": 1.9525571470728442, + "grad_norm": 0.43156269797326463, + "learning_rate": 7.0873350356924495e-06, + "loss": 0.5023, + "step": 2688 + }, + { + "epoch": 1.953283544821011, + "grad_norm": 0.34776606928147463, + "learning_rate": 7.085165542269139e-06, + "loss": 0.4955, + "step": 2689 + }, + { + "epoch": 1.954009942569178, + "grad_norm": 0.7017977923984215, + "learning_rate": 7.082995573523202e-06, + "loss": 0.4999, + "step": 2690 + }, + { + "epoch": 1.954736340317345, + "grad_norm": 0.4815505166236814, + "learning_rate": 7.080825129949289e-06, + "loss": 0.5146, + "step": 2691 + }, + { + "epoch": 1.955462738065512, + "grad_norm": 0.43130480748007916, + "learning_rate": 7.0786542120421645e-06, + "loss": 0.5215, + "step": 2692 + }, + { + "epoch": 1.956189135813679, + "grad_norm": 0.38558580874476744, + "learning_rate": 7.0764828202966986e-06, + "loss": 0.5011, + "step": 2693 + }, + { + "epoch": 1.956915533561846, + "grad_norm": 0.4036358786260398, + "learning_rate": 7.074310955207869e-06, + "loss": 0.5061, + "step": 2694 + }, + { + "epoch": 1.9576419313100129, + "grad_norm": 0.33067727483602, + "learning_rate": 7.072138617270765e-06, + "loss": 0.4895, + "step": 2695 + }, + { + "epoch": 1.95836832905818, + "grad_norm": 0.5605426386928977, + "learning_rate": 7.069965806980574e-06, + "loss": 0.5062, + "step": 2696 + }, + { + "epoch": 1.959094726806347, + "grad_norm": 0.35218801878956085, + "learning_rate": 7.067792524832604e-06, + "loss": 0.5043, + "step": 2697 + }, + { + "epoch": 1.9598211245545138, + "grad_norm": 0.3887738969745063, + "learning_rate": 7.0656187713222615e-06, + "loss": 0.5165, + "step": 2698 + }, + { + "epoch": 1.9605475223026807, + "grad_norm": 0.3868917511130432, + "learning_rate": 7.063444546945063e-06, + "loss": 0.5117, + "step": 2699 + }, + { + "epoch": 1.9612739200508478, + "grad_norm": 0.3402687842463978, + "learning_rate": 7.061269852196633e-06, + "loss": 0.4885, + "step": 2700 + }, + { + "epoch": 1.9620003177990148, + "grad_norm": 0.36804469030096626, + "learning_rate": 7.059094687572701e-06, + "loss": 0.509, + "step": 2701 + }, + { + "epoch": 1.962726715547182, + "grad_norm": 0.39457259575266557, + "learning_rate": 7.056919053569109e-06, + "loss": 0.4954, + "step": 2702 + }, + { + "epoch": 1.9634531132953488, + "grad_norm": 0.3204056385239754, + "learning_rate": 7.0547429506817986e-06, + "loss": 0.512, + "step": 2703 + }, + { + "epoch": 1.9641795110435156, + "grad_norm": 0.3629638300335219, + "learning_rate": 7.052566379406824e-06, + "loss": 0.4915, + "step": 2704 + }, + { + "epoch": 1.9649059087916827, + "grad_norm": 0.4589761089027009, + "learning_rate": 7.050389340240345e-06, + "loss": 0.4938, + "step": 2705 + }, + { + "epoch": 1.9656323065398498, + "grad_norm": 0.5304412242646968, + "learning_rate": 7.048211833678624e-06, + "loss": 0.5124, + "step": 2706 + }, + { + "epoch": 1.9663587042880168, + "grad_norm": 0.36321087701659993, + "learning_rate": 7.046033860218036e-06, + "loss": 0.4953, + "step": 2707 + }, + { + "epoch": 1.9670851020361837, + "grad_norm": 0.4130645277458678, + "learning_rate": 7.04385542035506e-06, + "loss": 0.4892, + "step": 2708 + }, + { + "epoch": 1.9678114997843505, + "grad_norm": 0.35499258033837994, + "learning_rate": 7.041676514586279e-06, + "loss": 0.4795, + "step": 2709 + }, + { + "epoch": 1.9685378975325176, + "grad_norm": 0.3760642593459764, + "learning_rate": 7.039497143408384e-06, + "loss": 0.5112, + "step": 2710 + }, + { + "epoch": 1.9692642952806847, + "grad_norm": 0.39847541288606975, + "learning_rate": 7.037317307318172e-06, + "loss": 0.5131, + "step": 2711 + }, + { + "epoch": 1.9699906930288518, + "grad_norm": 0.3325102212417593, + "learning_rate": 7.035137006812548e-06, + "loss": 0.5095, + "step": 2712 + }, + { + "epoch": 1.9707170907770186, + "grad_norm": 0.41951767852434335, + "learning_rate": 7.03295624238852e-06, + "loss": 0.5029, + "step": 2713 + }, + { + "epoch": 1.9714434885251855, + "grad_norm": 0.3882568800021323, + "learning_rate": 7.030775014543204e-06, + "loss": 0.5161, + "step": 2714 + }, + { + "epoch": 1.9721698862733525, + "grad_norm": 0.3246175477769834, + "learning_rate": 7.028593323773819e-06, + "loss": 0.5173, + "step": 2715 + }, + { + "epoch": 1.9728962840215196, + "grad_norm": 0.4320032858507335, + "learning_rate": 7.026411170577691e-06, + "loss": 0.4991, + "step": 2716 + }, + { + "epoch": 1.9736226817696865, + "grad_norm": 0.3582516058922341, + "learning_rate": 7.024228555452253e-06, + "loss": 0.4979, + "step": 2717 + }, + { + "epoch": 1.9743490795178535, + "grad_norm": 0.3865649105180352, + "learning_rate": 7.022045478895038e-06, + "loss": 0.5052, + "step": 2718 + }, + { + "epoch": 1.9750754772660204, + "grad_norm": 0.3865623049303795, + "learning_rate": 7.019861941403693e-06, + "loss": 0.5102, + "step": 2719 + }, + { + "epoch": 1.9758018750141875, + "grad_norm": 0.3744366218022015, + "learning_rate": 7.017677943475962e-06, + "loss": 0.5037, + "step": 2720 + }, + { + "epoch": 1.9765282727623545, + "grad_norm": 0.4049135491697213, + "learning_rate": 7.015493485609697e-06, + "loss": 0.5103, + "step": 2721 + }, + { + "epoch": 1.9772546705105214, + "grad_norm": 0.41669353889570854, + "learning_rate": 7.013308568302855e-06, + "loss": 0.496, + "step": 2722 + }, + { + "epoch": 1.9779810682586882, + "grad_norm": 1.55599304714908, + "learning_rate": 7.011123192053497e-06, + "loss": 0.4885, + "step": 2723 + }, + { + "epoch": 1.9787074660068553, + "grad_norm": 0.3766075391392106, + "learning_rate": 7.00893735735979e-06, + "loss": 0.5091, + "step": 2724 + }, + { + "epoch": 1.9794338637550224, + "grad_norm": 0.39432289032043133, + "learning_rate": 7.006751064720003e-06, + "loss": 0.5043, + "step": 2725 + }, + { + "epoch": 1.9801602615031895, + "grad_norm": 0.42252648320613373, + "learning_rate": 7.004564314632514e-06, + "loss": 0.4971, + "step": 2726 + }, + { + "epoch": 1.9808866592513563, + "grad_norm": 0.5220318083615987, + "learning_rate": 7.002377107595799e-06, + "loss": 0.5131, + "step": 2727 + }, + { + "epoch": 1.9816130569995232, + "grad_norm": 0.439124427096527, + "learning_rate": 7.000189444108443e-06, + "loss": 0.4987, + "step": 2728 + }, + { + "epoch": 1.9823394547476902, + "grad_norm": 0.6087596878574801, + "learning_rate": 6.998001324669135e-06, + "loss": 0.5007, + "step": 2729 + }, + { + "epoch": 1.9830658524958573, + "grad_norm": 0.4689995890970593, + "learning_rate": 6.995812749776663e-06, + "loss": 0.4928, + "step": 2730 + }, + { + "epoch": 1.9837922502440244, + "grad_norm": 0.4303674417050366, + "learning_rate": 6.993623719929924e-06, + "loss": 0.4972, + "step": 2731 + }, + { + "epoch": 1.9845186479921912, + "grad_norm": 1.7806687410524744, + "learning_rate": 6.991434235627918e-06, + "loss": 0.5007, + "step": 2732 + }, + { + "epoch": 1.985245045740358, + "grad_norm": 0.45617373447370785, + "learning_rate": 6.989244297369746e-06, + "loss": 0.5038, + "step": 2733 + }, + { + "epoch": 1.9859714434885252, + "grad_norm": 0.39328968595405445, + "learning_rate": 6.9870539056546145e-06, + "loss": 0.5095, + "step": 2734 + }, + { + "epoch": 1.9866978412366922, + "grad_norm": 0.43161678194415737, + "learning_rate": 6.984863060981835e-06, + "loss": 0.5066, + "step": 2735 + }, + { + "epoch": 1.987424238984859, + "grad_norm": 0.3628627971741911, + "learning_rate": 6.982671763850814e-06, + "loss": 0.4919, + "step": 2736 + }, + { + "epoch": 1.9881506367330262, + "grad_norm": 0.5086084610925337, + "learning_rate": 6.980480014761074e-06, + "loss": 0.51, + "step": 2737 + }, + { + "epoch": 1.988877034481193, + "grad_norm": 0.3982110835234876, + "learning_rate": 6.9782878142122315e-06, + "loss": 0.5163, + "step": 2738 + }, + { + "epoch": 1.98960343222936, + "grad_norm": 0.3966667256287206, + "learning_rate": 6.976095162704006e-06, + "loss": 0.4951, + "step": 2739 + }, + { + "epoch": 1.9903298299775272, + "grad_norm": 0.5304184543224457, + "learning_rate": 6.973902060736226e-06, + "loss": 0.5118, + "step": 2740 + }, + { + "epoch": 1.991056227725694, + "grad_norm": 0.3981730445998376, + "learning_rate": 6.971708508808815e-06, + "loss": 0.5081, + "step": 2741 + }, + { + "epoch": 1.9917826254738609, + "grad_norm": 0.5015706091927886, + "learning_rate": 6.969514507421805e-06, + "loss": 0.5093, + "step": 2742 + }, + { + "epoch": 1.992509023222028, + "grad_norm": 0.37103098758948716, + "learning_rate": 6.967320057075329e-06, + "loss": 0.4965, + "step": 2743 + }, + { + "epoch": 1.993235420970195, + "grad_norm": 0.5781372172121526, + "learning_rate": 6.965125158269619e-06, + "loss": 0.5011, + "step": 2744 + }, + { + "epoch": 1.993961818718362, + "grad_norm": 0.4157387952447532, + "learning_rate": 6.962929811505013e-06, + "loss": 0.5058, + "step": 2745 + }, + { + "epoch": 1.994688216466529, + "grad_norm": 0.4072625761207944, + "learning_rate": 6.9607340172819495e-06, + "loss": 0.5087, + "step": 2746 + }, + { + "epoch": 1.9954146142146958, + "grad_norm": 0.4575323537660217, + "learning_rate": 6.958537776100971e-06, + "loss": 0.4713, + "step": 2747 + }, + { + "epoch": 1.9961410119628629, + "grad_norm": 0.4236304013107574, + "learning_rate": 6.9563410884627195e-06, + "loss": 0.4951, + "step": 2748 + }, + { + "epoch": 1.99686740971103, + "grad_norm": 0.3395318180298695, + "learning_rate": 6.9541439548679394e-06, + "loss": 0.5169, + "step": 2749 + }, + { + "epoch": 1.997593807459197, + "grad_norm": 0.40012977930638305, + "learning_rate": 6.9519463758174745e-06, + "loss": 0.4898, + "step": 2750 + }, + { + "epoch": 1.9983202052073639, + "grad_norm": 0.6044468828057461, + "learning_rate": 6.949748351812277e-06, + "loss": 0.4968, + "step": 2751 + }, + { + "epoch": 1.9990466029555307, + "grad_norm": 0.3290317646035132, + "learning_rate": 6.947549883353393e-06, + "loss": 0.4896, + "step": 2752 + }, + { + "epoch": 1.9997730007036978, + "grad_norm": 0.4191772501374024, + "learning_rate": 6.9453509709419754e-06, + "loss": 0.5034, + "step": 2753 + }, + { + "epoch": 2.000499398451865, + "grad_norm": 0.4383592732805349, + "learning_rate": 6.943151615079273e-06, + "loss": 0.4997, + "step": 2754 + }, + { + "epoch": 2.001225796200032, + "grad_norm": 0.4310380742661681, + "learning_rate": 6.9409518162666416e-06, + "loss": 0.5013, + "step": 2755 + }, + { + "epoch": 2.0019521939481986, + "grad_norm": 0.6634042092614508, + "learning_rate": 6.938751575005531e-06, + "loss": 0.5085, + "step": 2756 + }, + { + "epoch": 2.0026785916963656, + "grad_norm": 0.43186399669748915, + "learning_rate": 6.9365508917975e-06, + "loss": 0.5018, + "step": 2757 + }, + { + "epoch": 2.0034049894445327, + "grad_norm": 0.34231458896554495, + "learning_rate": 6.934349767144203e-06, + "loss": 0.4969, + "step": 2758 + }, + { + "epoch": 2.0041313871927, + "grad_norm": 0.3852057248556303, + "learning_rate": 6.932148201547395e-06, + "loss": 0.5099, + "step": 2759 + }, + { + "epoch": 2.004857784940867, + "grad_norm": 0.5008104603895283, + "learning_rate": 6.929946195508933e-06, + "loss": 0.5165, + "step": 2760 + }, + { + "epoch": 2.0055841826890335, + "grad_norm": 0.3317056532033946, + "learning_rate": 6.9277437495307745e-06, + "loss": 0.5002, + "step": 2761 + }, + { + "epoch": 2.0063105804372006, + "grad_norm": 0.5412828602249329, + "learning_rate": 6.925540864114975e-06, + "loss": 0.4962, + "step": 2762 + }, + { + "epoch": 2.0070369781853676, + "grad_norm": 0.3430388566201603, + "learning_rate": 6.923337539763693e-06, + "loss": 0.4937, + "step": 2763 + }, + { + "epoch": 2.0077633759335347, + "grad_norm": 0.3647393285999755, + "learning_rate": 6.921133776979186e-06, + "loss": 0.4975, + "step": 2764 + }, + { + "epoch": 2.008489773681702, + "grad_norm": 0.40829618718724847, + "learning_rate": 6.918929576263811e-06, + "loss": 0.4984, + "step": 2765 + }, + { + "epoch": 2.0092161714298684, + "grad_norm": 0.40555403400281487, + "learning_rate": 6.916724938120026e-06, + "loss": 0.5066, + "step": 2766 + }, + { + "epoch": 2.0099425691780355, + "grad_norm": 0.34064395872559766, + "learning_rate": 6.914519863050388e-06, + "loss": 0.5029, + "step": 2767 + }, + { + "epoch": 2.0106689669262026, + "grad_norm": 0.38967717514091643, + "learning_rate": 6.912314351557552e-06, + "loss": 0.5132, + "step": 2768 + }, + { + "epoch": 2.0113953646743696, + "grad_norm": 0.3360598328932662, + "learning_rate": 6.910108404144276e-06, + "loss": 0.4864, + "step": 2769 + }, + { + "epoch": 2.0121217624225367, + "grad_norm": 0.36912864206432744, + "learning_rate": 6.907902021313414e-06, + "loss": 0.4959, + "step": 2770 + }, + { + "epoch": 2.0128481601707033, + "grad_norm": 0.35828330181110757, + "learning_rate": 6.905695203567919e-06, + "loss": 0.4931, + "step": 2771 + }, + { + "epoch": 2.0135745579188704, + "grad_norm": 0.3555368715816114, + "learning_rate": 6.90348795141085e-06, + "loss": 0.5029, + "step": 2772 + }, + { + "epoch": 2.0143009556670375, + "grad_norm": 0.33659315686379043, + "learning_rate": 6.901280265345355e-06, + "loss": 0.4989, + "step": 2773 + }, + { + "epoch": 2.0150273534152046, + "grad_norm": 0.37550584227442796, + "learning_rate": 6.8990721458746875e-06, + "loss": 0.5201, + "step": 2774 + }, + { + "epoch": 2.015753751163371, + "grad_norm": 0.3783837186018702, + "learning_rate": 6.896863593502198e-06, + "loss": 0.5071, + "step": 2775 + }, + { + "epoch": 2.0164801489115383, + "grad_norm": 0.3258935438932034, + "learning_rate": 6.894654608731335e-06, + "loss": 0.5182, + "step": 2776 + }, + { + "epoch": 2.0172065466597053, + "grad_norm": 0.39315885025393604, + "learning_rate": 6.8924451920656475e-06, + "loss": 0.4923, + "step": 2777 + }, + { + "epoch": 2.0179329444078724, + "grad_norm": 0.3632809798562676, + "learning_rate": 6.890235344008781e-06, + "loss": 0.4815, + "step": 2778 + }, + { + "epoch": 2.0186593421560395, + "grad_norm": 0.36527444592888336, + "learning_rate": 6.88802506506448e-06, + "loss": 0.4956, + "step": 2779 + }, + { + "epoch": 2.019385739904206, + "grad_norm": 0.3295945817112341, + "learning_rate": 6.8858143557365865e-06, + "loss": 0.5091, + "step": 2780 + }, + { + "epoch": 2.020112137652373, + "grad_norm": 0.38931789216868123, + "learning_rate": 6.883603216529043e-06, + "loss": 0.5059, + "step": 2781 + }, + { + "epoch": 2.0208385354005403, + "grad_norm": 0.3801269142307661, + "learning_rate": 6.881391647945887e-06, + "loss": 0.4952, + "step": 2782 + }, + { + "epoch": 2.0215649331487073, + "grad_norm": 0.3763092423880549, + "learning_rate": 6.879179650491257e-06, + "loss": 0.4924, + "step": 2783 + }, + { + "epoch": 2.0222913308968744, + "grad_norm": 0.36364396731621496, + "learning_rate": 6.8769672246693865e-06, + "loss": 0.487, + "step": 2784 + }, + { + "epoch": 2.023017728645041, + "grad_norm": 0.48214570723774847, + "learning_rate": 6.8747543709846064e-06, + "loss": 0.4799, + "step": 2785 + }, + { + "epoch": 2.023744126393208, + "grad_norm": 0.3373087430212078, + "learning_rate": 6.872541089941347e-06, + "loss": 0.5032, + "step": 2786 + }, + { + "epoch": 2.024470524141375, + "grad_norm": 0.4246028661887048, + "learning_rate": 6.870327382044138e-06, + "loss": 0.514, + "step": 2787 + }, + { + "epoch": 2.0251969218895423, + "grad_norm": 0.3637838866454337, + "learning_rate": 6.868113247797601e-06, + "loss": 0.4958, + "step": 2788 + }, + { + "epoch": 2.0259233196377093, + "grad_norm": 0.3624459628075546, + "learning_rate": 6.865898687706458e-06, + "loss": 0.5183, + "step": 2789 + }, + { + "epoch": 2.026649717385876, + "grad_norm": 0.3497999660081715, + "learning_rate": 6.8636837022755275e-06, + "loss": 0.4969, + "step": 2790 + }, + { + "epoch": 2.027376115134043, + "grad_norm": 0.30925439334485977, + "learning_rate": 6.8614682920097265e-06, + "loss": 0.4905, + "step": 2791 + }, + { + "epoch": 2.02810251288221, + "grad_norm": 0.4779644594533042, + "learning_rate": 6.859252457414067e-06, + "loss": 0.4983, + "step": 2792 + }, + { + "epoch": 2.028828910630377, + "grad_norm": 0.4156555631857716, + "learning_rate": 6.857036198993658e-06, + "loss": 0.5111, + "step": 2793 + }, + { + "epoch": 2.029555308378544, + "grad_norm": 0.4231134051622082, + "learning_rate": 6.8548195172537045e-06, + "loss": 0.5015, + "step": 2794 + }, + { + "epoch": 2.030281706126711, + "grad_norm": 0.35883752557089954, + "learning_rate": 6.8526024126995096e-06, + "loss": 0.4901, + "step": 2795 + }, + { + "epoch": 2.031008103874878, + "grad_norm": 0.5176921934711561, + "learning_rate": 6.850384885836472e-06, + "loss": 0.4762, + "step": 2796 + }, + { + "epoch": 2.031734501623045, + "grad_norm": 0.47898011289447745, + "learning_rate": 6.8481669371700865e-06, + "loss": 0.5091, + "step": 2797 + }, + { + "epoch": 2.032460899371212, + "grad_norm": 0.38298304746184586, + "learning_rate": 6.845948567205945e-06, + "loss": 0.4839, + "step": 2798 + }, + { + "epoch": 2.0331872971193787, + "grad_norm": 0.49166223619894195, + "learning_rate": 6.843729776449734e-06, + "loss": 0.4936, + "step": 2799 + }, + { + "epoch": 2.033913694867546, + "grad_norm": 0.36732722633298054, + "learning_rate": 6.841510565407235e-06, + "loss": 0.5045, + "step": 2800 + }, + { + "epoch": 2.034640092615713, + "grad_norm": 0.5769337316874685, + "learning_rate": 6.839290934584332e-06, + "loss": 0.5023, + "step": 2801 + }, + { + "epoch": 2.03536649036388, + "grad_norm": 0.34290008976429126, + "learning_rate": 6.837070884486994e-06, + "loss": 0.504, + "step": 2802 + }, + { + "epoch": 2.036092888112047, + "grad_norm": 0.36473122595802115, + "learning_rate": 6.8348504156212925e-06, + "loss": 0.5153, + "step": 2803 + }, + { + "epoch": 2.0368192858602137, + "grad_norm": 0.4304895612429296, + "learning_rate": 6.832629528493395e-06, + "loss": 0.4997, + "step": 2804 + }, + { + "epoch": 2.0375456836083807, + "grad_norm": 0.4387726012724692, + "learning_rate": 6.830408223609559e-06, + "loss": 0.5105, + "step": 2805 + }, + { + "epoch": 2.038272081356548, + "grad_norm": 0.9160945864401482, + "learning_rate": 6.828186501476145e-06, + "loss": 0.4922, + "step": 2806 + }, + { + "epoch": 2.038998479104715, + "grad_norm": 0.36725867499766507, + "learning_rate": 6.8259643625996016e-06, + "loss": 0.5068, + "step": 2807 + }, + { + "epoch": 2.039724876852882, + "grad_norm": 0.36862502375795786, + "learning_rate": 6.8237418074864766e-06, + "loss": 0.5122, + "step": 2808 + }, + { + "epoch": 2.0404512746010486, + "grad_norm": 0.33923787542671696, + "learning_rate": 6.8215188366434104e-06, + "loss": 0.5058, + "step": 2809 + }, + { + "epoch": 2.0411776723492157, + "grad_norm": 0.44498404380627354, + "learning_rate": 6.819295450577138e-06, + "loss": 0.4926, + "step": 2810 + }, + { + "epoch": 2.0419040700973827, + "grad_norm": 0.33216610091469256, + "learning_rate": 6.817071649794491e-06, + "loss": 0.4973, + "step": 2811 + }, + { + "epoch": 2.04263046784555, + "grad_norm": 0.3335571997556061, + "learning_rate": 6.8148474348023954e-06, + "loss": 0.4831, + "step": 2812 + }, + { + "epoch": 2.043356865593717, + "grad_norm": 0.360857049482772, + "learning_rate": 6.812622806107869e-06, + "loss": 0.5019, + "step": 2813 + }, + { + "epoch": 2.0440832633418835, + "grad_norm": 0.4074737961264872, + "learning_rate": 6.810397764218027e-06, + "loss": 0.5036, + "step": 2814 + }, + { + "epoch": 2.0448096610900506, + "grad_norm": 0.334689546487221, + "learning_rate": 6.808172309640078e-06, + "loss": 0.5003, + "step": 2815 + }, + { + "epoch": 2.0455360588382177, + "grad_norm": 0.37500374203543435, + "learning_rate": 6.805946442881322e-06, + "loss": 0.5213, + "step": 2816 + }, + { + "epoch": 2.0462624565863847, + "grad_norm": 0.4206563647325095, + "learning_rate": 6.803720164449155e-06, + "loss": 0.4977, + "step": 2817 + }, + { + "epoch": 2.0469888543345514, + "grad_norm": 0.370548084274352, + "learning_rate": 6.801493474851069e-06, + "loss": 0.5225, + "step": 2818 + }, + { + "epoch": 2.0477152520827184, + "grad_norm": 0.354149466875892, + "learning_rate": 6.799266374594646e-06, + "loss": 0.4976, + "step": 2819 + }, + { + "epoch": 2.0484416498308855, + "grad_norm": 0.3426096343449712, + "learning_rate": 6.797038864187564e-06, + "loss": 0.5028, + "step": 2820 + }, + { + "epoch": 2.0491680475790526, + "grad_norm": 0.3460308554783054, + "learning_rate": 6.794810944137595e-06, + "loss": 0.5117, + "step": 2821 + }, + { + "epoch": 2.0498944453272196, + "grad_norm": 0.3678978614040748, + "learning_rate": 6.792582614952602e-06, + "loss": 0.4935, + "step": 2822 + }, + { + "epoch": 2.0506208430753863, + "grad_norm": 0.3933345821668368, + "learning_rate": 6.790353877140542e-06, + "loss": 0.5033, + "step": 2823 + }, + { + "epoch": 2.0513472408235534, + "grad_norm": 0.3994192333516289, + "learning_rate": 6.788124731209466e-06, + "loss": 0.4953, + "step": 2824 + }, + { + "epoch": 2.0520736385717204, + "grad_norm": 0.36607993612051454, + "learning_rate": 6.785895177667516e-06, + "loss": 0.4963, + "step": 2825 + }, + { + "epoch": 2.0528000363198875, + "grad_norm": 0.36687689120176453, + "learning_rate": 6.78366521702293e-06, + "loss": 0.5003, + "step": 2826 + }, + { + "epoch": 2.0535264340680546, + "grad_norm": 0.42363155301268685, + "learning_rate": 6.781434849784039e-06, + "loss": 0.51, + "step": 2827 + }, + { + "epoch": 2.054252831816221, + "grad_norm": 0.3378823422646072, + "learning_rate": 6.779204076459264e-06, + "loss": 0.4923, + "step": 2828 + }, + { + "epoch": 2.0549792295643883, + "grad_norm": 0.41958106038730947, + "learning_rate": 6.776972897557117e-06, + "loss": 0.4943, + "step": 2829 + }, + { + "epoch": 2.0557056273125554, + "grad_norm": 0.3637963294712784, + "learning_rate": 6.774741313586206e-06, + "loss": 0.5042, + "step": 2830 + }, + { + "epoch": 2.0564320250607224, + "grad_norm": 0.35444308760596194, + "learning_rate": 6.772509325055233e-06, + "loss": 0.497, + "step": 2831 + }, + { + "epoch": 2.0571584228088895, + "grad_norm": 0.30668522235604445, + "learning_rate": 6.770276932472987e-06, + "loss": 0.5104, + "step": 2832 + }, + { + "epoch": 2.057884820557056, + "grad_norm": 0.5739622435217333, + "learning_rate": 6.768044136348353e-06, + "loss": 0.4828, + "step": 2833 + }, + { + "epoch": 2.058611218305223, + "grad_norm": 0.4356973061131128, + "learning_rate": 6.765810937190307e-06, + "loss": 0.5088, + "step": 2834 + }, + { + "epoch": 2.0593376160533903, + "grad_norm": 0.39513686409727394, + "learning_rate": 6.763577335507913e-06, + "loss": 0.4884, + "step": 2835 + }, + { + "epoch": 2.0600640138015573, + "grad_norm": 0.4080599817351704, + "learning_rate": 6.761343331810334e-06, + "loss": 0.4983, + "step": 2836 + }, + { + "epoch": 2.060790411549724, + "grad_norm": 0.4641360221646442, + "learning_rate": 6.759108926606821e-06, + "loss": 0.5074, + "step": 2837 + }, + { + "epoch": 2.061516809297891, + "grad_norm": 0.48283517031623097, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.4941, + "step": 2838 + }, + { + "epoch": 2.062243207046058, + "grad_norm": 0.37677270766893173, + "learning_rate": 6.754638913719449e-06, + "loss": 0.495, + "step": 2839 + }, + { + "epoch": 2.062969604794225, + "grad_norm": 0.3897940565546679, + "learning_rate": 6.752403307054549e-06, + "loss": 0.4832, + "step": 2840 + }, + { + "epoch": 2.0636960025423923, + "grad_norm": 0.3706329128849379, + "learning_rate": 6.750167300921635e-06, + "loss": 0.4951, + "step": 2841 + }, + { + "epoch": 2.064422400290559, + "grad_norm": 0.3677737911519454, + "learning_rate": 6.747930895830409e-06, + "loss": 0.4971, + "step": 2842 + }, + { + "epoch": 2.065148798038726, + "grad_norm": 0.4806697321398137, + "learning_rate": 6.745694092290671e-06, + "loss": 0.5163, + "step": 2843 + }, + { + "epoch": 2.065875195786893, + "grad_norm": 0.3258014555766835, + "learning_rate": 6.74345689081231e-06, + "loss": 0.4869, + "step": 2844 + }, + { + "epoch": 2.06660159353506, + "grad_norm": 0.35430162341459304, + "learning_rate": 6.741219291905308e-06, + "loss": 0.5137, + "step": 2845 + }, + { + "epoch": 2.067327991283227, + "grad_norm": 0.3735414728864755, + "learning_rate": 6.738981296079734e-06, + "loss": 0.5199, + "step": 2846 + }, + { + "epoch": 2.068054389031394, + "grad_norm": 0.4005930008235301, + "learning_rate": 6.7367429038457485e-06, + "loss": 0.5184, + "step": 2847 + }, + { + "epoch": 2.068780786779561, + "grad_norm": 0.46490959194042725, + "learning_rate": 6.7345041157136035e-06, + "loss": 0.5035, + "step": 2848 + }, + { + "epoch": 2.069507184527728, + "grad_norm": 0.336522204114012, + "learning_rate": 6.7322649321936395e-06, + "loss": 0.4975, + "step": 2849 + }, + { + "epoch": 2.070233582275895, + "grad_norm": 0.3625662873299568, + "learning_rate": 6.7300253537962905e-06, + "loss": 0.5191, + "step": 2850 + }, + { + "epoch": 2.070959980024062, + "grad_norm": 0.33860160399452294, + "learning_rate": 6.727785381032076e-06, + "loss": 0.5, + "step": 2851 + }, + { + "epoch": 2.0716863777722287, + "grad_norm": 0.363185881586708, + "learning_rate": 6.725545014411608e-06, + "loss": 0.4773, + "step": 2852 + }, + { + "epoch": 2.072412775520396, + "grad_norm": 0.32039147732115986, + "learning_rate": 6.723304254445589e-06, + "loss": 0.5072, + "step": 2853 + }, + { + "epoch": 2.073139173268563, + "grad_norm": 0.37782007575870213, + "learning_rate": 6.72106310164481e-06, + "loss": 0.4984, + "step": 2854 + }, + { + "epoch": 2.07386557101673, + "grad_norm": 0.33014550752157024, + "learning_rate": 6.718821556520151e-06, + "loss": 0.4903, + "step": 2855 + }, + { + "epoch": 2.074591968764897, + "grad_norm": 0.4828649833387322, + "learning_rate": 6.716579619582581e-06, + "loss": 0.502, + "step": 2856 + }, + { + "epoch": 2.0753183665130637, + "grad_norm": 0.40367185786911486, + "learning_rate": 6.71433729134316e-06, + "loss": 0.506, + "step": 2857 + }, + { + "epoch": 2.0760447642612307, + "grad_norm": 0.3460938036307214, + "learning_rate": 6.712094572313038e-06, + "loss": 0.496, + "step": 2858 + }, + { + "epoch": 2.076771162009398, + "grad_norm": 0.3581135934064617, + "learning_rate": 6.70985146300345e-06, + "loss": 0.4855, + "step": 2859 + }, + { + "epoch": 2.077497559757565, + "grad_norm": 0.3518723354378461, + "learning_rate": 6.707607963925725e-06, + "loss": 0.5071, + "step": 2860 + }, + { + "epoch": 2.0782239575057315, + "grad_norm": 0.41437491459633446, + "learning_rate": 6.7053640755912786e-06, + "loss": 0.4892, + "step": 2861 + }, + { + "epoch": 2.0789503552538986, + "grad_norm": 0.40016810758491533, + "learning_rate": 6.703119798511612e-06, + "loss": 0.5168, + "step": 2862 + }, + { + "epoch": 2.0796767530020657, + "grad_norm": 0.4107704547854394, + "learning_rate": 6.700875133198321e-06, + "loss": 0.5094, + "step": 2863 + }, + { + "epoch": 2.0804031507502327, + "grad_norm": 0.3885733785737631, + "learning_rate": 6.698630080163086e-06, + "loss": 0.5, + "step": 2864 + }, + { + "epoch": 2.0811295484984, + "grad_norm": 0.5226559711677936, + "learning_rate": 6.696384639917677e-06, + "loss": 0.5024, + "step": 2865 + }, + { + "epoch": 2.0818559462465664, + "grad_norm": 0.3797436835719642, + "learning_rate": 6.6941388129739504e-06, + "loss": 0.4978, + "step": 2866 + }, + { + "epoch": 2.0825823439947335, + "grad_norm": 0.3646627564573875, + "learning_rate": 6.691892599843856e-06, + "loss": 0.5022, + "step": 2867 + }, + { + "epoch": 2.0833087417429006, + "grad_norm": 0.36932403717003065, + "learning_rate": 6.689646001039422e-06, + "loss": 0.4874, + "step": 2868 + }, + { + "epoch": 2.0840351394910677, + "grad_norm": 0.3689439163666255, + "learning_rate": 6.687399017072775e-06, + "loss": 0.4938, + "step": 2869 + }, + { + "epoch": 2.0847615372392347, + "grad_norm": 0.3554966085123132, + "learning_rate": 6.685151648456124e-06, + "loss": 0.4857, + "step": 2870 + }, + { + "epoch": 2.0854879349874014, + "grad_norm": 0.6683235515857348, + "learning_rate": 6.682903895701767e-06, + "loss": 0.4834, + "step": 2871 + }, + { + "epoch": 2.0862143327355684, + "grad_norm": 0.37256500006151105, + "learning_rate": 6.680655759322086e-06, + "loss": 0.5091, + "step": 2872 + }, + { + "epoch": 2.0869407304837355, + "grad_norm": 0.3597428273584394, + "learning_rate": 6.678407239829558e-06, + "loss": 0.5078, + "step": 2873 + }, + { + "epoch": 2.0876671282319026, + "grad_norm": 0.4260727973510339, + "learning_rate": 6.676158337736738e-06, + "loss": 0.5046, + "step": 2874 + }, + { + "epoch": 2.0883935259800697, + "grad_norm": 0.41474895800784195, + "learning_rate": 6.673909053556278e-06, + "loss": 0.489, + "step": 2875 + }, + { + "epoch": 2.0891199237282363, + "grad_norm": 0.40947917474288614, + "learning_rate": 6.67165938780091e-06, + "loss": 0.5022, + "step": 2876 + }, + { + "epoch": 2.0898463214764034, + "grad_norm": 0.39584157189048297, + "learning_rate": 6.669409340983455e-06, + "loss": 0.4945, + "step": 2877 + }, + { + "epoch": 2.0905727192245704, + "grad_norm": 0.3825280975330202, + "learning_rate": 6.66715891361682e-06, + "loss": 0.492, + "step": 2878 + }, + { + "epoch": 2.0912991169727375, + "grad_norm": 0.37703028844024256, + "learning_rate": 6.664908106214001e-06, + "loss": 0.4941, + "step": 2879 + }, + { + "epoch": 2.0920255147209046, + "grad_norm": 0.364638637636816, + "learning_rate": 6.66265691928808e-06, + "loss": 0.5158, + "step": 2880 + }, + { + "epoch": 2.092751912469071, + "grad_norm": 0.33601793792720597, + "learning_rate": 6.660405353352226e-06, + "loss": 0.5131, + "step": 2881 + }, + { + "epoch": 2.0934783102172383, + "grad_norm": 0.592614366219023, + "learning_rate": 6.658153408919689e-06, + "loss": 0.4827, + "step": 2882 + }, + { + "epoch": 2.0942047079654054, + "grad_norm": 0.3619686249243115, + "learning_rate": 6.6559010865038135e-06, + "loss": 0.4959, + "step": 2883 + }, + { + "epoch": 2.0949311057135724, + "grad_norm": 0.35243439705723756, + "learning_rate": 6.653648386618025e-06, + "loss": 0.4988, + "step": 2884 + }, + { + "epoch": 2.095657503461739, + "grad_norm": 0.3651217663787908, + "learning_rate": 6.651395309775837e-06, + "loss": 0.496, + "step": 2885 + }, + { + "epoch": 2.096383901209906, + "grad_norm": 0.3842440109628493, + "learning_rate": 6.649141856490846e-06, + "loss": 0.4979, + "step": 2886 + }, + { + "epoch": 2.097110298958073, + "grad_norm": 0.4581904771466546, + "learning_rate": 6.646888027276739e-06, + "loss": 0.5091, + "step": 2887 + }, + { + "epoch": 2.0978366967062403, + "grad_norm": 0.5193582269979015, + "learning_rate": 6.644633822647285e-06, + "loss": 0.5116, + "step": 2888 + }, + { + "epoch": 2.0985630944544074, + "grad_norm": 0.3359619709738642, + "learning_rate": 6.6423792431163395e-06, + "loss": 0.4923, + "step": 2889 + }, + { + "epoch": 2.099289492202574, + "grad_norm": 0.3597705367645605, + "learning_rate": 6.640124289197845e-06, + "loss": 0.4991, + "step": 2890 + }, + { + "epoch": 2.100015889950741, + "grad_norm": 0.407415043370364, + "learning_rate": 6.637868961405829e-06, + "loss": 0.487, + "step": 2891 + }, + { + "epoch": 2.100742287698908, + "grad_norm": 0.37796274922777096, + "learning_rate": 6.635613260254401e-06, + "loss": 0.4914, + "step": 2892 + }, + { + "epoch": 2.101468685447075, + "grad_norm": 0.31201147100352206, + "learning_rate": 6.633357186257759e-06, + "loss": 0.5067, + "step": 2893 + }, + { + "epoch": 2.1021950831952423, + "grad_norm": 0.3265575415617553, + "learning_rate": 6.6311007399301855e-06, + "loss": 0.4925, + "step": 2894 + }, + { + "epoch": 2.102921480943409, + "grad_norm": 0.38724063508599177, + "learning_rate": 6.628843921786045e-06, + "loss": 0.4985, + "step": 2895 + }, + { + "epoch": 2.103647878691576, + "grad_norm": 0.6051025075106355, + "learning_rate": 6.626586732339794e-06, + "loss": 0.5002, + "step": 2896 + }, + { + "epoch": 2.104374276439743, + "grad_norm": 0.38902900482916525, + "learning_rate": 6.624329172105964e-06, + "loss": 0.5102, + "step": 2897 + }, + { + "epoch": 2.10510067418791, + "grad_norm": 0.3551678176261896, + "learning_rate": 6.622071241599175e-06, + "loss": 0.5054, + "step": 2898 + }, + { + "epoch": 2.105827071936077, + "grad_norm": 0.3393281263547164, + "learning_rate": 6.619812941334136e-06, + "loss": 0.5078, + "step": 2899 + }, + { + "epoch": 2.106553469684244, + "grad_norm": 1.5832168634036479, + "learning_rate": 6.617554271825636e-06, + "loss": 0.4975, + "step": 2900 + }, + { + "epoch": 2.107279867432411, + "grad_norm": 0.3944324081013014, + "learning_rate": 6.615295233588546e-06, + "loss": 0.5181, + "step": 2901 + }, + { + "epoch": 2.108006265180578, + "grad_norm": 0.3668693933031106, + "learning_rate": 6.613035827137827e-06, + "loss": 0.5025, + "step": 2902 + }, + { + "epoch": 2.108732662928745, + "grad_norm": 0.35678377216491264, + "learning_rate": 6.610776052988519e-06, + "loss": 0.4916, + "step": 2903 + }, + { + "epoch": 2.1094590606769117, + "grad_norm": 0.3724762793133281, + "learning_rate": 6.608515911655744e-06, + "loss": 0.4914, + "step": 2904 + }, + { + "epoch": 2.1101854584250788, + "grad_norm": 0.29830214021032636, + "learning_rate": 6.606255403654717e-06, + "loss": 0.4919, + "step": 2905 + }, + { + "epoch": 2.110911856173246, + "grad_norm": 0.4018527850111275, + "learning_rate": 6.603994529500728e-06, + "loss": 0.5077, + "step": 2906 + }, + { + "epoch": 2.111638253921413, + "grad_norm": 0.4001560282776422, + "learning_rate": 6.601733289709154e-06, + "loss": 0.4812, + "step": 2907 + }, + { + "epoch": 2.11236465166958, + "grad_norm": 0.4123126231877299, + "learning_rate": 6.599471684795452e-06, + "loss": 0.4916, + "step": 2908 + }, + { + "epoch": 2.1130910494177466, + "grad_norm": 0.31888312898973853, + "learning_rate": 6.597209715275168e-06, + "loss": 0.4819, + "step": 2909 + }, + { + "epoch": 2.1138174471659137, + "grad_norm": 0.37376059043712934, + "learning_rate": 6.594947381663926e-06, + "loss": 0.4869, + "step": 2910 + }, + { + "epoch": 2.1145438449140808, + "grad_norm": 0.3313260091877316, + "learning_rate": 6.592684684477435e-06, + "loss": 0.5016, + "step": 2911 + }, + { + "epoch": 2.115270242662248, + "grad_norm": 0.3587303872117973, + "learning_rate": 6.590421624231487e-06, + "loss": 0.4815, + "step": 2912 + }, + { + "epoch": 2.115996640410415, + "grad_norm": 0.37481813006623715, + "learning_rate": 6.588158201441956e-06, + "loss": 0.4888, + "step": 2913 + }, + { + "epoch": 2.1167230381585815, + "grad_norm": 0.32510590971086595, + "learning_rate": 6.5858944166247994e-06, + "loss": 0.5049, + "step": 2914 + }, + { + "epoch": 2.1174494359067486, + "grad_norm": 0.4162742377307629, + "learning_rate": 6.583630270296057e-06, + "loss": 0.4937, + "step": 2915 + }, + { + "epoch": 2.1181758336549157, + "grad_norm": 0.4144903712969387, + "learning_rate": 6.581365762971853e-06, + "loss": 0.5029, + "step": 2916 + }, + { + "epoch": 2.1189022314030828, + "grad_norm": 0.3926017570855505, + "learning_rate": 6.579100895168389e-06, + "loss": 0.5063, + "step": 2917 + }, + { + "epoch": 2.11962862915125, + "grad_norm": 0.37089262361926517, + "learning_rate": 6.576835667401953e-06, + "loss": 0.494, + "step": 2918 + }, + { + "epoch": 2.1203550268994165, + "grad_norm": 0.414632796429618, + "learning_rate": 6.574570080188911e-06, + "loss": 0.4996, + "step": 2919 + }, + { + "epoch": 2.1210814246475835, + "grad_norm": 0.3823046012390911, + "learning_rate": 6.5723041340457175e-06, + "loss": 0.506, + "step": 2920 + }, + { + "epoch": 2.1218078223957506, + "grad_norm": 0.34052878156073463, + "learning_rate": 6.570037829488902e-06, + "loss": 0.4983, + "step": 2921 + }, + { + "epoch": 2.1225342201439177, + "grad_norm": 0.3739161848070694, + "learning_rate": 6.56777116703508e-06, + "loss": 0.4977, + "step": 2922 + }, + { + "epoch": 2.1232606178920843, + "grad_norm": 0.3433004278765078, + "learning_rate": 6.565504147200945e-06, + "loss": 0.5015, + "step": 2923 + }, + { + "epoch": 2.1239870156402514, + "grad_norm": 0.3546076096060906, + "learning_rate": 6.563236770503276e-06, + "loss": 0.4928, + "step": 2924 + }, + { + "epoch": 2.1247134133884185, + "grad_norm": 0.3522187399759698, + "learning_rate": 6.560969037458933e-06, + "loss": 0.5002, + "step": 2925 + }, + { + "epoch": 2.1254398111365855, + "grad_norm": 0.43583768519910443, + "learning_rate": 6.558700948584852e-06, + "loss": 0.4951, + "step": 2926 + }, + { + "epoch": 2.1261662088847526, + "grad_norm": 0.39854393558885376, + "learning_rate": 6.556432504398056e-06, + "loss": 0.5109, + "step": 2927 + }, + { + "epoch": 2.1268926066329192, + "grad_norm": 0.4879786970248183, + "learning_rate": 6.554163705415646e-06, + "loss": 0.4779, + "step": 2928 + }, + { + "epoch": 2.1276190043810863, + "grad_norm": 0.3796776631929523, + "learning_rate": 6.551894552154806e-06, + "loss": 0.5165, + "step": 2929 + }, + { + "epoch": 2.1283454021292534, + "grad_norm": 0.34666624815506336, + "learning_rate": 6.5496250451327996e-06, + "loss": 0.4924, + "step": 2930 + }, + { + "epoch": 2.1290717998774205, + "grad_norm": 0.41599288142856355, + "learning_rate": 6.547355184866968e-06, + "loss": 0.5063, + "step": 2931 + }, + { + "epoch": 2.1297981976255875, + "grad_norm": 0.3738382621240727, + "learning_rate": 6.545084971874738e-06, + "loss": 0.4959, + "step": 2932 + }, + { + "epoch": 2.130524595373754, + "grad_norm": 0.5175550892040155, + "learning_rate": 6.542814406673613e-06, + "loss": 0.5029, + "step": 2933 + }, + { + "epoch": 2.1312509931219212, + "grad_norm": 0.4203991259923369, + "learning_rate": 6.540543489781183e-06, + "loss": 0.5143, + "step": 2934 + }, + { + "epoch": 2.1319773908700883, + "grad_norm": 0.4217207006128095, + "learning_rate": 6.538272221715107e-06, + "loss": 0.5133, + "step": 2935 + }, + { + "epoch": 2.1327037886182554, + "grad_norm": 0.42445433984426745, + "learning_rate": 6.536000602993134e-06, + "loss": 0.5033, + "step": 2936 + }, + { + "epoch": 2.1334301863664225, + "grad_norm": 0.3376368181992335, + "learning_rate": 6.533728634133089e-06, + "loss": 0.5009, + "step": 2937 + }, + { + "epoch": 2.134156584114589, + "grad_norm": 0.5977879995318446, + "learning_rate": 6.531456315652878e-06, + "loss": 0.5002, + "step": 2938 + }, + { + "epoch": 2.134882981862756, + "grad_norm": 0.4754900561503473, + "learning_rate": 6.529183648070484e-06, + "loss": 0.5076, + "step": 2939 + }, + { + "epoch": 2.1356093796109232, + "grad_norm": 0.3782456325532489, + "learning_rate": 6.526910631903973e-06, + "loss": 0.5007, + "step": 2940 + }, + { + "epoch": 2.1363357773590903, + "grad_norm": 0.37297735671201765, + "learning_rate": 6.5246372676714895e-06, + "loss": 0.5082, + "step": 2941 + }, + { + "epoch": 2.137062175107257, + "grad_norm": 0.3580637523533214, + "learning_rate": 6.522363555891255e-06, + "loss": 0.5074, + "step": 2942 + }, + { + "epoch": 2.137788572855424, + "grad_norm": 0.46572760943088304, + "learning_rate": 6.520089497081574e-06, + "loss": 0.4881, + "step": 2943 + }, + { + "epoch": 2.138514970603591, + "grad_norm": 0.4387344136375126, + "learning_rate": 6.5178150917608265e-06, + "loss": 0.4914, + "step": 2944 + }, + { + "epoch": 2.139241368351758, + "grad_norm": 0.3956585802771223, + "learning_rate": 6.515540340447475e-06, + "loss": 0.5024, + "step": 2945 + }, + { + "epoch": 2.1399677660999252, + "grad_norm": 0.3692454181128716, + "learning_rate": 6.513265243660057e-06, + "loss": 0.4931, + "step": 2946 + }, + { + "epoch": 2.1406941638480923, + "grad_norm": 0.3357775537505742, + "learning_rate": 6.5109898019171924e-06, + "loss": 0.5035, + "step": 2947 + }, + { + "epoch": 2.141420561596259, + "grad_norm": 0.3150057320366553, + "learning_rate": 6.508714015737577e-06, + "loss": 0.5124, + "step": 2948 + }, + { + "epoch": 2.142146959344426, + "grad_norm": 0.3200772493002218, + "learning_rate": 6.506437885639988e-06, + "loss": 0.5045, + "step": 2949 + }, + { + "epoch": 2.142873357092593, + "grad_norm": 0.5384146504997512, + "learning_rate": 6.504161412143277e-06, + "loss": 0.5008, + "step": 2950 + }, + { + "epoch": 2.14359975484076, + "grad_norm": 0.34588162160118074, + "learning_rate": 6.5018845957663764e-06, + "loss": 0.5021, + "step": 2951 + }, + { + "epoch": 2.144326152588927, + "grad_norm": 0.38778620028106675, + "learning_rate": 6.499607437028298e-06, + "loss": 0.4979, + "step": 2952 + }, + { + "epoch": 2.145052550337094, + "grad_norm": 0.7015904343487487, + "learning_rate": 6.497329936448129e-06, + "loss": 0.4771, + "step": 2953 + }, + { + "epoch": 2.145778948085261, + "grad_norm": 0.32149055233561774, + "learning_rate": 6.495052094545036e-06, + "loss": 0.4944, + "step": 2954 + }, + { + "epoch": 2.146505345833428, + "grad_norm": 0.45082281024800525, + "learning_rate": 6.492773911838263e-06, + "loss": 0.4984, + "step": 2955 + }, + { + "epoch": 2.147231743581595, + "grad_norm": 0.4953862513462056, + "learning_rate": 6.490495388847131e-06, + "loss": 0.4928, + "step": 2956 + }, + { + "epoch": 2.1479581413297617, + "grad_norm": 0.3827423100083045, + "learning_rate": 6.488216526091042e-06, + "loss": 0.4853, + "step": 2957 + }, + { + "epoch": 2.148684539077929, + "grad_norm": 0.3300300050257459, + "learning_rate": 6.485937324089468e-06, + "loss": 0.4915, + "step": 2958 + }, + { + "epoch": 2.149410936826096, + "grad_norm": 0.3433551945936361, + "learning_rate": 6.483657783361968e-06, + "loss": 0.4973, + "step": 2959 + }, + { + "epoch": 2.150137334574263, + "grad_norm": 0.3816368521232514, + "learning_rate": 6.481377904428171e-06, + "loss": 0.4866, + "step": 2960 + }, + { + "epoch": 2.15086373232243, + "grad_norm": 0.4339166761827746, + "learning_rate": 6.479097687807785e-06, + "loss": 0.5095, + "step": 2961 + }, + { + "epoch": 2.1515901300705966, + "grad_norm": 0.34956120380265815, + "learning_rate": 6.476817134020596e-06, + "loss": 0.4938, + "step": 2962 + }, + { + "epoch": 2.1523165278187637, + "grad_norm": 0.4460892120279158, + "learning_rate": 6.474536243586466e-06, + "loss": 0.505, + "step": 2963 + }, + { + "epoch": 2.1530429255669308, + "grad_norm": 0.3513400937290252, + "learning_rate": 6.472255017025334e-06, + "loss": 0.4939, + "step": 2964 + }, + { + "epoch": 2.153769323315098, + "grad_norm": 0.36376467899952253, + "learning_rate": 6.469973454857217e-06, + "loss": 0.4992, + "step": 2965 + }, + { + "epoch": 2.154495721063265, + "grad_norm": 0.36673117211425776, + "learning_rate": 6.4676915576022045e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 2.1552221188114316, + "grad_norm": 0.3762124077379351, + "learning_rate": 6.465409325780467e-06, + "loss": 0.5015, + "step": 2967 + }, + { + "epoch": 2.1559485165595986, + "grad_norm": 0.35115455230934106, + "learning_rate": 6.463126759912248e-06, + "loss": 0.5098, + "step": 2968 + }, + { + "epoch": 2.1566749143077657, + "grad_norm": 0.35298907038203053, + "learning_rate": 6.4608438605178695e-06, + "loss": 0.485, + "step": 2969 + }, + { + "epoch": 2.1574013120559328, + "grad_norm": 0.3472801797067342, + "learning_rate": 6.458560628117727e-06, + "loss": 0.5066, + "step": 2970 + }, + { + "epoch": 2.1581277098040994, + "grad_norm": 0.36895758673806917, + "learning_rate": 6.456277063232296e-06, + "loss": 0.5113, + "step": 2971 + }, + { + "epoch": 2.1588541075522665, + "grad_norm": 0.6875475378019651, + "learning_rate": 6.453993166382122e-06, + "loss": 0.5131, + "step": 2972 + }, + { + "epoch": 2.1595805053004336, + "grad_norm": 0.3214418264033814, + "learning_rate": 6.451708938087831e-06, + "loss": 0.4929, + "step": 2973 + }, + { + "epoch": 2.1603069030486006, + "grad_norm": 0.34283067423716496, + "learning_rate": 6.449424378870123e-06, + "loss": 0.4839, + "step": 2974 + }, + { + "epoch": 2.1610333007967677, + "grad_norm": 0.30824117611470303, + "learning_rate": 6.4471394892497714e-06, + "loss": 0.4937, + "step": 2975 + }, + { + "epoch": 2.1617596985449343, + "grad_norm": 0.36744675557677187, + "learning_rate": 6.44485426974763e-06, + "loss": 0.484, + "step": 2976 + }, + { + "epoch": 2.1624860962931014, + "grad_norm": 0.36198374511775755, + "learning_rate": 6.442568720884621e-06, + "loss": 0.51, + "step": 2977 + }, + { + "epoch": 2.1632124940412685, + "grad_norm": 0.3395204712387428, + "learning_rate": 6.440282843181748e-06, + "loss": 0.5048, + "step": 2978 + }, + { + "epoch": 2.1639388917894355, + "grad_norm": 0.3586355283964876, + "learning_rate": 6.437996637160086e-06, + "loss": 0.4987, + "step": 2979 + }, + { + "epoch": 2.1646652895376026, + "grad_norm": 0.6935755788835902, + "learning_rate": 6.435710103340787e-06, + "loss": 0.5091, + "step": 2980 + }, + { + "epoch": 2.1653916872857693, + "grad_norm": 0.4025325375633823, + "learning_rate": 6.433423242245074e-06, + "loss": 0.4987, + "step": 2981 + }, + { + "epoch": 2.1661180850339363, + "grad_norm": 0.3298638497588873, + "learning_rate": 6.431136054394247e-06, + "loss": 0.5024, + "step": 2982 + }, + { + "epoch": 2.1668444827821034, + "grad_norm": 0.4156589971885642, + "learning_rate": 6.4288485403096825e-06, + "loss": 0.507, + "step": 2983 + }, + { + "epoch": 2.1675708805302705, + "grad_norm": 0.30768317468521594, + "learning_rate": 6.426560700512828e-06, + "loss": 0.4919, + "step": 2984 + }, + { + "epoch": 2.1682972782784375, + "grad_norm": 0.3163321608467066, + "learning_rate": 6.4242725355252075e-06, + "loss": 0.5029, + "step": 2985 + }, + { + "epoch": 2.169023676026604, + "grad_norm": 0.3998333975752197, + "learning_rate": 6.421984045868418e-06, + "loss": 0.5134, + "step": 2986 + }, + { + "epoch": 2.1697500737747712, + "grad_norm": 0.3610186546760319, + "learning_rate": 6.419695232064131e-06, + "loss": 0.4949, + "step": 2987 + }, + { + "epoch": 2.1704764715229383, + "grad_norm": 0.33557243978359264, + "learning_rate": 6.41740609463409e-06, + "loss": 0.4966, + "step": 2988 + }, + { + "epoch": 2.1712028692711054, + "grad_norm": 0.30979741916167625, + "learning_rate": 6.415116634100116e-06, + "loss": 0.4861, + "step": 2989 + }, + { + "epoch": 2.171929267019272, + "grad_norm": 0.36967468828561945, + "learning_rate": 6.412826850984099e-06, + "loss": 0.5139, + "step": 2990 + }, + { + "epoch": 2.172655664767439, + "grad_norm": 0.510837030995537, + "learning_rate": 6.410536745808009e-06, + "loss": 0.5146, + "step": 2991 + }, + { + "epoch": 2.173382062515606, + "grad_norm": 0.3706820249330839, + "learning_rate": 6.408246319093882e-06, + "loss": 0.5086, + "step": 2992 + }, + { + "epoch": 2.1741084602637732, + "grad_norm": 0.38337344616742824, + "learning_rate": 6.405955571363832e-06, + "loss": 0.5003, + "step": 2993 + }, + { + "epoch": 2.1748348580119403, + "grad_norm": 0.33710792016006064, + "learning_rate": 6.403664503140046e-06, + "loss": 0.4857, + "step": 2994 + }, + { + "epoch": 2.175561255760107, + "grad_norm": 0.3325991143206928, + "learning_rate": 6.401373114944781e-06, + "loss": 0.504, + "step": 2995 + }, + { + "epoch": 2.176287653508274, + "grad_norm": 0.3747987822017251, + "learning_rate": 6.399081407300372e-06, + "loss": 0.4865, + "step": 2996 + }, + { + "epoch": 2.177014051256441, + "grad_norm": 0.8042188756656197, + "learning_rate": 6.396789380729218e-06, + "loss": 0.5041, + "step": 2997 + }, + { + "epoch": 2.177740449004608, + "grad_norm": 0.32635741396114964, + "learning_rate": 6.394497035753804e-06, + "loss": 0.4877, + "step": 2998 + }, + { + "epoch": 2.1784668467527752, + "grad_norm": 0.5737248436672333, + "learning_rate": 6.392204372896676e-06, + "loss": 0.4965, + "step": 2999 + }, + { + "epoch": 2.179193244500942, + "grad_norm": 0.3958470679136034, + "learning_rate": 6.3899113926804565e-06, + "loss": 0.4748, + "step": 3000 + }, + { + "epoch": 2.179919642249109, + "grad_norm": 0.4448283621367889, + "learning_rate": 6.387618095627841e-06, + "loss": 0.4839, + "step": 3001 + }, + { + "epoch": 2.180646039997276, + "grad_norm": 0.5670955549609107, + "learning_rate": 6.385324482261597e-06, + "loss": 0.4843, + "step": 3002 + }, + { + "epoch": 2.181372437745443, + "grad_norm": 0.3817161520952691, + "learning_rate": 6.383030553104562e-06, + "loss": 0.4961, + "step": 3003 + }, + { + "epoch": 2.18209883549361, + "grad_norm": 0.3568277844701838, + "learning_rate": 6.380736308679649e-06, + "loss": 0.4869, + "step": 3004 + }, + { + "epoch": 2.182825233241777, + "grad_norm": 0.35591197709871963, + "learning_rate": 6.378441749509841e-06, + "loss": 0.4872, + "step": 3005 + }, + { + "epoch": 2.183551630989944, + "grad_norm": 0.45168722406676537, + "learning_rate": 6.376146876118193e-06, + "loss": 0.4842, + "step": 3006 + }, + { + "epoch": 2.184278028738111, + "grad_norm": 0.36618818991265534, + "learning_rate": 6.37385168902783e-06, + "loss": 0.5104, + "step": 3007 + }, + { + "epoch": 2.185004426486278, + "grad_norm": 0.4164922266504299, + "learning_rate": 6.37155618876195e-06, + "loss": 0.4961, + "step": 3008 + }, + { + "epoch": 2.1857308242344446, + "grad_norm": 0.33660435418460627, + "learning_rate": 6.369260375843825e-06, + "loss": 0.494, + "step": 3009 + }, + { + "epoch": 2.1864572219826117, + "grad_norm": 0.3661615173054258, + "learning_rate": 6.366964250796794e-06, + "loss": 0.5024, + "step": 3010 + }, + { + "epoch": 2.187183619730779, + "grad_norm": 0.3621135555183418, + "learning_rate": 6.364667814144267e-06, + "loss": 0.494, + "step": 3011 + }, + { + "epoch": 2.187910017478946, + "grad_norm": 0.39242459700593657, + "learning_rate": 6.362371066409727e-06, + "loss": 0.4949, + "step": 3012 + }, + { + "epoch": 2.188636415227113, + "grad_norm": 0.35575069031596895, + "learning_rate": 6.360074008116732e-06, + "loss": 0.5088, + "step": 3013 + }, + { + "epoch": 2.1893628129752796, + "grad_norm": 0.36556209399790124, + "learning_rate": 6.357776639788904e-06, + "loss": 0.5023, + "step": 3014 + }, + { + "epoch": 2.1900892107234466, + "grad_norm": 0.34006098095149784, + "learning_rate": 6.355478961949935e-06, + "loss": 0.5036, + "step": 3015 + }, + { + "epoch": 2.1908156084716137, + "grad_norm": 0.3649755041154573, + "learning_rate": 6.353180975123595e-06, + "loss": 0.5236, + "step": 3016 + }, + { + "epoch": 2.191542006219781, + "grad_norm": 0.5093498310975254, + "learning_rate": 6.3508826798337184e-06, + "loss": 0.4942, + "step": 3017 + }, + { + "epoch": 2.192268403967948, + "grad_norm": 0.3997941447173825, + "learning_rate": 6.348584076604213e-06, + "loss": 0.4917, + "step": 3018 + }, + { + "epoch": 2.1929948017161145, + "grad_norm": 0.3734740659212164, + "learning_rate": 6.346285165959053e-06, + "loss": 0.4966, + "step": 3019 + }, + { + "epoch": 2.1937211994642816, + "grad_norm": 0.5631358961409713, + "learning_rate": 6.3439859484222874e-06, + "loss": 0.4836, + "step": 3020 + }, + { + "epoch": 2.1944475972124486, + "grad_norm": 0.3562949020025143, + "learning_rate": 6.341686424518032e-06, + "loss": 0.4878, + "step": 3021 + }, + { + "epoch": 2.1951739949606157, + "grad_norm": 0.44992840420045366, + "learning_rate": 6.339386594770472e-06, + "loss": 0.5014, + "step": 3022 + }, + { + "epoch": 2.195900392708783, + "grad_norm": 0.34442915629672294, + "learning_rate": 6.337086459703867e-06, + "loss": 0.4986, + "step": 3023 + }, + { + "epoch": 2.1966267904569494, + "grad_norm": 0.39024905830114803, + "learning_rate": 6.33478601984254e-06, + "loss": 0.5024, + "step": 3024 + }, + { + "epoch": 2.1973531882051165, + "grad_norm": 0.3998500751389736, + "learning_rate": 6.332485275710889e-06, + "loss": 0.4949, + "step": 3025 + }, + { + "epoch": 2.1980795859532836, + "grad_norm": 0.3429294829411977, + "learning_rate": 6.330184227833376e-06, + "loss": 0.4807, + "step": 3026 + }, + { + "epoch": 2.1988059837014506, + "grad_norm": 0.690099298899355, + "learning_rate": 6.327882876734537e-06, + "loss": 0.4909, + "step": 3027 + }, + { + "epoch": 2.1995323814496177, + "grad_norm": 0.35195712782936517, + "learning_rate": 6.325581222938972e-06, + "loss": 0.48, + "step": 3028 + }, + { + "epoch": 2.2002587791977843, + "grad_norm": 0.328965359456782, + "learning_rate": 6.323279266971357e-06, + "loss": 0.4956, + "step": 3029 + }, + { + "epoch": 2.2009851769459514, + "grad_norm": 0.4625848240891544, + "learning_rate": 6.3209770093564315e-06, + "loss": 0.5042, + "step": 3030 + }, + { + "epoch": 2.2017115746941185, + "grad_norm": 0.4053912419568821, + "learning_rate": 6.318674450619002e-06, + "loss": 0.494, + "step": 3031 + }, + { + "epoch": 2.2024379724422856, + "grad_norm": 0.3512279925789144, + "learning_rate": 6.316371591283953e-06, + "loss": 0.4903, + "step": 3032 + }, + { + "epoch": 2.2031643701904526, + "grad_norm": 0.3636383991578603, + "learning_rate": 6.3140684318762265e-06, + "loss": 0.4762, + "step": 3033 + }, + { + "epoch": 2.2038907679386193, + "grad_norm": 0.3852602096351379, + "learning_rate": 6.31176497292084e-06, + "loss": 0.4977, + "step": 3034 + }, + { + "epoch": 2.2046171656867863, + "grad_norm": 0.3947804255450504, + "learning_rate": 6.309461214942877e-06, + "loss": 0.4976, + "step": 3035 + }, + { + "epoch": 2.2053435634349534, + "grad_norm": 0.3790065839004977, + "learning_rate": 6.307157158467487e-06, + "loss": 0.4863, + "step": 3036 + }, + { + "epoch": 2.2060699611831205, + "grad_norm": 0.41911269447716715, + "learning_rate": 6.304852804019892e-06, + "loss": 0.4962, + "step": 3037 + }, + { + "epoch": 2.206796358931287, + "grad_norm": 0.35734766993699474, + "learning_rate": 6.30254815212538e-06, + "loss": 0.5068, + "step": 3038 + }, + { + "epoch": 2.207522756679454, + "grad_norm": 0.6636967409043618, + "learning_rate": 6.3002432033093055e-06, + "loss": 0.4963, + "step": 3039 + }, + { + "epoch": 2.2082491544276213, + "grad_norm": 0.32487300685748505, + "learning_rate": 6.297937958097094e-06, + "loss": 0.4934, + "step": 3040 + }, + { + "epoch": 2.2089755521757883, + "grad_norm": 0.31860100445543343, + "learning_rate": 6.29563241701423e-06, + "loss": 0.5128, + "step": 3041 + }, + { + "epoch": 2.2097019499239554, + "grad_norm": 0.644224408832893, + "learning_rate": 6.293326580586278e-06, + "loss": 0.495, + "step": 3042 + }, + { + "epoch": 2.210428347672122, + "grad_norm": 0.3863384310540497, + "learning_rate": 6.29102044933886e-06, + "loss": 0.5091, + "step": 3043 + }, + { + "epoch": 2.211154745420289, + "grad_norm": 0.3246376834758291, + "learning_rate": 6.2887140237976714e-06, + "loss": 0.5011, + "step": 3044 + }, + { + "epoch": 2.211881143168456, + "grad_norm": 0.34220816049628433, + "learning_rate": 6.286407304488471e-06, + "loss": 0.4955, + "step": 3045 + }, + { + "epoch": 2.2126075409166233, + "grad_norm": 0.32757096583048706, + "learning_rate": 6.284100291937083e-06, + "loss": 0.5024, + "step": 3046 + }, + { + "epoch": 2.2133339386647903, + "grad_norm": 0.37712871027281575, + "learning_rate": 6.281792986669406e-06, + "loss": 0.4955, + "step": 3047 + }, + { + "epoch": 2.214060336412957, + "grad_norm": 0.4039253546929864, + "learning_rate": 6.279485389211396e-06, + "loss": 0.4844, + "step": 3048 + }, + { + "epoch": 2.214786734161124, + "grad_norm": 0.3442024462559989, + "learning_rate": 6.277177500089082e-06, + "loss": 0.5045, + "step": 3049 + }, + { + "epoch": 2.215513131909291, + "grad_norm": 0.36987896779171475, + "learning_rate": 6.274869319828558e-06, + "loss": 0.4955, + "step": 3050 + }, + { + "epoch": 2.216239529657458, + "grad_norm": 0.33664971222500834, + "learning_rate": 6.272560848955982e-06, + "loss": 0.4976, + "step": 3051 + }, + { + "epoch": 2.2169659274056253, + "grad_norm": 0.38221122818636305, + "learning_rate": 6.270252087997581e-06, + "loss": 0.4949, + "step": 3052 + }, + { + "epoch": 2.217692325153792, + "grad_norm": 0.3254009518810701, + "learning_rate": 6.267943037479647e-06, + "loss": 0.504, + "step": 3053 + }, + { + "epoch": 2.218418722901959, + "grad_norm": 0.3533296250544195, + "learning_rate": 6.265633697928539e-06, + "loss": 0.495, + "step": 3054 + }, + { + "epoch": 2.219145120650126, + "grad_norm": 0.4377959364901721, + "learning_rate": 6.26332406987068e-06, + "loss": 0.4848, + "step": 3055 + }, + { + "epoch": 2.219871518398293, + "grad_norm": 0.44974371152679193, + "learning_rate": 6.261014153832559e-06, + "loss": 0.5034, + "step": 3056 + }, + { + "epoch": 2.2205979161464597, + "grad_norm": 0.4524681508883102, + "learning_rate": 6.258703950340734e-06, + "loss": 0.5038, + "step": 3057 + }, + { + "epoch": 2.221324313894627, + "grad_norm": 0.45769261482601487, + "learning_rate": 6.256393459921824e-06, + "loss": 0.5083, + "step": 3058 + }, + { + "epoch": 2.222050711642794, + "grad_norm": 0.37051014434450674, + "learning_rate": 6.254082683102517e-06, + "loss": 0.4945, + "step": 3059 + }, + { + "epoch": 2.222777109390961, + "grad_norm": 0.3394404946299637, + "learning_rate": 6.251771620409563e-06, + "loss": 0.4961, + "step": 3060 + }, + { + "epoch": 2.223503507139128, + "grad_norm": 0.33392609888135194, + "learning_rate": 6.24946027236978e-06, + "loss": 0.4922, + "step": 3061 + }, + { + "epoch": 2.2242299048872947, + "grad_norm": 0.39803836210224014, + "learning_rate": 6.24714863951005e-06, + "loss": 0.5061, + "step": 3062 + }, + { + "epoch": 2.2249563026354617, + "grad_norm": 0.5053484367967763, + "learning_rate": 6.244836722357319e-06, + "loss": 0.4951, + "step": 3063 + }, + { + "epoch": 2.225682700383629, + "grad_norm": 0.3402645327719718, + "learning_rate": 6.2425245214386e-06, + "loss": 0.488, + "step": 3064 + }, + { + "epoch": 2.226409098131796, + "grad_norm": 0.363552088042168, + "learning_rate": 6.240212037280967e-06, + "loss": 0.487, + "step": 3065 + }, + { + "epoch": 2.227135495879963, + "grad_norm": 0.5809668434482768, + "learning_rate": 6.237899270411564e-06, + "loss": 0.4819, + "step": 3066 + }, + { + "epoch": 2.2278618936281296, + "grad_norm": 0.36027206956518293, + "learning_rate": 6.2355862213575935e-06, + "loss": 0.5039, + "step": 3067 + }, + { + "epoch": 2.2285882913762967, + "grad_norm": 0.3955705798934775, + "learning_rate": 6.233272890646327e-06, + "loss": 0.4914, + "step": 3068 + }, + { + "epoch": 2.2293146891244637, + "grad_norm": 0.36772804787972874, + "learning_rate": 6.230959278805098e-06, + "loss": 0.494, + "step": 3069 + }, + { + "epoch": 2.230041086872631, + "grad_norm": 0.3811689830122236, + "learning_rate": 6.228645386361304e-06, + "loss": 0.5019, + "step": 3070 + }, + { + "epoch": 2.230767484620798, + "grad_norm": 0.3383061085045838, + "learning_rate": 6.226331213842406e-06, + "loss": 0.4885, + "step": 3071 + }, + { + "epoch": 2.2314938823689645, + "grad_norm": 0.36887931278841335, + "learning_rate": 6.224016761775933e-06, + "loss": 0.493, + "step": 3072 + }, + { + "epoch": 2.2322202801171316, + "grad_norm": 0.47523629811023393, + "learning_rate": 6.2217020306894705e-06, + "loss": 0.4951, + "step": 3073 + }, + { + "epoch": 2.2329466778652987, + "grad_norm": 0.3919649192352115, + "learning_rate": 6.2193870211106745e-06, + "loss": 0.4818, + "step": 3074 + }, + { + "epoch": 2.2336730756134657, + "grad_norm": 0.33973722288683916, + "learning_rate": 6.217071733567261e-06, + "loss": 0.4911, + "step": 3075 + }, + { + "epoch": 2.2343994733616324, + "grad_norm": 0.39854275169930475, + "learning_rate": 6.214756168587009e-06, + "loss": 0.5012, + "step": 3076 + }, + { + "epoch": 2.2351258711097994, + "grad_norm": 0.44043656008659554, + "learning_rate": 6.212440326697762e-06, + "loss": 0.5017, + "step": 3077 + }, + { + "epoch": 2.2358522688579665, + "grad_norm": 0.3722796430104987, + "learning_rate": 6.210124208427427e-06, + "loss": 0.4996, + "step": 3078 + }, + { + "epoch": 2.2365786666061336, + "grad_norm": 0.3412876211388963, + "learning_rate": 6.207807814303973e-06, + "loss": 0.5017, + "step": 3079 + }, + { + "epoch": 2.2373050643543007, + "grad_norm": 0.32142078513383754, + "learning_rate": 6.205491144855432e-06, + "loss": 0.4852, + "step": 3080 + }, + { + "epoch": 2.2380314621024673, + "grad_norm": 0.4026214624360047, + "learning_rate": 6.203174200609899e-06, + "loss": 0.4937, + "step": 3081 + }, + { + "epoch": 2.2387578598506344, + "grad_norm": 0.35889235187183816, + "learning_rate": 6.200856982095532e-06, + "loss": 0.495, + "step": 3082 + }, + { + "epoch": 2.2394842575988014, + "grad_norm": 0.3788649730714019, + "learning_rate": 6.19853948984055e-06, + "loss": 0.4982, + "step": 3083 + }, + { + "epoch": 2.2402106553469685, + "grad_norm": 0.36923423941185074, + "learning_rate": 6.196221724373237e-06, + "loss": 0.4852, + "step": 3084 + }, + { + "epoch": 2.2409370530951356, + "grad_norm": 0.3499125392928531, + "learning_rate": 6.193903686221937e-06, + "loss": 0.4993, + "step": 3085 + }, + { + "epoch": 2.241663450843302, + "grad_norm": 0.41064362066578464, + "learning_rate": 6.191585375915056e-06, + "loss": 0.4944, + "step": 3086 + }, + { + "epoch": 2.2423898485914693, + "grad_norm": 0.38299816267520065, + "learning_rate": 6.189266793981064e-06, + "loss": 0.4726, + "step": 3087 + }, + { + "epoch": 2.2431162463396364, + "grad_norm": 0.3238281351522023, + "learning_rate": 6.186947940948494e-06, + "loss": 0.4907, + "step": 3088 + }, + { + "epoch": 2.2438426440878034, + "grad_norm": 0.38073181834522724, + "learning_rate": 6.184628817345936e-06, + "loss": 0.5025, + "step": 3089 + }, + { + "epoch": 2.2445690418359705, + "grad_norm": 0.31508364422589685, + "learning_rate": 6.182309423702044e-06, + "loss": 0.4906, + "step": 3090 + }, + { + "epoch": 2.245295439584137, + "grad_norm": 0.47437382669171824, + "learning_rate": 6.179989760545535e-06, + "loss": 0.4859, + "step": 3091 + }, + { + "epoch": 2.246021837332304, + "grad_norm": 0.3875849029709213, + "learning_rate": 6.177669828405186e-06, + "loss": 0.5086, + "step": 3092 + }, + { + "epoch": 2.2467482350804713, + "grad_norm": 0.3891730372212856, + "learning_rate": 6.175349627809839e-06, + "loss": 0.5044, + "step": 3093 + }, + { + "epoch": 2.2474746328286384, + "grad_norm": 0.43368525656394613, + "learning_rate": 6.173029159288388e-06, + "loss": 0.5012, + "step": 3094 + }, + { + "epoch": 2.248201030576805, + "grad_norm": 0.6038511756961967, + "learning_rate": 6.1707084233697974e-06, + "loss": 0.4894, + "step": 3095 + }, + { + "epoch": 2.248927428324972, + "grad_norm": 0.3066072645157146, + "learning_rate": 6.1683874205830905e-06, + "loss": 0.498, + "step": 3096 + }, + { + "epoch": 2.249653826073139, + "grad_norm": 0.3188905488105647, + "learning_rate": 6.166066151457347e-06, + "loss": 0.4794, + "step": 3097 + }, + { + "epoch": 2.250380223821306, + "grad_norm": 0.43514160901236826, + "learning_rate": 6.163744616521712e-06, + "loss": 0.4904, + "step": 3098 + }, + { + "epoch": 2.2511066215694733, + "grad_norm": 0.36486654258921625, + "learning_rate": 6.161422816305389e-06, + "loss": 0.4897, + "step": 3099 + }, + { + "epoch": 2.2518330193176403, + "grad_norm": 0.3480152596351159, + "learning_rate": 6.1591007513376425e-06, + "loss": 0.4867, + "step": 3100 + }, + { + "epoch": 2.252559417065807, + "grad_norm": 0.6364902568308674, + "learning_rate": 6.156778422147797e-06, + "loss": 0.5002, + "step": 3101 + }, + { + "epoch": 2.253285814813974, + "grad_norm": 0.3973215322038487, + "learning_rate": 6.154455829265239e-06, + "loss": 0.4754, + "step": 3102 + }, + { + "epoch": 2.254012212562141, + "grad_norm": 0.4008644063048734, + "learning_rate": 6.152132973219414e-06, + "loss": 0.5, + "step": 3103 + }, + { + "epoch": 2.254738610310308, + "grad_norm": 0.770372781416335, + "learning_rate": 6.1498098545398255e-06, + "loss": 0.4892, + "step": 3104 + }, + { + "epoch": 2.255465008058475, + "grad_norm": 0.3680821695745185, + "learning_rate": 6.147486473756039e-06, + "loss": 0.4938, + "step": 3105 + }, + { + "epoch": 2.256191405806642, + "grad_norm": 0.4193679580025291, + "learning_rate": 6.14516283139768e-06, + "loss": 0.4937, + "step": 3106 + }, + { + "epoch": 2.256917803554809, + "grad_norm": 0.35600056467080976, + "learning_rate": 6.142838927994432e-06, + "loss": 0.491, + "step": 3107 + }, + { + "epoch": 2.257644201302976, + "grad_norm": 0.3740725509927286, + "learning_rate": 6.1405147640760395e-06, + "loss": 0.4931, + "step": 3108 + }, + { + "epoch": 2.258370599051143, + "grad_norm": 0.34855260413863015, + "learning_rate": 6.138190340172308e-06, + "loss": 0.5016, + "step": 3109 + }, + { + "epoch": 2.2590969967993098, + "grad_norm": 0.5207683008045049, + "learning_rate": 6.135865656813095e-06, + "loss": 0.5182, + "step": 3110 + }, + { + "epoch": 2.259823394547477, + "grad_norm": 0.40710281188709585, + "learning_rate": 6.1335407145283265e-06, + "loss": 0.4872, + "step": 3111 + }, + { + "epoch": 2.260549792295644, + "grad_norm": 0.38524703767888147, + "learning_rate": 6.131215513847982e-06, + "loss": 0.4888, + "step": 3112 + }, + { + "epoch": 2.261276190043811, + "grad_norm": 0.3611442300937129, + "learning_rate": 6.128890055302103e-06, + "loss": 0.4971, + "step": 3113 + }, + { + "epoch": 2.2620025877919776, + "grad_norm": 0.3726264631189811, + "learning_rate": 6.126564339420784e-06, + "loss": 0.4982, + "step": 3114 + }, + { + "epoch": 2.2627289855401447, + "grad_norm": 0.35003659748054033, + "learning_rate": 6.124238366734185e-06, + "loss": 0.4857, + "step": 3115 + }, + { + "epoch": 2.2634553832883118, + "grad_norm": 0.44795422664526696, + "learning_rate": 6.121912137772521e-06, + "loss": 0.4933, + "step": 3116 + }, + { + "epoch": 2.264181781036479, + "grad_norm": 0.38836552057562496, + "learning_rate": 6.119585653066067e-06, + "loss": 0.4868, + "step": 3117 + }, + { + "epoch": 2.264908178784646, + "grad_norm": 0.3874958318464041, + "learning_rate": 6.117258913145153e-06, + "loss": 0.4896, + "step": 3118 + }, + { + "epoch": 2.265634576532813, + "grad_norm": 0.4184151509447441, + "learning_rate": 6.114931918540172e-06, + "loss": 0.4927, + "step": 3119 + }, + { + "epoch": 2.2663609742809796, + "grad_norm": 0.5489700776940928, + "learning_rate": 6.112604669781572e-06, + "loss": 0.4811, + "step": 3120 + }, + { + "epoch": 2.2670873720291467, + "grad_norm": 0.37173040857538353, + "learning_rate": 6.110277167399861e-06, + "loss": 0.4978, + "step": 3121 + }, + { + "epoch": 2.2678137697773137, + "grad_norm": 0.3215371108487471, + "learning_rate": 6.107949411925599e-06, + "loss": 0.4808, + "step": 3122 + }, + { + "epoch": 2.268540167525481, + "grad_norm": 0.4337388157888445, + "learning_rate": 6.105621403889411e-06, + "loss": 0.501, + "step": 3123 + }, + { + "epoch": 2.2692665652736475, + "grad_norm": 0.33974619025502883, + "learning_rate": 6.103293143821978e-06, + "loss": 0.4955, + "step": 3124 + }, + { + "epoch": 2.2699929630218145, + "grad_norm": 0.4190678472874704, + "learning_rate": 6.100964632254033e-06, + "loss": 0.489, + "step": 3125 + }, + { + "epoch": 2.2707193607699816, + "grad_norm": 0.4983709975295048, + "learning_rate": 6.098635869716375e-06, + "loss": 0.4962, + "step": 3126 + }, + { + "epoch": 2.2714457585181487, + "grad_norm": 0.4360393265527959, + "learning_rate": 6.0963068567398535e-06, + "loss": 0.49, + "step": 3127 + }, + { + "epoch": 2.2721721562663157, + "grad_norm": 0.347103768192224, + "learning_rate": 6.093977593855376e-06, + "loss": 0.4925, + "step": 3128 + }, + { + "epoch": 2.2728985540144824, + "grad_norm": 0.4549595372358685, + "learning_rate": 6.0916480815939095e-06, + "loss": 0.4792, + "step": 3129 + }, + { + "epoch": 2.2736249517626494, + "grad_norm": 0.42519053661063105, + "learning_rate": 6.089318320486477e-06, + "loss": 0.5004, + "step": 3130 + }, + { + "epoch": 2.2743513495108165, + "grad_norm": 0.34401001903742934, + "learning_rate": 6.086988311064157e-06, + "loss": 0.4918, + "step": 3131 + }, + { + "epoch": 2.2750777472589836, + "grad_norm": 0.3675422068036801, + "learning_rate": 6.084658053858086e-06, + "loss": 0.4946, + "step": 3132 + }, + { + "epoch": 2.2758041450071507, + "grad_norm": 0.6373015019480114, + "learning_rate": 6.082327549399456e-06, + "loss": 0.4968, + "step": 3133 + }, + { + "epoch": 2.2765305427553173, + "grad_norm": 0.9060836261092253, + "learning_rate": 6.079996798219516e-06, + "loss": 0.5049, + "step": 3134 + }, + { + "epoch": 2.2772569405034844, + "grad_norm": 0.4086724137891712, + "learning_rate": 6.077665800849568e-06, + "loss": 0.4936, + "step": 3135 + }, + { + "epoch": 2.2779833382516514, + "grad_norm": 0.42824112282521215, + "learning_rate": 6.075334557820977e-06, + "loss": 0.5069, + "step": 3136 + }, + { + "epoch": 2.2787097359998185, + "grad_norm": 0.4370864620437589, + "learning_rate": 6.073003069665161e-06, + "loss": 0.4845, + "step": 3137 + }, + { + "epoch": 2.2794361337479856, + "grad_norm": 0.34589256703550214, + "learning_rate": 6.070671336913588e-06, + "loss": 0.4836, + "step": 3138 + }, + { + "epoch": 2.2801625314961522, + "grad_norm": 0.44081977847163717, + "learning_rate": 6.068339360097791e-06, + "loss": 0.499, + "step": 3139 + }, + { + "epoch": 2.2808889292443193, + "grad_norm": 0.3693827842084408, + "learning_rate": 6.066007139749351e-06, + "loss": 0.4819, + "step": 3140 + }, + { + "epoch": 2.2816153269924864, + "grad_norm": 0.4114801998919762, + "learning_rate": 6.063674676399911e-06, + "loss": 0.4971, + "step": 3141 + }, + { + "epoch": 2.2823417247406534, + "grad_norm": 0.5174468598265398, + "learning_rate": 6.061341970581165e-06, + "loss": 0.5008, + "step": 3142 + }, + { + "epoch": 2.28306812248882, + "grad_norm": 0.7341098293975072, + "learning_rate": 6.0590090228248625e-06, + "loss": 0.4929, + "step": 3143 + }, + { + "epoch": 2.283794520236987, + "grad_norm": 0.3505216954061479, + "learning_rate": 6.056675833662811e-06, + "loss": 0.5036, + "step": 3144 + }, + { + "epoch": 2.284520917985154, + "grad_norm": 0.3540500347732656, + "learning_rate": 6.0543424036268675e-06, + "loss": 0.4915, + "step": 3145 + }, + { + "epoch": 2.2852473157333213, + "grad_norm": 0.44413442338960246, + "learning_rate": 6.052008733248954e-06, + "loss": 0.5071, + "step": 3146 + }, + { + "epoch": 2.2859737134814884, + "grad_norm": 0.3900031444781187, + "learning_rate": 6.049674823061036e-06, + "loss": 0.5175, + "step": 3147 + }, + { + "epoch": 2.286700111229655, + "grad_norm": 0.32819673521829124, + "learning_rate": 6.047340673595139e-06, + "loss": 0.4848, + "step": 3148 + }, + { + "epoch": 2.287426508977822, + "grad_norm": 0.37858795931699807, + "learning_rate": 6.045006285383342e-06, + "loss": 0.4929, + "step": 3149 + }, + { + "epoch": 2.288152906725989, + "grad_norm": 0.3666616444696444, + "learning_rate": 6.042671658957783e-06, + "loss": 0.5038, + "step": 3150 + }, + { + "epoch": 2.288879304474156, + "grad_norm": 0.3633303205358319, + "learning_rate": 6.040336794850646e-06, + "loss": 0.4951, + "step": 3151 + }, + { + "epoch": 2.2896057022223233, + "grad_norm": 1.1115070192327328, + "learning_rate": 6.038001693594176e-06, + "loss": 0.4958, + "step": 3152 + }, + { + "epoch": 2.29033209997049, + "grad_norm": 0.326420594941503, + "learning_rate": 6.03566635572067e-06, + "loss": 0.4869, + "step": 3153 + }, + { + "epoch": 2.291058497718657, + "grad_norm": 0.3963449158303792, + "learning_rate": 6.033330781762476e-06, + "loss": 0.4788, + "step": 3154 + }, + { + "epoch": 2.291784895466824, + "grad_norm": 0.5105806805590508, + "learning_rate": 6.0309949722519986e-06, + "loss": 0.5032, + "step": 3155 + }, + { + "epoch": 2.292511293214991, + "grad_norm": 0.4155693537256211, + "learning_rate": 6.028658927721698e-06, + "loss": 0.4902, + "step": 3156 + }, + { + "epoch": 2.293237690963158, + "grad_norm": 0.4138282695922685, + "learning_rate": 6.0263226487040845e-06, + "loss": 0.4809, + "step": 3157 + }, + { + "epoch": 2.293964088711325, + "grad_norm": 0.37462980188091954, + "learning_rate": 6.023986135731724e-06, + "loss": 0.4942, + "step": 3158 + }, + { + "epoch": 2.294690486459492, + "grad_norm": 0.4289202950128778, + "learning_rate": 6.021649389337234e-06, + "loss": 0.498, + "step": 3159 + }, + { + "epoch": 2.295416884207659, + "grad_norm": 0.5635091191246396, + "learning_rate": 6.019312410053286e-06, + "loss": 0.489, + "step": 3160 + }, + { + "epoch": 2.296143281955826, + "grad_norm": 0.4607311853871575, + "learning_rate": 6.016975198412606e-06, + "loss": 0.4946, + "step": 3161 + }, + { + "epoch": 2.2968696797039927, + "grad_norm": 0.6935083323300786, + "learning_rate": 6.014637754947969e-06, + "loss": 0.4811, + "step": 3162 + }, + { + "epoch": 2.2975960774521598, + "grad_norm": 0.41357546263443523, + "learning_rate": 6.012300080192208e-06, + "loss": 0.5075, + "step": 3163 + }, + { + "epoch": 2.298322475200327, + "grad_norm": 0.3542243619660625, + "learning_rate": 6.0099621746782035e-06, + "loss": 0.4971, + "step": 3164 + }, + { + "epoch": 2.299048872948494, + "grad_norm": 0.44134295561503295, + "learning_rate": 6.007624038938895e-06, + "loss": 0.4888, + "step": 3165 + }, + { + "epoch": 2.299775270696661, + "grad_norm": 0.39417830167581025, + "learning_rate": 6.0052856735072685e-06, + "loss": 0.4984, + "step": 3166 + }, + { + "epoch": 2.300501668444828, + "grad_norm": 0.33506843695028593, + "learning_rate": 6.002947078916365e-06, + "loss": 0.4846, + "step": 3167 + }, + { + "epoch": 2.3012280661929947, + "grad_norm": 0.394726807159148, + "learning_rate": 6.000608255699277e-06, + "loss": 0.5015, + "step": 3168 + }, + { + "epoch": 2.3019544639411618, + "grad_norm": 0.35592888824186264, + "learning_rate": 5.99826920438915e-06, + "loss": 0.4928, + "step": 3169 + }, + { + "epoch": 2.302680861689329, + "grad_norm": 0.3741228112311038, + "learning_rate": 5.995929925519181e-06, + "loss": 0.4748, + "step": 3170 + }, + { + "epoch": 2.303407259437496, + "grad_norm": 0.4118521684904315, + "learning_rate": 5.993590419622619e-06, + "loss": 0.4916, + "step": 3171 + }, + { + "epoch": 2.3041336571856625, + "grad_norm": 0.4161129936032844, + "learning_rate": 5.991250687232764e-06, + "loss": 0.4841, + "step": 3172 + }, + { + "epoch": 2.3048600549338296, + "grad_norm": 0.42952160504666864, + "learning_rate": 5.98891072888297e-06, + "loss": 0.4884, + "step": 3173 + }, + { + "epoch": 2.3055864526819967, + "grad_norm": 0.39432337272336604, + "learning_rate": 5.986570545106638e-06, + "loss": 0.4956, + "step": 3174 + }, + { + "epoch": 2.3063128504301638, + "grad_norm": 0.3996450987725706, + "learning_rate": 5.984230136437226e-06, + "loss": 0.4968, + "step": 3175 + }, + { + "epoch": 2.307039248178331, + "grad_norm": 0.3391473825126834, + "learning_rate": 5.981889503408238e-06, + "loss": 0.4883, + "step": 3176 + }, + { + "epoch": 2.3077656459264975, + "grad_norm": 0.3267696126670061, + "learning_rate": 5.9795486465532325e-06, + "loss": 0.4978, + "step": 3177 + }, + { + "epoch": 2.3084920436746645, + "grad_norm": 0.3922857284244964, + "learning_rate": 5.977207566405818e-06, + "loss": 0.4833, + "step": 3178 + }, + { + "epoch": 2.3092184414228316, + "grad_norm": 0.42615109149541347, + "learning_rate": 5.974866263499655e-06, + "loss": 0.4861, + "step": 3179 + }, + { + "epoch": 2.3099448391709987, + "grad_norm": 0.39246348259809566, + "learning_rate": 5.972524738368452e-06, + "loss": 0.499, + "step": 3180 + }, + { + "epoch": 2.3106712369191653, + "grad_norm": 0.3811353423363143, + "learning_rate": 5.970182991545972e-06, + "loss": 0.4878, + "step": 3181 + }, + { + "epoch": 2.3113976346673324, + "grad_norm": 0.3538069167085555, + "learning_rate": 5.967841023566025e-06, + "loss": 0.4999, + "step": 3182 + }, + { + "epoch": 2.3121240324154995, + "grad_norm": 0.451875857319256, + "learning_rate": 5.965498834962472e-06, + "loss": 0.4811, + "step": 3183 + }, + { + "epoch": 2.3128504301636665, + "grad_norm": 0.5972179413766493, + "learning_rate": 5.963156426269228e-06, + "loss": 0.4903, + "step": 3184 + }, + { + "epoch": 2.3135768279118336, + "grad_norm": 0.41997194324350673, + "learning_rate": 5.960813798020252e-06, + "loss": 0.4753, + "step": 3185 + }, + { + "epoch": 2.3143032256600007, + "grad_norm": 0.6097372572334416, + "learning_rate": 5.9584709507495594e-06, + "loss": 0.5037, + "step": 3186 + }, + { + "epoch": 2.3150296234081673, + "grad_norm": 0.5162731967709743, + "learning_rate": 5.956127884991211e-06, + "loss": 0.4946, + "step": 3187 + }, + { + "epoch": 2.3157560211563344, + "grad_norm": 0.41135839595878126, + "learning_rate": 5.9537846012793184e-06, + "loss": 0.497, + "step": 3188 + }, + { + "epoch": 2.3164824189045015, + "grad_norm": 0.5225429394800519, + "learning_rate": 5.9514411001480435e-06, + "loss": 0.5023, + "step": 3189 + }, + { + "epoch": 2.3172088166526685, + "grad_norm": 0.36058624696826763, + "learning_rate": 5.949097382131599e-06, + "loss": 0.4968, + "step": 3190 + }, + { + "epoch": 2.317935214400835, + "grad_norm": 0.4896109590425641, + "learning_rate": 5.946753447764245e-06, + "loss": 0.4831, + "step": 3191 + }, + { + "epoch": 2.3186616121490022, + "grad_norm": 0.37739851530243934, + "learning_rate": 5.9444092975802925e-06, + "loss": 0.5023, + "step": 3192 + }, + { + "epoch": 2.3193880098971693, + "grad_norm": 0.3958899188666041, + "learning_rate": 5.942064932114099e-06, + "loss": 0.49, + "step": 3193 + }, + { + "epoch": 2.3201144076453364, + "grad_norm": 0.5308468797720521, + "learning_rate": 5.939720351900072e-06, + "loss": 0.497, + "step": 3194 + }, + { + "epoch": 2.3208408053935035, + "grad_norm": 0.3649823573866849, + "learning_rate": 5.9373755574726735e-06, + "loss": 0.4998, + "step": 3195 + }, + { + "epoch": 2.32156720314167, + "grad_norm": 0.3904577934507057, + "learning_rate": 5.935030549366405e-06, + "loss": 0.485, + "step": 3196 + }, + { + "epoch": 2.322293600889837, + "grad_norm": 0.34420467341162336, + "learning_rate": 5.932685328115823e-06, + "loss": 0.4921, + "step": 3197 + }, + { + "epoch": 2.3230199986380042, + "grad_norm": 0.3332275063766719, + "learning_rate": 5.930339894255533e-06, + "loss": 0.495, + "step": 3198 + }, + { + "epoch": 2.3237463963861713, + "grad_norm": 0.38028556409357467, + "learning_rate": 5.927994248320183e-06, + "loss": 0.4818, + "step": 3199 + }, + { + "epoch": 2.324472794134338, + "grad_norm": 0.42536308376316817, + "learning_rate": 5.925648390844476e-06, + "loss": 0.4848, + "step": 3200 + }, + { + "epoch": 2.1526435235773724, + "grad_norm": 0.36400339398627185, + "learning_rate": 6.446353231581457e-06, + "loss": 0.4911, + "step": 3201 + }, + { + "epoch": 2.1533159620072286, + "grad_norm": 0.37954296733354875, + "learning_rate": 6.4442456764159814e-06, + "loss": 0.4976, + "step": 3202 + }, + { + "epoch": 2.153988400437085, + "grad_norm": 0.45941645823821337, + "learning_rate": 6.44213784125384e-06, + "loss": 0.4858, + "step": 3203 + }, + { + "epoch": 2.154660838866941, + "grad_norm": 0.40495850868238187, + "learning_rate": 6.4400297265036795e-06, + "loss": 0.4881, + "step": 3204 + }, + { + "epoch": 2.1553332772967977, + "grad_norm": 0.3544217596686645, + "learning_rate": 6.437921332574203e-06, + "loss": 0.4921, + "step": 3205 + }, + { + "epoch": 2.156005715726654, + "grad_norm": 0.35855590079318694, + "learning_rate": 6.435812659874163e-06, + "loss": 0.4878, + "step": 3206 + }, + { + "epoch": 2.15667815415651, + "grad_norm": 0.4257310659468136, + "learning_rate": 6.4337037088123734e-06, + "loss": 0.4784, + "step": 3207 + }, + { + "epoch": 2.1573505925863663, + "grad_norm": 0.359797194091946, + "learning_rate": 6.431594479797693e-06, + "loss": 0.477, + "step": 3208 + }, + { + "epoch": 2.1580230310162225, + "grad_norm": 0.5311983101781117, + "learning_rate": 6.429484973239041e-06, + "loss": 0.4895, + "step": 3209 + }, + { + "epoch": 2.1586954694460787, + "grad_norm": 0.3738818622798392, + "learning_rate": 6.4273751895453894e-06, + "loss": 0.4743, + "step": 3210 + }, + { + "epoch": 2.159367907875935, + "grad_norm": 0.3760099340466295, + "learning_rate": 6.425265129125762e-06, + "loss": 0.4886, + "step": 3211 + }, + { + "epoch": 2.1600403463057916, + "grad_norm": 0.39293130543136123, + "learning_rate": 6.423154792389237e-06, + "loss": 0.4735, + "step": 3212 + }, + { + "epoch": 2.1607127847356478, + "grad_norm": 0.3819149221012835, + "learning_rate": 6.421044179744946e-06, + "loss": 0.4881, + "step": 3213 + }, + { + "epoch": 2.161385223165504, + "grad_norm": 0.3526992849837529, + "learning_rate": 6.418933291602079e-06, + "loss": 0.4905, + "step": 3214 + }, + { + "epoch": 2.16205766159536, + "grad_norm": 0.39242237563550325, + "learning_rate": 6.4168221283698696e-06, + "loss": 0.478, + "step": 3215 + }, + { + "epoch": 2.1627301000252164, + "grad_norm": 0.5052242268359664, + "learning_rate": 6.414710690457613e-06, + "loss": 0.4813, + "step": 3216 + }, + { + "epoch": 2.1634025384550726, + "grad_norm": 0.3974416216109638, + "learning_rate": 6.4125989782746535e-06, + "loss": 0.4613, + "step": 3217 + }, + { + "epoch": 2.164074976884929, + "grad_norm": 0.3953775362044758, + "learning_rate": 6.4104869922303905e-06, + "loss": 0.474, + "step": 3218 + }, + { + "epoch": 2.1647474153147854, + "grad_norm": 0.9970467066795999, + "learning_rate": 6.408374732734275e-06, + "loss": 0.4636, + "step": 3219 + }, + { + "epoch": 2.1654198537446416, + "grad_norm": 0.3449898830543444, + "learning_rate": 6.406262200195812e-06, + "loss": 0.4636, + "step": 3220 + }, + { + "epoch": 2.166092292174498, + "grad_norm": 0.3320632740312974, + "learning_rate": 6.4041493950245605e-06, + "loss": 0.4874, + "step": 3221 + }, + { + "epoch": 2.166764730604354, + "grad_norm": 0.3599970105192158, + "learning_rate": 6.40203631763013e-06, + "loss": 0.452, + "step": 3222 + }, + { + "epoch": 2.1674371690342102, + "grad_norm": 0.49644697700894996, + "learning_rate": 6.399922968422186e-06, + "loss": 0.4668, + "step": 3223 + }, + { + "epoch": 2.1681096074640664, + "grad_norm": 0.42659876148794784, + "learning_rate": 6.397809347810441e-06, + "loss": 0.4592, + "step": 3224 + }, + { + "epoch": 2.1687820458939226, + "grad_norm": 0.39713710109686995, + "learning_rate": 6.395695456204667e-06, + "loss": 0.4839, + "step": 3225 + }, + { + "epoch": 2.1694544843237793, + "grad_norm": 0.44749879329318243, + "learning_rate": 6.3935812940146835e-06, + "loss": 0.4735, + "step": 3226 + }, + { + "epoch": 2.1701269227536355, + "grad_norm": 0.4321525163181309, + "learning_rate": 6.391466861650365e-06, + "loss": 0.4775, + "step": 3227 + }, + { + "epoch": 2.1707993611834917, + "grad_norm": 0.3855482525351591, + "learning_rate": 6.389352159521636e-06, + "loss": 0.461, + "step": 3228 + }, + { + "epoch": 2.171471799613348, + "grad_norm": 0.44598796268875185, + "learning_rate": 6.3872371880384776e-06, + "loss": 0.4644, + "step": 3229 + }, + { + "epoch": 2.172144238043204, + "grad_norm": 0.3776752834861266, + "learning_rate": 6.3851219476109184e-06, + "loss": 0.487, + "step": 3230 + }, + { + "epoch": 2.1728166764730603, + "grad_norm": 0.3441671467053756, + "learning_rate": 6.383006438649042e-06, + "loss": 0.4682, + "step": 3231 + }, + { + "epoch": 2.1734891149029165, + "grad_norm": 0.38288668022048156, + "learning_rate": 6.380890661562984e-06, + "loss": 0.4876, + "step": 3232 + }, + { + "epoch": 2.174161553332773, + "grad_norm": 0.4303974490863329, + "learning_rate": 6.3787746167629286e-06, + "loss": 0.4628, + "step": 3233 + }, + { + "epoch": 2.1748339917626294, + "grad_norm": 0.44345318856219873, + "learning_rate": 6.376658304659118e-06, + "loss": 0.4638, + "step": 3234 + }, + { + "epoch": 2.1755064301924856, + "grad_norm": 0.4281777031529893, + "learning_rate": 6.37454172566184e-06, + "loss": 0.4844, + "step": 3235 + }, + { + "epoch": 2.1761788686223418, + "grad_norm": 0.3941670465285334, + "learning_rate": 6.372424880181437e-06, + "loss": 0.4864, + "step": 3236 + }, + { + "epoch": 2.176851307052198, + "grad_norm": 0.4568583459931907, + "learning_rate": 6.3703077686283045e-06, + "loss": 0.4735, + "step": 3237 + }, + { + "epoch": 2.177523745482054, + "grad_norm": 0.5070676742381048, + "learning_rate": 6.368190391412887e-06, + "loss": 0.4655, + "step": 3238 + }, + { + "epoch": 2.1781961839119104, + "grad_norm": 0.34743038619990396, + "learning_rate": 6.3660727489456815e-06, + "loss": 0.4652, + "step": 3239 + }, + { + "epoch": 2.178868622341767, + "grad_norm": 0.39638695582927597, + "learning_rate": 6.3639548416372385e-06, + "loss": 0.4702, + "step": 3240 + }, + { + "epoch": 2.1795410607716232, + "grad_norm": 0.40020178726324046, + "learning_rate": 6.3618366698981545e-06, + "loss": 0.4537, + "step": 3241 + }, + { + "epoch": 2.1802134992014794, + "grad_norm": 0.37629287034991654, + "learning_rate": 6.359718234139082e-06, + "loss": 0.4581, + "step": 3242 + }, + { + "epoch": 2.1808859376313356, + "grad_norm": 0.4216261561384884, + "learning_rate": 6.357599534770724e-06, + "loss": 0.4898, + "step": 3243 + }, + { + "epoch": 2.181558376061192, + "grad_norm": 0.33402757128286764, + "learning_rate": 6.355480572203834e-06, + "loss": 0.4574, + "step": 3244 + }, + { + "epoch": 2.182230814491048, + "grad_norm": 0.365971405847702, + "learning_rate": 6.353361346849214e-06, + "loss": 0.4571, + "step": 3245 + }, + { + "epoch": 2.1829032529209043, + "grad_norm": 0.3427010018820965, + "learning_rate": 6.35124185911772e-06, + "loss": 0.48, + "step": 3246 + }, + { + "epoch": 2.183575691350761, + "grad_norm": 0.36662548856695765, + "learning_rate": 6.349122109420262e-06, + "loss": 0.4668, + "step": 3247 + }, + { + "epoch": 2.184248129780617, + "grad_norm": 0.3478504185173162, + "learning_rate": 6.3470020981677915e-06, + "loss": 0.4723, + "step": 3248 + }, + { + "epoch": 2.1849205682104733, + "grad_norm": 0.31574374838479446, + "learning_rate": 6.3448818257713205e-06, + "loss": 0.4635, + "step": 3249 + }, + { + "epoch": 2.1855930066403295, + "grad_norm": 0.43950264307355086, + "learning_rate": 6.3427612926419056e-06, + "loss": 0.4791, + "step": 3250 + }, + { + "epoch": 2.1862654450701857, + "grad_norm": 0.3558974965207327, + "learning_rate": 6.340640499190656e-06, + "loss": 0.4614, + "step": 3251 + }, + { + "epoch": 2.186937883500042, + "grad_norm": 0.6481493882892034, + "learning_rate": 6.338519445828728e-06, + "loss": 0.4551, + "step": 3252 + }, + { + "epoch": 2.187610321929898, + "grad_norm": 0.39982021103181276, + "learning_rate": 6.336398132967337e-06, + "loss": 0.4529, + "step": 3253 + }, + { + "epoch": 2.1882827603597548, + "grad_norm": 0.3361374722833279, + "learning_rate": 6.334276561017738e-06, + "loss": 0.475, + "step": 3254 + }, + { + "epoch": 2.188955198789611, + "grad_norm": 0.3661649872244156, + "learning_rate": 6.3321547303912445e-06, + "loss": 0.4701, + "step": 3255 + }, + { + "epoch": 2.189627637219467, + "grad_norm": 0.38467033502442166, + "learning_rate": 6.330032641499216e-06, + "loss": 0.4654, + "step": 3256 + }, + { + "epoch": 2.1903000756493234, + "grad_norm": 0.33990424949533915, + "learning_rate": 6.3279102947530626e-06, + "loss": 0.4636, + "step": 3257 + }, + { + "epoch": 2.1909725140791796, + "grad_norm": 0.5088859601290591, + "learning_rate": 6.325787690564246e-06, + "loss": 0.4741, + "step": 3258 + }, + { + "epoch": 2.191644952509036, + "grad_norm": 0.4521884277447263, + "learning_rate": 6.3236648293442735e-06, + "loss": 0.4617, + "step": 3259 + }, + { + "epoch": 2.192317390938892, + "grad_norm": 0.3272684919643106, + "learning_rate": 6.321541711504708e-06, + "loss": 0.4675, + "step": 3260 + }, + { + "epoch": 2.1929898293687486, + "grad_norm": 0.3178252049326452, + "learning_rate": 6.319418337457158e-06, + "loss": 0.4601, + "step": 3261 + }, + { + "epoch": 2.193662267798605, + "grad_norm": 0.343212213649888, + "learning_rate": 6.317294707613287e-06, + "loss": 0.4823, + "step": 3262 + }, + { + "epoch": 2.194334706228461, + "grad_norm": 0.36111254446942886, + "learning_rate": 6.315170822384797e-06, + "loss": 0.4666, + "step": 3263 + }, + { + "epoch": 2.1950071446583173, + "grad_norm": 0.3247937151282206, + "learning_rate": 6.313046682183452e-06, + "loss": 0.4479, + "step": 3264 + }, + { + "epoch": 2.1956795830881735, + "grad_norm": 0.32402685278862575, + "learning_rate": 6.310922287421062e-06, + "loss": 0.4686, + "step": 3265 + }, + { + "epoch": 2.1963520215180297, + "grad_norm": 0.3085536264866017, + "learning_rate": 6.308797638509478e-06, + "loss": 0.4792, + "step": 3266 + }, + { + "epoch": 2.197024459947886, + "grad_norm": 0.3181533538129706, + "learning_rate": 6.3066727358606126e-06, + "loss": 0.4721, + "step": 3267 + }, + { + "epoch": 2.1976968983777425, + "grad_norm": 0.3312265223041896, + "learning_rate": 6.304547579886419e-06, + "loss": 0.4547, + "step": 3268 + }, + { + "epoch": 2.1983693368075987, + "grad_norm": 0.328345337965804, + "learning_rate": 6.302422170998903e-06, + "loss": 0.4789, + "step": 3269 + }, + { + "epoch": 2.199041775237455, + "grad_norm": 0.39821230084937137, + "learning_rate": 6.300296509610117e-06, + "loss": 0.4763, + "step": 3270 + }, + { + "epoch": 2.199714213667311, + "grad_norm": 0.45659507362333, + "learning_rate": 6.298170596132168e-06, + "loss": 0.4832, + "step": 3271 + }, + { + "epoch": 2.2003866520971673, + "grad_norm": 0.3582687048929397, + "learning_rate": 6.296044430977205e-06, + "loss": 0.4933, + "step": 3272 + }, + { + "epoch": 2.2010590905270235, + "grad_norm": 0.3347341994137685, + "learning_rate": 6.293918014557429e-06, + "loss": 0.4704, + "step": 3273 + }, + { + "epoch": 2.2017315289568797, + "grad_norm": 0.3215277495026712, + "learning_rate": 6.2917913472850915e-06, + "loss": 0.4629, + "step": 3274 + }, + { + "epoch": 2.2024039673867364, + "grad_norm": 0.3515898713741847, + "learning_rate": 6.289664429572486e-06, + "loss": 0.4675, + "step": 3275 + }, + { + "epoch": 2.2030764058165926, + "grad_norm": 0.40158213129557646, + "learning_rate": 6.287537261831965e-06, + "loss": 0.4712, + "step": 3276 + }, + { + "epoch": 2.203748844246449, + "grad_norm": 0.36658677429253866, + "learning_rate": 6.285409844475918e-06, + "loss": 0.4657, + "step": 3277 + }, + { + "epoch": 2.204421282676305, + "grad_norm": 0.40253283718196536, + "learning_rate": 6.283282177916794e-06, + "loss": 0.4712, + "step": 3278 + }, + { + "epoch": 2.205093721106161, + "grad_norm": 0.35863478731973064, + "learning_rate": 6.28115426256708e-06, + "loss": 0.4657, + "step": 3279 + }, + { + "epoch": 2.2057661595360174, + "grad_norm": 0.3305351152359045, + "learning_rate": 6.279026098839321e-06, + "loss": 0.4557, + "step": 3280 + }, + { + "epoch": 2.2064385979658736, + "grad_norm": 0.3318879526526031, + "learning_rate": 6.2768976871460985e-06, + "loss": 0.4601, + "step": 3281 + }, + { + "epoch": 2.20711103639573, + "grad_norm": 0.35760047494368197, + "learning_rate": 6.274769027900056e-06, + "loss": 0.478, + "step": 3282 + }, + { + "epoch": 2.2077834748255865, + "grad_norm": 0.34678732685220764, + "learning_rate": 6.272640121513872e-06, + "loss": 0.471, + "step": 3283 + }, + { + "epoch": 2.2084559132554427, + "grad_norm": 0.32565899960537, + "learning_rate": 6.270510968400283e-06, + "loss": 0.4814, + "step": 3284 + }, + { + "epoch": 2.209128351685299, + "grad_norm": 0.3208575439821756, + "learning_rate": 6.268381568972065e-06, + "loss": 0.4651, + "step": 3285 + }, + { + "epoch": 2.209800790115155, + "grad_norm": 0.4131626036588382, + "learning_rate": 6.266251923642049e-06, + "loss": 0.4898, + "step": 3286 + }, + { + "epoch": 2.2104732285450113, + "grad_norm": 0.3806906259045866, + "learning_rate": 6.264122032823111e-06, + "loss": 0.4727, + "step": 3287 + }, + { + "epoch": 2.2111456669748675, + "grad_norm": 0.7017980432113152, + "learning_rate": 6.26199189692817e-06, + "loss": 0.4762, + "step": 3288 + }, + { + "epoch": 2.2118181054047237, + "grad_norm": 0.3308519344551586, + "learning_rate": 6.259861516370201e-06, + "loss": 0.4555, + "step": 3289 + }, + { + "epoch": 2.2124905438345803, + "grad_norm": 0.3946936336332011, + "learning_rate": 6.2577308915622196e-06, + "loss": 0.4517, + "step": 3290 + }, + { + "epoch": 2.2131629822644365, + "grad_norm": 0.41976183541831025, + "learning_rate": 6.255600022917292e-06, + "loss": 0.4712, + "step": 3291 + }, + { + "epoch": 2.2138354206942927, + "grad_norm": 0.3441105184459733, + "learning_rate": 6.253468910848529e-06, + "loss": 0.4409, + "step": 3292 + }, + { + "epoch": 2.214507859124149, + "grad_norm": 0.440199416455677, + "learning_rate": 6.251337555769093e-06, + "loss": 0.4681, + "step": 3293 + }, + { + "epoch": 2.215180297554005, + "grad_norm": 0.45599533834389516, + "learning_rate": 6.2492059580921886e-06, + "loss": 0.4509, + "step": 3294 + }, + { + "epoch": 2.2158527359838613, + "grad_norm": 0.6261088005455595, + "learning_rate": 6.2470741182310725e-06, + "loss": 0.4504, + "step": 3295 + }, + { + "epoch": 2.2165251744137175, + "grad_norm": 0.32376267307506235, + "learning_rate": 6.244942036599042e-06, + "loss": 0.4565, + "step": 3296 + }, + { + "epoch": 2.217197612843574, + "grad_norm": 0.4449321605914357, + "learning_rate": 6.2428097136094476e-06, + "loss": 0.4837, + "step": 3297 + }, + { + "epoch": 2.2178700512734304, + "grad_norm": 0.4372822395750341, + "learning_rate": 6.240677149675684e-06, + "loss": 0.4713, + "step": 3298 + }, + { + "epoch": 2.2185424897032866, + "grad_norm": 0.7497825332115169, + "learning_rate": 6.2385443452111915e-06, + "loss": 0.4561, + "step": 3299 + }, + { + "epoch": 2.219214928133143, + "grad_norm": 0.3561248007825388, + "learning_rate": 6.236411300629458e-06, + "loss": 0.4714, + "step": 3300 + }, + { + "epoch": 2.219887366562999, + "grad_norm": 0.355500848926921, + "learning_rate": 6.234278016344018e-06, + "loss": 0.4625, + "step": 3301 + }, + { + "epoch": 2.220559804992855, + "grad_norm": 0.30501535700416, + "learning_rate": 6.232144492768451e-06, + "loss": 0.4719, + "step": 3302 + }, + { + "epoch": 2.2212322434227114, + "grad_norm": 0.42608821946659853, + "learning_rate": 6.230010730316388e-06, + "loss": 0.48, + "step": 3303 + }, + { + "epoch": 2.221904681852568, + "grad_norm": 0.45846152556150416, + "learning_rate": 6.227876729401501e-06, + "loss": 0.4938, + "step": 3304 + }, + { + "epoch": 2.2225771202824243, + "grad_norm": 0.4631201581405841, + "learning_rate": 6.225742490437507e-06, + "loss": 0.4493, + "step": 3305 + }, + { + "epoch": 2.2232495587122805, + "grad_norm": 0.3319109235846109, + "learning_rate": 6.223608013838177e-06, + "loss": 0.4509, + "step": 3306 + }, + { + "epoch": 2.2239219971421367, + "grad_norm": 0.4319123676573331, + "learning_rate": 6.221473300017319e-06, + "loss": 0.4613, + "step": 3307 + }, + { + "epoch": 2.224594435571993, + "grad_norm": 0.5502446668762675, + "learning_rate": 6.219338349388792e-06, + "loss": 0.4665, + "step": 3308 + }, + { + "epoch": 2.225266874001849, + "grad_norm": 0.3383954086793452, + "learning_rate": 6.217203162366502e-06, + "loss": 0.4773, + "step": 3309 + }, + { + "epoch": 2.2259393124317053, + "grad_norm": 0.3253169022175458, + "learning_rate": 6.215067739364397e-06, + "loss": 0.4457, + "step": 3310 + }, + { + "epoch": 2.226611750861562, + "grad_norm": 0.38283019608837743, + "learning_rate": 6.212932080796473e-06, + "loss": 0.4683, + "step": 3311 + }, + { + "epoch": 2.227284189291418, + "grad_norm": 0.3627285206977748, + "learning_rate": 6.2107961870767706e-06, + "loss": 0.4643, + "step": 3312 + }, + { + "epoch": 2.2279566277212743, + "grad_norm": 0.37400335315235345, + "learning_rate": 6.20866005861938e-06, + "loss": 0.4774, + "step": 3313 + }, + { + "epoch": 2.2286290661511305, + "grad_norm": 0.4257019090989514, + "learning_rate": 6.206523695838428e-06, + "loss": 0.4554, + "step": 3314 + }, + { + "epoch": 2.2293015045809867, + "grad_norm": 0.4793611486308495, + "learning_rate": 6.204387099148097e-06, + "loss": 0.4524, + "step": 3315 + }, + { + "epoch": 2.229973943010843, + "grad_norm": 0.38225534048231125, + "learning_rate": 6.2022502689626075e-06, + "loss": 0.4689, + "step": 3316 + }, + { + "epoch": 2.230646381440699, + "grad_norm": 0.32799250015071885, + "learning_rate": 6.200113205696228e-06, + "loss": 0.4682, + "step": 3317 + }, + { + "epoch": 2.231318819870556, + "grad_norm": 0.47480492674234276, + "learning_rate": 6.197975909763273e-06, + "loss": 0.4702, + "step": 3318 + }, + { + "epoch": 2.231991258300412, + "grad_norm": 0.3494408626924014, + "learning_rate": 6.195838381578101e-06, + "loss": 0.4562, + "step": 3319 + }, + { + "epoch": 2.232663696730268, + "grad_norm": 0.37782094497467594, + "learning_rate": 6.1937006215551176e-06, + "loss": 0.4707, + "step": 3320 + }, + { + "epoch": 2.2333361351601244, + "grad_norm": 0.37275458351555435, + "learning_rate": 6.191562630108767e-06, + "loss": 0.4572, + "step": 3321 + }, + { + "epoch": 2.2340085735899806, + "grad_norm": 0.3533869889329662, + "learning_rate": 6.189424407653548e-06, + "loss": 0.4544, + "step": 3322 + }, + { + "epoch": 2.234681012019837, + "grad_norm": 0.30999442867104227, + "learning_rate": 6.187285954603994e-06, + "loss": 0.472, + "step": 3323 + }, + { + "epoch": 2.235353450449693, + "grad_norm": 0.33558462774882347, + "learning_rate": 6.185147271374692e-06, + "loss": 0.4644, + "step": 3324 + }, + { + "epoch": 2.2360258888795497, + "grad_norm": 0.328979183614441, + "learning_rate": 6.183008358380266e-06, + "loss": 0.4646, + "step": 3325 + }, + { + "epoch": 2.236698327309406, + "grad_norm": 0.33931223367496843, + "learning_rate": 6.1808692160353904e-06, + "loss": 0.4729, + "step": 3326 + }, + { + "epoch": 2.237370765739262, + "grad_norm": 0.36880270279008426, + "learning_rate": 6.178729844754782e-06, + "loss": 0.4598, + "step": 3327 + }, + { + "epoch": 2.2380432041691183, + "grad_norm": 0.3645994812221974, + "learning_rate": 6.176590244953201e-06, + "loss": 0.4583, + "step": 3328 + }, + { + "epoch": 2.2387156425989745, + "grad_norm": 0.4262042189460906, + "learning_rate": 6.174450417045453e-06, + "loss": 0.4535, + "step": 3329 + }, + { + "epoch": 2.2393880810288307, + "grad_norm": 0.3778285311948193, + "learning_rate": 6.172310361446387e-06, + "loss": 0.451, + "step": 3330 + }, + { + "epoch": 2.240060519458687, + "grad_norm": 0.31217943304610896, + "learning_rate": 6.170170078570898e-06, + "loss": 0.4764, + "step": 3331 + }, + { + "epoch": 2.240732957888543, + "grad_norm": 0.43079020556899783, + "learning_rate": 6.168029568833923e-06, + "loss": 0.4607, + "step": 3332 + }, + { + "epoch": 2.2414053963183997, + "grad_norm": 0.3395066927764587, + "learning_rate": 6.165888832650444e-06, + "loss": 0.47, + "step": 3333 + }, + { + "epoch": 2.242077834748256, + "grad_norm": 0.36339912533861557, + "learning_rate": 6.163747870435486e-06, + "loss": 0.4749, + "step": 3334 + }, + { + "epoch": 2.242750273178112, + "grad_norm": 0.32776696036895525, + "learning_rate": 6.16160668260412e-06, + "loss": 0.4597, + "step": 3335 + }, + { + "epoch": 2.2434227116079684, + "grad_norm": 0.34177905036225076, + "learning_rate": 6.159465269571456e-06, + "loss": 0.4552, + "step": 3336 + }, + { + "epoch": 2.2440951500378246, + "grad_norm": 0.3385665299076455, + "learning_rate": 6.157323631752655e-06, + "loss": 0.4684, + "step": 3337 + }, + { + "epoch": 2.2447675884676808, + "grad_norm": 0.3009145657215783, + "learning_rate": 6.155181769562915e-06, + "loss": 0.4478, + "step": 3338 + }, + { + "epoch": 2.245440026897537, + "grad_norm": 0.3583058788615868, + "learning_rate": 6.15303968341748e-06, + "loss": 0.4798, + "step": 3339 + }, + { + "epoch": 2.2461124653273936, + "grad_norm": 0.38765323534294566, + "learning_rate": 6.150897373731637e-06, + "loss": 0.4591, + "step": 3340 + }, + { + "epoch": 2.24678490375725, + "grad_norm": 0.32351350562173625, + "learning_rate": 6.148754840920718e-06, + "loss": 0.4665, + "step": 3341 + }, + { + "epoch": 2.247457342187106, + "grad_norm": 0.3610813084026684, + "learning_rate": 6.146612085400096e-06, + "loss": 0.4749, + "step": 3342 + }, + { + "epoch": 2.2481297806169622, + "grad_norm": 0.346156630363326, + "learning_rate": 6.144469107585189e-06, + "loss": 0.4721, + "step": 3343 + }, + { + "epoch": 2.2488022190468184, + "grad_norm": 0.5827995420298924, + "learning_rate": 6.142325907891458e-06, + "loss": 0.4634, + "step": 3344 + }, + { + "epoch": 2.2494746574766746, + "grad_norm": 0.3453503009767493, + "learning_rate": 6.140182486734403e-06, + "loss": 0.4577, + "step": 3345 + }, + { + "epoch": 2.250147095906531, + "grad_norm": 0.39578599271575676, + "learning_rate": 6.138038844529574e-06, + "loss": 0.4555, + "step": 3346 + }, + { + "epoch": 2.2508195343363875, + "grad_norm": 0.3610888497577412, + "learning_rate": 6.1358949816925565e-06, + "loss": 0.4697, + "step": 3347 + }, + { + "epoch": 2.2514919727662437, + "grad_norm": 0.3602710216475961, + "learning_rate": 6.133750898638985e-06, + "loss": 0.4692, + "step": 3348 + }, + { + "epoch": 2.2521644111961, + "grad_norm": 0.36150798187522293, + "learning_rate": 6.131606595784531e-06, + "loss": 0.4515, + "step": 3349 + }, + { + "epoch": 2.252836849625956, + "grad_norm": 0.32433360019611446, + "learning_rate": 6.129462073544916e-06, + "loss": 0.4559, + "step": 3350 + }, + { + "epoch": 2.2535092880558123, + "grad_norm": 0.33643243164387887, + "learning_rate": 6.1273173323358955e-06, + "loss": 0.4524, + "step": 3351 + }, + { + "epoch": 2.2541817264856685, + "grad_norm": 0.3717075416643779, + "learning_rate": 6.125172372573275e-06, + "loss": 0.4674, + "step": 3352 + }, + { + "epoch": 2.2548541649155247, + "grad_norm": 0.4768402626852065, + "learning_rate": 6.123027194672897e-06, + "loss": 0.4533, + "step": 3353 + }, + { + "epoch": 2.2555266033453814, + "grad_norm": 0.4102465952984789, + "learning_rate": 6.120881799050648e-06, + "loss": 0.448, + "step": 3354 + }, + { + "epoch": 2.2561990417752376, + "grad_norm": 0.34922910900027687, + "learning_rate": 6.118736186122459e-06, + "loss": 0.4596, + "step": 3355 + }, + { + "epoch": 2.2568714802050938, + "grad_norm": 0.3605208529073994, + "learning_rate": 6.116590356304299e-06, + "loss": 0.4621, + "step": 3356 + }, + { + "epoch": 2.25754391863495, + "grad_norm": 0.4704991433860005, + "learning_rate": 6.114444310012181e-06, + "loss": 0.4639, + "step": 3357 + }, + { + "epoch": 2.258216357064806, + "grad_norm": 0.3261559815874674, + "learning_rate": 6.112298047662162e-06, + "loss": 0.4575, + "step": 3358 + }, + { + "epoch": 2.2588887954946624, + "grad_norm": 0.41003180273468104, + "learning_rate": 6.110151569670338e-06, + "loss": 0.4613, + "step": 3359 + }, + { + "epoch": 2.2595612339245186, + "grad_norm": 0.32332479510461204, + "learning_rate": 6.108004876452847e-06, + "loss": 0.4455, + "step": 3360 + }, + { + "epoch": 2.2602336723543752, + "grad_norm": 0.3221377433290264, + "learning_rate": 6.1058579684258715e-06, + "loss": 0.458, + "step": 3361 + }, + { + "epoch": 2.2609061107842314, + "grad_norm": 0.31199433964100104, + "learning_rate": 6.103710846005631e-06, + "loss": 0.4612, + "step": 3362 + }, + { + "epoch": 2.2615785492140876, + "grad_norm": 0.36232928587137264, + "learning_rate": 6.101563509608391e-06, + "loss": 0.4578, + "step": 3363 + }, + { + "epoch": 2.262250987643944, + "grad_norm": 0.4503992437491697, + "learning_rate": 6.099415959650459e-06, + "loss": 0.4629, + "step": 3364 + }, + { + "epoch": 2.2629234260738, + "grad_norm": 0.3428325411929094, + "learning_rate": 6.097268196548176e-06, + "loss": 0.461, + "step": 3365 + }, + { + "epoch": 2.2635958645036562, + "grad_norm": 0.3724685334484763, + "learning_rate": 6.095120220717934e-06, + "loss": 0.4531, + "step": 3366 + }, + { + "epoch": 2.2642683029335124, + "grad_norm": 0.3771343863955035, + "learning_rate": 6.092972032576161e-06, + "loss": 0.4652, + "step": 3367 + }, + { + "epoch": 2.264940741363369, + "grad_norm": 0.49327174181998573, + "learning_rate": 6.09082363253933e-06, + "loss": 0.4527, + "step": 3368 + }, + { + "epoch": 2.2656131797932253, + "grad_norm": 0.3807007587697114, + "learning_rate": 6.088675021023948e-06, + "loss": 0.4629, + "step": 3369 + }, + { + "epoch": 2.2662856182230815, + "grad_norm": 0.3282162633067282, + "learning_rate": 6.08652619844657e-06, + "loss": 0.4729, + "step": 3370 + }, + { + "epoch": 2.2669580566529377, + "grad_norm": 0.3615931019391284, + "learning_rate": 6.0843771652237884e-06, + "loss": 0.478, + "step": 3371 + }, + { + "epoch": 2.267630495082794, + "grad_norm": 0.7113957644212862, + "learning_rate": 6.082227921772239e-06, + "loss": 0.4586, + "step": 3372 + }, + { + "epoch": 2.26830293351265, + "grad_norm": 0.3374916536736294, + "learning_rate": 6.080078468508595e-06, + "loss": 0.4684, + "step": 3373 + }, + { + "epoch": 2.2689753719425063, + "grad_norm": 0.33119172696213184, + "learning_rate": 6.077928805849571e-06, + "loss": 0.4681, + "step": 3374 + }, + { + "epoch": 2.269647810372363, + "grad_norm": 0.3473668156048032, + "learning_rate": 6.075778934211926e-06, + "loss": 0.4488, + "step": 3375 + }, + { + "epoch": 2.270320248802219, + "grad_norm": 0.37623319413269796, + "learning_rate": 6.073628854012457e-06, + "loss": 0.4698, + "step": 3376 + }, + { + "epoch": 2.2709926872320754, + "grad_norm": 0.38767667636713227, + "learning_rate": 6.0714785656679984e-06, + "loss": 0.456, + "step": 3377 + }, + { + "epoch": 2.2716651256619316, + "grad_norm": 0.329376270538414, + "learning_rate": 6.069328069595431e-06, + "loss": 0.4644, + "step": 3378 + }, + { + "epoch": 2.2723375640917878, + "grad_norm": 0.3312790537268965, + "learning_rate": 6.06717736621167e-06, + "loss": 0.4416, + "step": 3379 + }, + { + "epoch": 2.273010002521644, + "grad_norm": 0.3267748945523019, + "learning_rate": 6.065026455933673e-06, + "loss": 0.4578, + "step": 3380 + }, + { + "epoch": 2.2736824409515, + "grad_norm": 0.4400018699156518, + "learning_rate": 6.06287533917844e-06, + "loss": 0.4839, + "step": 3381 + }, + { + "epoch": 2.274354879381357, + "grad_norm": 0.33881760895521307, + "learning_rate": 6.060724016363006e-06, + "loss": 0.4499, + "step": 3382 + }, + { + "epoch": 2.275027317811213, + "grad_norm": 0.3591199272872496, + "learning_rate": 6.058572487904454e-06, + "loss": 0.4621, + "step": 3383 + }, + { + "epoch": 2.2756997562410692, + "grad_norm": 0.43687174063020634, + "learning_rate": 6.056420754219898e-06, + "loss": 0.4558, + "step": 3384 + }, + { + "epoch": 2.2763721946709254, + "grad_norm": 0.6033053826657983, + "learning_rate": 6.054268815726498e-06, + "loss": 0.4604, + "step": 3385 + }, + { + "epoch": 2.2770446331007816, + "grad_norm": 0.3598084090904599, + "learning_rate": 6.052116672841451e-06, + "loss": 0.4625, + "step": 3386 + }, + { + "epoch": 2.277717071530638, + "grad_norm": 0.4066941227589285, + "learning_rate": 6.049964325981994e-06, + "loss": 0.4718, + "step": 3387 + }, + { + "epoch": 2.278389509960494, + "grad_norm": 0.37321132978250743, + "learning_rate": 6.047811775565403e-06, + "loss": 0.4598, + "step": 3388 + }, + { + "epoch": 2.2790619483903507, + "grad_norm": 0.36319732850319875, + "learning_rate": 6.045659022008993e-06, + "loss": 0.4669, + "step": 3389 + }, + { + "epoch": 2.279734386820207, + "grad_norm": 0.42909122433100383, + "learning_rate": 6.043506065730121e-06, + "loss": 0.4616, + "step": 3390 + }, + { + "epoch": 2.280406825250063, + "grad_norm": 0.3331074209108524, + "learning_rate": 6.041352907146182e-06, + "loss": 0.4412, + "step": 3391 + }, + { + "epoch": 2.2810792636799193, + "grad_norm": 0.3376033994060931, + "learning_rate": 6.0391995466746105e-06, + "loss": 0.4671, + "step": 3392 + }, + { + "epoch": 2.2817517021097755, + "grad_norm": 0.3632299799785127, + "learning_rate": 6.037045984732877e-06, + "loss": 0.4602, + "step": 3393 + }, + { + "epoch": 2.2824241405396317, + "grad_norm": 0.3367098095856448, + "learning_rate": 6.034892221738498e-06, + "loss": 0.4681, + "step": 3394 + }, + { + "epoch": 2.283096578969488, + "grad_norm": 0.3349169481214063, + "learning_rate": 6.032738258109019e-06, + "loss": 0.4693, + "step": 3395 + }, + { + "epoch": 2.2837690173993446, + "grad_norm": 0.43376510365880344, + "learning_rate": 6.030584094262034e-06, + "loss": 0.4651, + "step": 3396 + }, + { + "epoch": 2.2844414558292008, + "grad_norm": 0.3622779425031837, + "learning_rate": 6.028429730615172e-06, + "loss": 0.4505, + "step": 3397 + }, + { + "epoch": 2.285113894259057, + "grad_norm": 0.8038609794552343, + "learning_rate": 6.026275167586099e-06, + "loss": 0.455, + "step": 3398 + }, + { + "epoch": 2.285786332688913, + "grad_norm": 0.31065295877150795, + "learning_rate": 6.024120405592521e-06, + "loss": 0.4607, + "step": 3399 + }, + { + "epoch": 2.2864587711187694, + "grad_norm": 0.3118730137489182, + "learning_rate": 6.021965445052182e-06, + "loss": 0.4613, + "step": 3400 + }, + { + "epoch": 2.2871312095486256, + "grad_norm": 0.31927555976031363, + "learning_rate": 6.019810286382871e-06, + "loss": 0.4642, + "step": 3401 + }, + { + "epoch": 2.287803647978482, + "grad_norm": 0.3290263416853223, + "learning_rate": 6.017654930002404e-06, + "loss": 0.4467, + "step": 3402 + }, + { + "epoch": 2.2884760864083384, + "grad_norm": 0.41316779038138807, + "learning_rate": 6.015499376328642e-06, + "loss": 0.467, + "step": 3403 + }, + { + "epoch": 2.2891485248381946, + "grad_norm": 0.31784747734763796, + "learning_rate": 6.0133436257794845e-06, + "loss": 0.4578, + "step": 3404 + }, + { + "epoch": 2.289820963268051, + "grad_norm": 0.3090216314364643, + "learning_rate": 6.011187678772868e-06, + "loss": 0.463, + "step": 3405 + }, + { + "epoch": 2.290493401697907, + "grad_norm": 0.8887177266878555, + "learning_rate": 6.009031535726766e-06, + "loss": 0.4587, + "step": 3406 + }, + { + "epoch": 2.2911658401277633, + "grad_norm": 0.32367775021751516, + "learning_rate": 6.006875197059191e-06, + "loss": 0.4501, + "step": 3407 + }, + { + "epoch": 2.2918382785576195, + "grad_norm": 0.40011093762620686, + "learning_rate": 6.004718663188196e-06, + "loss": 0.4481, + "step": 3408 + }, + { + "epoch": 2.2925107169874757, + "grad_norm": 0.33581182782298286, + "learning_rate": 6.0025619345318665e-06, + "loss": 0.4477, + "step": 3409 + }, + { + "epoch": 2.2931831554173323, + "grad_norm": 0.35930830236800326, + "learning_rate": 6.000405011508331e-06, + "loss": 0.4642, + "step": 3410 + }, + { + "epoch": 2.2938555938471885, + "grad_norm": 0.38802853702385304, + "learning_rate": 5.99824789453575e-06, + "loss": 0.4627, + "step": 3411 + }, + { + "epoch": 2.2945280322770447, + "grad_norm": 0.3984723628092719, + "learning_rate": 5.996090584032329e-06, + "loss": 0.4616, + "step": 3412 + }, + { + "epoch": 2.295200470706901, + "grad_norm": 0.3867437885588343, + "learning_rate": 5.993933080416303e-06, + "loss": 0.461, + "step": 3413 + }, + { + "epoch": 2.295872909136757, + "grad_norm": 0.39906088363451553, + "learning_rate": 5.9917753841059516e-06, + "loss": 0.4793, + "step": 3414 + }, + { + "epoch": 2.2965453475666133, + "grad_norm": 0.3425550182660963, + "learning_rate": 5.989617495519586e-06, + "loss": 0.457, + "step": 3415 + }, + { + "epoch": 2.2972177859964695, + "grad_norm": 0.33384643461177643, + "learning_rate": 5.987459415075559e-06, + "loss": 0.4755, + "step": 3416 + }, + { + "epoch": 2.297890224426326, + "grad_norm": 0.38703558717300707, + "learning_rate": 5.985301143192258e-06, + "loss": 0.4556, + "step": 3417 + }, + { + "epoch": 2.2985626628561824, + "grad_norm": 0.4168459939748986, + "learning_rate": 5.983142680288109e-06, + "loss": 0.4587, + "step": 3418 + }, + { + "epoch": 2.2992351012860386, + "grad_norm": 0.3324296279399844, + "learning_rate": 5.980984026781572e-06, + "loss": 0.4543, + "step": 3419 + }, + { + "epoch": 2.299907539715895, + "grad_norm": 0.3950144902415733, + "learning_rate": 5.978825183091148e-06, + "loss": 0.4772, + "step": 3420 + }, + { + "epoch": 2.300579978145751, + "grad_norm": 0.4183703742331098, + "learning_rate": 5.976666149635375e-06, + "loss": 0.455, + "step": 3421 + }, + { + "epoch": 2.301252416575607, + "grad_norm": 0.3388616076348071, + "learning_rate": 5.974506926832822e-06, + "loss": 0.4559, + "step": 3422 + }, + { + "epoch": 2.3019248550054634, + "grad_norm": 0.343276092749269, + "learning_rate": 5.972347515102102e-06, + "loss": 0.4581, + "step": 3423 + }, + { + "epoch": 2.30259729343532, + "grad_norm": 0.3380498032264536, + "learning_rate": 5.97018791486186e-06, + "loss": 0.4748, + "step": 3424 + }, + { + "epoch": 2.3032697318651763, + "grad_norm": 0.35002944027653216, + "learning_rate": 5.96802812653078e-06, + "loss": 0.4524, + "step": 3425 + }, + { + "epoch": 2.3039421702950325, + "grad_norm": 0.34512713571252573, + "learning_rate": 5.9658681505275785e-06, + "loss": 0.457, + "step": 3426 + }, + { + "epoch": 2.3046146087248887, + "grad_norm": 0.380751421190636, + "learning_rate": 5.963707987271014e-06, + "loss": 0.4629, + "step": 3427 + }, + { + "epoch": 2.305287047154745, + "grad_norm": 0.44507606560359714, + "learning_rate": 5.961547637179875e-06, + "loss": 0.4526, + "step": 3428 + }, + { + "epoch": 2.305959485584601, + "grad_norm": 0.4277887184641815, + "learning_rate": 5.959387100672994e-06, + "loss": 0.4551, + "step": 3429 + }, + { + "epoch": 2.3066319240144573, + "grad_norm": 0.2969333291788095, + "learning_rate": 5.957226378169233e-06, + "loss": 0.4567, + "step": 3430 + }, + { + "epoch": 2.307304362444314, + "grad_norm": 0.3232796193905546, + "learning_rate": 5.955065470087491e-06, + "loss": 0.4579, + "step": 3431 + }, + { + "epoch": 2.30797680087417, + "grad_norm": 0.4424788971209814, + "learning_rate": 5.9529043768467085e-06, + "loss": 0.4559, + "step": 3432 + }, + { + "epoch": 2.3086492393040263, + "grad_norm": 0.4020983687858228, + "learning_rate": 5.950743098865854e-06, + "loss": 0.4594, + "step": 3433 + }, + { + "epoch": 2.3093216777338825, + "grad_norm": 0.33850722955430224, + "learning_rate": 5.948581636563939e-06, + "loss": 0.4516, + "step": 3434 + }, + { + "epoch": 2.3099941161637387, + "grad_norm": 0.34400717979285295, + "learning_rate": 5.946419990360004e-06, + "loss": 0.4634, + "step": 3435 + }, + { + "epoch": 2.310666554593595, + "grad_norm": 0.3447090933374568, + "learning_rate": 5.94425816067313e-06, + "loss": 0.4685, + "step": 3436 + }, + { + "epoch": 2.311338993023451, + "grad_norm": 0.3135329848238476, + "learning_rate": 5.9420961479224315e-06, + "loss": 0.4433, + "step": 3437 + }, + { + "epoch": 2.312011431453308, + "grad_norm": 0.38012311880815325, + "learning_rate": 5.9399339525270595e-06, + "loss": 0.4714, + "step": 3438 + }, + { + "epoch": 2.312683869883164, + "grad_norm": 0.3890841552249759, + "learning_rate": 5.9377715749062e-06, + "loss": 0.4611, + "step": 3439 + }, + { + "epoch": 2.31335630831302, + "grad_norm": 0.3188847168667241, + "learning_rate": 5.935609015479075e-06, + "loss": 0.4671, + "step": 3440 + }, + { + "epoch": 2.3140287467428764, + "grad_norm": 0.49850677405706556, + "learning_rate": 5.9334462746649375e-06, + "loss": 0.4444, + "step": 3441 + }, + { + "epoch": 2.3147011851727326, + "grad_norm": 0.35339646675873393, + "learning_rate": 5.9312833528830835e-06, + "loss": 0.4606, + "step": 3442 + }, + { + "epoch": 2.315373623602589, + "grad_norm": 0.3545069415592496, + "learning_rate": 5.92912025055284e-06, + "loss": 0.4423, + "step": 3443 + }, + { + "epoch": 2.316046062032445, + "grad_norm": 0.3661937143726214, + "learning_rate": 5.926956968093565e-06, + "loss": 0.46, + "step": 3444 + }, + { + "epoch": 2.3167185004623017, + "grad_norm": 0.3305650067264564, + "learning_rate": 5.924793505924658e-06, + "loss": 0.46, + "step": 3445 + }, + { + "epoch": 2.317390938892158, + "grad_norm": 0.36544706722770126, + "learning_rate": 5.9226298644655485e-06, + "loss": 0.4495, + "step": 3446 + }, + { + "epoch": 2.318063377322014, + "grad_norm": 0.3967132330665242, + "learning_rate": 5.920466044135704e-06, + "loss": 0.4619, + "step": 3447 + }, + { + "epoch": 2.3187358157518703, + "grad_norm": 0.47266167596193176, + "learning_rate": 5.918302045354626e-06, + "loss": 0.4483, + "step": 3448 + }, + { + "epoch": 2.3194082541817265, + "grad_norm": 0.3989441553994751, + "learning_rate": 5.91613786854185e-06, + "loss": 0.4614, + "step": 3449 + }, + { + "epoch": 2.3200806926115827, + "grad_norm": 0.519376934857127, + "learning_rate": 5.9139735141169455e-06, + "loss": 0.4338, + "step": 3450 + }, + { + "epoch": 2.320753131041439, + "grad_norm": 0.416453104062567, + "learning_rate": 5.911808982499519e-06, + "loss": 0.4563, + "step": 3451 + }, + { + "epoch": 2.3214255694712955, + "grad_norm": 0.36849376223410835, + "learning_rate": 5.909644274109204e-06, + "loss": 0.4497, + "step": 3452 + }, + { + "epoch": 2.3220980079011517, + "grad_norm": 0.33518302342042033, + "learning_rate": 5.907479389365678e-06, + "loss": 0.4594, + "step": 3453 + }, + { + "epoch": 2.322770446331008, + "grad_norm": 0.38748584452757723, + "learning_rate": 5.905314328688649e-06, + "loss": 0.4528, + "step": 3454 + }, + { + "epoch": 2.323442884760864, + "grad_norm": 0.44889342511131064, + "learning_rate": 5.903149092497856e-06, + "loss": 0.4646, + "step": 3455 + }, + { + "epoch": 2.3241153231907203, + "grad_norm": 0.39168359932896285, + "learning_rate": 5.900983681213075e-06, + "loss": 0.4451, + "step": 3456 + }, + { + "epoch": 2.3247877616205765, + "grad_norm": 0.33063191767706884, + "learning_rate": 5.898818095254116e-06, + "loss": 0.4552, + "step": 3457 + }, + { + "epoch": 2.3254602000504327, + "grad_norm": 0.4200611590444878, + "learning_rate": 5.896652335040825e-06, + "loss": 0.4658, + "step": 3458 + }, + { + "epoch": 2.3261326384802894, + "grad_norm": 0.3506150399617044, + "learning_rate": 5.894486400993072e-06, + "loss": 0.4623, + "step": 3459 + }, + { + "epoch": 2.3268050769101456, + "grad_norm": 0.4364839190318039, + "learning_rate": 5.8923202935307755e-06, + "loss": 0.4401, + "step": 3460 + }, + { + "epoch": 2.327477515340002, + "grad_norm": 0.3506604314068755, + "learning_rate": 5.890154013073875e-06, + "loss": 0.4564, + "step": 3461 + }, + { + "epoch": 2.328149953769858, + "grad_norm": 0.3195057224021962, + "learning_rate": 5.887987560042348e-06, + "loss": 0.4447, + "step": 3462 + }, + { + "epoch": 2.328822392199714, + "grad_norm": 0.4081013352389473, + "learning_rate": 5.88582093485621e-06, + "loss": 0.4575, + "step": 3463 + }, + { + "epoch": 2.3294948306295704, + "grad_norm": 0.3509868634510859, + "learning_rate": 5.883654137935501e-06, + "loss": 0.4437, + "step": 3464 + }, + { + "epoch": 2.3301672690594266, + "grad_norm": 0.3227618446614624, + "learning_rate": 5.881487169700305e-06, + "loss": 0.4718, + "step": 3465 + }, + { + "epoch": 2.330839707489283, + "grad_norm": 0.7520884280657425, + "learning_rate": 5.879320030570728e-06, + "loss": 0.4716, + "step": 3466 + }, + { + "epoch": 2.3315121459191395, + "grad_norm": 0.3747673865418648, + "learning_rate": 5.877152720966917e-06, + "loss": 0.4656, + "step": 3467 + }, + { + "epoch": 2.3321845843489957, + "grad_norm": 0.36206969058631283, + "learning_rate": 5.8749852413090465e-06, + "loss": 0.4562, + "step": 3468 + }, + { + "epoch": 2.332857022778852, + "grad_norm": 0.4123862983586637, + "learning_rate": 5.872817592017331e-06, + "loss": 0.4469, + "step": 3469 + }, + { + "epoch": 2.333529461208708, + "grad_norm": 0.33733669496642177, + "learning_rate": 5.870649773512011e-06, + "loss": 0.4565, + "step": 3470 + }, + { + "epoch": 2.3342018996385643, + "grad_norm": 0.42636616448320297, + "learning_rate": 5.868481786213364e-06, + "loss": 0.4468, + "step": 3471 + }, + { + "epoch": 2.3348743380684205, + "grad_norm": 0.384115666486269, + "learning_rate": 5.866313630541698e-06, + "loss": 0.4437, + "step": 3472 + }, + { + "epoch": 2.3355467764982767, + "grad_norm": 0.41716083426210254, + "learning_rate": 5.864145306917355e-06, + "loss": 0.4298, + "step": 3473 + }, + { + "epoch": 2.3362192149281333, + "grad_norm": 0.3513270477111295, + "learning_rate": 5.861976815760708e-06, + "loss": 0.4543, + "step": 3474 + }, + { + "epoch": 2.3368916533579895, + "grad_norm": 0.3658591809502371, + "learning_rate": 5.859808157492164e-06, + "loss": 0.4586, + "step": 3475 + }, + { + "epoch": 2.3375640917878457, + "grad_norm": 0.3776366683658066, + "learning_rate": 5.857639332532163e-06, + "loss": 0.4575, + "step": 3476 + }, + { + "epoch": 2.338236530217702, + "grad_norm": 0.36436135486161964, + "learning_rate": 5.855470341301176e-06, + "loss": 0.4386, + "step": 3477 + }, + { + "epoch": 2.338908968647558, + "grad_norm": 0.3240074843175133, + "learning_rate": 5.853301184219706e-06, + "loss": 0.4498, + "step": 3478 + }, + { + "epoch": 2.3395814070774144, + "grad_norm": 0.4090670479530033, + "learning_rate": 5.851131861708288e-06, + "loss": 0.473, + "step": 3479 + }, + { + "epoch": 2.3402538455072706, + "grad_norm": 0.388472246416491, + "learning_rate": 5.84896237418749e-06, + "loss": 0.4737, + "step": 3480 + }, + { + "epoch": 2.3409262839371268, + "grad_norm": 0.3360497237320795, + "learning_rate": 5.846792722077911e-06, + "loss": 0.4552, + "step": 3481 + }, + { + "epoch": 2.3415987223669834, + "grad_norm": 0.341364864082169, + "learning_rate": 5.8446229058001856e-06, + "loss": 0.4576, + "step": 3482 + }, + { + "epoch": 2.3422711607968396, + "grad_norm": 0.32697453039951657, + "learning_rate": 5.842452925774974e-06, + "loss": 0.4392, + "step": 3483 + }, + { + "epoch": 2.342943599226696, + "grad_norm": 0.3980001762154398, + "learning_rate": 5.840282782422972e-06, + "loss": 0.4551, + "step": 3484 + }, + { + "epoch": 2.343616037656552, + "grad_norm": 0.31000431629498004, + "learning_rate": 5.838112476164908e-06, + "loss": 0.4521, + "step": 3485 + }, + { + "epoch": 2.3442884760864082, + "grad_norm": 0.31545657709084923, + "learning_rate": 5.835942007421538e-06, + "loss": 0.4641, + "step": 3486 + }, + { + "epoch": 2.3449609145162644, + "grad_norm": 0.3679552244830917, + "learning_rate": 5.833771376613654e-06, + "loss": 0.4593, + "step": 3487 + }, + { + "epoch": 2.3456333529461206, + "grad_norm": 0.31157320097469177, + "learning_rate": 5.831600584162076e-06, + "loss": 0.4561, + "step": 3488 + }, + { + "epoch": 2.3463057913759773, + "grad_norm": 0.468879910189735, + "learning_rate": 5.829429630487659e-06, + "loss": 0.4376, + "step": 3489 + }, + { + "epoch": 2.3469782298058335, + "grad_norm": 0.3771545389043398, + "learning_rate": 5.827258516011284e-06, + "loss": 0.4609, + "step": 3490 + }, + { + "epoch": 2.3476506682356897, + "grad_norm": 0.39098431423034563, + "learning_rate": 5.825087241153867e-06, + "loss": 0.4572, + "step": 3491 + }, + { + "epoch": 2.348323106665546, + "grad_norm": 0.32933039095880423, + "learning_rate": 5.822915806336355e-06, + "loss": 0.4492, + "step": 3492 + }, + { + "epoch": 2.348995545095402, + "grad_norm": 0.33183804809411027, + "learning_rate": 5.820744211979725e-06, + "loss": 0.4514, + "step": 3493 + }, + { + "epoch": 2.3496679835252583, + "grad_norm": 0.33945276045807116, + "learning_rate": 5.818572458504983e-06, + "loss": 0.442, + "step": 3494 + }, + { + "epoch": 2.3503404219551145, + "grad_norm": 0.3534888950933072, + "learning_rate": 5.816400546333171e-06, + "loss": 0.4535, + "step": 3495 + }, + { + "epoch": 2.351012860384971, + "grad_norm": 0.3096377373774302, + "learning_rate": 5.814228475885357e-06, + "loss": 0.4546, + "step": 3496 + }, + { + "epoch": 2.3516852988148274, + "grad_norm": 0.32904977790765544, + "learning_rate": 5.812056247582643e-06, + "loss": 0.458, + "step": 3497 + }, + { + "epoch": 2.3523577372446836, + "grad_norm": 0.38029547051000695, + "learning_rate": 5.809883861846159e-06, + "loss": 0.4387, + "step": 3498 + }, + { + "epoch": 2.3530301756745398, + "grad_norm": 0.33342204975446, + "learning_rate": 5.8077113190970644e-06, + "loss": 0.4563, + "step": 3499 + }, + { + "epoch": 2.353702614104396, + "grad_norm": 0.34230708427497747, + "learning_rate": 5.8055386197565564e-06, + "loss": 0.4583, + "step": 3500 + }, + { + "epoch": 2.354375052534252, + "grad_norm": 0.2951783161108756, + "learning_rate": 5.803365764245852e-06, + "loss": 0.4583, + "step": 3501 + }, + { + "epoch": 2.3550474909641084, + "grad_norm": 0.3079953132771035, + "learning_rate": 5.801192752986208e-06, + "loss": 0.4503, + "step": 3502 + }, + { + "epoch": 2.355719929393965, + "grad_norm": 0.3570046765039688, + "learning_rate": 5.799019586398904e-06, + "loss": 0.4578, + "step": 3503 + }, + { + "epoch": 2.356392367823821, + "grad_norm": 0.3177125091323911, + "learning_rate": 5.796846264905255e-06, + "loss": 0.4648, + "step": 3504 + }, + { + "epoch": 2.3570648062536774, + "grad_norm": 0.3363125140497039, + "learning_rate": 5.794672788926602e-06, + "loss": 0.4623, + "step": 3505 + }, + { + "epoch": 2.3577372446835336, + "grad_norm": 0.3167853911375675, + "learning_rate": 5.792499158884322e-06, + "loss": 0.4667, + "step": 3506 + }, + { + "epoch": 2.35840968311339, + "grad_norm": 0.6921775759842348, + "learning_rate": 5.790325375199812e-06, + "loss": 0.4543, + "step": 3507 + }, + { + "epoch": 2.359082121543246, + "grad_norm": 0.30236339661586337, + "learning_rate": 5.788151438294509e-06, + "loss": 0.4433, + "step": 3508 + }, + { + "epoch": 2.3597545599731022, + "grad_norm": 0.40941230596207956, + "learning_rate": 5.785977348589875e-06, + "loss": 0.4474, + "step": 3509 + }, + { + "epoch": 2.360426998402959, + "grad_norm": 0.5196512787688972, + "learning_rate": 5.783803106507399e-06, + "loss": 0.4373, + "step": 3510 + }, + { + "epoch": 2.361099436832815, + "grad_norm": 0.2991559649141668, + "learning_rate": 5.781628712468605e-06, + "loss": 0.4538, + "step": 3511 + }, + { + "epoch": 2.3617718752626713, + "grad_norm": 0.3825417809422422, + "learning_rate": 5.779454166895042e-06, + "loss": 0.447, + "step": 3512 + }, + { + "epoch": 2.3624443136925275, + "grad_norm": 0.4769351696146306, + "learning_rate": 5.777279470208294e-06, + "loss": 0.4713, + "step": 3513 + }, + { + "epoch": 2.3631167521223837, + "grad_norm": 0.30023152393422464, + "learning_rate": 5.775104622829965e-06, + "loss": 0.446, + "step": 3514 + }, + { + "epoch": 2.36378919055224, + "grad_norm": 0.33153694221794716, + "learning_rate": 5.7729296251817e-06, + "loss": 0.4776, + "step": 3515 + }, + { + "epoch": 2.364461628982096, + "grad_norm": 0.4047692204296205, + "learning_rate": 5.770754477685162e-06, + "loss": 0.4573, + "step": 3516 + }, + { + "epoch": 2.3651340674119528, + "grad_norm": 0.40042975658998164, + "learning_rate": 5.7685791807620505e-06, + "loss": 0.4606, + "step": 3517 + }, + { + "epoch": 2.365806505841809, + "grad_norm": 0.327532117308796, + "learning_rate": 5.766403734834089e-06, + "loss": 0.445, + "step": 3518 + }, + { + "epoch": 2.366478944271665, + "grad_norm": 0.2704078193848059, + "learning_rate": 5.764228140323036e-06, + "loss": 0.4386, + "step": 3519 + }, + { + "epoch": 2.3671513827015214, + "grad_norm": 0.3404895554400373, + "learning_rate": 5.7620523976506715e-06, + "loss": 0.4511, + "step": 3520 + }, + { + "epoch": 2.3678238211313776, + "grad_norm": 0.4333037773933191, + "learning_rate": 5.759876507238811e-06, + "loss": 0.4511, + "step": 3521 + }, + { + "epoch": 2.3684962595612338, + "grad_norm": 1.309105942254137, + "learning_rate": 5.7577004695092945e-06, + "loss": 0.4603, + "step": 3522 + }, + { + "epoch": 2.36916869799109, + "grad_norm": 0.32328978955846294, + "learning_rate": 5.755524284883989e-06, + "loss": 0.4632, + "step": 3523 + }, + { + "epoch": 2.3698411364209466, + "grad_norm": 0.37849843215721984, + "learning_rate": 5.753347953784797e-06, + "loss": 0.4409, + "step": 3524 + }, + { + "epoch": 2.370513574850803, + "grad_norm": 0.3476793256077204, + "learning_rate": 5.751171476633641e-06, + "loss": 0.4486, + "step": 3525 + }, + { + "epoch": 2.371186013280659, + "grad_norm": 0.3750262990064418, + "learning_rate": 5.748994853852479e-06, + "loss": 0.4647, + "step": 3526 + }, + { + "epoch": 2.3718584517105152, + "grad_norm": 0.34635074214628625, + "learning_rate": 5.746818085863292e-06, + "loss": 0.4416, + "step": 3527 + }, + { + "epoch": 2.3725308901403714, + "grad_norm": 0.38434110558426926, + "learning_rate": 5.74464117308809e-06, + "loss": 0.4564, + "step": 3528 + }, + { + "epoch": 2.3732033285702276, + "grad_norm": 0.34561027779242004, + "learning_rate": 5.742464115948915e-06, + "loss": 0.4424, + "step": 3529 + }, + { + "epoch": 2.373875767000084, + "grad_norm": 0.33266273991326856, + "learning_rate": 5.740286914867832e-06, + "loss": 0.4387, + "step": 3530 + }, + { + "epoch": 2.3745482054299405, + "grad_norm": 0.42008457441870206, + "learning_rate": 5.738109570266939e-06, + "loss": 0.4511, + "step": 3531 + }, + { + "epoch": 2.3752206438597967, + "grad_norm": 0.3368066077329263, + "learning_rate": 5.735932082568356e-06, + "loss": 0.4416, + "step": 3532 + }, + { + "epoch": 2.375893082289653, + "grad_norm": 0.30953007129649845, + "learning_rate": 5.733754452194235e-06, + "loss": 0.4515, + "step": 3533 + }, + { + "epoch": 2.376565520719509, + "grad_norm": 0.3584541636729375, + "learning_rate": 5.731576679566754e-06, + "loss": 0.4529, + "step": 3534 + }, + { + "epoch": 2.3772379591493653, + "grad_norm": 0.3819730806896943, + "learning_rate": 5.729398765108118e-06, + "loss": 0.4545, + "step": 3535 + }, + { + "epoch": 2.3779103975792215, + "grad_norm": 0.3393313272383021, + "learning_rate": 5.727220709240563e-06, + "loss": 0.4607, + "step": 3536 + }, + { + "epoch": 2.3785828360090777, + "grad_norm": 0.3407277498003918, + "learning_rate": 5.725042512386347e-06, + "loss": 0.4556, + "step": 3537 + }, + { + "epoch": 2.3792552744389344, + "grad_norm": 0.44335690258193794, + "learning_rate": 5.72286417496776e-06, + "loss": 0.4492, + "step": 3538 + }, + { + "epoch": 2.3799277128687906, + "grad_norm": 0.360155250476559, + "learning_rate": 5.720685697407118e-06, + "loss": 0.4658, + "step": 3539 + }, + { + "epoch": 2.3806001512986468, + "grad_norm": 0.4081717614851412, + "learning_rate": 5.718507080126761e-06, + "loss": 0.453, + "step": 3540 + }, + { + "epoch": 2.381272589728503, + "grad_norm": 0.32668892308941555, + "learning_rate": 5.71632832354906e-06, + "loss": 0.4399, + "step": 3541 + }, + { + "epoch": 2.381945028158359, + "grad_norm": 0.4516247995817048, + "learning_rate": 5.714149428096413e-06, + "loss": 0.4543, + "step": 3542 + }, + { + "epoch": 2.3826174665882154, + "grad_norm": 0.35628091371878984, + "learning_rate": 5.711970394191241e-06, + "loss": 0.4426, + "step": 3543 + }, + { + "epoch": 2.3832899050180716, + "grad_norm": 0.3202215438454765, + "learning_rate": 5.709791222255996e-06, + "loss": 0.4556, + "step": 3544 + }, + { + "epoch": 2.3839623434479282, + "grad_norm": 0.3217462913974043, + "learning_rate": 5.7076119127131545e-06, + "loss": 0.4578, + "step": 3545 + }, + { + "epoch": 2.3846347818777844, + "grad_norm": 0.3800753121649306, + "learning_rate": 5.705432465985224e-06, + "loss": 0.4543, + "step": 3546 + }, + { + "epoch": 2.3853072203076406, + "grad_norm": 0.3745986832533898, + "learning_rate": 5.703252882494728e-06, + "loss": 0.4564, + "step": 3547 + }, + { + "epoch": 2.385979658737497, + "grad_norm": 0.39388160698750074, + "learning_rate": 5.70107316266423e-06, + "loss": 0.4553, + "step": 3548 + }, + { + "epoch": 2.386652097167353, + "grad_norm": 0.3693568996979446, + "learning_rate": 5.698893306916309e-06, + "loss": 0.4545, + "step": 3549 + }, + { + "epoch": 2.3873245355972093, + "grad_norm": 0.3358204292471544, + "learning_rate": 5.6967133156735785e-06, + "loss": 0.4654, + "step": 3550 + }, + { + "epoch": 2.3879969740270655, + "grad_norm": 0.3879923145296747, + "learning_rate": 5.6945331893586705e-06, + "loss": 0.4497, + "step": 3551 + }, + { + "epoch": 2.388669412456922, + "grad_norm": 0.4702166619666767, + "learning_rate": 5.69235292839425e-06, + "loss": 0.4523, + "step": 3552 + }, + { + "epoch": 2.3893418508867783, + "grad_norm": 0.35151582679612603, + "learning_rate": 5.690172533203005e-06, + "loss": 0.4548, + "step": 3553 + }, + { + "epoch": 2.3900142893166345, + "grad_norm": 0.33623006086909196, + "learning_rate": 5.68799200420765e-06, + "loss": 0.4395, + "step": 3554 + }, + { + "epoch": 2.3906867277464907, + "grad_norm": 0.3180533268004615, + "learning_rate": 5.685811341830924e-06, + "loss": 0.4521, + "step": 3555 + }, + { + "epoch": 2.391359166176347, + "grad_norm": 0.3120770132369895, + "learning_rate": 5.6836305464955945e-06, + "loss": 0.4563, + "step": 3556 + }, + { + "epoch": 2.392031604606203, + "grad_norm": 0.3313207875580303, + "learning_rate": 5.681449618624454e-06, + "loss": 0.4555, + "step": 3557 + }, + { + "epoch": 2.3927040430360593, + "grad_norm": 0.34906684447334524, + "learning_rate": 5.679268558640318e-06, + "loss": 0.4399, + "step": 3558 + }, + { + "epoch": 2.393376481465916, + "grad_norm": 0.3359379283214766, + "learning_rate": 5.677087366966031e-06, + "loss": 0.4499, + "step": 3559 + }, + { + "epoch": 2.394048919895772, + "grad_norm": 0.34212649051370675, + "learning_rate": 5.67490604402446e-06, + "loss": 0.4536, + "step": 3560 + }, + { + "epoch": 2.3947213583256284, + "grad_norm": 0.3968833539019725, + "learning_rate": 5.672724590238502e-06, + "loss": 0.4363, + "step": 3561 + }, + { + "epoch": 2.3953937967554846, + "grad_norm": 0.33982599660113577, + "learning_rate": 5.670543006031075e-06, + "loss": 0.4588, + "step": 3562 + }, + { + "epoch": 2.396066235185341, + "grad_norm": 0.3126079592635059, + "learning_rate": 5.668361291825124e-06, + "loss": 0.4459, + "step": 3563 + }, + { + "epoch": 2.396738673615197, + "grad_norm": 0.3645127404080134, + "learning_rate": 5.666179448043621e-06, + "loss": 0.4499, + "step": 3564 + }, + { + "epoch": 2.397411112045053, + "grad_norm": 0.3110482341885458, + "learning_rate": 5.663997475109558e-06, + "loss": 0.4678, + "step": 3565 + }, + { + "epoch": 2.39808355047491, + "grad_norm": 0.32199104525287026, + "learning_rate": 5.661815373445959e-06, + "loss": 0.4614, + "step": 3566 + }, + { + "epoch": 2.398755988904766, + "grad_norm": 0.8639742510000906, + "learning_rate": 5.659633143475864e-06, + "loss": 0.4617, + "step": 3567 + }, + { + "epoch": 2.3994284273346222, + "grad_norm": 0.40995524737792327, + "learning_rate": 5.6574507856223474e-06, + "loss": 0.4533, + "step": 3568 + }, + { + "epoch": 2.4001008657644785, + "grad_norm": 0.36762151659653114, + "learning_rate": 5.655268300308502e-06, + "loss": 0.4489, + "step": 3569 + }, + { + "epoch": 2.4007733041943347, + "grad_norm": 0.33318188755161604, + "learning_rate": 5.653085687957449e-06, + "loss": 0.4482, + "step": 3570 + }, + { + "epoch": 2.401445742624191, + "grad_norm": 0.28687255302183695, + "learning_rate": 5.650902948992332e-06, + "loss": 0.4448, + "step": 3571 + }, + { + "epoch": 2.402118181054047, + "grad_norm": 0.3976726902254766, + "learning_rate": 5.648720083836319e-06, + "loss": 0.4601, + "step": 3572 + }, + { + "epoch": 2.4027906194839037, + "grad_norm": 0.3193218278392546, + "learning_rate": 5.646537092912603e-06, + "loss": 0.4585, + "step": 3573 + }, + { + "epoch": 2.40346305791376, + "grad_norm": 0.36282104810944193, + "learning_rate": 5.644353976644402e-06, + "loss": 0.4362, + "step": 3574 + }, + { + "epoch": 2.404135496343616, + "grad_norm": 0.39340217861985266, + "learning_rate": 5.642170735454959e-06, + "loss": 0.4593, + "step": 3575 + }, + { + "epoch": 2.4048079347734723, + "grad_norm": 0.3427481067975898, + "learning_rate": 5.6399873697675385e-06, + "loss": 0.4485, + "step": 3576 + }, + { + "epoch": 2.4054803732033285, + "grad_norm": 0.405186505645815, + "learning_rate": 5.637803880005431e-06, + "loss": 0.4441, + "step": 3577 + }, + { + "epoch": 2.4061528116331847, + "grad_norm": 0.3167912422521965, + "learning_rate": 5.635620266591953e-06, + "loss": 0.4543, + "step": 3578 + }, + { + "epoch": 2.406825250063041, + "grad_norm": 0.35430272710261224, + "learning_rate": 5.63343652995044e-06, + "loss": 0.4806, + "step": 3579 + }, + { + "epoch": 2.4074976884928976, + "grad_norm": 0.39827430646088413, + "learning_rate": 5.631252670504255e-06, + "loss": 0.4627, + "step": 3580 + }, + { + "epoch": 2.408170126922754, + "grad_norm": 0.3536213396008051, + "learning_rate": 5.629068688676785e-06, + "loss": 0.4375, + "step": 3581 + }, + { + "epoch": 2.40884256535261, + "grad_norm": 0.5413875717151687, + "learning_rate": 5.626884584891438e-06, + "loss": 0.4451, + "step": 3582 + }, + { + "epoch": 2.409515003782466, + "grad_norm": 0.41928945425001374, + "learning_rate": 5.624700359571649e-06, + "loss": 0.4569, + "step": 3583 + }, + { + "epoch": 2.4101874422123224, + "grad_norm": 0.3354908193449054, + "learning_rate": 5.622516013140874e-06, + "loss": 0.4486, + "step": 3584 + }, + { + "epoch": 2.4108598806421786, + "grad_norm": 0.3648121009986923, + "learning_rate": 5.620331546022591e-06, + "loss": 0.4332, + "step": 3585 + }, + { + "epoch": 2.411532319072035, + "grad_norm": 0.35255253553402593, + "learning_rate": 5.618146958640309e-06, + "loss": 0.4381, + "step": 3586 + }, + { + "epoch": 2.4122047575018915, + "grad_norm": 0.333954692659307, + "learning_rate": 5.615962251417551e-06, + "loss": 0.4526, + "step": 3587 + }, + { + "epoch": 2.4128771959317477, + "grad_norm": 0.37607079229227774, + "learning_rate": 5.613777424777871e-06, + "loss": 0.4626, + "step": 3588 + }, + { + "epoch": 2.413549634361604, + "grad_norm": 0.38095934546167926, + "learning_rate": 5.6115924791448385e-06, + "loss": 0.4528, + "step": 3589 + }, + { + "epoch": 2.41422207279146, + "grad_norm": 0.3993076314402837, + "learning_rate": 5.609407414942053e-06, + "loss": 0.4643, + "step": 3590 + }, + { + "epoch": 2.4148945112213163, + "grad_norm": 0.39223329828299025, + "learning_rate": 5.607222232593131e-06, + "loss": 0.4641, + "step": 3591 + }, + { + "epoch": 2.4155669496511725, + "grad_norm": 0.3591949686390438, + "learning_rate": 5.605036932521718e-06, + "loss": 0.4494, + "step": 3592 + }, + { + "epoch": 2.4162393880810287, + "grad_norm": 0.3133711489094439, + "learning_rate": 5.602851515151477e-06, + "loss": 0.4429, + "step": 3593 + }, + { + "epoch": 2.4169118265108853, + "grad_norm": 0.33802182001683545, + "learning_rate": 5.600665980906099e-06, + "loss": 0.4481, + "step": 3594 + }, + { + "epoch": 2.4175842649407415, + "grad_norm": 0.35171603486105313, + "learning_rate": 5.59848033020929e-06, + "loss": 0.4791, + "step": 3595 + }, + { + "epoch": 2.4182567033705977, + "grad_norm": 0.3591522629231598, + "learning_rate": 5.596294563484787e-06, + "loss": 0.4724, + "step": 3596 + }, + { + "epoch": 2.418929141800454, + "grad_norm": 0.447799486858325, + "learning_rate": 5.594108681156347e-06, + "loss": 0.4495, + "step": 3597 + }, + { + "epoch": 2.41960158023031, + "grad_norm": 0.3290264637523835, + "learning_rate": 5.591922683647744e-06, + "loss": 0.4495, + "step": 3598 + }, + { + "epoch": 2.4202740186601663, + "grad_norm": 0.3379787091837133, + "learning_rate": 5.589736571382781e-06, + "loss": 0.4592, + "step": 3599 + }, + { + "epoch": 2.4209464570900225, + "grad_norm": 0.32521436538097526, + "learning_rate": 5.587550344785278e-06, + "loss": 0.4619, + "step": 3600 + }, + { + "epoch": 2.421618895519879, + "grad_norm": 0.3701978164103939, + "learning_rate": 5.585364004279083e-06, + "loss": 0.4436, + "step": 3601 + }, + { + "epoch": 2.4222913339497354, + "grad_norm": 0.3847282583347662, + "learning_rate": 5.583177550288063e-06, + "loss": 0.457, + "step": 3602 + }, + { + "epoch": 2.4229637723795916, + "grad_norm": 0.31382599299470876, + "learning_rate": 5.580990983236105e-06, + "loss": 0.4668, + "step": 3603 + }, + { + "epoch": 2.423636210809448, + "grad_norm": 0.3389055913276392, + "learning_rate": 5.5788043035471205e-06, + "loss": 0.4558, + "step": 3604 + }, + { + "epoch": 2.424308649239304, + "grad_norm": 1.0617375171131134, + "learning_rate": 5.576617511645044e-06, + "loss": 0.4664, + "step": 3605 + }, + { + "epoch": 2.42498108766916, + "grad_norm": 0.3532471892301109, + "learning_rate": 5.574430607953827e-06, + "loss": 0.4551, + "step": 3606 + }, + { + "epoch": 2.4256535260990164, + "grad_norm": 0.3346822353524649, + "learning_rate": 5.572243592897449e-06, + "loss": 0.4539, + "step": 3607 + }, + { + "epoch": 2.426325964528873, + "grad_norm": 0.36237163805522626, + "learning_rate": 5.570056466899904e-06, + "loss": 0.4612, + "step": 3608 + }, + { + "epoch": 2.4269984029587293, + "grad_norm": 0.3128470088619246, + "learning_rate": 5.567869230385214e-06, + "loss": 0.4469, + "step": 3609 + }, + { + "epoch": 2.4276708413885855, + "grad_norm": 0.34861094074937193, + "learning_rate": 5.565681883777418e-06, + "loss": 0.4551, + "step": 3610 + }, + { + "epoch": 2.4283432798184417, + "grad_norm": 0.4297843609897647, + "learning_rate": 5.56349442750058e-06, + "loss": 0.4449, + "step": 3611 + }, + { + "epoch": 2.429015718248298, + "grad_norm": 0.3477250631144471, + "learning_rate": 5.561306861978783e-06, + "loss": 0.4398, + "step": 3612 + }, + { + "epoch": 2.429688156678154, + "grad_norm": 0.3509821263654013, + "learning_rate": 5.559119187636129e-06, + "loss": 0.4441, + "step": 3613 + }, + { + "epoch": 2.4303605951080103, + "grad_norm": 0.35737277401120054, + "learning_rate": 5.556931404896747e-06, + "loss": 0.4494, + "step": 3614 + }, + { + "epoch": 2.431033033537867, + "grad_norm": 0.3625938554539421, + "learning_rate": 5.554743514184781e-06, + "loss": 0.4558, + "step": 3615 + }, + { + "epoch": 2.431705471967723, + "grad_norm": 0.36630770612755065, + "learning_rate": 5.552555515924398e-06, + "loss": 0.4443, + "step": 3616 + }, + { + "epoch": 2.4323779103975793, + "grad_norm": 0.33239617734472793, + "learning_rate": 5.550367410539788e-06, + "loss": 0.4478, + "step": 3617 + }, + { + "epoch": 2.4330503488274355, + "grad_norm": 0.7626069649045053, + "learning_rate": 5.5481791984551614e-06, + "loss": 0.4503, + "step": 3618 + }, + { + "epoch": 2.4337227872572917, + "grad_norm": 0.3158254591145617, + "learning_rate": 5.545990880094747e-06, + "loss": 0.4589, + "step": 3619 + }, + { + "epoch": 2.434395225687148, + "grad_norm": 0.3522408038054907, + "learning_rate": 5.543802455882793e-06, + "loss": 0.4539, + "step": 3620 + }, + { + "epoch": 2.435067664117004, + "grad_norm": 0.35852926157212767, + "learning_rate": 5.541613926243573e-06, + "loss": 0.4552, + "step": 3621 + }, + { + "epoch": 2.435740102546861, + "grad_norm": 0.3039993342004631, + "learning_rate": 5.539425291601378e-06, + "loss": 0.4565, + "step": 3622 + }, + { + "epoch": 2.436412540976717, + "grad_norm": 0.34123599831642026, + "learning_rate": 5.537236552380518e-06, + "loss": 0.4699, + "step": 3623 + }, + { + "epoch": 2.437084979406573, + "grad_norm": 0.34602669178981266, + "learning_rate": 5.535047709005327e-06, + "loss": 0.4612, + "step": 3624 + }, + { + "epoch": 2.4377574178364294, + "grad_norm": 0.38467285537375934, + "learning_rate": 5.532858761900156e-06, + "loss": 0.4437, + "step": 3625 + }, + { + "epoch": 2.4384298562662856, + "grad_norm": 0.3498196149534507, + "learning_rate": 5.5306697114893785e-06, + "loss": 0.4524, + "step": 3626 + }, + { + "epoch": 2.439102294696142, + "grad_norm": 0.3548262072706039, + "learning_rate": 5.5284805581973854e-06, + "loss": 0.4547, + "step": 3627 + }, + { + "epoch": 2.439774733125998, + "grad_norm": 0.3799419694111064, + "learning_rate": 5.526291302448589e-06, + "loss": 0.4424, + "step": 3628 + }, + { + "epoch": 2.4404471715558547, + "grad_norm": 0.9342712842749369, + "learning_rate": 5.5241019446674235e-06, + "loss": 0.4591, + "step": 3629 + }, + { + "epoch": 2.441119609985711, + "grad_norm": 0.3836099289086837, + "learning_rate": 5.52191248527834e-06, + "loss": 0.46, + "step": 3630 + }, + { + "epoch": 2.441792048415567, + "grad_norm": 0.3213034553405655, + "learning_rate": 5.519722924705808e-06, + "loss": 0.4524, + "step": 3631 + }, + { + "epoch": 2.4424644868454233, + "grad_norm": 0.3457962788868057, + "learning_rate": 5.517533263374322e-06, + "loss": 0.4429, + "step": 3632 + }, + { + "epoch": 2.4431369252752795, + "grad_norm": 0.30079475711553777, + "learning_rate": 5.51534350170839e-06, + "loss": 0.4608, + "step": 3633 + }, + { + "epoch": 2.4438093637051357, + "grad_norm": 0.33057774220755387, + "learning_rate": 5.513153640132543e-06, + "loss": 0.4751, + "step": 3634 + }, + { + "epoch": 2.444481802134992, + "grad_norm": 0.6793162709862243, + "learning_rate": 5.5109636790713315e-06, + "loss": 0.4369, + "step": 3635 + }, + { + "epoch": 2.445154240564848, + "grad_norm": 0.30836110440964787, + "learning_rate": 5.508773618949326e-06, + "loss": 0.4536, + "step": 3636 + }, + { + "epoch": 2.4458266789947047, + "grad_norm": 0.3242160993777931, + "learning_rate": 5.50658346019111e-06, + "loss": 0.4322, + "step": 3637 + }, + { + "epoch": 2.446499117424561, + "grad_norm": 0.4923792011120225, + "learning_rate": 5.504393203221294e-06, + "loss": 0.451, + "step": 3638 + }, + { + "epoch": 2.447171555854417, + "grad_norm": 0.3273613311767987, + "learning_rate": 5.502202848464504e-06, + "loss": 0.4545, + "step": 3639 + }, + { + "epoch": 2.4478439942842734, + "grad_norm": 0.37767840103116296, + "learning_rate": 5.500012396345385e-06, + "loss": 0.4412, + "step": 3640 + }, + { + "epoch": 2.4485164327141296, + "grad_norm": 0.39168523194271326, + "learning_rate": 5.497821847288599e-06, + "loss": 0.4675, + "step": 3641 + }, + { + "epoch": 2.4491888711439858, + "grad_norm": 0.38593499549881277, + "learning_rate": 5.4956312017188315e-06, + "loss": 0.4667, + "step": 3642 + }, + { + "epoch": 2.449861309573842, + "grad_norm": 0.3615940397816671, + "learning_rate": 5.493440460060785e-06, + "loss": 0.4512, + "step": 3643 + }, + { + "epoch": 2.450533748003698, + "grad_norm": 0.3281921881336059, + "learning_rate": 5.491249622739177e-06, + "loss": 0.4531, + "step": 3644 + }, + { + "epoch": 2.451206186433555, + "grad_norm": 0.34808784454053615, + "learning_rate": 5.489058690178748e-06, + "loss": 0.4414, + "step": 3645 + }, + { + "epoch": 2.451878624863411, + "grad_norm": 0.35169534590584306, + "learning_rate": 5.4868676628042524e-06, + "loss": 0.4396, + "step": 3646 + }, + { + "epoch": 2.452551063293267, + "grad_norm": 0.32127016672844627, + "learning_rate": 5.484676541040471e-06, + "loss": 0.4441, + "step": 3647 + }, + { + "epoch": 2.4532235017231234, + "grad_norm": 0.42630039919354706, + "learning_rate": 5.482485325312192e-06, + "loss": 0.453, + "step": 3648 + }, + { + "epoch": 2.4538959401529796, + "grad_norm": 0.3802912671793287, + "learning_rate": 5.480294016044232e-06, + "loss": 0.4543, + "step": 3649 + }, + { + "epoch": 2.454568378582836, + "grad_norm": 0.44748190503299723, + "learning_rate": 5.478102613661419e-06, + "loss": 0.4687, + "step": 3650 + }, + { + "epoch": 2.455240817012692, + "grad_norm": 0.3925107294918871, + "learning_rate": 5.475911118588603e-06, + "loss": 0.4542, + "step": 3651 + }, + { + "epoch": 2.4559132554425487, + "grad_norm": 0.30168919377227377, + "learning_rate": 5.473719531250648e-06, + "loss": 0.4424, + "step": 3652 + }, + { + "epoch": 2.456585693872405, + "grad_norm": 0.3977451541372308, + "learning_rate": 5.471527852072439e-06, + "loss": 0.4498, + "step": 3653 + }, + { + "epoch": 2.457258132302261, + "grad_norm": 0.3721355148940516, + "learning_rate": 5.46933608147888e-06, + "loss": 0.4476, + "step": 3654 + }, + { + "epoch": 2.4579305707321173, + "grad_norm": 0.35096306924828724, + "learning_rate": 5.467144219894888e-06, + "loss": 0.4413, + "step": 3655 + }, + { + "epoch": 2.4586030091619735, + "grad_norm": 0.3745171435882103, + "learning_rate": 5.4649522677454025e-06, + "loss": 0.4475, + "step": 3656 + }, + { + "epoch": 2.4592754475918297, + "grad_norm": 0.34740641576534054, + "learning_rate": 5.4627602254553756e-06, + "loss": 0.4531, + "step": 3657 + }, + { + "epoch": 2.459947886021686, + "grad_norm": 0.31338871132628493, + "learning_rate": 5.460568093449782e-06, + "loss": 0.454, + "step": 3658 + }, + { + "epoch": 2.4606203244515426, + "grad_norm": 0.3593929671118801, + "learning_rate": 5.458375872153611e-06, + "loss": 0.4474, + "step": 3659 + }, + { + "epoch": 2.4612927628813988, + "grad_norm": 0.33493349981955667, + "learning_rate": 5.45618356199187e-06, + "loss": 0.4511, + "step": 3660 + }, + { + "epoch": 2.461965201311255, + "grad_norm": 0.3787649160828917, + "learning_rate": 5.453991163389583e-06, + "loss": 0.4399, + "step": 3661 + }, + { + "epoch": 2.462637639741111, + "grad_norm": 0.32480355204924527, + "learning_rate": 5.451798676771793e-06, + "loss": 0.4438, + "step": 3662 + }, + { + "epoch": 2.4633100781709674, + "grad_norm": 0.31279420879956654, + "learning_rate": 5.449606102563554e-06, + "loss": 0.4452, + "step": 3663 + }, + { + "epoch": 2.4639825166008236, + "grad_norm": 0.28327007386162856, + "learning_rate": 5.447413441189945e-06, + "loss": 0.4472, + "step": 3664 + }, + { + "epoch": 2.4646549550306798, + "grad_norm": 0.3486201881078406, + "learning_rate": 5.445220693076058e-06, + "loss": 0.4563, + "step": 3665 + }, + { + "epoch": 2.4653273934605364, + "grad_norm": 0.36849582646330364, + "learning_rate": 5.443027858647002e-06, + "loss": 0.4522, + "step": 3666 + }, + { + "epoch": 2.4659998318903926, + "grad_norm": 0.3301145167435137, + "learning_rate": 5.440834938327905e-06, + "loss": 0.4584, + "step": 3667 + }, + { + "epoch": 2.466672270320249, + "grad_norm": 0.3003337613365606, + "learning_rate": 5.438641932543905e-06, + "loss": 0.4464, + "step": 3668 + }, + { + "epoch": 2.467344708750105, + "grad_norm": 0.3347995636249391, + "learning_rate": 5.436448841720166e-06, + "loss": 0.435, + "step": 3669 + }, + { + "epoch": 2.4680171471799612, + "grad_norm": 0.30100475327165044, + "learning_rate": 5.43425566628186e-06, + "loss": 0.4565, + "step": 3670 + }, + { + "epoch": 2.4686895856098174, + "grad_norm": 0.38738774674073373, + "learning_rate": 5.432062406654182e-06, + "loss": 0.4513, + "step": 3671 + }, + { + "epoch": 2.4693620240396736, + "grad_norm": 0.3242614621730323, + "learning_rate": 5.4298690632623375e-06, + "loss": 0.454, + "step": 3672 + }, + { + "epoch": 2.4700344624695303, + "grad_norm": 0.3905562361155058, + "learning_rate": 5.427675636531552e-06, + "loss": 0.444, + "step": 3673 + }, + { + "epoch": 2.4707069008993865, + "grad_norm": 0.34570382106032044, + "learning_rate": 5.425482126887067e-06, + "loss": 0.4396, + "step": 3674 + }, + { + "epoch": 2.4713793393292427, + "grad_norm": 0.3403550891713755, + "learning_rate": 5.423288534754141e-06, + "loss": 0.4522, + "step": 3675 + }, + { + "epoch": 2.472051777759099, + "grad_norm": 0.3318834277282211, + "learning_rate": 5.421094860558045e-06, + "loss": 0.4388, + "step": 3676 + }, + { + "epoch": 2.472724216188955, + "grad_norm": 0.29745058737843016, + "learning_rate": 5.4189011047240655e-06, + "loss": 0.4707, + "step": 3677 + }, + { + "epoch": 2.4733966546188113, + "grad_norm": 0.3149220065122759, + "learning_rate": 5.416707267677512e-06, + "loss": 0.4372, + "step": 3678 + }, + { + "epoch": 2.4740690930486675, + "grad_norm": 0.32808004579387384, + "learning_rate": 5.414513349843702e-06, + "loss": 0.4561, + "step": 3679 + }, + { + "epoch": 2.474741531478524, + "grad_norm": 0.38005276723555714, + "learning_rate": 5.412319351647971e-06, + "loss": 0.4572, + "step": 3680 + }, + { + "epoch": 2.4754139699083804, + "grad_norm": 0.32430615453417455, + "learning_rate": 5.410125273515672e-06, + "loss": 0.4377, + "step": 3681 + }, + { + "epoch": 2.4760864083382366, + "grad_norm": 0.29363910335500876, + "learning_rate": 5.407931115872169e-06, + "loss": 0.4345, + "step": 3682 + }, + { + "epoch": 2.4767588467680928, + "grad_norm": 0.390361054176332, + "learning_rate": 5.405736879142849e-06, + "loss": 0.45, + "step": 3683 + }, + { + "epoch": 2.477431285197949, + "grad_norm": 0.36684773614787974, + "learning_rate": 5.403542563753108e-06, + "loss": 0.4363, + "step": 3684 + }, + { + "epoch": 2.478103723627805, + "grad_norm": 0.3920056483082055, + "learning_rate": 5.401348170128359e-06, + "loss": 0.4523, + "step": 3685 + }, + { + "epoch": 2.4787761620576614, + "grad_norm": 1.4669592281148975, + "learning_rate": 5.399153698694029e-06, + "loss": 0.4544, + "step": 3686 + }, + { + "epoch": 2.479448600487518, + "grad_norm": 0.33742803366775703, + "learning_rate": 5.396959149875564e-06, + "loss": 0.4525, + "step": 3687 + }, + { + "epoch": 2.4801210389173742, + "grad_norm": 0.43895102427182286, + "learning_rate": 5.3947645240984185e-06, + "loss": 0.4398, + "step": 3688 + }, + { + "epoch": 2.4807934773472304, + "grad_norm": 0.3213947832025373, + "learning_rate": 5.392569821788069e-06, + "loss": 0.4261, + "step": 3689 + }, + { + "epoch": 2.4814659157770866, + "grad_norm": 0.35836800990492956, + "learning_rate": 5.390375043370002e-06, + "loss": 0.4289, + "step": 3690 + }, + { + "epoch": 2.482138354206943, + "grad_norm": 0.41082932360377455, + "learning_rate": 5.388180189269722e-06, + "loss": 0.466, + "step": 3691 + }, + { + "epoch": 2.482810792636799, + "grad_norm": 0.3616481356253951, + "learning_rate": 5.385985259912745e-06, + "loss": 0.4434, + "step": 3692 + }, + { + "epoch": 2.4834832310666553, + "grad_norm": 0.3233270418518311, + "learning_rate": 5.383790255724604e-06, + "loss": 0.4624, + "step": 3693 + }, + { + "epoch": 2.484155669496512, + "grad_norm": 0.3773964032169205, + "learning_rate": 5.381595177130844e-06, + "loss": 0.4633, + "step": 3694 + }, + { + "epoch": 2.484828107926368, + "grad_norm": 0.40606903638666175, + "learning_rate": 5.379400024557029e-06, + "loss": 0.4504, + "step": 3695 + }, + { + "epoch": 2.4855005463562243, + "grad_norm": 0.35668009511381027, + "learning_rate": 5.37720479842873e-06, + "loss": 0.4459, + "step": 3696 + }, + { + "epoch": 2.4861729847860805, + "grad_norm": 0.34781885195436796, + "learning_rate": 5.37500949917154e-06, + "loss": 0.4368, + "step": 3697 + }, + { + "epoch": 2.4868454232159367, + "grad_norm": 0.3977243510705404, + "learning_rate": 5.3728141272110625e-06, + "loss": 0.4397, + "step": 3698 + }, + { + "epoch": 2.487517861645793, + "grad_norm": 0.38920376735507745, + "learning_rate": 5.370618682972914e-06, + "loss": 0.4699, + "step": 3699 + }, + { + "epoch": 2.488190300075649, + "grad_norm": 0.535365852410878, + "learning_rate": 5.368423166882729e-06, + "loss": 0.4579, + "step": 3700 + }, + { + "epoch": 2.4888627385055058, + "grad_norm": 0.3562149949820014, + "learning_rate": 5.366227579366151e-06, + "loss": 0.4407, + "step": 3701 + }, + { + "epoch": 2.489535176935362, + "grad_norm": 0.3354089482329607, + "learning_rate": 5.364031920848841e-06, + "loss": 0.4611, + "step": 3702 + }, + { + "epoch": 2.490207615365218, + "grad_norm": 0.4049203957942453, + "learning_rate": 5.361836191756471e-06, + "loss": 0.4558, + "step": 3703 + }, + { + "epoch": 2.4908800537950744, + "grad_norm": 0.3010025011844406, + "learning_rate": 5.35964039251473e-06, + "loss": 0.4443, + "step": 3704 + }, + { + "epoch": 2.4915524922249306, + "grad_norm": 0.3885268762782305, + "learning_rate": 5.3574445235493165e-06, + "loss": 0.4522, + "step": 3705 + }, + { + "epoch": 2.492224930654787, + "grad_norm": 0.9568793935382124, + "learning_rate": 5.355248585285946e-06, + "loss": 0.4513, + "step": 3706 + }, + { + "epoch": 2.492897369084643, + "grad_norm": 0.4198708586420078, + "learning_rate": 5.353052578150346e-06, + "loss": 0.454, + "step": 3707 + }, + { + "epoch": 2.4935698075144996, + "grad_norm": 0.3116920558206328, + "learning_rate": 5.350856502568258e-06, + "loss": 0.4531, + "step": 3708 + }, + { + "epoch": 2.494242245944356, + "grad_norm": 0.3530569433463639, + "learning_rate": 5.348660358965438e-06, + "loss": 0.4373, + "step": 3709 + }, + { + "epoch": 2.494914684374212, + "grad_norm": 0.6219596914149117, + "learning_rate": 5.346464147767649e-06, + "loss": 0.47, + "step": 3710 + }, + { + "epoch": 2.4955871228040682, + "grad_norm": 0.7200298627812961, + "learning_rate": 5.344267869400676e-06, + "loss": 0.4562, + "step": 3711 + }, + { + "epoch": 2.4962595612339245, + "grad_norm": 0.3831329382165244, + "learning_rate": 5.342071524290311e-06, + "loss": 0.4503, + "step": 3712 + }, + { + "epoch": 2.4969319996637807, + "grad_norm": 0.3730820369626398, + "learning_rate": 5.339875112862361e-06, + "loss": 0.4544, + "step": 3713 + }, + { + "epoch": 2.497604438093637, + "grad_norm": 0.9308006610674574, + "learning_rate": 5.337678635542641e-06, + "loss": 0.4455, + "step": 3714 + }, + { + "epoch": 2.4982768765234935, + "grad_norm": 0.3900152218356268, + "learning_rate": 5.335482092756991e-06, + "loss": 0.4519, + "step": 3715 + }, + { + "epoch": 2.4989493149533497, + "grad_norm": 0.34510470883456346, + "learning_rate": 5.33328548493125e-06, + "loss": 0.4614, + "step": 3716 + }, + { + "epoch": 2.499621753383206, + "grad_norm": 0.39301377708235846, + "learning_rate": 5.331088812491278e-06, + "loss": 0.4376, + "step": 3717 + }, + { + "epoch": 2.500294191813062, + "grad_norm": 0.4132391443171161, + "learning_rate": 5.3288920758629445e-06, + "loss": 0.4611, + "step": 3718 + }, + { + "epoch": 2.5009666302429183, + "grad_norm": 0.30379365709876277, + "learning_rate": 5.326695275472132e-06, + "loss": 0.462, + "step": 3719 + }, + { + "epoch": 2.5016390686727745, + "grad_norm": 0.3063310686320014, + "learning_rate": 5.324498411744737e-06, + "loss": 0.4438, + "step": 3720 + }, + { + "epoch": 2.5023115071026307, + "grad_norm": 0.3687412953586924, + "learning_rate": 5.322301485106663e-06, + "loss": 0.4485, + "step": 3721 + }, + { + "epoch": 2.5029839455324874, + "grad_norm": 0.30038806642290977, + "learning_rate": 5.320104495983831e-06, + "loss": 0.448, + "step": 3722 + }, + { + "epoch": 2.5036563839623436, + "grad_norm": 0.4042132931618547, + "learning_rate": 5.317907444802174e-06, + "loss": 0.4486, + "step": 3723 + }, + { + "epoch": 2.5043288223922, + "grad_norm": 0.38294548405814394, + "learning_rate": 5.315710331987634e-06, + "loss": 0.459, + "step": 3724 + }, + { + "epoch": 2.505001260822056, + "grad_norm": 0.2917400221339506, + "learning_rate": 5.313513157966166e-06, + "loss": 0.4398, + "step": 3725 + }, + { + "epoch": 2.505673699251912, + "grad_norm": 0.30874946783443197, + "learning_rate": 5.311315923163739e-06, + "loss": 0.4611, + "step": 3726 + }, + { + "epoch": 2.5063461376817684, + "grad_norm": 0.32034127394020057, + "learning_rate": 5.30911862800633e-06, + "loss": 0.4564, + "step": 3727 + }, + { + "epoch": 2.5070185761116246, + "grad_norm": 0.393814502245678, + "learning_rate": 5.306921272919931e-06, + "loss": 0.4548, + "step": 3728 + }, + { + "epoch": 2.5076910145414812, + "grad_norm": 0.3589397375038359, + "learning_rate": 5.304723858330544e-06, + "loss": 0.4525, + "step": 3729 + }, + { + "epoch": 2.5083634529713374, + "grad_norm": 0.3886193975048878, + "learning_rate": 5.302526384664182e-06, + "loss": 0.4326, + "step": 3730 + }, + { + "epoch": 2.5090358914011937, + "grad_norm": 0.3619242724868831, + "learning_rate": 5.300328852346872e-06, + "loss": 0.4603, + "step": 3731 + }, + { + "epoch": 2.50970832983105, + "grad_norm": 0.3808986117863621, + "learning_rate": 5.29813126180465e-06, + "loss": 0.4587, + "step": 3732 + }, + { + "epoch": 2.510380768260906, + "grad_norm": 0.3818801429897, + "learning_rate": 5.295933613463565e-06, + "loss": 0.4528, + "step": 3733 + }, + { + "epoch": 2.5110532066907623, + "grad_norm": 0.3185246625786046, + "learning_rate": 5.2937359077496756e-06, + "loss": 0.4388, + "step": 3734 + }, + { + "epoch": 2.5117256451206185, + "grad_norm": 0.33417761556505876, + "learning_rate": 5.291538145089051e-06, + "loss": 0.4422, + "step": 3735 + }, + { + "epoch": 2.512398083550475, + "grad_norm": 0.3543133452008666, + "learning_rate": 5.289340325907774e-06, + "loss": 0.4375, + "step": 3736 + }, + { + "epoch": 2.5130705219803313, + "grad_norm": 0.45034041931345337, + "learning_rate": 5.287142450631937e-06, + "loss": 0.4511, + "step": 3737 + }, + { + "epoch": 2.5137429604101875, + "grad_norm": 0.3341749624172436, + "learning_rate": 5.284944519687641e-06, + "loss": 0.4419, + "step": 3738 + }, + { + "epoch": 2.5144153988400437, + "grad_norm": 0.31416689317385305, + "learning_rate": 5.2827465335010034e-06, + "loss": 0.4485, + "step": 3739 + }, + { + "epoch": 2.5150878372699, + "grad_norm": 0.32352161394960716, + "learning_rate": 5.280548492498145e-06, + "loss": 0.4637, + "step": 3740 + }, + { + "epoch": 2.515760275699756, + "grad_norm": 0.3596611050259769, + "learning_rate": 5.278350397105205e-06, + "loss": 0.4492, + "step": 3741 + }, + { + "epoch": 2.5164327141296123, + "grad_norm": 0.38533801199013906, + "learning_rate": 5.276152247748329e-06, + "loss": 0.4591, + "step": 3742 + }, + { + "epoch": 2.517105152559469, + "grad_norm": 0.39893128818506984, + "learning_rate": 5.27395404485367e-06, + "loss": 0.4515, + "step": 3743 + }, + { + "epoch": 2.517777590989325, + "grad_norm": 0.3601291730744921, + "learning_rate": 5.271755788847399e-06, + "loss": 0.4331, + "step": 3744 + }, + { + "epoch": 2.5184500294191814, + "grad_norm": 0.35341859376319246, + "learning_rate": 5.269557480155689e-06, + "loss": 0.4467, + "step": 3745 + }, + { + "epoch": 2.5191224678490376, + "grad_norm": 0.34460484487452686, + "learning_rate": 5.267359119204729e-06, + "loss": 0.4619, + "step": 3746 + }, + { + "epoch": 2.519794906278894, + "grad_norm": 0.3163956703666074, + "learning_rate": 5.2651607064207174e-06, + "loss": 0.4456, + "step": 3747 + }, + { + "epoch": 2.52046734470875, + "grad_norm": 0.38107463823791704, + "learning_rate": 5.262962242229861e-06, + "loss": 0.4604, + "step": 3748 + }, + { + "epoch": 2.521139783138606, + "grad_norm": 0.354401592955316, + "learning_rate": 5.260763727058377e-06, + "loss": 0.4598, + "step": 3749 + }, + { + "epoch": 2.521812221568463, + "grad_norm": 0.30038110187928374, + "learning_rate": 5.258565161332493e-06, + "loss": 0.4312, + "step": 3750 + }, + { + "epoch": 2.522484659998319, + "grad_norm": 0.35161551300999627, + "learning_rate": 5.256366545478444e-06, + "loss": 0.4369, + "step": 3751 + }, + { + "epoch": 2.5231570984281753, + "grad_norm": 0.4154141265683533, + "learning_rate": 5.2541678799224795e-06, + "loss": 0.4545, + "step": 3752 + }, + { + "epoch": 2.5238295368580315, + "grad_norm": 0.2843520409523974, + "learning_rate": 5.251969165090857e-06, + "loss": 0.4477, + "step": 3753 + }, + { + "epoch": 2.5245019752878877, + "grad_norm": 0.3356459667390433, + "learning_rate": 5.2497704014098375e-06, + "loss": 0.4437, + "step": 3754 + }, + { + "epoch": 2.525174413717744, + "grad_norm": 0.40533334967123485, + "learning_rate": 5.2475715893057e-06, + "loss": 0.4461, + "step": 3755 + }, + { + "epoch": 2.5258468521476, + "grad_norm": 0.351820305830887, + "learning_rate": 5.24537272920473e-06, + "loss": 0.4497, + "step": 3756 + }, + { + "epoch": 2.5265192905774567, + "grad_norm": 0.36700532615869463, + "learning_rate": 5.243173821533221e-06, + "loss": 0.4603, + "step": 3757 + }, + { + "epoch": 2.527191729007313, + "grad_norm": 0.36421125558057277, + "learning_rate": 5.2409748667174746e-06, + "loss": 0.4508, + "step": 3758 + }, + { + "epoch": 2.527864167437169, + "grad_norm": 0.3593637659685638, + "learning_rate": 5.238775865183805e-06, + "loss": 0.4319, + "step": 3759 + }, + { + "epoch": 2.5285366058670253, + "grad_norm": 0.451127830436243, + "learning_rate": 5.236576817358533e-06, + "loss": 0.4504, + "step": 3760 + }, + { + "epoch": 2.5292090442968815, + "grad_norm": 0.3612158552582543, + "learning_rate": 5.2343777236679905e-06, + "loss": 0.4432, + "step": 3761 + }, + { + "epoch": 2.5298814827267377, + "grad_norm": 0.35963170197322686, + "learning_rate": 5.232178584538515e-06, + "loss": 0.4586, + "step": 3762 + }, + { + "epoch": 2.530553921156594, + "grad_norm": 0.3513240604985171, + "learning_rate": 5.229979400396455e-06, + "loss": 0.4506, + "step": 3763 + }, + { + "epoch": 2.5312263595864506, + "grad_norm": 0.40705859055602794, + "learning_rate": 5.227780171668169e-06, + "loss": 0.4413, + "step": 3764 + }, + { + "epoch": 2.531898798016307, + "grad_norm": 0.38056720078199147, + "learning_rate": 5.225580898780022e-06, + "loss": 0.449, + "step": 3765 + }, + { + "epoch": 2.532571236446163, + "grad_norm": 0.3344126266364116, + "learning_rate": 5.223381582158388e-06, + "loss": 0.4499, + "step": 3766 + }, + { + "epoch": 2.533243674876019, + "grad_norm": 0.3450151129401091, + "learning_rate": 5.2211822222296495e-06, + "loss": 0.4584, + "step": 3767 + }, + { + "epoch": 2.5339161133058754, + "grad_norm": 0.3125769248548215, + "learning_rate": 5.218982819420199e-06, + "loss": 0.4434, + "step": 3768 + }, + { + "epoch": 2.5345885517357316, + "grad_norm": 0.3459655360483346, + "learning_rate": 5.216783374156432e-06, + "loss": 0.4543, + "step": 3769 + }, + { + "epoch": 2.535260990165588, + "grad_norm": 0.33743900715413994, + "learning_rate": 5.21458388686476e-06, + "loss": 0.4444, + "step": 3770 + }, + { + "epoch": 2.5359334285954445, + "grad_norm": 0.34409415630074597, + "learning_rate": 5.212384357971599e-06, + "loss": 0.4341, + "step": 3771 + }, + { + "epoch": 2.5366058670253, + "grad_norm": 0.3347872267650656, + "learning_rate": 5.21018478790337e-06, + "loss": 0.4525, + "step": 3772 + }, + { + "epoch": 2.537278305455157, + "grad_norm": 0.32985605985546435, + "learning_rate": 5.207985177086506e-06, + "loss": 0.4302, + "step": 3773 + }, + { + "epoch": 2.537950743885013, + "grad_norm": 0.3319653933091309, + "learning_rate": 5.205785525947445e-06, + "loss": 0.4317, + "step": 3774 + }, + { + "epoch": 2.5386231823148693, + "grad_norm": 0.3318287561536541, + "learning_rate": 5.20358583491264e-06, + "loss": 0.4572, + "step": 3775 + }, + { + "epoch": 2.5392956207447255, + "grad_norm": 0.2725473101671124, + "learning_rate": 5.20138610440854e-06, + "loss": 0.4362, + "step": 3776 + }, + { + "epoch": 2.5399680591745817, + "grad_norm": 0.6860141812033361, + "learning_rate": 5.199186334861612e-06, + "loss": 0.4323, + "step": 3777 + }, + { + "epoch": 2.5406404976044383, + "grad_norm": 0.3415301890154741, + "learning_rate": 5.1969865266983245e-06, + "loss": 0.4566, + "step": 3778 + }, + { + "epoch": 2.541312936034294, + "grad_norm": 0.3686557719090204, + "learning_rate": 5.1947866803451555e-06, + "loss": 0.4478, + "step": 3779 + }, + { + "epoch": 2.5419853744641507, + "grad_norm": 0.35319179579920057, + "learning_rate": 5.192586796228589e-06, + "loss": 0.4488, + "step": 3780 + }, + { + "epoch": 2.542657812894007, + "grad_norm": 0.3668355608939105, + "learning_rate": 5.190386874775123e-06, + "loss": 0.4484, + "step": 3781 + }, + { + "epoch": 2.543330251323863, + "grad_norm": 0.34965355223710987, + "learning_rate": 5.1881869164112516e-06, + "loss": 0.4337, + "step": 3782 + }, + { + "epoch": 2.5440026897537193, + "grad_norm": 0.35308599479483654, + "learning_rate": 5.185986921563485e-06, + "loss": 0.4373, + "step": 3783 + }, + { + "epoch": 2.5446751281835756, + "grad_norm": 0.324205088399401, + "learning_rate": 5.183786890658337e-06, + "loss": 0.4378, + "step": 3784 + }, + { + "epoch": 2.545347566613432, + "grad_norm": 0.380163684655463, + "learning_rate": 5.181586824122327e-06, + "loss": 0.4706, + "step": 3785 + }, + { + "epoch": 2.546020005043288, + "grad_norm": 0.33074437109441635, + "learning_rate": 5.1793867223819846e-06, + "loss": 0.4704, + "step": 3786 + }, + { + "epoch": 2.5466924434731446, + "grad_norm": 0.3292609650828323, + "learning_rate": 5.177186585863845e-06, + "loss": 0.45, + "step": 3787 + }, + { + "epoch": 2.547364881903001, + "grad_norm": 0.4139902078750704, + "learning_rate": 5.1749864149944485e-06, + "loss": 0.4398, + "step": 3788 + }, + { + "epoch": 2.548037320332857, + "grad_norm": 0.5993488931079183, + "learning_rate": 5.172786210200343e-06, + "loss": 0.4486, + "step": 3789 + }, + { + "epoch": 2.548709758762713, + "grad_norm": 0.42042569553752296, + "learning_rate": 5.170585971908087e-06, + "loss": 0.4456, + "step": 3790 + }, + { + "epoch": 2.5493821971925694, + "grad_norm": 0.3823238256484114, + "learning_rate": 5.168385700544239e-06, + "loss": 0.4457, + "step": 3791 + }, + { + "epoch": 2.550054635622426, + "grad_norm": 0.37123219611674746, + "learning_rate": 5.166185396535366e-06, + "loss": 0.4599, + "step": 3792 + }, + { + "epoch": 2.550727074052282, + "grad_norm": 0.36751036038349477, + "learning_rate": 5.1639850603080435e-06, + "loss": 0.4412, + "step": 3793 + }, + { + "epoch": 2.5513995124821385, + "grad_norm": 0.28656948143169914, + "learning_rate": 5.16178469228885e-06, + "loss": 0.4557, + "step": 3794 + }, + { + "epoch": 2.5520719509119947, + "grad_norm": 0.32446866213313197, + "learning_rate": 5.159584292904375e-06, + "loss": 0.4583, + "step": 3795 + }, + { + "epoch": 2.552744389341851, + "grad_norm": 0.5964162794538592, + "learning_rate": 5.157383862581209e-06, + "loss": 0.4458, + "step": 3796 + }, + { + "epoch": 2.553416827771707, + "grad_norm": 0.3297847044616623, + "learning_rate": 5.155183401745953e-06, + "loss": 0.4544, + "step": 3797 + }, + { + "epoch": 2.5540892662015633, + "grad_norm": 0.3327996968439367, + "learning_rate": 5.152982910825207e-06, + "loss": 0.437, + "step": 3798 + }, + { + "epoch": 2.55476170463142, + "grad_norm": 0.3178463020120286, + "learning_rate": 5.150782390245586e-06, + "loss": 0.4384, + "step": 3799 + }, + { + "epoch": 2.5554341430612757, + "grad_norm": 0.41892183789374043, + "learning_rate": 5.148581840433703e-06, + "loss": 0.447, + "step": 3800 + }, + { + "epoch": 2.5561065814911323, + "grad_norm": 0.39518782274915, + "learning_rate": 5.1463812618161815e-06, + "loss": 0.4528, + "step": 3801 + }, + { + "epoch": 2.5567790199209885, + "grad_norm": 0.32549332411081666, + "learning_rate": 5.144180654819647e-06, + "loss": 0.4519, + "step": 3802 + }, + { + "epoch": 2.5574514583508448, + "grad_norm": 0.38308949144270205, + "learning_rate": 5.1419800198707335e-06, + "loss": 0.4513, + "step": 3803 + }, + { + "epoch": 2.558123896780701, + "grad_norm": 0.3117899154840055, + "learning_rate": 5.13977935739608e-06, + "loss": 0.4426, + "step": 3804 + }, + { + "epoch": 2.558796335210557, + "grad_norm": 0.38078454486838426, + "learning_rate": 5.13757866782233e-06, + "loss": 0.4416, + "step": 3805 + }, + { + "epoch": 2.559468773640414, + "grad_norm": 0.3892915045158023, + "learning_rate": 5.135377951576129e-06, + "loss": 0.4484, + "step": 3806 + }, + { + "epoch": 2.5601412120702696, + "grad_norm": 0.3922351660486267, + "learning_rate": 5.133177209084135e-06, + "loss": 0.4734, + "step": 3807 + }, + { + "epoch": 2.560813650500126, + "grad_norm": 0.3145125925952972, + "learning_rate": 5.130976440773007e-06, + "loss": 0.4418, + "step": 3808 + }, + { + "epoch": 2.5614860889299824, + "grad_norm": 0.33503272386293115, + "learning_rate": 5.128775647069406e-06, + "loss": 0.4403, + "step": 3809 + }, + { + "epoch": 2.5621585273598386, + "grad_norm": 0.3206081131155908, + "learning_rate": 5.126574828400005e-06, + "loss": 0.4657, + "step": 3810 + }, + { + "epoch": 2.562830965789695, + "grad_norm": 0.3677827674996036, + "learning_rate": 5.124373985191473e-06, + "loss": 0.4607, + "step": 3811 + }, + { + "epoch": 2.563503404219551, + "grad_norm": 0.3313057068776185, + "learning_rate": 5.122173117870493e-06, + "loss": 0.439, + "step": 3812 + }, + { + "epoch": 2.5641758426494077, + "grad_norm": 0.33844242490608817, + "learning_rate": 5.119972226863746e-06, + "loss": 0.4511, + "step": 3813 + }, + { + "epoch": 2.5648482810792634, + "grad_norm": 0.46303435953046407, + "learning_rate": 5.117771312597921e-06, + "loss": 0.4489, + "step": 3814 + }, + { + "epoch": 2.56552071950912, + "grad_norm": 0.3947544136560684, + "learning_rate": 5.11557037549971e-06, + "loss": 0.4497, + "step": 3815 + }, + { + "epoch": 2.5661931579389763, + "grad_norm": 0.32707548217635574, + "learning_rate": 5.113369415995811e-06, + "loss": 0.4528, + "step": 3816 + }, + { + "epoch": 2.5668655963688325, + "grad_norm": 0.3493213925953552, + "learning_rate": 5.1111684345129216e-06, + "loss": 0.4442, + "step": 3817 + }, + { + "epoch": 2.5675380347986887, + "grad_norm": 0.32379197184450675, + "learning_rate": 5.10896743147775e-06, + "loss": 0.4392, + "step": 3818 + }, + { + "epoch": 2.568210473228545, + "grad_norm": 0.3734254339295873, + "learning_rate": 5.106766407317005e-06, + "loss": 0.4531, + "step": 3819 + }, + { + "epoch": 2.5688829116584015, + "grad_norm": 0.40420746016090003, + "learning_rate": 5.104565362457402e-06, + "loss": 0.4493, + "step": 3820 + }, + { + "epoch": 2.5695553500882573, + "grad_norm": 0.35353410656708223, + "learning_rate": 5.102364297325658e-06, + "loss": 0.4496, + "step": 3821 + }, + { + "epoch": 2.570227788518114, + "grad_norm": 0.3325958370960057, + "learning_rate": 5.100163212348492e-06, + "loss": 0.4439, + "step": 3822 + }, + { + "epoch": 2.57090022694797, + "grad_norm": 0.3045565716634522, + "learning_rate": 5.097962107952634e-06, + "loss": 0.4547, + "step": 3823 + }, + { + "epoch": 2.5715726653778264, + "grad_norm": 0.32827460477458903, + "learning_rate": 5.0957609845648095e-06, + "loss": 0.4599, + "step": 3824 + }, + { + "epoch": 2.5722451038076826, + "grad_norm": 0.3896500200822667, + "learning_rate": 5.093559842611753e-06, + "loss": 0.4547, + "step": 3825 + }, + { + "epoch": 2.5729175422375388, + "grad_norm": 0.3558175143222838, + "learning_rate": 5.091358682520201e-06, + "loss": 0.4546, + "step": 3826 + }, + { + "epoch": 2.5735899806673954, + "grad_norm": 0.49023083575365556, + "learning_rate": 5.089157504716892e-06, + "loss": 0.4364, + "step": 3827 + }, + { + "epoch": 2.574262419097251, + "grad_norm": 0.46450442763910227, + "learning_rate": 5.086956309628571e-06, + "loss": 0.4422, + "step": 3828 + }, + { + "epoch": 2.574934857527108, + "grad_norm": 0.2988745132917568, + "learning_rate": 5.084755097681986e-06, + "loss": 0.4425, + "step": 3829 + }, + { + "epoch": 2.575607295956964, + "grad_norm": 0.4269834171435638, + "learning_rate": 5.0825538693038855e-06, + "loss": 0.451, + "step": 3830 + }, + { + "epoch": 2.5762797343868202, + "grad_norm": 0.3509948140622454, + "learning_rate": 5.080352624921024e-06, + "loss": 0.4457, + "step": 3831 + }, + { + "epoch": 2.5769521728166764, + "grad_norm": 0.3429616383589686, + "learning_rate": 5.078151364960155e-06, + "loss": 0.4581, + "step": 3832 + }, + { + "epoch": 2.5776246112465326, + "grad_norm": 0.3287023293291114, + "learning_rate": 5.075950089848042e-06, + "loss": 0.4388, + "step": 3833 + }, + { + "epoch": 2.578297049676389, + "grad_norm": 0.39127126728235045, + "learning_rate": 5.0737488000114445e-06, + "loss": 0.4593, + "step": 3834 + }, + { + "epoch": 2.578969488106245, + "grad_norm": 0.3211215536959106, + "learning_rate": 5.0715474958771285e-06, + "loss": 0.4402, + "step": 3835 + }, + { + "epoch": 2.5796419265361017, + "grad_norm": 0.4274164835536321, + "learning_rate": 5.069346177871861e-06, + "loss": 0.432, + "step": 3836 + }, + { + "epoch": 2.580314364965958, + "grad_norm": 0.314409178497314, + "learning_rate": 5.067144846422414e-06, + "loss": 0.4412, + "step": 3837 + }, + { + "epoch": 2.580986803395814, + "grad_norm": 0.30381499647440013, + "learning_rate": 5.064943501955561e-06, + "loss": 0.4449, + "step": 3838 + }, + { + "epoch": 2.5816592418256703, + "grad_norm": 0.3075225485816905, + "learning_rate": 5.062742144898077e-06, + "loss": 0.4333, + "step": 3839 + }, + { + "epoch": 2.5823316802555265, + "grad_norm": 0.31533603082366796, + "learning_rate": 5.060540775676741e-06, + "loss": 0.4426, + "step": 3840 + }, + { + "epoch": 2.5830041186853827, + "grad_norm": 0.40525331303601203, + "learning_rate": 5.058339394718334e-06, + "loss": 0.4429, + "step": 3841 + }, + { + "epoch": 2.583676557115239, + "grad_norm": 0.3258861688476787, + "learning_rate": 5.056138002449637e-06, + "loss": 0.4559, + "step": 3842 + }, + { + "epoch": 2.5843489955450956, + "grad_norm": 0.4413250887046823, + "learning_rate": 5.053936599297434e-06, + "loss": 0.4413, + "step": 3843 + }, + { + "epoch": 2.5850214339749518, + "grad_norm": 0.34107229576023107, + "learning_rate": 5.051735185688517e-06, + "loss": 0.4546, + "step": 3844 + }, + { + "epoch": 2.585693872404808, + "grad_norm": 0.35051767639704207, + "learning_rate": 5.049533762049672e-06, + "loss": 0.429, + "step": 3845 + }, + { + "epoch": 2.586366310834664, + "grad_norm": 0.31965601826054163, + "learning_rate": 5.04733232880769e-06, + "loss": 0.4484, + "step": 3846 + }, + { + "epoch": 2.5870387492645204, + "grad_norm": 0.31771321814884435, + "learning_rate": 5.045130886389366e-06, + "loss": 0.4491, + "step": 3847 + }, + { + "epoch": 2.5877111876943766, + "grad_norm": 0.4950577405834247, + "learning_rate": 5.04292943522149e-06, + "loss": 0.4632, + "step": 3848 + }, + { + "epoch": 2.588383626124233, + "grad_norm": 0.3789943252896637, + "learning_rate": 5.040727975730866e-06, + "loss": 0.4484, + "step": 3849 + }, + { + "epoch": 2.5890560645540894, + "grad_norm": 0.3349219315480771, + "learning_rate": 5.038526508344286e-06, + "loss": 0.4377, + "step": 3850 + }, + { + "epoch": 2.5897285029839456, + "grad_norm": 0.29139705497556456, + "learning_rate": 5.036325033488552e-06, + "loss": 0.4329, + "step": 3851 + }, + { + "epoch": 2.590400941413802, + "grad_norm": 0.31799540729427056, + "learning_rate": 5.034123551590464e-06, + "loss": 0.4499, + "step": 3852 + }, + { + "epoch": 2.591073379843658, + "grad_norm": 0.46667337205344145, + "learning_rate": 5.031922063076825e-06, + "loss": 0.4435, + "step": 3853 + }, + { + "epoch": 2.5917458182735142, + "grad_norm": 0.3173954396303586, + "learning_rate": 5.029720568374441e-06, + "loss": 0.4434, + "step": 3854 + }, + { + "epoch": 2.5924182567033704, + "grad_norm": 0.4038778883302914, + "learning_rate": 5.027519067910113e-06, + "loss": 0.4562, + "step": 3855 + }, + { + "epoch": 2.5930906951332267, + "grad_norm": 0.3334517592595017, + "learning_rate": 5.0253175621106496e-06, + "loss": 0.4641, + "step": 3856 + }, + { + "epoch": 2.5937631335630833, + "grad_norm": 0.34674178762730656, + "learning_rate": 5.023116051402857e-06, + "loss": 0.4506, + "step": 3857 + }, + { + "epoch": 2.5944355719929395, + "grad_norm": 0.33680663953221074, + "learning_rate": 5.020914536213544e-06, + "loss": 0.4601, + "step": 3858 + }, + { + "epoch": 2.5951080104227957, + "grad_norm": 0.29991687118698995, + "learning_rate": 5.018713016969518e-06, + "loss": 0.4384, + "step": 3859 + }, + { + "epoch": 2.595780448852652, + "grad_norm": 0.6532795206230446, + "learning_rate": 5.016511494097589e-06, + "loss": 0.444, + "step": 3860 + }, + { + "epoch": 2.596452887282508, + "grad_norm": 0.40399908418384883, + "learning_rate": 5.014309968024569e-06, + "loss": 0.4498, + "step": 3861 + }, + { + "epoch": 2.5971253257123643, + "grad_norm": 0.4697959446174812, + "learning_rate": 5.0121084391772675e-06, + "loss": 0.4368, + "step": 3862 + }, + { + "epoch": 2.5977977641422205, + "grad_norm": 0.33135513650808096, + "learning_rate": 5.009906907982498e-06, + "loss": 0.4527, + "step": 3863 + }, + { + "epoch": 2.598470202572077, + "grad_norm": 0.4101478794356378, + "learning_rate": 5.007705374867069e-06, + "loss": 0.46, + "step": 3864 + }, + { + "epoch": 2.5991426410019334, + "grad_norm": 0.38900473343673664, + "learning_rate": 5.005503840257797e-06, + "loss": 0.4541, + "step": 3865 + }, + { + "epoch": 2.5998150794317896, + "grad_norm": 0.35719574910883917, + "learning_rate": 5.003302304581491e-06, + "loss": 0.4462, + "step": 3866 + }, + { + "epoch": 2.600487517861646, + "grad_norm": 0.3964272408590807, + "learning_rate": 5.001100768264967e-06, + "loss": 0.4498, + "step": 3867 + }, + { + "epoch": 2.601159956291502, + "grad_norm": 0.45572001774437354, + "learning_rate": 4.998899231735036e-06, + "loss": 0.4397, + "step": 3868 + }, + { + "epoch": 2.601832394721358, + "grad_norm": 0.3854538081021313, + "learning_rate": 4.996697695418509e-06, + "loss": 0.4431, + "step": 3869 + }, + { + "epoch": 2.6025048331512144, + "grad_norm": 0.46467833571113215, + "learning_rate": 4.9944961597422045e-06, + "loss": 0.4464, + "step": 3870 + }, + { + "epoch": 2.603177271581071, + "grad_norm": 0.49315106615561277, + "learning_rate": 4.992294625132931e-06, + "loss": 0.4454, + "step": 3871 + }, + { + "epoch": 2.6038497100109272, + "grad_norm": 0.41126821224026316, + "learning_rate": 4.990093092017505e-06, + "loss": 0.4275, + "step": 3872 + }, + { + "epoch": 2.6045221484407834, + "grad_norm": 0.33764530796775627, + "learning_rate": 4.987891560822734e-06, + "loss": 0.4448, + "step": 3873 + }, + { + "epoch": 2.6051945868706397, + "grad_norm": 0.3960123488009799, + "learning_rate": 4.9856900319754325e-06, + "loss": 0.4433, + "step": 3874 + }, + { + "epoch": 2.605867025300496, + "grad_norm": 0.40780345359310755, + "learning_rate": 4.983488505902412e-06, + "loss": 0.4566, + "step": 3875 + }, + { + "epoch": 2.606539463730352, + "grad_norm": 0.7557178612501396, + "learning_rate": 4.981286983030483e-06, + "loss": 0.4555, + "step": 3876 + }, + { + "epoch": 2.6072119021602083, + "grad_norm": 0.42171713653694304, + "learning_rate": 4.979085463786458e-06, + "loss": 0.4435, + "step": 3877 + }, + { + "epoch": 2.607884340590065, + "grad_norm": 0.3340723064181383, + "learning_rate": 4.976883948597144e-06, + "loss": 0.4303, + "step": 3878 + }, + { + "epoch": 2.608556779019921, + "grad_norm": 0.29293861353368855, + "learning_rate": 4.974682437889351e-06, + "loss": 0.4565, + "step": 3879 + }, + { + "epoch": 2.6092292174497773, + "grad_norm": 0.31253385788814575, + "learning_rate": 4.972480932089887e-06, + "loss": 0.4412, + "step": 3880 + }, + { + "epoch": 2.6099016558796335, + "grad_norm": 0.42975688666696954, + "learning_rate": 4.970279431625562e-06, + "loss": 0.4285, + "step": 3881 + }, + { + "epoch": 2.6105740943094897, + "grad_norm": 0.33551292043635644, + "learning_rate": 4.9680779369231765e-06, + "loss": 0.4223, + "step": 3882 + }, + { + "epoch": 2.611246532739346, + "grad_norm": 0.35565215137443845, + "learning_rate": 4.965876448409538e-06, + "loss": 0.4392, + "step": 3883 + }, + { + "epoch": 2.611918971169202, + "grad_norm": 0.3795433031972901, + "learning_rate": 4.96367496651145e-06, + "loss": 0.4491, + "step": 3884 + }, + { + "epoch": 2.612591409599059, + "grad_norm": 0.33398281760717086, + "learning_rate": 4.961473491655716e-06, + "loss": 0.4491, + "step": 3885 + }, + { + "epoch": 2.613263848028915, + "grad_norm": 0.34789460060253946, + "learning_rate": 4.9592720242691365e-06, + "loss": 0.4449, + "step": 3886 + }, + { + "epoch": 2.613936286458771, + "grad_norm": 0.7521759371904863, + "learning_rate": 4.95707056477851e-06, + "loss": 0.462, + "step": 3887 + }, + { + "epoch": 2.6146087248886274, + "grad_norm": 0.39615418310057654, + "learning_rate": 4.954869113610636e-06, + "loss": 0.4371, + "step": 3888 + }, + { + "epoch": 2.6152811633184836, + "grad_norm": 0.3072143702625055, + "learning_rate": 4.9526676711923105e-06, + "loss": 0.4308, + "step": 3889 + }, + { + "epoch": 2.61595360174834, + "grad_norm": 0.4958232349535628, + "learning_rate": 4.950466237950331e-06, + "loss": 0.4411, + "step": 3890 + }, + { + "epoch": 2.616626040178196, + "grad_norm": 0.3206297035074455, + "learning_rate": 4.948264814311485e-06, + "loss": 0.4503, + "step": 3891 + }, + { + "epoch": 2.6172984786080526, + "grad_norm": 0.34153060861871193, + "learning_rate": 4.9460634007025666e-06, + "loss": 0.4371, + "step": 3892 + }, + { + "epoch": 2.617970917037909, + "grad_norm": 0.34692001072221584, + "learning_rate": 4.943861997550364e-06, + "loss": 0.4316, + "step": 3893 + }, + { + "epoch": 2.618643355467765, + "grad_norm": 0.30897086469776225, + "learning_rate": 4.941660605281669e-06, + "loss": 0.44, + "step": 3894 + }, + { + "epoch": 2.6193157938976213, + "grad_norm": 0.2964363692863969, + "learning_rate": 4.93945922432326e-06, + "loss": 0.4264, + "step": 3895 + }, + { + "epoch": 2.6199882323274775, + "grad_norm": 0.3451149147036572, + "learning_rate": 4.937257855101924e-06, + "loss": 0.4494, + "step": 3896 + }, + { + "epoch": 2.6206606707573337, + "grad_norm": 0.3723792970190891, + "learning_rate": 4.9350564980444395e-06, + "loss": 0.4466, + "step": 3897 + }, + { + "epoch": 2.62133310918719, + "grad_norm": 0.49347025859965554, + "learning_rate": 4.932855153577586e-06, + "loss": 0.4522, + "step": 3898 + }, + { + "epoch": 2.6220055476170465, + "grad_norm": 0.3619643702258718, + "learning_rate": 4.9306538221281405e-06, + "loss": 0.435, + "step": 3899 + }, + { + "epoch": 2.6226779860469027, + "grad_norm": 0.6146626855230981, + "learning_rate": 4.928452504122873e-06, + "loss": 0.445, + "step": 3900 + }, + { + "epoch": 2.623350424476759, + "grad_norm": 0.38166015510603984, + "learning_rate": 4.926251199988557e-06, + "loss": 0.4572, + "step": 3901 + }, + { + "epoch": 2.624022862906615, + "grad_norm": 0.3070758076618173, + "learning_rate": 4.924049910151959e-06, + "loss": 0.4391, + "step": 3902 + }, + { + "epoch": 2.6246953013364713, + "grad_norm": 0.40800701682714846, + "learning_rate": 4.9218486350398465e-06, + "loss": 0.4461, + "step": 3903 + }, + { + "epoch": 2.6253677397663275, + "grad_norm": 0.2973151821861155, + "learning_rate": 4.919647375078978e-06, + "loss": 0.4479, + "step": 3904 + }, + { + "epoch": 2.6260401781961837, + "grad_norm": 0.38336864739259563, + "learning_rate": 4.917446130696115e-06, + "loss": 0.4475, + "step": 3905 + }, + { + "epoch": 2.6267126166260404, + "grad_norm": 0.3214570330346468, + "learning_rate": 4.915244902318015e-06, + "loss": 0.4498, + "step": 3906 + }, + { + "epoch": 2.6273850550558966, + "grad_norm": 0.6332904874395928, + "learning_rate": 4.913043690371428e-06, + "loss": 0.4369, + "step": 3907 + }, + { + "epoch": 2.628057493485753, + "grad_norm": 0.38335987959677537, + "learning_rate": 4.910842495283109e-06, + "loss": 0.4459, + "step": 3908 + }, + { + "epoch": 2.628729931915609, + "grad_norm": 0.6059650455962325, + "learning_rate": 4.908641317479801e-06, + "loss": 0.4444, + "step": 3909 + }, + { + "epoch": 2.629402370345465, + "grad_norm": 0.3583108368754653, + "learning_rate": 4.906440157388248e-06, + "loss": 0.4574, + "step": 3910 + }, + { + "epoch": 2.6300748087753214, + "grad_norm": 0.4759206529064839, + "learning_rate": 4.904239015435191e-06, + "loss": 0.4556, + "step": 3911 + }, + { + "epoch": 2.6307472472051776, + "grad_norm": 0.2995614698468876, + "learning_rate": 4.902037892047368e-06, + "loss": 0.4504, + "step": 3912 + }, + { + "epoch": 2.6314196856350343, + "grad_norm": 0.44317698613266354, + "learning_rate": 4.899836787651509e-06, + "loss": 0.4493, + "step": 3913 + }, + { + "epoch": 2.6320921240648905, + "grad_norm": 0.349065702885983, + "learning_rate": 4.897635702674344e-06, + "loss": 0.4488, + "step": 3914 + }, + { + "epoch": 2.6327645624947467, + "grad_norm": 0.8225983390548947, + "learning_rate": 4.895434637542598e-06, + "loss": 0.4612, + "step": 3915 + }, + { + "epoch": 2.633437000924603, + "grad_norm": 0.3849712777720729, + "learning_rate": 4.893233592682996e-06, + "loss": 0.4457, + "step": 3916 + }, + { + "epoch": 2.634109439354459, + "grad_norm": 0.37120607081678014, + "learning_rate": 4.8910325685222515e-06, + "loss": 0.4511, + "step": 3917 + }, + { + "epoch": 2.6347818777843153, + "grad_norm": 0.3827365226913108, + "learning_rate": 4.88883156548708e-06, + "loss": 0.4467, + "step": 3918 + }, + { + "epoch": 2.6354543162141715, + "grad_norm": 0.3254878222098606, + "learning_rate": 4.886630584004191e-06, + "loss": 0.4584, + "step": 3919 + }, + { + "epoch": 2.636126754644028, + "grad_norm": 0.771189071779267, + "learning_rate": 4.88442962450029e-06, + "loss": 0.44, + "step": 3920 + }, + { + "epoch": 2.6367991930738843, + "grad_norm": 0.3757486231448364, + "learning_rate": 4.882228687402081e-06, + "loss": 0.4406, + "step": 3921 + }, + { + "epoch": 2.6374716315037405, + "grad_norm": 0.3503419583682442, + "learning_rate": 4.880027773136255e-06, + "loss": 0.4483, + "step": 3922 + }, + { + "epoch": 2.6381440699335967, + "grad_norm": 0.3443790198819944, + "learning_rate": 4.877826882129509e-06, + "loss": 0.4395, + "step": 3923 + }, + { + "epoch": 2.638816508363453, + "grad_norm": 0.3113332727795207, + "learning_rate": 4.875626014808528e-06, + "loss": 0.456, + "step": 3924 + }, + { + "epoch": 2.639488946793309, + "grad_norm": 0.3685464703821919, + "learning_rate": 4.873425171599998e-06, + "loss": 0.4355, + "step": 3925 + }, + { + "epoch": 2.6401613852231653, + "grad_norm": 0.36662723975171313, + "learning_rate": 4.871224352930596e-06, + "loss": 0.4294, + "step": 3926 + }, + { + "epoch": 2.640833823653022, + "grad_norm": 0.3117729797433047, + "learning_rate": 4.8690235592269946e-06, + "loss": 0.4317, + "step": 3927 + }, + { + "epoch": 2.641506262082878, + "grad_norm": 0.35979033394621746, + "learning_rate": 4.866822790915865e-06, + "loss": 0.4415, + "step": 3928 + }, + { + "epoch": 2.6421787005127344, + "grad_norm": 0.37009695722503977, + "learning_rate": 4.864622048423871e-06, + "loss": 0.4302, + "step": 3929 + }, + { + "epoch": 2.6428511389425906, + "grad_norm": 0.3568595977172907, + "learning_rate": 4.862421332177674e-06, + "loss": 0.4406, + "step": 3930 + }, + { + "epoch": 2.643523577372447, + "grad_norm": 0.41973788949292573, + "learning_rate": 4.860220642603922e-06, + "loss": 0.4427, + "step": 3931 + }, + { + "epoch": 2.644196015802303, + "grad_norm": 0.36700531410496684, + "learning_rate": 4.858019980129267e-06, + "loss": 0.4534, + "step": 3932 + }, + { + "epoch": 2.644868454232159, + "grad_norm": 0.30269861413872146, + "learning_rate": 4.855819345180354e-06, + "loss": 0.4497, + "step": 3933 + }, + { + "epoch": 2.645540892662016, + "grad_norm": 0.6152798719083605, + "learning_rate": 4.853618738183821e-06, + "loss": 0.4531, + "step": 3934 + }, + { + "epoch": 2.6462133310918716, + "grad_norm": 0.30401248236622574, + "learning_rate": 4.8514181595662984e-06, + "loss": 0.4488, + "step": 3935 + }, + { + "epoch": 2.6468857695217283, + "grad_norm": 0.37429738252667466, + "learning_rate": 4.849217609754415e-06, + "loss": 0.4411, + "step": 3936 + }, + { + "epoch": 2.6475582079515845, + "grad_norm": 0.36301547372734627, + "learning_rate": 4.847017089174793e-06, + "loss": 0.4536, + "step": 3937 + }, + { + "epoch": 2.6482306463814407, + "grad_norm": 0.4981606026371317, + "learning_rate": 4.84481659825405e-06, + "loss": 0.4422, + "step": 3938 + }, + { + "epoch": 2.648903084811297, + "grad_norm": 0.34595349036346, + "learning_rate": 4.842616137418792e-06, + "loss": 0.4351, + "step": 3939 + }, + { + "epoch": 2.649575523241153, + "grad_norm": 0.34087659069215315, + "learning_rate": 4.840415707095626e-06, + "loss": 0.4605, + "step": 3940 + }, + { + "epoch": 2.6502479616710097, + "grad_norm": 0.4107782880869281, + "learning_rate": 4.8382153077111505e-06, + "loss": 0.4493, + "step": 3941 + }, + { + "epoch": 2.6509204001008655, + "grad_norm": 0.32892418309754395, + "learning_rate": 4.836014939691957e-06, + "loss": 0.4371, + "step": 3942 + }, + { + "epoch": 2.651592838530722, + "grad_norm": 0.35482669054847, + "learning_rate": 4.8338146034646365e-06, + "loss": 0.4512, + "step": 3943 + }, + { + "epoch": 2.6522652769605783, + "grad_norm": 0.4993082701462041, + "learning_rate": 4.831614299455763e-06, + "loss": 0.4514, + "step": 3944 + }, + { + "epoch": 2.6529377153904345, + "grad_norm": 0.4296586018836682, + "learning_rate": 4.829414028091914e-06, + "loss": 0.4436, + "step": 3945 + }, + { + "epoch": 2.6536101538202908, + "grad_norm": 0.3528586999832636, + "learning_rate": 4.827213789799656e-06, + "loss": 0.4454, + "step": 3946 + }, + { + "epoch": 2.654282592250147, + "grad_norm": 0.3524200784847369, + "learning_rate": 4.825013585005554e-06, + "loss": 0.4505, + "step": 3947 + }, + { + "epoch": 2.6549550306800036, + "grad_norm": 0.3637313457903069, + "learning_rate": 4.822813414136157e-06, + "loss": 0.4193, + "step": 3948 + }, + { + "epoch": 2.6556274691098594, + "grad_norm": 0.30803439934798615, + "learning_rate": 4.820613277618016e-06, + "loss": 0.4236, + "step": 3949 + }, + { + "epoch": 2.656299907539716, + "grad_norm": 0.32356597567643813, + "learning_rate": 4.818413175877674e-06, + "loss": 0.4343, + "step": 3950 + }, + { + "epoch": 2.656972345969572, + "grad_norm": 0.3117213382948113, + "learning_rate": 4.816213109341664e-06, + "loss": 0.4297, + "step": 3951 + }, + { + "epoch": 2.6576447843994284, + "grad_norm": 0.30292637566183234, + "learning_rate": 4.814013078436517e-06, + "loss": 0.4388, + "step": 3952 + }, + { + "epoch": 2.6583172228292846, + "grad_norm": 0.4441875471980197, + "learning_rate": 4.81181308358875e-06, + "loss": 0.4477, + "step": 3953 + }, + { + "epoch": 2.658989661259141, + "grad_norm": 0.35145162184961165, + "learning_rate": 4.8096131252248785e-06, + "loss": 0.4514, + "step": 3954 + }, + { + "epoch": 2.6596620996889975, + "grad_norm": 0.32937856083291966, + "learning_rate": 4.80741320377141e-06, + "loss": 0.4514, + "step": 3955 + }, + { + "epoch": 2.6603345381188532, + "grad_norm": 0.3354037013121662, + "learning_rate": 4.805213319654847e-06, + "loss": 0.4409, + "step": 3956 + }, + { + "epoch": 2.66100697654871, + "grad_norm": 0.354286594317942, + "learning_rate": 4.803013473301677e-06, + "loss": 0.449, + "step": 3957 + }, + { + "epoch": 2.661679414978566, + "grad_norm": 0.3923920764655487, + "learning_rate": 4.800813665138389e-06, + "loss": 0.4352, + "step": 3958 + }, + { + "epoch": 2.6623518534084223, + "grad_norm": 0.4069089239628588, + "learning_rate": 4.7986138955914604e-06, + "loss": 0.4469, + "step": 3959 + }, + { + "epoch": 2.6630242918382785, + "grad_norm": 0.40021340268552197, + "learning_rate": 4.796414165087363e-06, + "loss": 0.4566, + "step": 3960 + }, + { + "epoch": 2.6636967302681347, + "grad_norm": 0.4094469334606063, + "learning_rate": 4.794214474052555e-06, + "loss": 0.4343, + "step": 3961 + }, + { + "epoch": 2.6643691686979913, + "grad_norm": 0.3149962928170417, + "learning_rate": 4.792014822913496e-06, + "loss": 0.4337, + "step": 3962 + }, + { + "epoch": 2.665041607127847, + "grad_norm": 0.34533606893544566, + "learning_rate": 4.789815212096632e-06, + "loss": 0.4378, + "step": 3963 + }, + { + "epoch": 2.6657140455577037, + "grad_norm": 0.42166294954305816, + "learning_rate": 4.787615642028402e-06, + "loss": 0.441, + "step": 3964 + }, + { + "epoch": 2.66638648398756, + "grad_norm": 0.3720335576158297, + "learning_rate": 4.785416113135241e-06, + "loss": 0.454, + "step": 3965 + }, + { + "epoch": 2.667058922417416, + "grad_norm": 0.34103569866637795, + "learning_rate": 4.7832166258435684e-06, + "loss": 0.4489, + "step": 3966 + }, + { + "epoch": 2.6677313608472724, + "grad_norm": 0.3144751920124584, + "learning_rate": 4.781017180579803e-06, + "loss": 0.4367, + "step": 3967 + }, + { + "epoch": 2.6684037992771286, + "grad_norm": 0.2934768252750471, + "learning_rate": 4.7788177777703505e-06, + "loss": 0.451, + "step": 3968 + }, + { + "epoch": 2.669076237706985, + "grad_norm": 0.44173091051097085, + "learning_rate": 4.776618417841615e-06, + "loss": 0.4555, + "step": 3969 + }, + { + "epoch": 2.669748676136841, + "grad_norm": 0.3621724409828, + "learning_rate": 4.7744191012199805e-06, + "loss": 0.4462, + "step": 3970 + }, + { + "epoch": 2.6704211145666976, + "grad_norm": 0.4217030265201206, + "learning_rate": 4.772219828331833e-06, + "loss": 0.4401, + "step": 3971 + }, + { + "epoch": 2.671093552996554, + "grad_norm": 0.3258720085996176, + "learning_rate": 4.7700205996035465e-06, + "loss": 0.4438, + "step": 3972 + }, + { + "epoch": 2.67176599142641, + "grad_norm": 0.3286042825081745, + "learning_rate": 4.767821415461487e-06, + "loss": 0.4423, + "step": 3973 + }, + { + "epoch": 2.6724384298562662, + "grad_norm": 0.37240805371753094, + "learning_rate": 4.765622276332013e-06, + "loss": 0.4476, + "step": 3974 + }, + { + "epoch": 2.6731108682861224, + "grad_norm": 0.46705034090135494, + "learning_rate": 4.763423182641469e-06, + "loss": 0.4555, + "step": 3975 + }, + { + "epoch": 2.673783306715979, + "grad_norm": 0.31248097026521715, + "learning_rate": 4.761224134816197e-06, + "loss": 0.4265, + "step": 3976 + }, + { + "epoch": 2.674455745145835, + "grad_norm": 0.3701567562637434, + "learning_rate": 4.759025133282526e-06, + "loss": 0.4493, + "step": 3977 + }, + { + "epoch": 2.6751281835756915, + "grad_norm": 0.334949784468544, + "learning_rate": 4.756826178466783e-06, + "loss": 0.447, + "step": 3978 + }, + { + "epoch": 2.6758006220055477, + "grad_norm": 0.3687722696307002, + "learning_rate": 4.754627270795272e-06, + "loss": 0.4402, + "step": 3979 + }, + { + "epoch": 2.676473060435404, + "grad_norm": 0.33745162534262396, + "learning_rate": 4.752428410694301e-06, + "loss": 0.4341, + "step": 3980 + }, + { + "epoch": 2.67714549886526, + "grad_norm": 0.3030594134472027, + "learning_rate": 4.750229598590163e-06, + "loss": 0.4352, + "step": 3981 + }, + { + "epoch": 2.6778179372951163, + "grad_norm": 0.34207218560456, + "learning_rate": 4.748030834909147e-06, + "loss": 0.4322, + "step": 3982 + }, + { + "epoch": 2.678490375724973, + "grad_norm": 0.518303876053175, + "learning_rate": 4.745832120077521e-06, + "loss": 0.4557, + "step": 3983 + }, + { + "epoch": 2.6791628141548287, + "grad_norm": 0.3883315083460478, + "learning_rate": 4.7436334545215565e-06, + "loss": 0.4499, + "step": 3984 + }, + { + "epoch": 2.6798352525846854, + "grad_norm": 0.33195023039056065, + "learning_rate": 4.741434838667508e-06, + "loss": 0.4194, + "step": 3985 + }, + { + "epoch": 2.6805076910145416, + "grad_norm": 0.3431102125311453, + "learning_rate": 4.739236272941623e-06, + "loss": 0.4675, + "step": 3986 + }, + { + "epoch": 2.6811801294443978, + "grad_norm": 0.3330249236624501, + "learning_rate": 4.737037757770141e-06, + "loss": 0.4536, + "step": 3987 + }, + { + "epoch": 2.681852567874254, + "grad_norm": 0.7319725476689913, + "learning_rate": 4.734839293579284e-06, + "loss": 0.4496, + "step": 3988 + }, + { + "epoch": 2.68252500630411, + "grad_norm": 0.374642251156238, + "learning_rate": 4.732640880795272e-06, + "loss": 0.4486, + "step": 3989 + }, + { + "epoch": 2.683197444733967, + "grad_norm": 0.37675456514832767, + "learning_rate": 4.7304425198443125e-06, + "loss": 0.4308, + "step": 3990 + }, + { + "epoch": 2.6838698831638226, + "grad_norm": 0.3512707357224236, + "learning_rate": 4.728244211152604e-06, + "loss": 0.4269, + "step": 3991 + }, + { + "epoch": 2.6845423215936792, + "grad_norm": 0.4393990814568386, + "learning_rate": 4.726045955146331e-06, + "loss": 0.4512, + "step": 3992 + }, + { + "epoch": 2.6852147600235354, + "grad_norm": 0.35171319923804023, + "learning_rate": 4.723847752251673e-06, + "loss": 0.4468, + "step": 3993 + }, + { + "epoch": 2.6858871984533916, + "grad_norm": 0.4101286118141828, + "learning_rate": 4.721649602894795e-06, + "loss": 0.4436, + "step": 3994 + }, + { + "epoch": 2.686559636883248, + "grad_norm": 0.33289556695129713, + "learning_rate": 4.719451507501857e-06, + "loss": 0.4577, + "step": 3995 + }, + { + "epoch": 2.687232075313104, + "grad_norm": 0.3934530287799647, + "learning_rate": 4.717253466499e-06, + "loss": 0.4391, + "step": 3996 + }, + { + "epoch": 2.6879045137429602, + "grad_norm": 0.30849467174103273, + "learning_rate": 4.71505548031236e-06, + "loss": 0.4435, + "step": 3997 + }, + { + "epoch": 2.6885769521728164, + "grad_norm": 0.46370010026944075, + "learning_rate": 4.712857549368065e-06, + "loss": 0.4509, + "step": 3998 + }, + { + "epoch": 2.689249390602673, + "grad_norm": 0.3206942755628773, + "learning_rate": 4.710659674092226e-06, + "loss": 0.448, + "step": 3999 + }, + { + "epoch": 2.6899218290325293, + "grad_norm": 0.46281544863600615, + "learning_rate": 4.7084618549109515e-06, + "loss": 0.4525, + "step": 4000 + }, + { + "epoch": 2.6905942674623855, + "grad_norm": 0.34585129329342007, + "learning_rate": 4.706264092250327e-06, + "loss": 0.4379, + "step": 4001 + }, + { + "epoch": 2.6912667058922417, + "grad_norm": 0.3354546534002248, + "learning_rate": 4.704066386536436e-06, + "loss": 0.4447, + "step": 4002 + }, + { + "epoch": 2.691939144322098, + "grad_norm": 0.4748817548224659, + "learning_rate": 4.70186873819535e-06, + "loss": 0.4485, + "step": 4003 + }, + { + "epoch": 2.692611582751954, + "grad_norm": 0.33706271599700643, + "learning_rate": 4.699671147653129e-06, + "loss": 0.4599, + "step": 4004 + }, + { + "epoch": 2.6932840211818103, + "grad_norm": 0.41419105569679066, + "learning_rate": 4.697473615335819e-06, + "loss": 0.4354, + "step": 4005 + }, + { + "epoch": 2.693956459611667, + "grad_norm": 0.48293969803601555, + "learning_rate": 4.695276141669458e-06, + "loss": 0.443, + "step": 4006 + }, + { + "epoch": 2.694628898041523, + "grad_norm": 0.3852684945316683, + "learning_rate": 4.6930787270800705e-06, + "loss": 0.4492, + "step": 4007 + }, + { + "epoch": 2.6953013364713794, + "grad_norm": 0.35153149593918026, + "learning_rate": 4.690881371993671e-06, + "loss": 0.4336, + "step": 4008 + }, + { + "epoch": 2.6959737749012356, + "grad_norm": 0.31717053577082666, + "learning_rate": 4.688684076836264e-06, + "loss": 0.4458, + "step": 4009 + }, + { + "epoch": 2.696646213331092, + "grad_norm": 0.4152306709052566, + "learning_rate": 4.686486842033836e-06, + "loss": 0.4491, + "step": 4010 + }, + { + "epoch": 2.697318651760948, + "grad_norm": 0.3764797379680534, + "learning_rate": 4.684289668012367e-06, + "loss": 0.4335, + "step": 4011 + }, + { + "epoch": 2.697991090190804, + "grad_norm": 0.32045573355406193, + "learning_rate": 4.682092555197827e-06, + "loss": 0.4396, + "step": 4012 + }, + { + "epoch": 2.698663528620661, + "grad_norm": 0.36799773366081795, + "learning_rate": 4.67989550401617e-06, + "loss": 0.4408, + "step": 4013 + }, + { + "epoch": 2.699335967050517, + "grad_norm": 0.3558258065277898, + "learning_rate": 4.677698514893338e-06, + "loss": 0.4302, + "step": 4014 + }, + { + "epoch": 2.7000084054803732, + "grad_norm": 0.3554089068118138, + "learning_rate": 4.675501588255264e-06, + "loss": 0.4367, + "step": 4015 + }, + { + "epoch": 2.7006808439102294, + "grad_norm": 0.3475852676857237, + "learning_rate": 4.673304724527868e-06, + "loss": 0.4486, + "step": 4016 + }, + { + "epoch": 2.7013532823400856, + "grad_norm": 0.3515982513660669, + "learning_rate": 4.671107924137057e-06, + "loss": 0.4401, + "step": 4017 + }, + { + "epoch": 2.702025720769942, + "grad_norm": 0.39299387084846193, + "learning_rate": 4.668911187508723e-06, + "loss": 0.4507, + "step": 4018 + }, + { + "epoch": 2.702698159199798, + "grad_norm": 0.3191081249376199, + "learning_rate": 4.666714515068751e-06, + "loss": 0.4387, + "step": 4019 + }, + { + "epoch": 2.7033705976296547, + "grad_norm": 0.515033568771394, + "learning_rate": 4.664517907243011e-06, + "loss": 0.4438, + "step": 4020 + }, + { + "epoch": 2.704043036059511, + "grad_norm": 0.5144595273589103, + "learning_rate": 4.662321364457359e-06, + "loss": 0.4285, + "step": 4021 + }, + { + "epoch": 2.704715474489367, + "grad_norm": 0.3848515807272575, + "learning_rate": 4.660124887137643e-06, + "loss": 0.4335, + "step": 4022 + }, + { + "epoch": 2.7053879129192233, + "grad_norm": 0.3245188178348665, + "learning_rate": 4.657928475709691e-06, + "loss": 0.4494, + "step": 4023 + }, + { + "epoch": 2.7060603513490795, + "grad_norm": 0.32339811165861526, + "learning_rate": 4.6557321305993246e-06, + "loss": 0.4481, + "step": 4024 + }, + { + "epoch": 2.7067327897789357, + "grad_norm": 0.3965925056893796, + "learning_rate": 4.65353585223235e-06, + "loss": 0.4409, + "step": 4025 + }, + { + "epoch": 2.707405228208792, + "grad_norm": 0.35551982751089395, + "learning_rate": 4.651339641034565e-06, + "loss": 0.4524, + "step": 4026 + }, + { + "epoch": 2.7080776666386486, + "grad_norm": 0.48379414670857646, + "learning_rate": 4.649143497431743e-06, + "loss": 0.4272, + "step": 4027 + }, + { + "epoch": 2.7087501050685048, + "grad_norm": 0.41530286231090446, + "learning_rate": 4.646947421849655e-06, + "loss": 0.4315, + "step": 4028 + }, + { + "epoch": 2.709422543498361, + "grad_norm": 0.439133912286961, + "learning_rate": 4.644751414714056e-06, + "loss": 0.4314, + "step": 4029 + }, + { + "epoch": 2.710094981928217, + "grad_norm": 0.3371496209827778, + "learning_rate": 4.642555476450684e-06, + "loss": 0.4443, + "step": 4030 + }, + { + "epoch": 2.7107674203580734, + "grad_norm": 0.4041486073180736, + "learning_rate": 4.640359607485272e-06, + "loss": 0.4438, + "step": 4031 + }, + { + "epoch": 2.7114398587879296, + "grad_norm": 0.3215829040360271, + "learning_rate": 4.63816380824353e-06, + "loss": 0.4231, + "step": 4032 + }, + { + "epoch": 2.712112297217786, + "grad_norm": 0.3355336267006405, + "learning_rate": 4.63596807915116e-06, + "loss": 0.4408, + "step": 4033 + }, + { + "epoch": 2.7127847356476424, + "grad_norm": 0.374171806207746, + "learning_rate": 4.633772420633849e-06, + "loss": 0.4273, + "step": 4034 + }, + { + "epoch": 2.7134571740774986, + "grad_norm": 0.30441595052025694, + "learning_rate": 4.6315768331172725e-06, + "loss": 0.4489, + "step": 4035 + }, + { + "epoch": 2.714129612507355, + "grad_norm": 0.35620556437115636, + "learning_rate": 4.629381317027086e-06, + "loss": 0.4401, + "step": 4036 + }, + { + "epoch": 2.714802050937211, + "grad_norm": 0.35531137314619227, + "learning_rate": 4.627185872788938e-06, + "loss": 0.4385, + "step": 4037 + }, + { + "epoch": 2.7154744893670673, + "grad_norm": 0.4110343598892571, + "learning_rate": 4.6249905008284605e-06, + "loss": 0.4344, + "step": 4038 + }, + { + "epoch": 2.7161469277969235, + "grad_norm": 0.3394545822035457, + "learning_rate": 4.622795201571272e-06, + "loss": 0.4559, + "step": 4039 + }, + { + "epoch": 2.7168193662267797, + "grad_norm": 0.3676872378283041, + "learning_rate": 4.620599975442974e-06, + "loss": 0.4232, + "step": 4040 + }, + { + "epoch": 2.7174918046566363, + "grad_norm": 0.4307709856361871, + "learning_rate": 4.618404822869157e-06, + "loss": 0.4552, + "step": 4041 + }, + { + "epoch": 2.7181642430864925, + "grad_norm": 0.39071616092546935, + "learning_rate": 4.616209744275398e-06, + "loss": 0.4494, + "step": 4042 + }, + { + "epoch": 2.7188366815163487, + "grad_norm": 0.3350976454381335, + "learning_rate": 4.614014740087256e-06, + "loss": 0.4365, + "step": 4043 + }, + { + "epoch": 2.719509119946205, + "grad_norm": 0.35175623390334937, + "learning_rate": 4.611819810730281e-06, + "loss": 0.4249, + "step": 4044 + }, + { + "epoch": 2.720181558376061, + "grad_norm": 0.44179988158296624, + "learning_rate": 4.60962495663e-06, + "loss": 0.446, + "step": 4045 + }, + { + "epoch": 2.7208539968059173, + "grad_norm": 0.31338487083319794, + "learning_rate": 4.607430178211933e-06, + "loss": 0.4322, + "step": 4046 + }, + { + "epoch": 2.7215264352357735, + "grad_norm": 0.3153451800447677, + "learning_rate": 4.6052354759015815e-06, + "loss": 0.4327, + "step": 4047 + }, + { + "epoch": 2.72219887366563, + "grad_norm": 0.3394799354136693, + "learning_rate": 4.60304085012444e-06, + "loss": 0.4256, + "step": 4048 + }, + { + "epoch": 2.7228713120954864, + "grad_norm": 0.30607813581454435, + "learning_rate": 4.600846301305973e-06, + "loss": 0.4539, + "step": 4049 + }, + { + "epoch": 2.7235437505253426, + "grad_norm": 0.3282820415912827, + "learning_rate": 4.598651829871643e-06, + "loss": 0.4416, + "step": 4050 + }, + { + "epoch": 2.724216188955199, + "grad_norm": 0.324333649191008, + "learning_rate": 4.5964574362468925e-06, + "loss": 0.4425, + "step": 4051 + }, + { + "epoch": 2.724888627385055, + "grad_norm": 0.34630251064694667, + "learning_rate": 4.594263120857151e-06, + "loss": 0.451, + "step": 4052 + }, + { + "epoch": 2.725561065814911, + "grad_norm": 0.5041706542960473, + "learning_rate": 4.5920688841278315e-06, + "loss": 0.4337, + "step": 4053 + }, + { + "epoch": 2.7262335042447674, + "grad_norm": 0.35387917754802173, + "learning_rate": 4.58987472648433e-06, + "loss": 0.4352, + "step": 4054 + }, + { + "epoch": 2.726905942674624, + "grad_norm": 0.41760733333223016, + "learning_rate": 4.5876806483520305e-06, + "loss": 0.4278, + "step": 4055 + }, + { + "epoch": 2.7275783811044803, + "grad_norm": 0.3397217293758815, + "learning_rate": 4.585486650156299e-06, + "loss": 0.4257, + "step": 4056 + }, + { + "epoch": 2.7282508195343365, + "grad_norm": 0.36874491179924024, + "learning_rate": 4.58329273232249e-06, + "loss": 0.4438, + "step": 4057 + }, + { + "epoch": 2.7289232579641927, + "grad_norm": 0.3774561271535653, + "learning_rate": 4.581098895275935e-06, + "loss": 0.448, + "step": 4058 + }, + { + "epoch": 2.729595696394049, + "grad_norm": 0.33630915462567423, + "learning_rate": 4.578905139441957e-06, + "loss": 0.4407, + "step": 4059 + }, + { + "epoch": 2.730268134823905, + "grad_norm": 0.3126494069121454, + "learning_rate": 4.57671146524586e-06, + "loss": 0.4402, + "step": 4060 + }, + { + "epoch": 2.7309405732537613, + "grad_norm": 0.3582877240176243, + "learning_rate": 4.574517873112934e-06, + "loss": 0.445, + "step": 4061 + }, + { + "epoch": 2.731613011683618, + "grad_norm": 0.3211952898034876, + "learning_rate": 4.572324363468449e-06, + "loss": 0.4338, + "step": 4062 + }, + { + "epoch": 2.732285450113474, + "grad_norm": 0.3384746843259489, + "learning_rate": 4.570130936737664e-06, + "loss": 0.4574, + "step": 4063 + }, + { + "epoch": 2.7329578885433303, + "grad_norm": 0.38074821139775816, + "learning_rate": 4.56793759334582e-06, + "loss": 0.4278, + "step": 4064 + }, + { + "epoch": 2.7336303269731865, + "grad_norm": 0.3958921748871837, + "learning_rate": 4.565744333718141e-06, + "loss": 0.4445, + "step": 4065 + }, + { + "epoch": 2.7343027654030427, + "grad_norm": 0.3406667599130005, + "learning_rate": 4.563551158279837e-06, + "loss": 0.4668, + "step": 4066 + }, + { + "epoch": 2.734975203832899, + "grad_norm": 0.3326943427455027, + "learning_rate": 4.5613580674560964e-06, + "loss": 0.4531, + "step": 4067 + }, + { + "epoch": 2.735647642262755, + "grad_norm": 0.3079837541279532, + "learning_rate": 4.5591650616720975e-06, + "loss": 0.45, + "step": 4068 + }, + { + "epoch": 2.736320080692612, + "grad_norm": 0.37775753040507126, + "learning_rate": 4.556972141352999e-06, + "loss": 0.4235, + "step": 4069 + }, + { + "epoch": 2.736992519122468, + "grad_norm": 0.3191006036475625, + "learning_rate": 4.554779306923943e-06, + "loss": 0.4478, + "step": 4070 + }, + { + "epoch": 2.737664957552324, + "grad_norm": 0.3218784168108929, + "learning_rate": 4.5525865588100566e-06, + "loss": 0.4358, + "step": 4071 + }, + { + "epoch": 2.7383373959821804, + "grad_norm": 0.3934979110896771, + "learning_rate": 4.550393897436447e-06, + "loss": 0.4494, + "step": 4072 + }, + { + "epoch": 2.7390098344120366, + "grad_norm": 0.43781924786854065, + "learning_rate": 4.54820132322821e-06, + "loss": 0.4463, + "step": 4073 + }, + { + "epoch": 2.739682272841893, + "grad_norm": 0.3497296661145289, + "learning_rate": 4.546008836610417e-06, + "loss": 0.4373, + "step": 4074 + }, + { + "epoch": 2.740354711271749, + "grad_norm": 1.4186194019979426, + "learning_rate": 4.543816438008132e-06, + "loss": 0.449, + "step": 4075 + }, + { + "epoch": 2.7410271497016057, + "grad_norm": 0.39392153382582556, + "learning_rate": 4.54162412784639e-06, + "loss": 0.4452, + "step": 4076 + }, + { + "epoch": 2.741699588131462, + "grad_norm": 0.6244952823781512, + "learning_rate": 4.539431906550219e-06, + "loss": 0.451, + "step": 4077 + }, + { + "epoch": 2.742372026561318, + "grad_norm": 0.36142281857296416, + "learning_rate": 4.537239774544625e-06, + "loss": 0.4408, + "step": 4078 + }, + { + "epoch": 2.7430444649911743, + "grad_norm": 0.5173482600389665, + "learning_rate": 4.535047732254601e-06, + "loss": 0.4525, + "step": 4079 + }, + { + "epoch": 2.7437169034210305, + "grad_norm": 0.4456350759158602, + "learning_rate": 4.532855780105114e-06, + "loss": 0.4397, + "step": 4080 + }, + { + "epoch": 2.7443893418508867, + "grad_norm": 0.3776474889137343, + "learning_rate": 4.530663918521121e-06, + "loss": 0.4316, + "step": 4081 + }, + { + "epoch": 2.745061780280743, + "grad_norm": 0.4622116773312596, + "learning_rate": 4.528472147927561e-06, + "loss": 0.4318, + "step": 4082 + }, + { + "epoch": 2.7457342187105995, + "grad_norm": 0.42715846688985476, + "learning_rate": 4.526280468749355e-06, + "loss": 0.4494, + "step": 4083 + }, + { + "epoch": 2.7464066571404557, + "grad_norm": 0.33742122624911886, + "learning_rate": 4.5240888814114e-06, + "loss": 0.4285, + "step": 4084 + }, + { + "epoch": 2.747079095570312, + "grad_norm": 0.3831275524199066, + "learning_rate": 4.521897386338582e-06, + "loss": 0.4361, + "step": 4085 + }, + { + "epoch": 2.747751534000168, + "grad_norm": 0.34803028860558366, + "learning_rate": 4.5197059839557694e-06, + "loss": 0.4382, + "step": 4086 + }, + { + "epoch": 2.7484239724300243, + "grad_norm": 0.3358652039154297, + "learning_rate": 4.5175146746878086e-06, + "loss": 0.4402, + "step": 4087 + }, + { + "epoch": 2.7490964108598805, + "grad_norm": 0.3596305282095028, + "learning_rate": 4.515323458959532e-06, + "loss": 0.4499, + "step": 4088 + }, + { + "epoch": 2.7497688492897367, + "grad_norm": 0.3511275929318625, + "learning_rate": 4.513132337195748e-06, + "loss": 0.4516, + "step": 4089 + }, + { + "epoch": 2.7504412877195934, + "grad_norm": 0.31670305524414344, + "learning_rate": 4.510941309821254e-06, + "loss": 0.4397, + "step": 4090 + }, + { + "epoch": 2.7511137261494496, + "grad_norm": 0.32598957982308097, + "learning_rate": 4.508750377260824e-06, + "loss": 0.4392, + "step": 4091 + }, + { + "epoch": 2.751786164579306, + "grad_norm": 0.4135011005395795, + "learning_rate": 4.506559539939218e-06, + "loss": 0.4308, + "step": 4092 + }, + { + "epoch": 2.752458603009162, + "grad_norm": 0.35335225307008133, + "learning_rate": 4.50436879828117e-06, + "loss": 0.444, + "step": 4093 + }, + { + "epoch": 2.753131041439018, + "grad_norm": 0.339990579672587, + "learning_rate": 4.502178152711403e-06, + "loss": 0.4444, + "step": 4094 + }, + { + "epoch": 2.7538034798688744, + "grad_norm": 1.0034503263282213, + "learning_rate": 4.499987603654618e-06, + "loss": 0.4327, + "step": 4095 + }, + { + "epoch": 2.7544759182987306, + "grad_norm": 0.37428166492973364, + "learning_rate": 4.497797151535496e-06, + "loss": 0.4485, + "step": 4096 + }, + { + "epoch": 2.7551483567285873, + "grad_norm": 0.38282626112707663, + "learning_rate": 4.495606796778707e-06, + "loss": 0.4579, + "step": 4097 + }, + { + "epoch": 2.7558207951584435, + "grad_norm": 0.45749960797959477, + "learning_rate": 4.493416539808891e-06, + "loss": 0.4277, + "step": 4098 + }, + { + "epoch": 2.7564932335882997, + "grad_norm": 0.38810401286703083, + "learning_rate": 4.4912263810506765e-06, + "loss": 0.4263, + "step": 4099 + }, + { + "epoch": 2.757165672018156, + "grad_norm": 0.3944196941682623, + "learning_rate": 4.4890363209286685e-06, + "loss": 0.4492, + "step": 4100 + }, + { + "epoch": 2.757838110448012, + "grad_norm": 0.40649470910613944, + "learning_rate": 4.486846359867459e-06, + "loss": 0.4535, + "step": 4101 + }, + { + "epoch": 2.7585105488778683, + "grad_norm": 0.44185871010693245, + "learning_rate": 4.484656498291611e-06, + "loss": 0.4385, + "step": 4102 + }, + { + "epoch": 2.7591829873077245, + "grad_norm": 0.3637943966579024, + "learning_rate": 4.48246673662568e-06, + "loss": 0.4483, + "step": 4103 + }, + { + "epoch": 2.759855425737581, + "grad_norm": 0.37969279378764087, + "learning_rate": 4.480277075294192e-06, + "loss": 0.4399, + "step": 4104 + }, + { + "epoch": 2.760527864167437, + "grad_norm": 0.3138671963424064, + "learning_rate": 4.4780875147216625e-06, + "loss": 0.4369, + "step": 4105 + }, + { + "epoch": 2.7612003025972935, + "grad_norm": 0.3263125250207868, + "learning_rate": 4.475898055332578e-06, + "loss": 0.4395, + "step": 4106 + }, + { + "epoch": 2.7618727410271497, + "grad_norm": 0.5257770684119895, + "learning_rate": 4.473708697551411e-06, + "loss": 0.4347, + "step": 4107 + }, + { + "epoch": 2.762545179457006, + "grad_norm": 0.3248907365494214, + "learning_rate": 4.471519441802616e-06, + "loss": 0.4613, + "step": 4108 + }, + { + "epoch": 2.763217617886862, + "grad_norm": 0.32027788836591803, + "learning_rate": 4.469330288510622e-06, + "loss": 0.4398, + "step": 4109 + }, + { + "epoch": 2.7638900563167184, + "grad_norm": 0.3472818383721879, + "learning_rate": 4.4671412380998456e-06, + "loss": 0.45, + "step": 4110 + }, + { + "epoch": 2.764562494746575, + "grad_norm": 0.32326005111267875, + "learning_rate": 4.464952290994674e-06, + "loss": 0.4559, + "step": 4111 + }, + { + "epoch": 2.7652349331764308, + "grad_norm": 0.39181182928822017, + "learning_rate": 4.4627634476194826e-06, + "loss": 0.4388, + "step": 4112 + }, + { + "epoch": 2.7659073716062874, + "grad_norm": 0.31195199944135227, + "learning_rate": 4.4605747083986225e-06, + "loss": 0.4416, + "step": 4113 + }, + { + "epoch": 2.7665798100361436, + "grad_norm": 0.6106923654849684, + "learning_rate": 4.4583860737564285e-06, + "loss": 0.4526, + "step": 4114 + }, + { + "epoch": 2.767252248466, + "grad_norm": 0.3188243064392307, + "learning_rate": 4.456197544117208e-06, + "loss": 0.4384, + "step": 4115 + }, + { + "epoch": 2.767924686895856, + "grad_norm": 0.4046926030229404, + "learning_rate": 4.454009119905255e-06, + "loss": 0.4408, + "step": 4116 + }, + { + "epoch": 2.7685971253257122, + "grad_norm": 0.31559735785847665, + "learning_rate": 4.4518208015448385e-06, + "loss": 0.4272, + "step": 4117 + }, + { + "epoch": 2.769269563755569, + "grad_norm": 0.517474290715874, + "learning_rate": 4.449632589460212e-06, + "loss": 0.4266, + "step": 4118 + }, + { + "epoch": 2.7699420021854246, + "grad_norm": 0.5023251432358923, + "learning_rate": 4.447444484075603e-06, + "loss": 0.4456, + "step": 4119 + }, + { + "epoch": 2.7706144406152813, + "grad_norm": 0.32524193502553655, + "learning_rate": 4.445256485815222e-06, + "loss": 0.4409, + "step": 4120 + }, + { + "epoch": 2.7712868790451375, + "grad_norm": 0.35927764141343127, + "learning_rate": 4.4430685951032546e-06, + "loss": 0.4466, + "step": 4121 + }, + { + "epoch": 2.7719593174749937, + "grad_norm": 0.31890742397512795, + "learning_rate": 4.440880812363871e-06, + "loss": 0.4247, + "step": 4122 + }, + { + "epoch": 2.77263175590485, + "grad_norm": 0.3545582580996859, + "learning_rate": 4.43869313802122e-06, + "loss": 0.436, + "step": 4123 + }, + { + "epoch": 2.773304194334706, + "grad_norm": 0.4796722667815604, + "learning_rate": 4.436505572499422e-06, + "loss": 0.4489, + "step": 4124 + }, + { + "epoch": 2.7739766327645627, + "grad_norm": 0.30288716390684745, + "learning_rate": 4.434318116222583e-06, + "loss": 0.44, + "step": 4125 + }, + { + "epoch": 2.7746490711944185, + "grad_norm": 0.3279704264872385, + "learning_rate": 4.432130769614787e-06, + "loss": 0.4327, + "step": 4126 + }, + { + "epoch": 2.775321509624275, + "grad_norm": 0.45918742036048055, + "learning_rate": 4.429943533100098e-06, + "loss": 0.4224, + "step": 4127 + }, + { + "epoch": 2.7759939480541314, + "grad_norm": 0.3816536568297676, + "learning_rate": 4.427756407102554e-06, + "loss": 0.4453, + "step": 4128 + }, + { + "epoch": 2.7766663864839876, + "grad_norm": 0.3130350219354691, + "learning_rate": 4.425569392046174e-06, + "loss": 0.4357, + "step": 4129 + }, + { + "epoch": 2.7773388249138438, + "grad_norm": 0.31236615723388095, + "learning_rate": 4.423382488354957e-06, + "loss": 0.4327, + "step": 4130 + }, + { + "epoch": 2.7780112633437, + "grad_norm": 0.33840814755852966, + "learning_rate": 4.4211956964528795e-06, + "loss": 0.4528, + "step": 4131 + }, + { + "epoch": 2.7786837017735566, + "grad_norm": 0.3498544989130826, + "learning_rate": 4.4190090167638976e-06, + "loss": 0.436, + "step": 4132 + }, + { + "epoch": 2.7793561402034124, + "grad_norm": 0.359188442813231, + "learning_rate": 4.416822449711939e-06, + "loss": 0.4328, + "step": 4133 + }, + { + "epoch": 2.780028578633269, + "grad_norm": 0.4924888241936742, + "learning_rate": 4.414635995720918e-06, + "loss": 0.4611, + "step": 4134 + }, + { + "epoch": 2.7807010170631252, + "grad_norm": 0.3342595068619039, + "learning_rate": 4.4124496552147225e-06, + "loss": 0.4302, + "step": 4135 + }, + { + "epoch": 2.7813734554929814, + "grad_norm": 0.30432160255455765, + "learning_rate": 4.410263428617222e-06, + "loss": 0.4408, + "step": 4136 + }, + { + "epoch": 2.7820458939228376, + "grad_norm": 0.34090270895012315, + "learning_rate": 4.408077316352258e-06, + "loss": 0.4579, + "step": 4137 + }, + { + "epoch": 2.782718332352694, + "grad_norm": 0.342366852309439, + "learning_rate": 4.405891318843655e-06, + "loss": 0.4321, + "step": 4138 + }, + { + "epoch": 2.7833907707825505, + "grad_norm": 0.300326428255073, + "learning_rate": 4.403705436515212e-06, + "loss": 0.4386, + "step": 4139 + }, + { + "epoch": 2.7840632092124062, + "grad_norm": 0.4374188529908497, + "learning_rate": 4.401519669790709e-06, + "loss": 0.4617, + "step": 4140 + }, + { + "epoch": 2.784735647642263, + "grad_norm": 0.36903156045600943, + "learning_rate": 4.399334019093904e-06, + "loss": 0.4501, + "step": 4141 + }, + { + "epoch": 2.785408086072119, + "grad_norm": 0.3469467309885719, + "learning_rate": 4.3971484848485245e-06, + "loss": 0.4469, + "step": 4142 + }, + { + "epoch": 2.7860805245019753, + "grad_norm": 0.3529204570178575, + "learning_rate": 4.394963067478283e-06, + "loss": 0.4364, + "step": 4143 + }, + { + "epoch": 2.7867529629318315, + "grad_norm": 0.3156208007068114, + "learning_rate": 4.39277776740687e-06, + "loss": 0.4405, + "step": 4144 + }, + { + "epoch": 2.7874254013616877, + "grad_norm": 0.3589296480700081, + "learning_rate": 4.390592585057949e-06, + "loss": 0.444, + "step": 4145 + }, + { + "epoch": 2.7880978397915444, + "grad_norm": 0.4860533872791004, + "learning_rate": 4.388407520855162e-06, + "loss": 0.4198, + "step": 4146 + }, + { + "epoch": 2.7887702782214, + "grad_norm": 0.31359406004259816, + "learning_rate": 4.38622257522213e-06, + "loss": 0.444, + "step": 4147 + }, + { + "epoch": 2.7894427166512568, + "grad_norm": 0.3628458626927062, + "learning_rate": 4.384037748582448e-06, + "loss": 0.4365, + "step": 4148 + }, + { + "epoch": 2.790115155081113, + "grad_norm": 0.3971586747602741, + "learning_rate": 4.381853041359693e-06, + "loss": 0.4303, + "step": 4149 + }, + { + "epoch": 2.790787593510969, + "grad_norm": 0.3762934083513401, + "learning_rate": 4.37966845397741e-06, + "loss": 0.4448, + "step": 4150 + }, + { + "epoch": 2.7914600319408254, + "grad_norm": 0.3429231449205495, + "learning_rate": 4.377483986859128e-06, + "loss": 0.442, + "step": 4151 + }, + { + "epoch": 2.7921324703706816, + "grad_norm": 0.5394162778507079, + "learning_rate": 4.375299640428352e-06, + "loss": 0.4355, + "step": 4152 + }, + { + "epoch": 2.792804908800538, + "grad_norm": 0.32743745427999177, + "learning_rate": 4.373115415108563e-06, + "loss": 0.4447, + "step": 4153 + }, + { + "epoch": 2.793477347230394, + "grad_norm": 0.4000900373043025, + "learning_rate": 4.370931311323217e-06, + "loss": 0.4482, + "step": 4154 + }, + { + "epoch": 2.7941497856602506, + "grad_norm": 0.35385970730396854, + "learning_rate": 4.368747329495747e-06, + "loss": 0.4324, + "step": 4155 + }, + { + "epoch": 2.794822224090107, + "grad_norm": 0.3965033179606159, + "learning_rate": 4.366563470049561e-06, + "loss": 0.443, + "step": 4156 + }, + { + "epoch": 2.795494662519963, + "grad_norm": 0.34703061683707376, + "learning_rate": 4.364379733408048e-06, + "loss": 0.4407, + "step": 4157 + }, + { + "epoch": 2.7961671009498192, + "grad_norm": 0.31031118789611056, + "learning_rate": 4.36219611999457e-06, + "loss": 0.4496, + "step": 4158 + }, + { + "epoch": 2.7968395393796754, + "grad_norm": 0.356068190864218, + "learning_rate": 4.360012630232463e-06, + "loss": 0.4386, + "step": 4159 + }, + { + "epoch": 2.797511977809532, + "grad_norm": 0.3053172018438645, + "learning_rate": 4.357829264545042e-06, + "loss": 0.4418, + "step": 4160 + }, + { + "epoch": 2.798184416239388, + "grad_norm": 0.34260686402603957, + "learning_rate": 4.355646023355599e-06, + "loss": 0.4234, + "step": 4161 + }, + { + "epoch": 2.7988568546692445, + "grad_norm": 0.37957637036473135, + "learning_rate": 4.353462907087397e-06, + "loss": 0.4349, + "step": 4162 + }, + { + "epoch": 2.7995292930991007, + "grad_norm": 0.41495400538925514, + "learning_rate": 4.351279916163684e-06, + "loss": 0.4217, + "step": 4163 + }, + { + "epoch": 2.800201731528957, + "grad_norm": 0.39348273057249017, + "learning_rate": 4.34909705100767e-06, + "loss": 0.4421, + "step": 4164 + }, + { + "epoch": 2.800874169958813, + "grad_norm": 0.32721102101693095, + "learning_rate": 4.346914312042552e-06, + "loss": 0.4373, + "step": 4165 + }, + { + "epoch": 2.8015466083886693, + "grad_norm": 0.4929749225276872, + "learning_rate": 4.344731699691498e-06, + "loss": 0.4325, + "step": 4166 + }, + { + "epoch": 2.8022190468185255, + "grad_norm": 0.3944224671487195, + "learning_rate": 4.342549214377653e-06, + "loss": 0.4428, + "step": 4167 + }, + { + "epoch": 2.8028914852483817, + "grad_norm": 0.32805134167892863, + "learning_rate": 4.340366856524137e-06, + "loss": 0.4339, + "step": 4168 + }, + { + "epoch": 2.8035639236782384, + "grad_norm": 0.3571698135102629, + "learning_rate": 4.338184626554043e-06, + "loss": 0.448, + "step": 4169 + }, + { + "epoch": 2.8042363621080946, + "grad_norm": 1.1019836197426351, + "learning_rate": 4.336002524890442e-06, + "loss": 0.4559, + "step": 4170 + }, + { + "epoch": 2.8049088005379508, + "grad_norm": 0.5615591083474871, + "learning_rate": 4.333820551956381e-06, + "loss": 0.4432, + "step": 4171 + }, + { + "epoch": 2.805581238967807, + "grad_norm": 0.42330074239721943, + "learning_rate": 4.331638708174877e-06, + "loss": 0.4495, + "step": 4172 + }, + { + "epoch": 2.806253677397663, + "grad_norm": 0.41645460574901194, + "learning_rate": 4.329456993968926e-06, + "loss": 0.431, + "step": 4173 + }, + { + "epoch": 2.8069261158275194, + "grad_norm": 0.3709084412027702, + "learning_rate": 4.327275409761499e-06, + "loss": 0.4378, + "step": 4174 + }, + { + "epoch": 2.8075985542573756, + "grad_norm": 0.353736146094937, + "learning_rate": 4.325093955975541e-06, + "loss": 0.4446, + "step": 4175 + }, + { + "epoch": 2.8082709926872322, + "grad_norm": 0.3818149149271061, + "learning_rate": 4.322912633033972e-06, + "loss": 0.4341, + "step": 4176 + }, + { + "epoch": 2.8089434311170884, + "grad_norm": 0.4666823798883692, + "learning_rate": 4.320731441359684e-06, + "loss": 0.4504, + "step": 4177 + }, + { + "epoch": 2.8096158695469446, + "grad_norm": 0.3905860953030715, + "learning_rate": 4.318550381375548e-06, + "loss": 0.4313, + "step": 4178 + }, + { + "epoch": 2.810288307976801, + "grad_norm": 0.4089372449811153, + "learning_rate": 4.3163694535044055e-06, + "loss": 0.4312, + "step": 4179 + }, + { + "epoch": 2.810960746406657, + "grad_norm": 0.37258055042296084, + "learning_rate": 4.314188658169078e-06, + "loss": 0.4441, + "step": 4180 + }, + { + "epoch": 2.8116331848365133, + "grad_norm": 0.35262230137425327, + "learning_rate": 4.312007995792353e-06, + "loss": 0.4423, + "step": 4181 + }, + { + "epoch": 2.8123056232663695, + "grad_norm": 0.4462012393921559, + "learning_rate": 4.309827466796996e-06, + "loss": 0.4647, + "step": 4182 + }, + { + "epoch": 2.812978061696226, + "grad_norm": 0.423231503154175, + "learning_rate": 4.307647071605751e-06, + "loss": 0.4494, + "step": 4183 + }, + { + "epoch": 2.8136505001260823, + "grad_norm": 0.37000476204116944, + "learning_rate": 4.30546681064133e-06, + "loss": 0.4381, + "step": 4184 + }, + { + "epoch": 2.8143229385559385, + "grad_norm": 0.4382581644812144, + "learning_rate": 4.303286684326424e-06, + "loss": 0.4331, + "step": 4185 + }, + { + "epoch": 2.8149953769857947, + "grad_norm": 0.3426587641141498, + "learning_rate": 4.301106693083692e-06, + "loss": 0.4344, + "step": 4186 + }, + { + "epoch": 2.815667815415651, + "grad_norm": 0.4400919829618176, + "learning_rate": 4.298926837335772e-06, + "loss": 0.4499, + "step": 4187 + }, + { + "epoch": 2.816340253845507, + "grad_norm": 0.43886870253233085, + "learning_rate": 4.2967471175052725e-06, + "loss": 0.4469, + "step": 4188 + }, + { + "epoch": 2.8170126922753633, + "grad_norm": 0.35837375545485195, + "learning_rate": 4.2945675340147796e-06, + "loss": 0.4345, + "step": 4189 + }, + { + "epoch": 2.81768513070522, + "grad_norm": 0.3634353514020071, + "learning_rate": 4.292388087286846e-06, + "loss": 0.431, + "step": 4190 + }, + { + "epoch": 2.818357569135076, + "grad_norm": 0.3444790616942859, + "learning_rate": 4.290208777744006e-06, + "loss": 0.4385, + "step": 4191 + }, + { + "epoch": 2.8190300075649324, + "grad_norm": 0.4673667498158592, + "learning_rate": 4.288029605808761e-06, + "loss": 0.4545, + "step": 4192 + }, + { + "epoch": 2.8197024459947886, + "grad_norm": 0.4901550150844472, + "learning_rate": 4.28585057190359e-06, + "loss": 0.4252, + "step": 4193 + }, + { + "epoch": 2.820374884424645, + "grad_norm": 0.34004725640003464, + "learning_rate": 4.283671676450941e-06, + "loss": 0.4339, + "step": 4194 + }, + { + "epoch": 2.821047322854501, + "grad_norm": 0.37589666520860127, + "learning_rate": 4.28149291987324e-06, + "loss": 0.4454, + "step": 4195 + }, + { + "epoch": 2.821719761284357, + "grad_norm": 0.43697203610507124, + "learning_rate": 4.2793143025928835e-06, + "loss": 0.4246, + "step": 4196 + }, + { + "epoch": 2.822392199714214, + "grad_norm": 0.32464672726116545, + "learning_rate": 4.27713582503224e-06, + "loss": 0.4458, + "step": 4197 + }, + { + "epoch": 2.82306463814407, + "grad_norm": 0.362476088230034, + "learning_rate": 4.274957487613654e-06, + "loss": 0.4426, + "step": 4198 + }, + { + "epoch": 2.8237370765739263, + "grad_norm": 0.3578027590759422, + "learning_rate": 4.272779290759439e-06, + "loss": 0.424, + "step": 4199 + }, + { + "epoch": 2.8244095150037825, + "grad_norm": 0.742708640591173, + "learning_rate": 4.270601234891883e-06, + "loss": 0.4413, + "step": 4200 + }, + { + "epoch": 2.8250819534336387, + "grad_norm": 0.3279396144376603, + "learning_rate": 4.268423320433247e-06, + "loss": 0.4392, + "step": 4201 + }, + { + "epoch": 2.825754391863495, + "grad_norm": 0.35208092982367156, + "learning_rate": 4.266245547805767e-06, + "loss": 0.4416, + "step": 4202 + }, + { + "epoch": 2.826426830293351, + "grad_norm": 0.36251601382145465, + "learning_rate": 4.264067917431645e-06, + "loss": 0.4561, + "step": 4203 + }, + { + "epoch": 2.8270992687232077, + "grad_norm": 0.4306499066372681, + "learning_rate": 4.261890429733063e-06, + "loss": 0.4324, + "step": 4204 + }, + { + "epoch": 2.827771707153064, + "grad_norm": 0.35960347296205575, + "learning_rate": 4.2597130851321685e-06, + "loss": 0.4246, + "step": 4205 + }, + { + "epoch": 2.82844414558292, + "grad_norm": 0.3312427958990518, + "learning_rate": 4.257535884051086e-06, + "loss": 0.4662, + "step": 4206 + }, + { + "epoch": 2.8291165840127763, + "grad_norm": 0.3127332077974222, + "learning_rate": 4.255358826911912e-06, + "loss": 0.4256, + "step": 4207 + }, + { + "epoch": 2.8297890224426325, + "grad_norm": 0.36043322840857506, + "learning_rate": 4.253181914136711e-06, + "loss": 0.4442, + "step": 4208 + }, + { + "epoch": 2.8304614608724887, + "grad_norm": 0.3183818541164439, + "learning_rate": 4.251005146147522e-06, + "loss": 0.4271, + "step": 4209 + }, + { + "epoch": 2.831133899302345, + "grad_norm": 0.4745175769316324, + "learning_rate": 4.248828523366359e-06, + "loss": 0.4312, + "step": 4210 + }, + { + "epoch": 2.8318063377322016, + "grad_norm": 0.33201401995383073, + "learning_rate": 4.246652046215206e-06, + "loss": 0.453, + "step": 4211 + }, + { + "epoch": 2.832478776162058, + "grad_norm": 0.324963818023879, + "learning_rate": 4.244475715116012e-06, + "loss": 0.4339, + "step": 4212 + }, + { + "epoch": 2.833151214591914, + "grad_norm": 0.3568436731790368, + "learning_rate": 4.242299530490708e-06, + "loss": 0.4498, + "step": 4213 + }, + { + "epoch": 2.83382365302177, + "grad_norm": 0.4182822403247682, + "learning_rate": 4.24012349276119e-06, + "loss": 0.4609, + "step": 4214 + }, + { + "epoch": 2.8344960914516264, + "grad_norm": 3.131545947607575, + "learning_rate": 4.23794760234933e-06, + "loss": 0.4352, + "step": 4215 + }, + { + "epoch": 2.8351685298814826, + "grad_norm": 0.4583669298743143, + "learning_rate": 4.2357718596769655e-06, + "loss": 0.4469, + "step": 4216 + }, + { + "epoch": 2.835840968311339, + "grad_norm": 0.3794591328890642, + "learning_rate": 4.233596265165912e-06, + "loss": 0.4319, + "step": 4217 + }, + { + "epoch": 2.8365134067411955, + "grad_norm": 0.4474970862074643, + "learning_rate": 4.231420819237951e-06, + "loss": 0.4468, + "step": 4218 + }, + { + "epoch": 2.8371858451710517, + "grad_norm": 0.4264990089485767, + "learning_rate": 4.229245522314839e-06, + "loss": 0.4457, + "step": 4219 + }, + { + "epoch": 2.837858283600908, + "grad_norm": 0.32880018687738627, + "learning_rate": 4.2270703748183025e-06, + "loss": 0.4394, + "step": 4220 + }, + { + "epoch": 2.838530722030764, + "grad_norm": 0.4347059065127276, + "learning_rate": 4.2248953771700356e-06, + "loss": 0.45, + "step": 4221 + }, + { + "epoch": 2.8392031604606203, + "grad_norm": 0.3539200637449874, + "learning_rate": 4.222720529791708e-06, + "loss": 0.44, + "step": 4222 + }, + { + "epoch": 2.8398755988904765, + "grad_norm": 0.31172589745609186, + "learning_rate": 4.220545833104958e-06, + "loss": 0.4481, + "step": 4223 + }, + { + "epoch": 2.8405480373203327, + "grad_norm": 0.35007131216723775, + "learning_rate": 4.218371287531397e-06, + "loss": 0.4293, + "step": 4224 + }, + { + "epoch": 2.8412204757501893, + "grad_norm": 0.449397329929107, + "learning_rate": 4.216196893492602e-06, + "loss": 0.4361, + "step": 4225 + }, + { + "epoch": 2.8418929141800455, + "grad_norm": 0.4494250507992524, + "learning_rate": 4.2140226514101265e-06, + "loss": 0.4327, + "step": 4226 + }, + { + "epoch": 2.8425653526099017, + "grad_norm": 0.328475017212245, + "learning_rate": 4.2118485617054916e-06, + "loss": 0.4507, + "step": 4227 + }, + { + "epoch": 2.843237791039758, + "grad_norm": 0.4306809663698929, + "learning_rate": 4.20967462480019e-06, + "loss": 0.4374, + "step": 4228 + }, + { + "epoch": 2.843910229469614, + "grad_norm": 0.35037605536830907, + "learning_rate": 4.207500841115682e-06, + "loss": 0.4409, + "step": 4229 + }, + { + "epoch": 2.8445826678994703, + "grad_norm": 0.33765625836092356, + "learning_rate": 4.205327211073399e-06, + "loss": 0.4381, + "step": 4230 + }, + { + "epoch": 2.8452551063293265, + "grad_norm": 0.5551967692371169, + "learning_rate": 4.203153735094747e-06, + "loss": 0.4328, + "step": 4231 + }, + { + "epoch": 2.845927544759183, + "grad_norm": 0.40994218394479104, + "learning_rate": 4.200980413601097e-06, + "loss": 0.4471, + "step": 4232 + }, + { + "epoch": 2.8465999831890394, + "grad_norm": 0.37418125193428436, + "learning_rate": 4.198807247013794e-06, + "loss": 0.4518, + "step": 4233 + }, + { + "epoch": 2.8472724216188956, + "grad_norm": 0.41210781906159627, + "learning_rate": 4.1966342357541486e-06, + "loss": 0.4363, + "step": 4234 + }, + { + "epoch": 2.847944860048752, + "grad_norm": 0.4155183764811362, + "learning_rate": 4.194461380243445e-06, + "loss": 0.4373, + "step": 4235 + }, + { + "epoch": 2.848617298478608, + "grad_norm": 0.3669197743717589, + "learning_rate": 4.192288680902935e-06, + "loss": 0.4453, + "step": 4236 + }, + { + "epoch": 2.849289736908464, + "grad_norm": 0.425957665347839, + "learning_rate": 4.190116138153844e-06, + "loss": 0.4562, + "step": 4237 + }, + { + "epoch": 2.8499621753383204, + "grad_norm": 0.3404491078149701, + "learning_rate": 4.187943752417359e-06, + "loss": 0.4301, + "step": 4238 + }, + { + "epoch": 2.850634613768177, + "grad_norm": 0.3677165947890313, + "learning_rate": 4.185771524114644e-06, + "loss": 0.4263, + "step": 4239 + }, + { + "epoch": 2.8513070521980333, + "grad_norm": 0.31412292626872296, + "learning_rate": 4.18359945366683e-06, + "loss": 0.4299, + "step": 4240 + }, + { + "epoch": 2.8519794906278895, + "grad_norm": 0.3716609911042492, + "learning_rate": 4.181427541495018e-06, + "loss": 0.4427, + "step": 4241 + }, + { + "epoch": 2.8526519290577457, + "grad_norm": 0.3518355157188309, + "learning_rate": 4.179255788020278e-06, + "loss": 0.4342, + "step": 4242 + }, + { + "epoch": 2.853324367487602, + "grad_norm": 0.35252230576663934, + "learning_rate": 4.177084193663647e-06, + "loss": 0.4337, + "step": 4243 + }, + { + "epoch": 2.853996805917458, + "grad_norm": 0.3379641016136965, + "learning_rate": 4.174912758846134e-06, + "loss": 0.4207, + "step": 4244 + }, + { + "epoch": 2.8546692443473143, + "grad_norm": 0.3524487510511978, + "learning_rate": 4.172741483988717e-06, + "loss": 0.4353, + "step": 4245 + }, + { + "epoch": 2.855341682777171, + "grad_norm": 0.40793087769440395, + "learning_rate": 4.170570369512344e-06, + "loss": 0.4253, + "step": 4246 + }, + { + "epoch": 2.856014121207027, + "grad_norm": 0.422585208007085, + "learning_rate": 4.1683994158379255e-06, + "loss": 0.4295, + "step": 4247 + }, + { + "epoch": 2.8566865596368833, + "grad_norm": 0.45569305713966374, + "learning_rate": 4.166228623386347e-06, + "loss": 0.4325, + "step": 4248 + }, + { + "epoch": 2.8573589980667395, + "grad_norm": 0.5739734252923965, + "learning_rate": 4.164057992578463e-06, + "loss": 0.4316, + "step": 4249 + }, + { + "epoch": 2.8580314364965957, + "grad_norm": 0.38539807899996864, + "learning_rate": 4.1618875238350945e-06, + "loss": 0.4518, + "step": 4250 + }, + { + "epoch": 2.858703874926452, + "grad_norm": 0.4344583117890214, + "learning_rate": 4.15971721757703e-06, + "loss": 0.435, + "step": 4251 + }, + { + "epoch": 2.859376313356308, + "grad_norm": 0.37028539101648733, + "learning_rate": 4.157547074225028e-06, + "loss": 0.4475, + "step": 4252 + }, + { + "epoch": 2.860048751786165, + "grad_norm": 0.3612453578542226, + "learning_rate": 4.155377094199816e-06, + "loss": 0.4343, + "step": 4253 + }, + { + "epoch": 2.860721190216021, + "grad_norm": 0.46991632221235885, + "learning_rate": 4.153207277922089e-06, + "loss": 0.433, + "step": 4254 + }, + { + "epoch": 2.861393628645877, + "grad_norm": 0.34980489848240764, + "learning_rate": 4.151037625812513e-06, + "loss": 0.4365, + "step": 4255 + }, + { + "epoch": 2.8620660670757334, + "grad_norm": 0.3750905114580679, + "learning_rate": 4.148868138291714e-06, + "loss": 0.4365, + "step": 4256 + }, + { + "epoch": 2.8627385055055896, + "grad_norm": 0.38187841266966155, + "learning_rate": 4.146698815780295e-06, + "loss": 0.4443, + "step": 4257 + }, + { + "epoch": 2.863410943935446, + "grad_norm": 0.33677571728502814, + "learning_rate": 4.144529658698824e-06, + "loss": 0.4423, + "step": 4258 + }, + { + "epoch": 2.864083382365302, + "grad_norm": 0.402470212628385, + "learning_rate": 4.142360667467838e-06, + "loss": 0.4259, + "step": 4259 + }, + { + "epoch": 2.8647558207951587, + "grad_norm": 0.6996119095805915, + "learning_rate": 4.1401918425078365e-06, + "loss": 0.4397, + "step": 4260 + }, + { + "epoch": 2.865428259225015, + "grad_norm": 0.32779799605137916, + "learning_rate": 4.138023184239294e-06, + "loss": 0.4488, + "step": 4261 + }, + { + "epoch": 2.866100697654871, + "grad_norm": 0.31941981810706765, + "learning_rate": 4.135854693082646e-06, + "loss": 0.4242, + "step": 4262 + }, + { + "epoch": 2.8667731360847273, + "grad_norm": 0.4026405587453875, + "learning_rate": 4.133686369458303e-06, + "loss": 0.437, + "step": 4263 + }, + { + "epoch": 2.8674455745145835, + "grad_norm": 0.3102554702479224, + "learning_rate": 4.131518213786638e-06, + "loss": 0.453, + "step": 4264 + }, + { + "epoch": 2.8681180129444397, + "grad_norm": 0.4258431129398313, + "learning_rate": 4.1293502264879895e-06, + "loss": 0.4534, + "step": 4265 + }, + { + "epoch": 2.868790451374296, + "grad_norm": 0.6528324085825112, + "learning_rate": 4.1271824079826695e-06, + "loss": 0.4371, + "step": 4266 + }, + { + "epoch": 2.8694628898041525, + "grad_norm": 0.376635634239984, + "learning_rate": 4.1250147586909534e-06, + "loss": 0.4323, + "step": 4267 + }, + { + "epoch": 2.8701353282340083, + "grad_norm": 0.45667317364587295, + "learning_rate": 4.122847279033086e-06, + "loss": 0.4647, + "step": 4268 + }, + { + "epoch": 2.870807766663865, + "grad_norm": 0.40142667751818195, + "learning_rate": 4.120679969429274e-06, + "loss": 0.4471, + "step": 4269 + }, + { + "epoch": 2.871480205093721, + "grad_norm": 0.39789184379035186, + "learning_rate": 4.118512830299696e-06, + "loss": 0.4405, + "step": 4270 + }, + { + "epoch": 2.8721526435235774, + "grad_norm": 0.34470586336361553, + "learning_rate": 4.116345862064498e-06, + "loss": 0.4551, + "step": 4271 + }, + { + "epoch": 2.8728250819534336, + "grad_norm": 0.3288657110198987, + "learning_rate": 4.114179065143792e-06, + "loss": 0.4506, + "step": 4272 + }, + { + "epoch": 2.8734975203832898, + "grad_norm": 0.316861492206836, + "learning_rate": 4.112012439957653e-06, + "loss": 0.4336, + "step": 4273 + }, + { + "epoch": 2.8741699588131464, + "grad_norm": 0.3757760463793481, + "learning_rate": 4.109845986926127e-06, + "loss": 0.4187, + "step": 4274 + }, + { + "epoch": 2.874842397243002, + "grad_norm": 0.36283664838320717, + "learning_rate": 4.107679706469226e-06, + "loss": 0.4286, + "step": 4275 + }, + { + "epoch": 2.875514835672859, + "grad_norm": 0.3972832536818271, + "learning_rate": 4.105513599006927e-06, + "loss": 0.4417, + "step": 4276 + }, + { + "epoch": 2.876187274102715, + "grad_norm": 0.3395551214278122, + "learning_rate": 4.103347664959179e-06, + "loss": 0.447, + "step": 4277 + }, + { + "epoch": 2.8768597125325712, + "grad_norm": 0.3844461209754712, + "learning_rate": 4.101181904745885e-06, + "loss": 0.4556, + "step": 4278 + }, + { + "epoch": 2.8775321509624274, + "grad_norm": 0.3931268402696742, + "learning_rate": 4.099016318786926e-06, + "loss": 0.4535, + "step": 4279 + }, + { + "epoch": 2.8782045893922836, + "grad_norm": 0.3069355737860178, + "learning_rate": 4.096850907502145e-06, + "loss": 0.4391, + "step": 4280 + }, + { + "epoch": 2.8788770278221403, + "grad_norm": 1.1404325595603302, + "learning_rate": 4.094685671311353e-06, + "loss": 0.4477, + "step": 4281 + }, + { + "epoch": 2.879549466251996, + "grad_norm": 0.3256061095754914, + "learning_rate": 4.092520610634323e-06, + "loss": 0.435, + "step": 4282 + }, + { + "epoch": 2.8802219046818527, + "grad_norm": 0.3556430518345657, + "learning_rate": 4.0903557258907975e-06, + "loss": 0.4357, + "step": 4283 + }, + { + "epoch": 2.880894343111709, + "grad_norm": 0.4232484835520356, + "learning_rate": 4.088191017500484e-06, + "loss": 0.4407, + "step": 4284 + }, + { + "epoch": 2.881566781541565, + "grad_norm": 0.5641353968555147, + "learning_rate": 4.0860264858830545e-06, + "loss": 0.4343, + "step": 4285 + }, + { + "epoch": 2.8822392199714213, + "grad_norm": 0.35668894593439726, + "learning_rate": 4.083862131458152e-06, + "loss": 0.4384, + "step": 4286 + }, + { + "epoch": 2.8829116584012775, + "grad_norm": 0.5268429700767152, + "learning_rate": 4.081697954645375e-06, + "loss": 0.4454, + "step": 4287 + }, + { + "epoch": 2.883584096831134, + "grad_norm": 0.3592019898868432, + "learning_rate": 4.0795339558642964e-06, + "loss": 0.43, + "step": 4288 + }, + { + "epoch": 2.88425653526099, + "grad_norm": 0.3355202443504104, + "learning_rate": 4.077370135534452e-06, + "loss": 0.4256, + "step": 4289 + }, + { + "epoch": 2.8849289736908466, + "grad_norm": 0.35761333702572906, + "learning_rate": 4.075206494075344e-06, + "loss": 0.4093, + "step": 4290 + }, + { + "epoch": 2.8856014121207028, + "grad_norm": 0.36261425680219084, + "learning_rate": 4.073043031906437e-06, + "loss": 0.434, + "step": 4291 + }, + { + "epoch": 2.886273850550559, + "grad_norm": 0.48381493475754367, + "learning_rate": 4.070879749447162e-06, + "loss": 0.4384, + "step": 4292 + }, + { + "epoch": 2.886946288980415, + "grad_norm": 0.3368170714364876, + "learning_rate": 4.0687166471169156e-06, + "loss": 0.4339, + "step": 4293 + }, + { + "epoch": 2.8876187274102714, + "grad_norm": 0.37122650595240925, + "learning_rate": 4.066553725335064e-06, + "loss": 0.4456, + "step": 4294 + }, + { + "epoch": 2.888291165840128, + "grad_norm": 0.472617643332299, + "learning_rate": 4.0643909845209286e-06, + "loss": 0.4395, + "step": 4295 + }, + { + "epoch": 2.8889636042699838, + "grad_norm": 0.4182466531187184, + "learning_rate": 4.062228425093802e-06, + "loss": 0.4311, + "step": 4296 + }, + { + "epoch": 2.8896360426998404, + "grad_norm": 0.4305460750947574, + "learning_rate": 4.060066047472942e-06, + "loss": 0.4396, + "step": 4297 + }, + { + "epoch": 2.8903084811296966, + "grad_norm": 0.33529669856356104, + "learning_rate": 4.05790385207757e-06, + "loss": 0.4318, + "step": 4298 + }, + { + "epoch": 2.890980919559553, + "grad_norm": 0.31791553757939606, + "learning_rate": 4.055741839326872e-06, + "loss": 0.4511, + "step": 4299 + }, + { + "epoch": 2.891653357989409, + "grad_norm": 0.4054930402209033, + "learning_rate": 4.053580009639998e-06, + "loss": 0.442, + "step": 4300 + }, + { + "epoch": 2.8923257964192652, + "grad_norm": 0.4424163335137873, + "learning_rate": 4.051418363436062e-06, + "loss": 0.4409, + "step": 4301 + }, + { + "epoch": 2.892998234849122, + "grad_norm": 0.3115659861446721, + "learning_rate": 4.049256901134146e-06, + "loss": 0.4344, + "step": 4302 + }, + { + "epoch": 2.8936706732789776, + "grad_norm": 0.35125570354866636, + "learning_rate": 4.047095623153294e-06, + "loss": 0.4417, + "step": 4303 + }, + { + "epoch": 2.8943431117088343, + "grad_norm": 0.5242489123159184, + "learning_rate": 4.0449345299125105e-06, + "loss": 0.4424, + "step": 4304 + }, + { + "epoch": 2.8950155501386905, + "grad_norm": 0.39148138912349173, + "learning_rate": 4.042773621830769e-06, + "loss": 0.4341, + "step": 4305 + }, + { + "epoch": 2.8956879885685467, + "grad_norm": 0.4032256382439055, + "learning_rate": 4.040612899327007e-06, + "loss": 0.4374, + "step": 4306 + }, + { + "epoch": 2.896360426998403, + "grad_norm": 0.45494911382000336, + "learning_rate": 4.0384523628201246e-06, + "loss": 0.4405, + "step": 4307 + }, + { + "epoch": 2.897032865428259, + "grad_norm": 0.5259742575105177, + "learning_rate": 4.03629201272899e-06, + "loss": 0.4419, + "step": 4308 + }, + { + "epoch": 2.8977053038581158, + "grad_norm": 0.3713800624548828, + "learning_rate": 4.034131849472423e-06, + "loss": 0.4203, + "step": 4309 + }, + { + "epoch": 2.8983777422879715, + "grad_norm": 0.35812631028349245, + "learning_rate": 4.031971873469222e-06, + "loss": 0.4432, + "step": 4310 + }, + { + "epoch": 2.899050180717828, + "grad_norm": 0.3508903930067445, + "learning_rate": 4.0298120851381405e-06, + "loss": 0.4601, + "step": 4311 + }, + { + "epoch": 2.8997226191476844, + "grad_norm": 0.36295881177394324, + "learning_rate": 4.0276524848978985e-06, + "loss": 0.4389, + "step": 4312 + }, + { + "epoch": 2.9003950575775406, + "grad_norm": 0.31969764112024057, + "learning_rate": 4.0254930731671785e-06, + "loss": 0.4254, + "step": 4313 + }, + { + "epoch": 2.9010674960073968, + "grad_norm": 0.3590482150560216, + "learning_rate": 4.0233338503646255e-06, + "loss": 0.4522, + "step": 4314 + }, + { + "epoch": 2.901739934437253, + "grad_norm": 0.42649867839913475, + "learning_rate": 4.021174816908852e-06, + "loss": 0.44, + "step": 4315 + }, + { + "epoch": 2.9024123728671096, + "grad_norm": 0.35706898525852293, + "learning_rate": 4.0190159732184305e-06, + "loss": 0.4449, + "step": 4316 + }, + { + "epoch": 2.9030848112969654, + "grad_norm": 0.3270427305336125, + "learning_rate": 4.016857319711893e-06, + "loss": 0.4353, + "step": 4317 + }, + { + "epoch": 2.903757249726822, + "grad_norm": 0.3247344819275317, + "learning_rate": 4.014698856807744e-06, + "loss": 0.4326, + "step": 4318 + }, + { + "epoch": 2.9044296881566782, + "grad_norm": 0.36525032765947724, + "learning_rate": 4.012540584924442e-06, + "loss": 0.4276, + "step": 4319 + }, + { + "epoch": 2.9051021265865344, + "grad_norm": 0.4000180432810828, + "learning_rate": 4.010382504480415e-06, + "loss": 0.4422, + "step": 4320 + }, + { + "epoch": 2.9057745650163906, + "grad_norm": 0.3840704237209392, + "learning_rate": 4.00822461589405e-06, + "loss": 0.435, + "step": 4321 + }, + { + "epoch": 2.906447003446247, + "grad_norm": 0.3185174756415056, + "learning_rate": 4.006066919583698e-06, + "loss": 0.4412, + "step": 4322 + }, + { + "epoch": 2.9071194418761035, + "grad_norm": 0.35060641499913836, + "learning_rate": 4.003909415967672e-06, + "loss": 0.446, + "step": 4323 + }, + { + "epoch": 2.9077918803059593, + "grad_norm": 0.3229457585115087, + "learning_rate": 4.001752105464249e-06, + "loss": 0.4338, + "step": 4324 + }, + { + "epoch": 2.908464318735816, + "grad_norm": 0.35988129806189556, + "learning_rate": 3.999594988491672e-06, + "loss": 0.4393, + "step": 4325 + }, + { + "epoch": 2.909136757165672, + "grad_norm": 0.5260762120795631, + "learning_rate": 3.997438065468135e-06, + "loss": 0.4366, + "step": 4326 + }, + { + "epoch": 2.9098091955955283, + "grad_norm": 0.33383556534355013, + "learning_rate": 3.9952813368118054e-06, + "loss": 0.4236, + "step": 4327 + }, + { + "epoch": 2.9104816340253845, + "grad_norm": 0.3549944344912472, + "learning_rate": 3.9931248029408096e-06, + "loss": 0.4194, + "step": 4328 + }, + { + "epoch": 2.9111540724552407, + "grad_norm": 0.340081867293386, + "learning_rate": 3.9909684642732346e-06, + "loss": 0.438, + "step": 4329 + }, + { + "epoch": 2.911826510885097, + "grad_norm": 0.41570221401733837, + "learning_rate": 3.988812321227134e-06, + "loss": 0.4427, + "step": 4330 + }, + { + "epoch": 2.912498949314953, + "grad_norm": 0.39149912394492825, + "learning_rate": 3.986656374220516e-06, + "loss": 0.4401, + "step": 4331 + }, + { + "epoch": 2.9131713877448098, + "grad_norm": 0.5597983452383962, + "learning_rate": 3.984500623671359e-06, + "loss": 0.4343, + "step": 4332 + }, + { + "epoch": 2.913843826174666, + "grad_norm": 0.46909294423105663, + "learning_rate": 3.982345069997597e-06, + "loss": 0.4362, + "step": 4333 + }, + { + "epoch": 2.914516264604522, + "grad_norm": 0.4440529949481681, + "learning_rate": 3.980189713617132e-06, + "loss": 0.447, + "step": 4334 + }, + { + "epoch": 2.9151887030343784, + "grad_norm": 0.3846651417453312, + "learning_rate": 3.9780345549478185e-06, + "loss": 0.4319, + "step": 4335 + }, + { + "epoch": 2.9158611414642346, + "grad_norm": 0.3907841131878748, + "learning_rate": 3.975879594407481e-06, + "loss": 0.4302, + "step": 4336 + }, + { + "epoch": 2.916533579894091, + "grad_norm": 0.34113166552315544, + "learning_rate": 3.9737248324139035e-06, + "loss": 0.4581, + "step": 4337 + }, + { + "epoch": 2.917206018323947, + "grad_norm": 0.4001528551480132, + "learning_rate": 3.971570269384831e-06, + "loss": 0.4256, + "step": 4338 + }, + { + "epoch": 2.9178784567538036, + "grad_norm": 0.3707242145332742, + "learning_rate": 3.969415905737967e-06, + "loss": 0.4358, + "step": 4339 + }, + { + "epoch": 2.91855089518366, + "grad_norm": 0.3583102598519517, + "learning_rate": 3.967261741890982e-06, + "loss": 0.4364, + "step": 4340 + }, + { + "epoch": 2.919223333613516, + "grad_norm": 0.3255702338682238, + "learning_rate": 3.965107778261504e-06, + "loss": 0.4398, + "step": 4341 + }, + { + "epoch": 2.9198957720433723, + "grad_norm": 0.34596311967659127, + "learning_rate": 3.962954015267123e-06, + "loss": 0.4166, + "step": 4342 + }, + { + "epoch": 2.9205682104732285, + "grad_norm": 0.3569938436608133, + "learning_rate": 3.960800453325392e-06, + "loss": 0.4401, + "step": 4343 + }, + { + "epoch": 2.9212406489030847, + "grad_norm": 0.4270059848200292, + "learning_rate": 3.958647092853819e-06, + "loss": 0.4399, + "step": 4344 + }, + { + "epoch": 2.921913087332941, + "grad_norm": 0.45722521864625526, + "learning_rate": 3.95649393426988e-06, + "loss": 0.4399, + "step": 4345 + }, + { + "epoch": 2.9225855257627975, + "grad_norm": 0.326045276220502, + "learning_rate": 3.954340977991008e-06, + "loss": 0.4371, + "step": 4346 + }, + { + "epoch": 2.9232579641926537, + "grad_norm": 0.5610976445828544, + "learning_rate": 3.9521882244345996e-06, + "loss": 0.443, + "step": 4347 + }, + { + "epoch": 2.92393040262251, + "grad_norm": 0.37308037941952066, + "learning_rate": 3.950035674018008e-06, + "loss": 0.4418, + "step": 4348 + }, + { + "epoch": 2.924602841052366, + "grad_norm": 0.2977451673809849, + "learning_rate": 3.9478833271585494e-06, + "loss": 0.4423, + "step": 4349 + }, + { + "epoch": 2.9252752794822223, + "grad_norm": 0.3645061290192601, + "learning_rate": 3.945731184273502e-06, + "loss": 0.4206, + "step": 4350 + }, + { + "epoch": 2.9259477179120785, + "grad_norm": 0.37103446938698675, + "learning_rate": 3.943579245780101e-06, + "loss": 0.4396, + "step": 4351 + }, + { + "epoch": 2.9266201563419347, + "grad_norm": 0.33635430725600896, + "learning_rate": 3.941427512095548e-06, + "loss": 0.4409, + "step": 4352 + }, + { + "epoch": 2.9272925947717914, + "grad_norm": 0.44784358321516776, + "learning_rate": 3.939275983636995e-06, + "loss": 0.443, + "step": 4353 + }, + { + "epoch": 2.9279650332016476, + "grad_norm": 0.32436690281265607, + "learning_rate": 3.937124660821562e-06, + "loss": 0.4371, + "step": 4354 + }, + { + "epoch": 2.928637471631504, + "grad_norm": 0.33151113447889863, + "learning_rate": 3.934973544066328e-06, + "loss": 0.4312, + "step": 4355 + }, + { + "epoch": 2.92930991006136, + "grad_norm": 0.39549657605539945, + "learning_rate": 3.932822633788334e-06, + "loss": 0.4484, + "step": 4356 + }, + { + "epoch": 2.929982348491216, + "grad_norm": 0.3303839780249808, + "learning_rate": 3.930671930404572e-06, + "loss": 0.4392, + "step": 4357 + }, + { + "epoch": 2.9306547869210724, + "grad_norm": 0.45877058828119155, + "learning_rate": 3.928521434332002e-06, + "loss": 0.4452, + "step": 4358 + }, + { + "epoch": 2.9313272253509286, + "grad_norm": 0.3174532018147674, + "learning_rate": 3.926371145987544e-06, + "loss": 0.4355, + "step": 4359 + }, + { + "epoch": 2.9319996637807852, + "grad_norm": 0.3864330463989357, + "learning_rate": 3.9242210657880745e-06, + "loss": 0.4528, + "step": 4360 + }, + { + "epoch": 2.9326721022106415, + "grad_norm": 0.376800759867609, + "learning_rate": 3.92207119415043e-06, + "loss": 0.438, + "step": 4361 + }, + { + "epoch": 2.9333445406404977, + "grad_norm": 0.32073634660336636, + "learning_rate": 3.919921531491407e-06, + "loss": 0.4443, + "step": 4362 + }, + { + "epoch": 2.934016979070354, + "grad_norm": 0.3172303339676808, + "learning_rate": 3.917772078227763e-06, + "loss": 0.4454, + "step": 4363 + }, + { + "epoch": 2.93468941750021, + "grad_norm": 0.3850781141799853, + "learning_rate": 3.9156228347762115e-06, + "loss": 0.4228, + "step": 4364 + }, + { + "epoch": 2.9353618559300663, + "grad_norm": 0.2930249871479131, + "learning_rate": 3.913473801553433e-06, + "loss": 0.4452, + "step": 4365 + }, + { + "epoch": 2.9360342943599225, + "grad_norm": 0.3473829718278103, + "learning_rate": 3.911324978976054e-06, + "loss": 0.4289, + "step": 4366 + }, + { + "epoch": 2.936706732789779, + "grad_norm": 0.3217487492540504, + "learning_rate": 3.909176367460672e-06, + "loss": 0.4403, + "step": 4367 + }, + { + "epoch": 2.9373791712196353, + "grad_norm": 0.4522829535415397, + "learning_rate": 3.907027967423839e-06, + "loss": 0.4355, + "step": 4368 + }, + { + "epoch": 2.9380516096494915, + "grad_norm": 0.3485041332119282, + "learning_rate": 3.904879779282067e-06, + "loss": 0.4336, + "step": 4369 + }, + { + "epoch": 2.9387240480793477, + "grad_norm": 0.4578987985171744, + "learning_rate": 3.9027318034518245e-06, + "loss": 0.4625, + "step": 4370 + }, + { + "epoch": 2.939396486509204, + "grad_norm": 0.4543264370604624, + "learning_rate": 3.900584040349543e-06, + "loss": 0.4496, + "step": 4371 + }, + { + "epoch": 2.94006892493906, + "grad_norm": 0.3338600468951495, + "learning_rate": 3.8984364903916086e-06, + "loss": 0.4394, + "step": 4372 + }, + { + "epoch": 2.9407413633689163, + "grad_norm": 0.3022669546557657, + "learning_rate": 3.896289153994369e-06, + "loss": 0.4248, + "step": 4373 + }, + { + "epoch": 2.941413801798773, + "grad_norm": 0.36615243953150717, + "learning_rate": 3.894142031574131e-06, + "loss": 0.4413, + "step": 4374 + }, + { + "epoch": 2.942086240228629, + "grad_norm": 0.8356858213298126, + "learning_rate": 3.891995123547154e-06, + "loss": 0.437, + "step": 4375 + }, + { + "epoch": 2.9427586786584854, + "grad_norm": 0.3351703752251006, + "learning_rate": 3.889848430329664e-06, + "loss": 0.4423, + "step": 4376 + }, + { + "epoch": 2.9434311170883416, + "grad_norm": 0.29626714008291044, + "learning_rate": 3.887701952337839e-06, + "loss": 0.4284, + "step": 4377 + }, + { + "epoch": 2.944103555518198, + "grad_norm": 0.28788910704281123, + "learning_rate": 3.88555568998782e-06, + "loss": 0.4332, + "step": 4378 + }, + { + "epoch": 2.944775993948054, + "grad_norm": 0.3341832475121459, + "learning_rate": 3.883409643695702e-06, + "loss": 0.4526, + "step": 4379 + }, + { + "epoch": 2.94544843237791, + "grad_norm": 0.3747649741539901, + "learning_rate": 3.881263813877542e-06, + "loss": 0.4448, + "step": 4380 + }, + { + "epoch": 2.946120870807767, + "grad_norm": 0.34952520777191076, + "learning_rate": 3.879118200949352e-06, + "loss": 0.4326, + "step": 4381 + }, + { + "epoch": 2.946793309237623, + "grad_norm": 0.4237596153227539, + "learning_rate": 3.876972805327105e-06, + "loss": 0.4422, + "step": 4382 + }, + { + "epoch": 2.9474657476674793, + "grad_norm": 0.3219451157593167, + "learning_rate": 3.874827627426727e-06, + "loss": 0.4405, + "step": 4383 + }, + { + "epoch": 2.9481381860973355, + "grad_norm": 0.44034715574883926, + "learning_rate": 3.872682667664105e-06, + "loss": 0.4244, + "step": 4384 + }, + { + "epoch": 2.9488106245271917, + "grad_norm": 0.3202514401249179, + "learning_rate": 3.870537926455086e-06, + "loss": 0.4474, + "step": 4385 + }, + { + "epoch": 2.949483062957048, + "grad_norm": 0.3718417568910564, + "learning_rate": 3.868393404215469e-06, + "loss": 0.4301, + "step": 4386 + }, + { + "epoch": 2.950155501386904, + "grad_norm": 0.35650965227510006, + "learning_rate": 3.866249101361018e-06, + "loss": 0.4379, + "step": 4387 + }, + { + "epoch": 2.9508279398167607, + "grad_norm": 0.4331958919885307, + "learning_rate": 3.864105018307446e-06, + "loss": 0.4652, + "step": 4388 + }, + { + "epoch": 2.951500378246617, + "grad_norm": 0.36349900094158416, + "learning_rate": 3.861961155470428e-06, + "loss": 0.4429, + "step": 4389 + }, + { + "epoch": 2.952172816676473, + "grad_norm": 0.34935962224647293, + "learning_rate": 3.8598175132655975e-06, + "loss": 0.4501, + "step": 4390 + }, + { + "epoch": 2.9528452551063293, + "grad_norm": 0.36038031146784655, + "learning_rate": 3.8576740921085455e-06, + "loss": 0.4352, + "step": 4391 + }, + { + "epoch": 2.9535176935361855, + "grad_norm": 0.39743775222788014, + "learning_rate": 3.855530892414813e-06, + "loss": 0.4352, + "step": 4392 + }, + { + "epoch": 2.9541901319660417, + "grad_norm": 0.32815871640241545, + "learning_rate": 3.853387914599905e-06, + "loss": 0.4474, + "step": 4393 + }, + { + "epoch": 2.954862570395898, + "grad_norm": 0.3279877663767734, + "learning_rate": 3.851245159079283e-06, + "loss": 0.4388, + "step": 4394 + }, + { + "epoch": 2.9555350088257546, + "grad_norm": 0.37024229207542203, + "learning_rate": 3.849102626268364e-06, + "loss": 0.4348, + "step": 4395 + }, + { + "epoch": 2.956207447255611, + "grad_norm": 0.3295138624701636, + "learning_rate": 3.8469603165825226e-06, + "loss": 0.4402, + "step": 4396 + }, + { + "epoch": 2.956879885685467, + "grad_norm": 0.36905622085010975, + "learning_rate": 3.844818230437087e-06, + "loss": 0.4338, + "step": 4397 + }, + { + "epoch": 2.957552324115323, + "grad_norm": 0.32310577320388223, + "learning_rate": 3.842676368247347e-06, + "loss": 0.436, + "step": 4398 + }, + { + "epoch": 2.9582247625451794, + "grad_norm": 0.3368728139063044, + "learning_rate": 3.840534730428545e-06, + "loss": 0.4124, + "step": 4399 + }, + { + "epoch": 2.9588972009750356, + "grad_norm": 0.356940651326783, + "learning_rate": 3.8383933173958835e-06, + "loss": 0.4377, + "step": 4400 + }, + { + "epoch": 2.959569639404892, + "grad_norm": 0.4407601298417127, + "learning_rate": 3.8362521295645164e-06, + "loss": 0.4388, + "step": 4401 + }, + { + "epoch": 2.9602420778347485, + "grad_norm": 0.3699052538556628, + "learning_rate": 3.834111167349558e-06, + "loss": 0.4566, + "step": 4402 + }, + { + "epoch": 2.9609145162646047, + "grad_norm": 0.41370129265662003, + "learning_rate": 3.8319704311660785e-06, + "loss": 0.4443, + "step": 4403 + }, + { + "epoch": 2.961586954694461, + "grad_norm": 0.3563826234769003, + "learning_rate": 3.829829921429103e-06, + "loss": 0.4551, + "step": 4404 + }, + { + "epoch": 2.962259393124317, + "grad_norm": 0.5052402730139329, + "learning_rate": 3.8276896385536145e-06, + "loss": 0.4235, + "step": 4405 + }, + { + "epoch": 2.9629318315541733, + "grad_norm": 0.3742072239640784, + "learning_rate": 3.825549582954548e-06, + "loss": 0.425, + "step": 4406 + }, + { + "epoch": 2.9636042699840295, + "grad_norm": 0.484152781124374, + "learning_rate": 3.8234097550468e-06, + "loss": 0.4319, + "step": 4407 + }, + { + "epoch": 2.9642767084138857, + "grad_norm": 0.41888443325550634, + "learning_rate": 3.821270155245219e-06, + "loss": 0.4564, + "step": 4408 + }, + { + "epoch": 2.9649491468437423, + "grad_norm": 0.3124106039017705, + "learning_rate": 3.81913078396461e-06, + "loss": 0.4434, + "step": 4409 + }, + { + "epoch": 2.9656215852735985, + "grad_norm": 0.3310828872080542, + "learning_rate": 3.816991641619736e-06, + "loss": 0.449, + "step": 4410 + }, + { + "epoch": 2.9662940237034547, + "grad_norm": 0.3283652682944646, + "learning_rate": 3.81485272862531e-06, + "loss": 0.4258, + "step": 4411 + }, + { + "epoch": 2.966966462133311, + "grad_norm": 0.3375407699567821, + "learning_rate": 3.8127140453960065e-06, + "loss": 0.4444, + "step": 4412 + }, + { + "epoch": 2.967638900563167, + "grad_norm": 0.3991924856965535, + "learning_rate": 3.810575592346455e-06, + "loss": 0.4355, + "step": 4413 + }, + { + "epoch": 2.9683113389930234, + "grad_norm": 0.31755611053343813, + "learning_rate": 3.8084373698912334e-06, + "loss": 0.4395, + "step": 4414 + }, + { + "epoch": 2.9689837774228796, + "grad_norm": 0.3789286228953469, + "learning_rate": 3.806299378444884e-06, + "loss": 0.4415, + "step": 4415 + }, + { + "epoch": 2.969656215852736, + "grad_norm": 0.3697470094575646, + "learning_rate": 3.804161618421899e-06, + "loss": 0.4376, + "step": 4416 + }, + { + "epoch": 2.9703286542825924, + "grad_norm": 0.3486963933380699, + "learning_rate": 3.802024090236727e-06, + "loss": 0.4323, + "step": 4417 + }, + { + "epoch": 2.9710010927124486, + "grad_norm": 0.3036241491697172, + "learning_rate": 3.799886794303773e-06, + "loss": 0.4355, + "step": 4418 + }, + { + "epoch": 2.971673531142305, + "grad_norm": 0.3629807388059824, + "learning_rate": 3.7977497310373946e-06, + "loss": 0.4411, + "step": 4419 + }, + { + "epoch": 2.972345969572161, + "grad_norm": 0.33530428915772964, + "learning_rate": 3.7956129008519046e-06, + "loss": 0.4275, + "step": 4420 + }, + { + "epoch": 2.973018408002017, + "grad_norm": 0.32335921903193104, + "learning_rate": 3.7934763041615717e-06, + "loss": 0.432, + "step": 4421 + }, + { + "epoch": 2.9736908464318734, + "grad_norm": 0.5711599036875669, + "learning_rate": 3.7913399413806227e-06, + "loss": 0.4504, + "step": 4422 + }, + { + "epoch": 2.97436328486173, + "grad_norm": 0.32928993885534397, + "learning_rate": 3.78920381292323e-06, + "loss": 0.428, + "step": 4423 + }, + { + "epoch": 2.9750357232915863, + "grad_norm": 0.3275175794711282, + "learning_rate": 3.787067919203528e-06, + "loss": 0.4517, + "step": 4424 + }, + { + "epoch": 2.9757081617214425, + "grad_norm": 0.4021150224715988, + "learning_rate": 3.784932260635604e-06, + "loss": 0.4355, + "step": 4425 + }, + { + "epoch": 2.9763806001512987, + "grad_norm": 0.4063896786721347, + "learning_rate": 3.7827968376334996e-06, + "loss": 0.432, + "step": 4426 + }, + { + "epoch": 2.977053038581155, + "grad_norm": 0.45938151658635823, + "learning_rate": 3.780661650611209e-06, + "loss": 0.4371, + "step": 4427 + }, + { + "epoch": 2.977725477011011, + "grad_norm": 0.3171925401712923, + "learning_rate": 3.7785266999826826e-06, + "loss": 0.449, + "step": 4428 + }, + { + "epoch": 2.9783979154408673, + "grad_norm": 0.3228017458469971, + "learning_rate": 3.7763919861618247e-06, + "loss": 0.4408, + "step": 4429 + }, + { + "epoch": 2.979070353870724, + "grad_norm": 0.4159499590049895, + "learning_rate": 3.774257509562493e-06, + "loss": 0.4389, + "step": 4430 + }, + { + "epoch": 2.9797427923005797, + "grad_norm": 0.3722689785310936, + "learning_rate": 3.7721232705985022e-06, + "loss": 0.4492, + "step": 4431 + }, + { + "epoch": 2.9804152307304363, + "grad_norm": 0.4000912931409732, + "learning_rate": 3.769989269683614e-06, + "loss": 0.4395, + "step": 4432 + }, + { + "epoch": 2.9810876691602926, + "grad_norm": 0.35943578317497027, + "learning_rate": 3.7678555072315496e-06, + "loss": 0.4355, + "step": 4433 + }, + { + "epoch": 2.9817601075901488, + "grad_norm": 0.31215073655493997, + "learning_rate": 3.765721983655984e-06, + "loss": 0.432, + "step": 4434 + }, + { + "epoch": 2.982432546020005, + "grad_norm": 0.32125224482976794, + "learning_rate": 3.7635886993705443e-06, + "loss": 0.4465, + "step": 4435 + }, + { + "epoch": 2.983104984449861, + "grad_norm": 0.35945927270651473, + "learning_rate": 3.76145565478881e-06, + "loss": 0.4427, + "step": 4436 + }, + { + "epoch": 2.983777422879718, + "grad_norm": 0.3154339889699578, + "learning_rate": 3.7593228503243173e-06, + "loss": 0.4375, + "step": 4437 + }, + { + "epoch": 2.9844498613095736, + "grad_norm": 0.3717430845968554, + "learning_rate": 3.7571902863905524e-06, + "loss": 0.432, + "step": 4438 + }, + { + "epoch": 2.98512229973943, + "grad_norm": 0.3863997070542928, + "learning_rate": 3.7550579634009582e-06, + "loss": 0.4455, + "step": 4439 + }, + { + "epoch": 2.9857947381692864, + "grad_norm": 0.2909428722373954, + "learning_rate": 3.752925881768931e-06, + "loss": 0.4362, + "step": 4440 + }, + { + "epoch": 2.9864671765991426, + "grad_norm": 0.34718339418997324, + "learning_rate": 3.7507940419078127e-06, + "loss": 0.447, + "step": 4441 + }, + { + "epoch": 2.987139615028999, + "grad_norm": 0.40098733777174367, + "learning_rate": 3.7486624442309087e-06, + "loss": 0.4382, + "step": 4442 + }, + { + "epoch": 2.987812053458855, + "grad_norm": 0.3695593860512126, + "learning_rate": 3.7465310891514716e-06, + "loss": 0.4378, + "step": 4443 + }, + { + "epoch": 2.9884844918887117, + "grad_norm": 0.3398051541434089, + "learning_rate": 3.74439997708271e-06, + "loss": 0.4374, + "step": 4444 + }, + { + "epoch": 2.9891569303185674, + "grad_norm": 0.34199486719995503, + "learning_rate": 3.7422691084377817e-06, + "loss": 0.4422, + "step": 4445 + }, + { + "epoch": 2.989829368748424, + "grad_norm": 0.35147755808673253, + "learning_rate": 3.7401384836297994e-06, + "loss": 0.4316, + "step": 4446 + }, + { + "epoch": 2.9905018071782803, + "grad_norm": 0.32203234479982756, + "learning_rate": 3.7380081030718296e-06, + "loss": 0.4385, + "step": 4447 + }, + { + "epoch": 2.9911742456081365, + "grad_norm": 0.3200714589833798, + "learning_rate": 3.7358779671768917e-06, + "loss": 0.448, + "step": 4448 + }, + { + "epoch": 2.9918466840379927, + "grad_norm": 0.3481011119507389, + "learning_rate": 3.733748076357952e-06, + "loss": 0.4416, + "step": 4449 + }, + { + "epoch": 2.992519122467849, + "grad_norm": 0.32614372611646925, + "learning_rate": 3.7316184310279356e-06, + "loss": 0.434, + "step": 4450 + }, + { + "epoch": 2.9931915608977055, + "grad_norm": 1.4320532111873607, + "learning_rate": 3.729489031599719e-06, + "loss": 0.4359, + "step": 4451 + }, + { + "epoch": 2.9938639993275613, + "grad_norm": 0.34225309102035456, + "learning_rate": 3.7273598784861288e-06, + "loss": 0.4575, + "step": 4452 + }, + { + "epoch": 2.994536437757418, + "grad_norm": 0.3854212013922402, + "learning_rate": 3.725230972099947e-06, + "loss": 0.438, + "step": 4453 + }, + { + "epoch": 2.995208876187274, + "grad_norm": 0.49023520566482387, + "learning_rate": 3.7231023128539023e-06, + "loss": 0.4372, + "step": 4454 + }, + { + "epoch": 2.9958813146171304, + "grad_norm": 0.343889522075791, + "learning_rate": 3.7209739011606814e-06, + "loss": 0.4305, + "step": 4455 + }, + { + "epoch": 2.9965537530469866, + "grad_norm": 0.3953877234343036, + "learning_rate": 3.71884573743292e-06, + "loss": 0.4341, + "step": 4456 + }, + { + "epoch": 2.9972261914768428, + "grad_norm": 0.3238747253395985, + "learning_rate": 3.716717822083209e-06, + "loss": 0.4341, + "step": 4457 + }, + { + "epoch": 2.9978986299066994, + "grad_norm": 0.3730996167012987, + "learning_rate": 3.714590155524084e-06, + "loss": 0.4407, + "step": 4458 + }, + { + "epoch": 2.998571068336555, + "grad_norm": 0.37637480012700475, + "learning_rate": 3.7124627381680367e-06, + "loss": 0.4266, + "step": 4459 + }, + { + "epoch": 2.999243506766412, + "grad_norm": 0.38744039917159373, + "learning_rate": 3.7103355704275136e-06, + "loss": 0.4359, + "step": 4460 + }, + { + "epoch": 2.999915945196268, + "grad_norm": 0.32943006457297874, + "learning_rate": 3.708208652714912e-06, + "loss": 0.4349, + "step": 4461 + }, + { + "epoch": 3.0005883836261242, + "grad_norm": 0.6176820385280498, + "learning_rate": 3.7060819854425723e-06, + "loss": 0.4499, + "step": 4462 + }, + { + "epoch": 3.0012608220559804, + "grad_norm": 0.33412830767673857, + "learning_rate": 3.7039555690227963e-06, + "loss": 0.442, + "step": 4463 + }, + { + "epoch": 3.0019332604858366, + "grad_norm": 0.40413572576992607, + "learning_rate": 3.7018294038678326e-06, + "loss": 0.4357, + "step": 4464 + }, + { + "epoch": 3.002605698915693, + "grad_norm": 0.3732340624532239, + "learning_rate": 3.6997034903898826e-06, + "loss": 0.4394, + "step": 4465 + }, + { + "epoch": 3.0032781373455495, + "grad_norm": 0.3435046155970762, + "learning_rate": 3.697577829001099e-06, + "loss": 0.4296, + "step": 4466 + }, + { + "epoch": 3.0039505757754057, + "grad_norm": 0.2881616624216002, + "learning_rate": 3.6954524201135823e-06, + "loss": 0.4372, + "step": 4467 + }, + { + "epoch": 3.004623014205262, + "grad_norm": 0.35940667588121067, + "learning_rate": 3.693327264139388e-06, + "loss": 0.4407, + "step": 4468 + }, + { + "epoch": 3.005295452635118, + "grad_norm": 0.3429020968951988, + "learning_rate": 3.6912023614905218e-06, + "loss": 0.4262, + "step": 4469 + }, + { + "epoch": 3.0059678910649743, + "grad_norm": 0.31452139822598385, + "learning_rate": 3.689077712578941e-06, + "loss": 0.4474, + "step": 4470 + }, + { + "epoch": 3.0066403294948305, + "grad_norm": 0.35211524591414006, + "learning_rate": 3.686953317816548e-06, + "loss": 0.4178, + "step": 4471 + }, + { + "epoch": 3.0073127679246867, + "grad_norm": 0.38554047368580263, + "learning_rate": 3.6848291776152044e-06, + "loss": 0.4281, + "step": 4472 + }, + { + "epoch": 3.0079852063545434, + "grad_norm": 0.37807020266172264, + "learning_rate": 3.682705292386716e-06, + "loss": 0.4405, + "step": 4473 + }, + { + "epoch": 3.0086576447843996, + "grad_norm": 0.5783834918565653, + "learning_rate": 3.6805816625428424e-06, + "loss": 0.4303, + "step": 4474 + }, + { + "epoch": 3.0093300832142558, + "grad_norm": 0.3027429867873164, + "learning_rate": 3.6784582884952935e-06, + "loss": 0.4284, + "step": 4475 + }, + { + "epoch": 3.010002521644112, + "grad_norm": 0.3573843602129143, + "learning_rate": 3.676335170655728e-06, + "loss": 0.4257, + "step": 4476 + }, + { + "epoch": 3.010674960073968, + "grad_norm": 0.5170737055546644, + "learning_rate": 3.674212309435756e-06, + "loss": 0.4343, + "step": 4477 + }, + { + "epoch": 3.0113473985038244, + "grad_norm": 0.39620461894404074, + "learning_rate": 3.6720897052469374e-06, + "loss": 0.4352, + "step": 4478 + }, + { + "epoch": 3.0120198369336806, + "grad_norm": 0.34278914287919265, + "learning_rate": 3.6699673585007858e-06, + "loss": 0.4303, + "step": 4479 + }, + { + "epoch": 3.0126922753635372, + "grad_norm": 0.3574083848789049, + "learning_rate": 3.6678452696087563e-06, + "loss": 0.4235, + "step": 4480 + }, + { + "epoch": 3.0133647137933934, + "grad_norm": 0.4886644604880471, + "learning_rate": 3.6657234389822626e-06, + "loss": 0.4326, + "step": 4481 + }, + { + "epoch": 3.0140371522232496, + "grad_norm": 0.4046875113455944, + "learning_rate": 3.6636018670326646e-06, + "loss": 0.4461, + "step": 4482 + }, + { + "epoch": 3.014709590653106, + "grad_norm": 0.3058637048547598, + "learning_rate": 3.6614805541712726e-06, + "loss": 0.4428, + "step": 4483 + }, + { + "epoch": 3.015382029082962, + "grad_norm": 0.3532756588998309, + "learning_rate": 3.6593595008093464e-06, + "loss": 0.419, + "step": 4484 + }, + { + "epoch": 3.0160544675128182, + "grad_norm": 0.4818989394385238, + "learning_rate": 3.657238707358096e-06, + "loss": 0.4308, + "step": 4485 + }, + { + "epoch": 3.0167269059426745, + "grad_norm": 0.38255122084895843, + "learning_rate": 3.6551181742286803e-06, + "loss": 0.4438, + "step": 4486 + }, + { + "epoch": 3.017399344372531, + "grad_norm": 0.3754752510920237, + "learning_rate": 3.652997901832208e-06, + "loss": 0.4444, + "step": 4487 + }, + { + "epoch": 3.0180717828023873, + "grad_norm": 0.36663369477192764, + "learning_rate": 3.6508778905797404e-06, + "loss": 0.4342, + "step": 4488 + }, + { + "epoch": 3.0187442212322435, + "grad_norm": 0.31769006125646865, + "learning_rate": 3.648758140882281e-06, + "loss": 0.4397, + "step": 4489 + }, + { + "epoch": 3.0194166596620997, + "grad_norm": 0.37354204632756116, + "learning_rate": 3.646638653150788e-06, + "loss": 0.4223, + "step": 4490 + }, + { + "epoch": 3.020089098091956, + "grad_norm": 0.3219142891575576, + "learning_rate": 3.644519427796168e-06, + "loss": 0.4477, + "step": 4491 + }, + { + "epoch": 3.020761536521812, + "grad_norm": 0.347480754410478, + "learning_rate": 3.6424004652292778e-06, + "loss": 0.4461, + "step": 4492 + }, + { + "epoch": 3.0214339749516683, + "grad_norm": 0.30296795201848176, + "learning_rate": 3.640281765860919e-06, + "loss": 0.4481, + "step": 4493 + }, + { + "epoch": 3.022106413381525, + "grad_norm": 0.39347590564289325, + "learning_rate": 3.638163330101847e-06, + "loss": 0.4327, + "step": 4494 + }, + { + "epoch": 3.022778851811381, + "grad_norm": 0.32219672839312713, + "learning_rate": 3.636045158362763e-06, + "loss": 0.451, + "step": 4495 + }, + { + "epoch": 3.0234512902412374, + "grad_norm": 0.34578066800769225, + "learning_rate": 3.6339272510543184e-06, + "loss": 0.4284, + "step": 4496 + }, + { + "epoch": 3.0241237286710936, + "grad_norm": 0.5388113685987523, + "learning_rate": 3.6318096085871148e-06, + "loss": 0.4295, + "step": 4497 + }, + { + "epoch": 3.02479616710095, + "grad_norm": 0.38941952487264886, + "learning_rate": 3.6296922313716976e-06, + "loss": 0.4391, + "step": 4498 + }, + { + "epoch": 3.025468605530806, + "grad_norm": 0.3382159734636173, + "learning_rate": 3.6275751198185644e-06, + "loss": 0.4447, + "step": 4499 + }, + { + "epoch": 3.026141043960662, + "grad_norm": 0.3602686931463578, + "learning_rate": 3.6254582743381617e-06, + "loss": 0.4295, + "step": 4500 + }, + { + "epoch": 3.0268134823905184, + "grad_norm": 0.44577634985746306, + "learning_rate": 3.6233416953408844e-06, + "loss": 0.4402, + "step": 4501 + }, + { + "epoch": 3.027485920820375, + "grad_norm": 0.35712384379947537, + "learning_rate": 3.6212253832370727e-06, + "loss": 0.4294, + "step": 4502 + }, + { + "epoch": 3.0281583592502312, + "grad_norm": 0.3874521973589743, + "learning_rate": 3.6191093384370173e-06, + "loss": 0.4288, + "step": 4503 + }, + { + "epoch": 3.0288307976800874, + "grad_norm": 0.43785321982355346, + "learning_rate": 3.6169935613509585e-06, + "loss": 0.4489, + "step": 4504 + }, + { + "epoch": 3.0295032361099437, + "grad_norm": 0.35880506553273245, + "learning_rate": 3.6148780523890836e-06, + "loss": 0.4387, + "step": 4505 + }, + { + "epoch": 3.0301756745398, + "grad_norm": 0.31171991877169425, + "learning_rate": 3.6127628119615245e-06, + "loss": 0.4415, + "step": 4506 + }, + { + "epoch": 3.030848112969656, + "grad_norm": 0.5210472160505484, + "learning_rate": 3.610647840478365e-06, + "loss": 0.4404, + "step": 4507 + }, + { + "epoch": 3.0315205513995123, + "grad_norm": 0.3050302944530371, + "learning_rate": 3.6085331383496357e-06, + "loss": 0.4542, + "step": 4508 + }, + { + "epoch": 3.032192989829369, + "grad_norm": 0.3415842205365484, + "learning_rate": 3.6064187059853173e-06, + "loss": 0.4304, + "step": 4509 + }, + { + "epoch": 3.032865428259225, + "grad_norm": 0.3195233141623651, + "learning_rate": 3.604304543795335e-06, + "loss": 0.424, + "step": 4510 + }, + { + "epoch": 3.0335378666890813, + "grad_norm": 0.3758174522413181, + "learning_rate": 3.60219065218956e-06, + "loss": 0.4417, + "step": 4511 + }, + { + "epoch": 3.0342103051189375, + "grad_norm": 0.3266941100665439, + "learning_rate": 3.6000770315778157e-06, + "loss": 0.432, + "step": 4512 + }, + { + "epoch": 3.0348827435487937, + "grad_norm": 0.3321494460098997, + "learning_rate": 3.5979636823698704e-06, + "loss": 0.4364, + "step": 4513 + }, + { + "epoch": 3.03555518197865, + "grad_norm": 0.390727902210108, + "learning_rate": 3.595850604975441e-06, + "loss": 0.4429, + "step": 4514 + }, + { + "epoch": 3.036227620408506, + "grad_norm": 0.3715492533589567, + "learning_rate": 3.5937377998041888e-06, + "loss": 0.4391, + "step": 4515 + }, + { + "epoch": 3.036900058838363, + "grad_norm": 0.4056712467446377, + "learning_rate": 3.591625267265727e-06, + "loss": 0.4296, + "step": 4516 + }, + { + "epoch": 3.037572497268219, + "grad_norm": 0.9494418041097662, + "learning_rate": 3.589513007769611e-06, + "loss": 0.418, + "step": 4517 + }, + { + "epoch": 3.038244935698075, + "grad_norm": 0.3523303983594626, + "learning_rate": 3.5874010217253473e-06, + "loss": 0.4373, + "step": 4518 + }, + { + "epoch": 3.0389173741279314, + "grad_norm": 0.3689719434747409, + "learning_rate": 3.58528930954239e-06, + "loss": 0.425, + "step": 4519 + }, + { + "epoch": 3.0395898125577876, + "grad_norm": 0.3906852991531941, + "learning_rate": 3.5831778716301325e-06, + "loss": 0.4267, + "step": 4520 + }, + { + "epoch": 3.040262250987644, + "grad_norm": 0.5857781559126515, + "learning_rate": 3.5810667083979228e-06, + "loss": 0.4309, + "step": 4521 + }, + { + "epoch": 3.0409346894175, + "grad_norm": 0.4321142668318707, + "learning_rate": 3.5789558202550533e-06, + "loss": 0.4344, + "step": 4522 + }, + { + "epoch": 3.0416071278473567, + "grad_norm": 0.3307722284146976, + "learning_rate": 3.576845207610765e-06, + "loss": 0.448, + "step": 4523 + }, + { + "epoch": 3.042279566277213, + "grad_norm": 0.4335724324922873, + "learning_rate": 3.5747348708742404e-06, + "loss": 0.4443, + "step": 4524 + }, + { + "epoch": 3.042952004707069, + "grad_norm": 0.4024654859333773, + "learning_rate": 3.572624810454612e-06, + "loss": 0.4296, + "step": 4525 + }, + { + "epoch": 3.0436244431369253, + "grad_norm": 0.34310394163753305, + "learning_rate": 3.5705150267609596e-06, + "loss": 0.4468, + "step": 4526 + }, + { + "epoch": 3.0442968815667815, + "grad_norm": 0.3634416843777775, + "learning_rate": 3.5684055202023093e-06, + "loss": 0.4484, + "step": 4527 + }, + { + "epoch": 3.0449693199966377, + "grad_norm": 0.3003360998378971, + "learning_rate": 3.566296291187629e-06, + "loss": 0.4466, + "step": 4528 + }, + { + "epoch": 3.045641758426494, + "grad_norm": 0.32281364279096514, + "learning_rate": 3.5641873401258377e-06, + "loss": 0.4367, + "step": 4529 + }, + { + "epoch": 3.0463141968563505, + "grad_norm": 0.3521914518525131, + "learning_rate": 3.5620786674257983e-06, + "loss": 0.4342, + "step": 4530 + }, + { + "epoch": 3.0469866352862067, + "grad_norm": 0.3169201581905446, + "learning_rate": 3.559970273496321e-06, + "loss": 0.4329, + "step": 4531 + }, + { + "epoch": 3.047659073716063, + "grad_norm": 0.3902227518137739, + "learning_rate": 3.5578621587461615e-06, + "loss": 0.4413, + "step": 4532 + }, + { + "epoch": 3.048331512145919, + "grad_norm": 0.4085131117285104, + "learning_rate": 3.55575432358402e-06, + "loss": 0.4384, + "step": 4533 + }, + { + "epoch": 3.0490039505757753, + "grad_norm": 0.3264488825660449, + "learning_rate": 3.553646768418544e-06, + "loss": 0.4228, + "step": 4534 + }, + { + "epoch": 3.0496763890056315, + "grad_norm": 0.4476131212157327, + "learning_rate": 3.5515394936583265e-06, + "loss": 0.4418, + "step": 4535 + }, + { + "epoch": 3.0503488274354877, + "grad_norm": 0.5043788346159002, + "learning_rate": 3.549432499711908e-06, + "loss": 0.4466, + "step": 4536 + }, + { + "epoch": 3.0510212658653444, + "grad_norm": 0.3253028263392405, + "learning_rate": 3.547325786987768e-06, + "loss": 0.431, + "step": 4537 + }, + { + "epoch": 3.0516937042952006, + "grad_norm": 0.34369162109844337, + "learning_rate": 3.545219355894339e-06, + "loss": 0.4341, + "step": 4538 + }, + { + "epoch": 3.052366142725057, + "grad_norm": 0.33327263778378285, + "learning_rate": 3.543113206839995e-06, + "loss": 0.4323, + "step": 4539 + }, + { + "epoch": 3.053038581154913, + "grad_norm": 0.3719428111807546, + "learning_rate": 3.5410073402330565e-06, + "loss": 0.4514, + "step": 4540 + }, + { + "epoch": 3.053711019584769, + "grad_norm": 0.37129283477446157, + "learning_rate": 3.53890175648179e-06, + "loss": 0.4186, + "step": 4541 + }, + { + "epoch": 3.0543834580146254, + "grad_norm": 0.37228958234421883, + "learning_rate": 3.5367964559944045e-06, + "loss": 0.4449, + "step": 4542 + }, + { + "epoch": 3.0550558964444816, + "grad_norm": 0.32190002037023063, + "learning_rate": 3.5346914391790566e-06, + "loss": 0.4429, + "step": 4543 + }, + { + "epoch": 3.0557283348743383, + "grad_norm": 0.31796748081394494, + "learning_rate": 3.532586706443846e-06, + "loss": 0.4451, + "step": 4544 + }, + { + "epoch": 3.0564007733041945, + "grad_norm": 0.31043316844484037, + "learning_rate": 3.5304822581968214e-06, + "loss": 0.4321, + "step": 4545 + }, + { + "epoch": 3.0570732117340507, + "grad_norm": 0.44760175036381356, + "learning_rate": 3.528378094845969e-06, + "loss": 0.4357, + "step": 4546 + }, + { + "epoch": 3.057745650163907, + "grad_norm": 0.3789634341908856, + "learning_rate": 3.5262742167992265e-06, + "loss": 0.4279, + "step": 4547 + }, + { + "epoch": 3.058418088593763, + "grad_norm": 0.30327069801108236, + "learning_rate": 3.5241706244644724e-06, + "loss": 0.434, + "step": 4548 + }, + { + "epoch": 3.0590905270236193, + "grad_norm": 0.3517385667754944, + "learning_rate": 3.5220673182495346e-06, + "loss": 0.4369, + "step": 4549 + }, + { + "epoch": 3.0597629654534755, + "grad_norm": 0.3115038237779994, + "learning_rate": 3.5199642985621775e-06, + "loss": 0.4481, + "step": 4550 + }, + { + "epoch": 3.060435403883332, + "grad_norm": 0.4064540175949745, + "learning_rate": 3.517861565810118e-06, + "loss": 0.4277, + "step": 4551 + }, + { + "epoch": 3.0611078423131883, + "grad_norm": 0.40234286779242157, + "learning_rate": 3.5157591204010123e-06, + "loss": 0.4371, + "step": 4552 + }, + { + "epoch": 3.0617802807430445, + "grad_norm": 0.4273834795088497, + "learning_rate": 3.513656962742463e-06, + "loss": 0.4359, + "step": 4553 + }, + { + "epoch": 3.0624527191729007, + "grad_norm": 0.4395651494398156, + "learning_rate": 3.5115550932420194e-06, + "loss": 0.4392, + "step": 4554 + }, + { + "epoch": 3.063125157602757, + "grad_norm": 0.47998674278000614, + "learning_rate": 3.509453512307167e-06, + "loss": 0.4382, + "step": 4555 + }, + { + "epoch": 3.063797596032613, + "grad_norm": 0.2963015918779441, + "learning_rate": 3.507352220345343e-06, + "loss": 0.4365, + "step": 4556 + }, + { + "epoch": 3.0644700344624693, + "grad_norm": 0.3417701165760403, + "learning_rate": 3.505251217763925e-06, + "loss": 0.433, + "step": 4557 + }, + { + "epoch": 3.065142472892326, + "grad_norm": 0.5514900169517631, + "learning_rate": 3.5031505049702388e-06, + "loss": 0.4288, + "step": 4558 + }, + { + "epoch": 3.065814911322182, + "grad_norm": 0.3463449584856086, + "learning_rate": 3.5010500823715453e-06, + "loss": 0.4267, + "step": 4559 + }, + { + "epoch": 3.0664873497520384, + "grad_norm": 0.3585885707085387, + "learning_rate": 3.498949950375059e-06, + "loss": 0.4235, + "step": 4560 + }, + { + "epoch": 3.0671597881818946, + "grad_norm": 0.3581184175510112, + "learning_rate": 3.496850109387931e-06, + "loss": 0.4363, + "step": 4561 + }, + { + "epoch": 3.067832226611751, + "grad_norm": 0.3978062858624349, + "learning_rate": 3.494750559817259e-06, + "loss": 0.4301, + "step": 4562 + }, + { + "epoch": 3.068504665041607, + "grad_norm": 0.3134466285837552, + "learning_rate": 3.4926513020700862e-06, + "loss": 0.4457, + "step": 4563 + }, + { + "epoch": 3.069177103471463, + "grad_norm": 0.43058514574800544, + "learning_rate": 3.4905523365533933e-06, + "loss": 0.4355, + "step": 4564 + }, + { + "epoch": 3.06984954190132, + "grad_norm": 0.34477356186422137, + "learning_rate": 3.48845366367411e-06, + "loss": 0.4391, + "step": 4565 + }, + { + "epoch": 3.070521980331176, + "grad_norm": 0.3388934365750416, + "learning_rate": 3.4863552838391063e-06, + "loss": 0.4331, + "step": 4566 + }, + { + "epoch": 3.0711944187610323, + "grad_norm": 0.4684156526808039, + "learning_rate": 3.4842571974551988e-06, + "loss": 0.4234, + "step": 4567 + }, + { + "epoch": 3.0718668571908885, + "grad_norm": 0.3313737505540861, + "learning_rate": 3.4821594049291397e-06, + "loss": 0.4276, + "step": 4568 + }, + { + "epoch": 3.0725392956207447, + "grad_norm": 0.36942260054250975, + "learning_rate": 3.480061906667631e-06, + "loss": 0.4314, + "step": 4569 + }, + { + "epoch": 3.073211734050601, + "grad_norm": 0.32521269600746494, + "learning_rate": 3.477964703077318e-06, + "loss": 0.4495, + "step": 4570 + }, + { + "epoch": 3.073884172480457, + "grad_norm": 0.3608809742566614, + "learning_rate": 3.4758677945647845e-06, + "loss": 0.4246, + "step": 4571 + }, + { + "epoch": 3.0745566109103137, + "grad_norm": 0.3344989550477347, + "learning_rate": 3.47377118153656e-06, + "loss": 0.4497, + "step": 4572 + }, + { + "epoch": 3.07522904934017, + "grad_norm": 0.3572676656882878, + "learning_rate": 3.4716748643991156e-06, + "loss": 0.431, + "step": 4573 + }, + { + "epoch": 3.075901487770026, + "grad_norm": 0.553248539316614, + "learning_rate": 3.469578843558865e-06, + "loss": 0.4304, + "step": 4574 + }, + { + "epoch": 3.0765739261998823, + "grad_norm": 0.35331663767167226, + "learning_rate": 3.4674831194221664e-06, + "loss": 0.4319, + "step": 4575 + }, + { + "epoch": 3.0772463646297386, + "grad_norm": 0.42518322373957207, + "learning_rate": 3.46538769239532e-06, + "loss": 0.4308, + "step": 4576 + }, + { + "epoch": 3.0779188030595948, + "grad_norm": 0.3338628786350596, + "learning_rate": 3.4632925628845627e-06, + "loss": 0.4416, + "step": 4577 + }, + { + "epoch": 3.078591241489451, + "grad_norm": 0.32805731158939805, + "learning_rate": 3.461197731296081e-06, + "loss": 0.4427, + "step": 4578 + }, + { + "epoch": 3.0792636799193076, + "grad_norm": 0.36026355171320523, + "learning_rate": 3.4591031980360014e-06, + "loss": 0.4369, + "step": 4579 + }, + { + "epoch": 3.079936118349164, + "grad_norm": 0.30893179819019195, + "learning_rate": 3.4570089635103934e-06, + "loss": 0.4347, + "step": 4580 + }, + { + "epoch": 3.08060855677902, + "grad_norm": 0.4345687627764726, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.4427, + "step": 4581 + }, + { + "epoch": 3.081280995208876, + "grad_norm": 0.40103013471154253, + "learning_rate": 3.452821392286567e-06, + "loss": 0.4304, + "step": 4582 + }, + { + "epoch": 3.0819534336387324, + "grad_norm": 0.30194327621664857, + "learning_rate": 3.4507280564001968e-06, + "loss": 0.4434, + "step": 4583 + }, + { + "epoch": 3.0826258720685886, + "grad_norm": 0.332364236177735, + "learning_rate": 3.44863502087199e-06, + "loss": 0.4478, + "step": 4584 + }, + { + "epoch": 3.083298310498445, + "grad_norm": 0.33594200809082103, + "learning_rate": 3.4465422861077267e-06, + "loss": 0.4263, + "step": 4585 + }, + { + "epoch": 3.0839707489283015, + "grad_norm": 0.3689223762605296, + "learning_rate": 3.4444498525131215e-06, + "loss": 0.446, + "step": 4586 + }, + { + "epoch": 3.0846431873581577, + "grad_norm": 0.7450107564695669, + "learning_rate": 3.442357720493838e-06, + "loss": 0.4424, + "step": 4587 + }, + { + "epoch": 3.085315625788014, + "grad_norm": 0.3466911797566331, + "learning_rate": 3.4402658904554785e-06, + "loss": 0.4394, + "step": 4588 + }, + { + "epoch": 3.08598806421787, + "grad_norm": 0.34958332313672374, + "learning_rate": 3.43817436280359e-06, + "loss": 0.4432, + "step": 4589 + }, + { + "epoch": 3.0866605026477263, + "grad_norm": 0.42559182941801493, + "learning_rate": 3.4360831379436533e-06, + "loss": 0.4315, + "step": 4590 + }, + { + "epoch": 3.0873329410775825, + "grad_norm": 0.33547003791092794, + "learning_rate": 3.4339922162810983e-06, + "loss": 0.4357, + "step": 4591 + }, + { + "epoch": 3.0880053795074387, + "grad_norm": 0.38646265904258303, + "learning_rate": 3.4319015982212922e-06, + "loss": 0.4434, + "step": 4592 + }, + { + "epoch": 3.0886778179372953, + "grad_norm": 0.35206075599972964, + "learning_rate": 3.4298112841695477e-06, + "loss": 0.4416, + "step": 4593 + }, + { + "epoch": 3.0893502563671515, + "grad_norm": 0.34848617744389093, + "learning_rate": 3.4277212745311084e-06, + "loss": 0.4366, + "step": 4594 + }, + { + "epoch": 3.0900226947970078, + "grad_norm": 0.3533361661758744, + "learning_rate": 3.425631569711169e-06, + "loss": 0.4401, + "step": 4595 + }, + { + "epoch": 3.090695133226864, + "grad_norm": 0.36406170069571986, + "learning_rate": 3.4235421701148625e-06, + "loss": 0.4273, + "step": 4596 + }, + { + "epoch": 3.09136757165672, + "grad_norm": 1.4039836290122003, + "learning_rate": 3.4214530761472607e-06, + "loss": 0.4357, + "step": 4597 + }, + { + "epoch": 3.0920400100865764, + "grad_norm": 0.3308515481068115, + "learning_rate": 3.419364288213379e-06, + "loss": 0.4441, + "step": 4598 + }, + { + "epoch": 3.0927124485164326, + "grad_norm": 0.3119654188294005, + "learning_rate": 3.4172758067181687e-06, + "loss": 0.4208, + "step": 4599 + }, + { + "epoch": 3.0933848869462888, + "grad_norm": 0.2867191112981178, + "learning_rate": 3.4151876320665276e-06, + "loss": 0.4422, + "step": 4600 + }, + { + "epoch": 3.0940573253761454, + "grad_norm": 0.35474956609831393, + "learning_rate": 3.4130997646632895e-06, + "loss": 0.4314, + "step": 4601 + }, + { + "epoch": 3.0947297638060016, + "grad_norm": 0.3159002316785881, + "learning_rate": 3.411012204913233e-06, + "loss": 0.4459, + "step": 4602 + }, + { + "epoch": 3.095402202235858, + "grad_norm": 0.3194766649563873, + "learning_rate": 3.4089249532210717e-06, + "loss": 0.4496, + "step": 4603 + }, + { + "epoch": 3.096074640665714, + "grad_norm": 0.6258388924333276, + "learning_rate": 3.406838009991462e-06, + "loss": 0.4433, + "step": 4604 + }, + { + "epoch": 3.0967470790955702, + "grad_norm": 0.3838652445382094, + "learning_rate": 3.404751375629002e-06, + "loss": 0.4416, + "step": 4605 + }, + { + "epoch": 3.0974195175254264, + "grad_norm": 0.32257798582969266, + "learning_rate": 3.402665050538228e-06, + "loss": 0.4256, + "step": 4606 + }, + { + "epoch": 3.0980919559552826, + "grad_norm": 0.31236500506479076, + "learning_rate": 3.4005790351236185e-06, + "loss": 0.4342, + "step": 4607 + }, + { + "epoch": 3.0987643943851393, + "grad_norm": 0.6161557458503163, + "learning_rate": 3.3984933297895876e-06, + "loss": 0.4219, + "step": 4608 + }, + { + "epoch": 3.0994368328149955, + "grad_norm": 0.31925387633231034, + "learning_rate": 3.3964079349404937e-06, + "loss": 0.4293, + "step": 4609 + }, + { + "epoch": 3.1001092712448517, + "grad_norm": 0.43522890072392556, + "learning_rate": 3.3943228509806337e-06, + "loss": 0.4343, + "step": 4610 + }, + { + "epoch": 3.100781709674708, + "grad_norm": 0.345784014529943, + "learning_rate": 3.392238078314245e-06, + "loss": 0.4335, + "step": 4611 + }, + { + "epoch": 3.101454148104564, + "grad_norm": 0.42119438239765744, + "learning_rate": 3.390153617345501e-06, + "loss": 0.4222, + "step": 4612 + }, + { + "epoch": 3.1021265865344203, + "grad_norm": 0.32674762135053875, + "learning_rate": 3.388069468478516e-06, + "loss": 0.4317, + "step": 4613 + }, + { + "epoch": 3.1027990249642765, + "grad_norm": 0.4307593993758202, + "learning_rate": 3.385985632117349e-06, + "loss": 0.4522, + "step": 4614 + }, + { + "epoch": 3.103471463394133, + "grad_norm": 0.432891284456854, + "learning_rate": 3.3839021086659944e-06, + "loss": 0.4315, + "step": 4615 + }, + { + "epoch": 3.1041439018239894, + "grad_norm": 0.48510915643605573, + "learning_rate": 3.3818188985283827e-06, + "loss": 0.4209, + "step": 4616 + }, + { + "epoch": 3.1048163402538456, + "grad_norm": 0.5217158789525718, + "learning_rate": 3.379736002108388e-06, + "loss": 0.4352, + "step": 4617 + }, + { + "epoch": 3.1054887786837018, + "grad_norm": 0.38286786080522917, + "learning_rate": 3.3776534198098245e-06, + "loss": 0.4428, + "step": 4618 + }, + { + "epoch": 3.106161217113558, + "grad_norm": 0.39261664609612695, + "learning_rate": 3.375571152036441e-06, + "loss": 0.4394, + "step": 4619 + }, + { + "epoch": 3.106833655543414, + "grad_norm": 0.9636478000474139, + "learning_rate": 3.37348919919193e-06, + "loss": 0.4277, + "step": 4620 + }, + { + "epoch": 3.1075060939732704, + "grad_norm": 0.5307538668903624, + "learning_rate": 3.371407561679919e-06, + "loss": 0.4312, + "step": 4621 + }, + { + "epoch": 3.108178532403127, + "grad_norm": 0.39308857470248204, + "learning_rate": 3.3693262399039765e-06, + "loss": 0.422, + "step": 4622 + }, + { + "epoch": 3.1088509708329832, + "grad_norm": 0.42116258159068504, + "learning_rate": 3.3672452342676104e-06, + "loss": 0.4508, + "step": 4623 + }, + { + "epoch": 3.1095234092628394, + "grad_norm": 0.34333009940872494, + "learning_rate": 3.3651645451742677e-06, + "loss": 0.4381, + "step": 4624 + }, + { + "epoch": 3.1101958476926956, + "grad_norm": 0.32574292490361406, + "learning_rate": 3.363084173027329e-06, + "loss": 0.4373, + "step": 4625 + }, + { + "epoch": 3.110868286122552, + "grad_norm": 0.47585240478550994, + "learning_rate": 3.3610041182301185e-06, + "loss": 0.4407, + "step": 4626 + }, + { + "epoch": 3.111540724552408, + "grad_norm": 0.30577248963550213, + "learning_rate": 3.358924381185898e-06, + "loss": 0.4431, + "step": 4627 + }, + { + "epoch": 3.1122131629822642, + "grad_norm": 0.34454421287433445, + "learning_rate": 3.3568449622978672e-06, + "loss": 0.4249, + "step": 4628 + }, + { + "epoch": 3.112885601412121, + "grad_norm": 0.3124320595245157, + "learning_rate": 3.354765861969165e-06, + "loss": 0.428, + "step": 4629 + }, + { + "epoch": 3.113558039841977, + "grad_norm": 0.33255926005497843, + "learning_rate": 3.352687080602866e-06, + "loss": 0.4339, + "step": 4630 + }, + { + "epoch": 3.1142304782718333, + "grad_norm": 0.3263425191480317, + "learning_rate": 3.3506086186019853e-06, + "loss": 0.4396, + "step": 4631 + }, + { + "epoch": 3.1149029167016895, + "grad_norm": 0.3311160032209623, + "learning_rate": 3.348530476369476e-06, + "loss": 0.4413, + "step": 4632 + }, + { + "epoch": 3.1155753551315457, + "grad_norm": 0.3542262654933314, + "learning_rate": 3.3464526543082294e-06, + "loss": 0.4459, + "step": 4633 + }, + { + "epoch": 3.116247793561402, + "grad_norm": 0.325571930550553, + "learning_rate": 3.344375152821071e-06, + "loss": 0.4362, + "step": 4634 + }, + { + "epoch": 3.116920231991258, + "grad_norm": 0.3358903620675028, + "learning_rate": 3.3422979723107684e-06, + "loss": 0.4424, + "step": 4635 + }, + { + "epoch": 3.1175926704211148, + "grad_norm": 0.305451735662324, + "learning_rate": 3.3402211131800267e-06, + "loss": 0.4259, + "step": 4636 + }, + { + "epoch": 3.118265108850971, + "grad_norm": 0.42671107335206243, + "learning_rate": 3.338144575831488e-06, + "loss": 0.4215, + "step": 4637 + }, + { + "epoch": 3.118937547280827, + "grad_norm": 0.43720263887286137, + "learning_rate": 3.336068360667729e-06, + "loss": 0.4331, + "step": 4638 + }, + { + "epoch": 3.1196099857106834, + "grad_norm": 0.32109468966367455, + "learning_rate": 3.333992468091268e-06, + "loss": 0.4357, + "step": 4639 + }, + { + "epoch": 3.1202824241405396, + "grad_norm": 0.3038497432055437, + "learning_rate": 3.3319168985045613e-06, + "loss": 0.4396, + "step": 4640 + }, + { + "epoch": 3.120954862570396, + "grad_norm": 0.34822304470317245, + "learning_rate": 3.329841652309998e-06, + "loss": 0.4324, + "step": 4641 + }, + { + "epoch": 3.121627301000252, + "grad_norm": 0.33377784623106194, + "learning_rate": 3.3277667299099104e-06, + "loss": 0.4339, + "step": 4642 + }, + { + "epoch": 3.1222997394301086, + "grad_norm": 0.3474041561897187, + "learning_rate": 3.3256921317065603e-06, + "loss": 0.4531, + "step": 4643 + }, + { + "epoch": 3.122972177859965, + "grad_norm": 0.35275400166282933, + "learning_rate": 3.3236178581021543e-06, + "loss": 0.4435, + "step": 4644 + }, + { + "epoch": 3.123644616289821, + "grad_norm": 0.3766660078631167, + "learning_rate": 3.3215439094988315e-06, + "loss": 0.4408, + "step": 4645 + }, + { + "epoch": 3.1243170547196772, + "grad_norm": 0.34077492250456387, + "learning_rate": 3.3194702862986716e-06, + "loss": 0.4237, + "step": 4646 + }, + { + "epoch": 3.1249894931495334, + "grad_norm": 0.3990472748064627, + "learning_rate": 3.3173969889036858e-06, + "loss": 0.4252, + "step": 4647 + }, + { + "epoch": 3.1256619315793897, + "grad_norm": 0.3324141727579477, + "learning_rate": 3.3153240177158267e-06, + "loss": 0.4459, + "step": 4648 + }, + { + "epoch": 3.126334370009246, + "grad_norm": 0.3740768618563349, + "learning_rate": 3.3132513731369832e-06, + "loss": 0.4497, + "step": 4649 + }, + { + "epoch": 3.127006808439102, + "grad_norm": 0.5408116642166466, + "learning_rate": 3.311179055568979e-06, + "loss": 0.433, + "step": 4650 + }, + { + "epoch": 3.1276792468689587, + "grad_norm": 0.29868137271676387, + "learning_rate": 3.3091070654135777e-06, + "loss": 0.4381, + "step": 4651 + }, + { + "epoch": 3.128351685298815, + "grad_norm": 0.31638763072719994, + "learning_rate": 3.3070354030724735e-06, + "loss": 0.4414, + "step": 4652 + }, + { + "epoch": 3.129024123728671, + "grad_norm": 0.3291056686507435, + "learning_rate": 3.3049640689473015e-06, + "loss": 0.4389, + "step": 4653 + }, + { + "epoch": 3.1296965621585273, + "grad_norm": 0.3468954593487525, + "learning_rate": 3.302893063439634e-06, + "loss": 0.4297, + "step": 4654 + }, + { + "epoch": 3.1303690005883835, + "grad_norm": 0.3827760044042586, + "learning_rate": 3.300822386950978e-06, + "loss": 0.4352, + "step": 4655 + }, + { + "epoch": 3.1310414390182397, + "grad_norm": 0.35013542676495674, + "learning_rate": 3.298752039882774e-06, + "loss": 0.4232, + "step": 4656 + }, + { + "epoch": 3.131713877448096, + "grad_norm": 0.3148957999424402, + "learning_rate": 3.2966820226364037e-06, + "loss": 0.4345, + "step": 4657 + }, + { + "epoch": 3.1323863158779526, + "grad_norm": 0.31776486418488736, + "learning_rate": 3.2946123356131817e-06, + "loss": 0.4474, + "step": 4658 + }, + { + "epoch": 3.133058754307809, + "grad_norm": 0.336646129660973, + "learning_rate": 3.292542979214361e-06, + "loss": 0.4159, + "step": 4659 + }, + { + "epoch": 3.133731192737665, + "grad_norm": 0.3276513923565536, + "learning_rate": 3.2904739538411253e-06, + "loss": 0.4296, + "step": 4660 + }, + { + "epoch": 3.134403631167521, + "grad_norm": 0.4116312387173316, + "learning_rate": 3.288405259894599e-06, + "loss": 0.4351, + "step": 4661 + }, + { + "epoch": 3.1350760695973774, + "grad_norm": 0.3403757223273257, + "learning_rate": 3.2863368977758408e-06, + "loss": 0.4371, + "step": 4662 + }, + { + "epoch": 3.1357485080272336, + "grad_norm": 0.3659654866043058, + "learning_rate": 3.284268867885847e-06, + "loss": 0.4295, + "step": 4663 + }, + { + "epoch": 3.13642094645709, + "grad_norm": 0.4652447138237276, + "learning_rate": 3.2822011706255473e-06, + "loss": 0.4324, + "step": 4664 + }, + { + "epoch": 3.1370933848869464, + "grad_norm": 0.33165855878747724, + "learning_rate": 3.2801338063958055e-06, + "loss": 0.4229, + "step": 4665 + }, + { + "epoch": 3.1377658233168026, + "grad_norm": 0.3838062127845923, + "learning_rate": 3.278066775597423e-06, + "loss": 0.4432, + "step": 4666 + }, + { + "epoch": 3.138438261746659, + "grad_norm": 0.3439324081941863, + "learning_rate": 3.276000078631136e-06, + "loss": 0.4308, + "step": 4667 + }, + { + "epoch": 3.139110700176515, + "grad_norm": 0.3777137266840129, + "learning_rate": 3.2739337158976183e-06, + "loss": 0.438, + "step": 4668 + }, + { + "epoch": 3.1397831386063713, + "grad_norm": 0.3770860508041263, + "learning_rate": 3.2718676877974732e-06, + "loss": 0.4284, + "step": 4669 + }, + { + "epoch": 3.1404555770362275, + "grad_norm": 0.4052584725799753, + "learning_rate": 3.2698019947312447e-06, + "loss": 0.419, + "step": 4670 + }, + { + "epoch": 3.1411280154660837, + "grad_norm": 0.33877127331626405, + "learning_rate": 3.26773663709941e-06, + "loss": 0.4375, + "step": 4671 + }, + { + "epoch": 3.1418004538959403, + "grad_norm": 0.38442473662802157, + "learning_rate": 3.2656716153023806e-06, + "loss": 0.4204, + "step": 4672 + }, + { + "epoch": 3.1424728923257965, + "grad_norm": 0.38480592954673515, + "learning_rate": 3.263606929740505e-06, + "loss": 0.4339, + "step": 4673 + }, + { + "epoch": 3.1431453307556527, + "grad_norm": 0.4169876926992838, + "learning_rate": 3.2615425808140617e-06, + "loss": 0.4431, + "step": 4674 + }, + { + "epoch": 3.143817769185509, + "grad_norm": 0.3583867437549412, + "learning_rate": 3.259478568923269e-06, + "loss": 0.4353, + "step": 4675 + }, + { + "epoch": 3.144490207615365, + "grad_norm": 0.36524560376873244, + "learning_rate": 3.257414894468277e-06, + "loss": 0.4236, + "step": 4676 + }, + { + "epoch": 3.1451626460452213, + "grad_norm": 0.4022822932069097, + "learning_rate": 3.2553515578491727e-06, + "loss": 0.4367, + "step": 4677 + }, + { + "epoch": 3.1458350844750775, + "grad_norm": 0.34481558379751426, + "learning_rate": 3.2532885594659756e-06, + "loss": 0.4174, + "step": 4678 + }, + { + "epoch": 3.146507522904934, + "grad_norm": 0.3678747660739935, + "learning_rate": 3.2512258997186396e-06, + "loss": 0.4448, + "step": 4679 + }, + { + "epoch": 3.1471799613347904, + "grad_norm": 0.36854708191939056, + "learning_rate": 3.249163579007054e-06, + "loss": 0.4303, + "step": 4680 + }, + { + "epoch": 3.1478523997646466, + "grad_norm": 0.33787152789012104, + "learning_rate": 3.247101597731045e-06, + "loss": 0.4429, + "step": 4681 + }, + { + "epoch": 3.148524838194503, + "grad_norm": 0.3562201680510042, + "learning_rate": 3.245039956290365e-06, + "loss": 0.4181, + "step": 4682 + }, + { + "epoch": 3.149197276624359, + "grad_norm": 0.3401836758777454, + "learning_rate": 3.2429786550847074e-06, + "loss": 0.4351, + "step": 4683 + }, + { + "epoch": 3.149869715054215, + "grad_norm": 0.4177430294864894, + "learning_rate": 3.240917694513699e-06, + "loss": 0.4335, + "step": 4684 + }, + { + "epoch": 3.1505421534840714, + "grad_norm": 0.4272694717550732, + "learning_rate": 3.238857074976898e-06, + "loss": 0.4403, + "step": 4685 + }, + { + "epoch": 3.151214591913928, + "grad_norm": 0.5249306400567656, + "learning_rate": 3.2367967968737982e-06, + "loss": 0.4442, + "step": 4686 + }, + { + "epoch": 3.1518870303437843, + "grad_norm": 0.29953421863302154, + "learning_rate": 3.2347368606038265e-06, + "loss": 0.4291, + "step": 4687 + }, + { + "epoch": 3.1525594687736405, + "grad_norm": 0.37863611117307133, + "learning_rate": 3.2326772665663443e-06, + "loss": 0.4394, + "step": 4688 + }, + { + "epoch": 3.1532319072034967, + "grad_norm": 0.3344756042740373, + "learning_rate": 3.230618015160646e-06, + "loss": 0.4205, + "step": 4689 + }, + { + "epoch": 3.153904345633353, + "grad_norm": 0.3554230676368455, + "learning_rate": 3.228559106785961e-06, + "loss": 0.4313, + "step": 4690 + }, + { + "epoch": 3.154576784063209, + "grad_norm": 0.36182306292558464, + "learning_rate": 3.2265005418414486e-06, + "loss": 0.4412, + "step": 4691 + }, + { + "epoch": 3.1552492224930653, + "grad_norm": 0.3154854946548312, + "learning_rate": 3.2244423207262047e-06, + "loss": 0.4303, + "step": 4692 + }, + { + "epoch": 3.155921660922922, + "grad_norm": 0.42983282243562276, + "learning_rate": 3.2223844438392583e-06, + "loss": 0.4446, + "step": 4693 + }, + { + "epoch": 3.156594099352778, + "grad_norm": 0.30269250195619035, + "learning_rate": 3.220326911579571e-06, + "loss": 0.432, + "step": 4694 + }, + { + "epoch": 3.1572665377826343, + "grad_norm": 0.424479235474068, + "learning_rate": 3.218269724346037e-06, + "loss": 0.4561, + "step": 4695 + }, + { + "epoch": 3.1579389762124905, + "grad_norm": 0.33574116245111074, + "learning_rate": 3.216212882537484e-06, + "loss": 0.4264, + "step": 4696 + }, + { + "epoch": 3.1586114146423467, + "grad_norm": 0.3498294739538529, + "learning_rate": 3.214156386552674e-06, + "loss": 0.4368, + "step": 4697 + }, + { + "epoch": 3.159283853072203, + "grad_norm": 0.37375534804380817, + "learning_rate": 3.2121002367903005e-06, + "loss": 0.4257, + "step": 4698 + }, + { + "epoch": 3.159956291502059, + "grad_norm": 0.3090641749528765, + "learning_rate": 3.2100444336489923e-06, + "loss": 0.4339, + "step": 4699 + }, + { + "epoch": 3.160628729931916, + "grad_norm": 0.9572396401829548, + "learning_rate": 3.2079889775273053e-06, + "loss": 0.4307, + "step": 4700 + }, + { + "epoch": 3.161301168361772, + "grad_norm": 0.45192994900712563, + "learning_rate": 3.205933868823734e-06, + "loss": 0.4384, + "step": 4701 + }, + { + "epoch": 3.161973606791628, + "grad_norm": 0.36941191503540555, + "learning_rate": 3.2038791079367025e-06, + "loss": 0.4322, + "step": 4702 + }, + { + "epoch": 3.1626460452214844, + "grad_norm": 4.38470789270265, + "learning_rate": 3.20182469526457e-06, + "loss": 0.4199, + "step": 4703 + }, + { + "epoch": 3.1633184836513406, + "grad_norm": 0.34675448007122195, + "learning_rate": 3.1997706312056254e-06, + "loss": 0.4402, + "step": 4704 + }, + { + "epoch": 3.163990922081197, + "grad_norm": 0.40977690757777413, + "learning_rate": 3.1977169161580905e-06, + "loss": 0.4452, + "step": 4705 + }, + { + "epoch": 3.164663360511053, + "grad_norm": 0.4220151900113677, + "learning_rate": 3.1956635505201217e-06, + "loss": 0.4345, + "step": 4706 + }, + { + "epoch": 3.1653357989409097, + "grad_norm": 0.478450870024991, + "learning_rate": 3.193610534689805e-06, + "loss": 0.4352, + "step": 4707 + }, + { + "epoch": 3.166008237370766, + "grad_norm": 0.3188804493495357, + "learning_rate": 3.1915578690651614e-06, + "loss": 0.4395, + "step": 4708 + }, + { + "epoch": 3.166680675800622, + "grad_norm": 0.3183761201737437, + "learning_rate": 3.189505554044139e-06, + "loss": 0.4423, + "step": 4709 + }, + { + "epoch": 3.1673531142304783, + "grad_norm": 0.30600258196367147, + "learning_rate": 3.1874535900246232e-06, + "loss": 0.4316, + "step": 4710 + }, + { + "epoch": 3.1680255526603345, + "grad_norm": 0.3562444091319478, + "learning_rate": 3.1854019774044293e-06, + "loss": 0.4238, + "step": 4711 + }, + { + "epoch": 3.1686979910901907, + "grad_norm": 0.3773529419732106, + "learning_rate": 3.183350716581305e-06, + "loss": 0.4271, + "step": 4712 + }, + { + "epoch": 3.169370429520047, + "grad_norm": 0.4118248790288195, + "learning_rate": 3.181299807952928e-06, + "loss": 0.4305, + "step": 4713 + }, + { + "epoch": 3.1700428679499035, + "grad_norm": 0.4317878872718693, + "learning_rate": 3.1792492519169094e-06, + "loss": 0.4404, + "step": 4714 + }, + { + "epoch": 3.1707153063797597, + "grad_norm": 0.3565325816330687, + "learning_rate": 3.177199048870792e-06, + "loss": 0.4442, + "step": 4715 + }, + { + "epoch": 3.171387744809616, + "grad_norm": 0.5234862327575597, + "learning_rate": 3.175149199212052e-06, + "loss": 0.428, + "step": 4716 + }, + { + "epoch": 3.172060183239472, + "grad_norm": 0.38414172568977173, + "learning_rate": 3.173099703338089e-06, + "loss": 0.4614, + "step": 4717 + }, + { + "epoch": 3.1727326216693283, + "grad_norm": 0.43272101090757875, + "learning_rate": 3.171050561646244e-06, + "loss": 0.4336, + "step": 4718 + }, + { + "epoch": 3.1734050600991845, + "grad_norm": 0.3643160602771628, + "learning_rate": 3.169001774533785e-06, + "loss": 0.4373, + "step": 4719 + }, + { + "epoch": 3.1740774985290408, + "grad_norm": 0.388990393612831, + "learning_rate": 3.1669533423979105e-06, + "loss": 0.4231, + "step": 4720 + }, + { + "epoch": 3.1747499369588974, + "grad_norm": 0.38034827680550565, + "learning_rate": 3.1649052656357536e-06, + "loss": 0.4433, + "step": 4721 + }, + { + "epoch": 3.1754223753887536, + "grad_norm": 0.3678383641889589, + "learning_rate": 3.1628575446443722e-06, + "loss": 0.4442, + "step": 4722 + }, + { + "epoch": 3.17609481381861, + "grad_norm": 0.3981919440636663, + "learning_rate": 3.16081017982076e-06, + "loss": 0.4163, + "step": 4723 + }, + { + "epoch": 3.176767252248466, + "grad_norm": 0.38403716118646863, + "learning_rate": 3.158763171561842e-06, + "loss": 0.4341, + "step": 4724 + }, + { + "epoch": 3.177439690678322, + "grad_norm": 0.3583236464931998, + "learning_rate": 3.1567165202644733e-06, + "loss": 0.441, + "step": 4725 + }, + { + "epoch": 3.1781121291081784, + "grad_norm": 0.36116918921683977, + "learning_rate": 3.154670226325437e-06, + "loss": 0.4566, + "step": 4726 + }, + { + "epoch": 3.1787845675380346, + "grad_norm": 0.34424155111991867, + "learning_rate": 3.15262429014145e-06, + "loss": 0.4348, + "step": 4727 + }, + { + "epoch": 3.1794570059678913, + "grad_norm": 0.36997385256532983, + "learning_rate": 3.1505787121091595e-06, + "loss": 0.4195, + "step": 4728 + }, + { + "epoch": 3.1801294443977475, + "grad_norm": 0.3275366678701998, + "learning_rate": 3.1485334926251433e-06, + "loss": 0.4444, + "step": 4729 + }, + { + "epoch": 3.1808018828276037, + "grad_norm": 0.37805491290009186, + "learning_rate": 3.1464886320859096e-06, + "loss": 0.4262, + "step": 4730 + }, + { + "epoch": 3.18147432125746, + "grad_norm": 0.30860103148911594, + "learning_rate": 3.1444441308878935e-06, + "loss": 0.4292, + "step": 4731 + }, + { + "epoch": 3.182146759687316, + "grad_norm": 0.32504355552537767, + "learning_rate": 3.142399989427466e-06, + "loss": 0.4366, + "step": 4732 + }, + { + "epoch": 3.1828191981171723, + "grad_norm": 0.3226580089600235, + "learning_rate": 3.1403562081009252e-06, + "loss": 0.4199, + "step": 4733 + }, + { + "epoch": 3.1834916365470285, + "grad_norm": 0.4895344773329832, + "learning_rate": 3.1383127873045016e-06, + "loss": 0.4197, + "step": 4734 + }, + { + "epoch": 3.184164074976885, + "grad_norm": 0.3564439182384866, + "learning_rate": 3.1362697274343512e-06, + "loss": 0.4396, + "step": 4735 + }, + { + "epoch": 3.1848365134067413, + "grad_norm": 0.37691746552945693, + "learning_rate": 3.1342270288865655e-06, + "loss": 0.431, + "step": 4736 + }, + { + "epoch": 3.1855089518365975, + "grad_norm": 0.4162265918187403, + "learning_rate": 3.1321846920571627e-06, + "loss": 0.4268, + "step": 4737 + }, + { + "epoch": 3.1861813902664538, + "grad_norm": 0.39649204601421295, + "learning_rate": 3.1301427173420935e-06, + "loss": 0.4447, + "step": 4738 + }, + { + "epoch": 3.18685382869631, + "grad_norm": 0.49702336290092375, + "learning_rate": 3.1281011051372327e-06, + "loss": 0.4637, + "step": 4739 + }, + { + "epoch": 3.187526267126166, + "grad_norm": 0.3006983383812492, + "learning_rate": 3.1260598558383913e-06, + "loss": 0.4224, + "step": 4740 + }, + { + "epoch": 3.1881987055560224, + "grad_norm": 0.40406229463083043, + "learning_rate": 3.124018969841307e-06, + "loss": 0.4494, + "step": 4741 + }, + { + "epoch": 3.188871143985879, + "grad_norm": 0.3584595005860944, + "learning_rate": 3.121978447541648e-06, + "loss": 0.432, + "step": 4742 + }, + { + "epoch": 3.189543582415735, + "grad_norm": 0.3686743773892617, + "learning_rate": 3.1199382893350115e-06, + "loss": 0.4333, + "step": 4743 + }, + { + "epoch": 3.1902160208455914, + "grad_norm": 0.3359050197895751, + "learning_rate": 3.1178984956169225e-06, + "loss": 0.4175, + "step": 4744 + }, + { + "epoch": 3.1908884592754476, + "grad_norm": 0.31833285001091, + "learning_rate": 3.1158590667828376e-06, + "loss": 0.4212, + "step": 4745 + }, + { + "epoch": 3.191560897705304, + "grad_norm": 0.4891485403436755, + "learning_rate": 3.113820003228142e-06, + "loss": 0.4414, + "step": 4746 + }, + { + "epoch": 3.19223333613516, + "grad_norm": 0.4269682563934399, + "learning_rate": 3.111781305348153e-06, + "loss": 0.4449, + "step": 4747 + }, + { + "epoch": 3.1929057745650162, + "grad_norm": 0.3584339588087221, + "learning_rate": 3.109742973538108e-06, + "loss": 0.4255, + "step": 4748 + }, + { + "epoch": 3.193578212994873, + "grad_norm": 0.40390970804608317, + "learning_rate": 3.1077050081931835e-06, + "loss": 0.4405, + "step": 4749 + }, + { + "epoch": 3.194250651424729, + "grad_norm": 0.40806604809088715, + "learning_rate": 3.10566740970848e-06, + "loss": 0.4223, + "step": 4750 + }, + { + "epoch": 3.1949230898545853, + "grad_norm": 0.3056085110647207, + "learning_rate": 3.103630178479028e-06, + "loss": 0.4376, + "step": 4751 + }, + { + "epoch": 3.1955955282844415, + "grad_norm": 0.34709925682469184, + "learning_rate": 3.1015933148997868e-06, + "loss": 0.4374, + "step": 4752 + }, + { + "epoch": 3.1962679667142977, + "grad_norm": 0.30168107092410623, + "learning_rate": 3.0995568193656435e-06, + "loss": 0.4348, + "step": 4753 + }, + { + "epoch": 3.196940405144154, + "grad_norm": 0.45845343718385345, + "learning_rate": 3.097520692271414e-06, + "loss": 0.4438, + "step": 4754 + }, + { + "epoch": 3.19761284357401, + "grad_norm": 0.33802532153175735, + "learning_rate": 3.0954849340118454e-06, + "loss": 0.4197, + "step": 4755 + }, + { + "epoch": 3.1982852820038667, + "grad_norm": 0.4163329714506242, + "learning_rate": 3.0934495449816117e-06, + "loss": 0.439, + "step": 4756 + }, + { + "epoch": 3.198957720433723, + "grad_norm": 0.3638694248396867, + "learning_rate": 3.09141452557531e-06, + "loss": 0.4208, + "step": 4757 + }, + { + "epoch": 3.199630158863579, + "grad_norm": 0.36166532189611367, + "learning_rate": 3.0893798761874754e-06, + "loss": 0.4257, + "step": 4758 + }, + { + "epoch": 3.2003025972934354, + "grad_norm": 0.35809717619963644, + "learning_rate": 3.0873455972125644e-06, + "loss": 0.4317, + "step": 4759 + }, + { + "epoch": 3.2009750357232916, + "grad_norm": 0.35426721678748335, + "learning_rate": 3.0853116890449646e-06, + "loss": 0.4367, + "step": 4760 + }, + { + "epoch": 3.2016474741531478, + "grad_norm": 0.3126887790627608, + "learning_rate": 3.08327815207899e-06, + "loss": 0.4274, + "step": 4761 + }, + { + "epoch": 3.202319912583004, + "grad_norm": 0.39450856417405994, + "learning_rate": 3.0812449867088833e-06, + "loss": 0.4224, + "step": 4762 + }, + { + "epoch": 3.2029923510128606, + "grad_norm": 0.32818810265606196, + "learning_rate": 3.0792121933288162e-06, + "loss": 0.4395, + "step": 4763 + }, + { + "epoch": 3.203664789442717, + "grad_norm": 0.35599959312630275, + "learning_rate": 3.0771797723328868e-06, + "loss": 0.4312, + "step": 4764 + }, + { + "epoch": 3.204337227872573, + "grad_norm": 0.4090587251102353, + "learning_rate": 3.075147724115124e-06, + "loss": 0.4372, + "step": 4765 + }, + { + "epoch": 3.2050096663024292, + "grad_norm": 0.468838029036378, + "learning_rate": 3.073116049069478e-06, + "loss": 0.4244, + "step": 4766 + }, + { + "epoch": 3.2056821047322854, + "grad_norm": 0.37953456910976113, + "learning_rate": 3.071084747589832e-06, + "loss": 0.4327, + "step": 4767 + }, + { + "epoch": 3.2063545431621416, + "grad_norm": 0.3686688100535242, + "learning_rate": 3.0690538200699973e-06, + "loss": 0.4229, + "step": 4768 + }, + { + "epoch": 3.207026981591998, + "grad_norm": 0.31807479799014965, + "learning_rate": 3.0670232669037112e-06, + "loss": 0.4385, + "step": 4769 + }, + { + "epoch": 3.207699420021854, + "grad_norm": 0.3307981654169214, + "learning_rate": 3.0649930884846348e-06, + "loss": 0.4304, + "step": 4770 + }, + { + "epoch": 3.2083718584517107, + "grad_norm": 0.4029112853490653, + "learning_rate": 3.0629632852063616e-06, + "loss": 0.4358, + "step": 4771 + }, + { + "epoch": 3.209044296881567, + "grad_norm": 0.3732050035030929, + "learning_rate": 3.060933857462411e-06, + "loss": 0.421, + "step": 4772 + }, + { + "epoch": 3.209716735311423, + "grad_norm": 0.3742535869075959, + "learning_rate": 3.058904805646229e-06, + "loss": 0.4622, + "step": 4773 + }, + { + "epoch": 3.2103891737412793, + "grad_norm": 0.3659673176100718, + "learning_rate": 3.0568761301511894e-06, + "loss": 0.4347, + "step": 4774 + }, + { + "epoch": 3.2110616121711355, + "grad_norm": 0.3358417737771693, + "learning_rate": 3.0548478313705917e-06, + "loss": 0.4521, + "step": 4775 + }, + { + "epoch": 3.2117340506009917, + "grad_norm": 0.4766779055355382, + "learning_rate": 3.052819909697663e-06, + "loss": 0.4451, + "step": 4776 + }, + { + "epoch": 3.212406489030848, + "grad_norm": 0.3932548693099987, + "learning_rate": 3.0507923655255588e-06, + "loss": 0.4352, + "step": 4777 + }, + { + "epoch": 3.2130789274607046, + "grad_norm": 0.3385982906612951, + "learning_rate": 3.048765199247361e-06, + "loss": 0.431, + "step": 4778 + }, + { + "epoch": 3.2137513658905608, + "grad_norm": 0.5054161825659945, + "learning_rate": 3.046738411256074e-06, + "loss": 0.4342, + "step": 4779 + }, + { + "epoch": 3.214423804320417, + "grad_norm": 0.43428867622105366, + "learning_rate": 3.044712001944634e-06, + "loss": 0.4428, + "step": 4780 + }, + { + "epoch": 3.215096242750273, + "grad_norm": 0.5603923248105427, + "learning_rate": 3.042685971705903e-06, + "loss": 0.4547, + "step": 4781 + }, + { + "epoch": 3.2157686811801294, + "grad_norm": 0.35307316001332334, + "learning_rate": 3.040660320932668e-06, + "loss": 0.4331, + "step": 4782 + }, + { + "epoch": 3.2164411196099856, + "grad_norm": 0.3614379883327843, + "learning_rate": 3.038635050017642e-06, + "loss": 0.4428, + "step": 4783 + }, + { + "epoch": 3.217113558039842, + "grad_norm": 0.3314145010300879, + "learning_rate": 3.036610159353466e-06, + "loss": 0.4218, + "step": 4784 + }, + { + "epoch": 3.2177859964696984, + "grad_norm": 0.31371322318633665, + "learning_rate": 3.0345856493327066e-06, + "loss": 0.4411, + "step": 4785 + }, + { + "epoch": 3.2184584348995546, + "grad_norm": 0.3213531388938575, + "learning_rate": 3.0325615203478563e-06, + "loss": 0.4182, + "step": 4786 + }, + { + "epoch": 3.219130873329411, + "grad_norm": 0.41555390009757326, + "learning_rate": 3.0305377727913366e-06, + "loss": 0.4341, + "step": 4787 + }, + { + "epoch": 3.219803311759267, + "grad_norm": 0.5289558142311216, + "learning_rate": 3.0285144070554884e-06, + "loss": 0.4428, + "step": 4788 + }, + { + "epoch": 3.2204757501891232, + "grad_norm": 0.39461029318470636, + "learning_rate": 3.0264914235325847e-06, + "loss": 0.4287, + "step": 4789 + }, + { + "epoch": 3.2211481886189794, + "grad_norm": 0.4359852144425983, + "learning_rate": 3.024468822614822e-06, + "loss": 0.4289, + "step": 4790 + }, + { + "epoch": 3.2218206270488357, + "grad_norm": 0.3223360264707932, + "learning_rate": 3.0224466046943245e-06, + "loss": 0.4259, + "step": 4791 + }, + { + "epoch": 3.2224930654786923, + "grad_norm": 0.3298543675630738, + "learning_rate": 3.020424770163138e-06, + "loss": 0.4435, + "step": 4792 + }, + { + "epoch": 3.2231655039085485, + "grad_norm": 0.3712648442771377, + "learning_rate": 3.018403319413238e-06, + "loss": 0.4399, + "step": 4793 + }, + { + "epoch": 3.2238379423384047, + "grad_norm": 0.35017531257266965, + "learning_rate": 3.016382252836525e-06, + "loss": 0.4117, + "step": 4794 + }, + { + "epoch": 3.224510380768261, + "grad_norm": 0.3906473263678258, + "learning_rate": 3.014361570824823e-06, + "loss": 0.4473, + "step": 4795 + }, + { + "epoch": 3.225182819198117, + "grad_norm": 0.31963992523794227, + "learning_rate": 3.012341273769885e-06, + "loss": 0.4388, + "step": 4796 + }, + { + "epoch": 3.2258552576279733, + "grad_norm": 0.3652391401530828, + "learning_rate": 3.010321362063383e-06, + "loss": 0.4234, + "step": 4797 + }, + { + "epoch": 3.2265276960578295, + "grad_norm": 0.40334376458618504, + "learning_rate": 3.0083018360969213e-06, + "loss": 0.4531, + "step": 4798 + }, + { + "epoch": 3.227200134487686, + "grad_norm": 0.3849107781967533, + "learning_rate": 3.0062826962620252e-06, + "loss": 0.4355, + "step": 4799 + }, + { + "epoch": 3.2278725729175424, + "grad_norm": 0.41994305213491073, + "learning_rate": 3.004263942950148e-06, + "loss": 0.4298, + "step": 4800 + } + ], + "logging_steps": 1.0, + "max_steps": 7435, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.434020777725788e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}