diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5688 +1,3832 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9988810145468108, + "epoch": 2.0, "eval_steps": 500, - "global_step": 4020, + "global_step": 2702, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 0.353515625, - "learning_rate": 9.950248756218907e-07, - "loss": 2.6395, + "grad_norm": 0.34765625, + "learning_rate": 1.4760147601476015e-06, + "loss": 2.6485, "step": 1 }, { "epoch": 0.0, - "grad_norm": 0.328125, - "learning_rate": 4.975124378109453e-06, - "loss": 2.6482, + "grad_norm": 0.36328125, + "learning_rate": 7.380073800738008e-06, + "loss": 2.6486, "step": 5 }, { "epoch": 0.01, - "grad_norm": 0.337890625, - "learning_rate": 9.950248756218906e-06, - "loss": 2.6389, + "grad_norm": 0.37890625, + "learning_rate": 1.4760147601476015e-05, + "loss": 2.6449, "step": 10 }, { "epoch": 0.01, - "grad_norm": 0.3203125, - "learning_rate": 1.4925373134328357e-05, - "loss": 2.629, + "grad_norm": 0.400390625, + "learning_rate": 2.2140221402214025e-05, + "loss": 2.6688, "step": 15 }, { "epoch": 0.01, - "grad_norm": 0.328125, - "learning_rate": 1.990049751243781e-05, - "loss": 2.6301, + "grad_norm": 0.34375, + "learning_rate": 2.952029520295203e-05, + "loss": 2.6268, "step": 20 }, { "epoch": 0.02, - "grad_norm": 0.36328125, - "learning_rate": 2.4875621890547266e-05, - "loss": 2.6048, + "grad_norm": 0.271484375, + "learning_rate": 3.690036900369004e-05, + "loss": 2.6019, "step": 25 }, { "epoch": 0.02, - "grad_norm": 0.298828125, - "learning_rate": 2.9850746268656714e-05, - "loss": 2.5818, + "grad_norm": 0.25, + "learning_rate": 4.428044280442805e-05, + "loss": 2.5532, "step": 30 }, { "epoch": 0.03, - "grad_norm": 0.26171875, - "learning_rate": 3.4825870646766175e-05, - "loss": 2.5372, + "grad_norm": 0.2578125, + "learning_rate": 5.166051660516605e-05, + "loss": 2.508, "step": 35 }, { "epoch": 0.03, - "grad_norm": 0.271484375, - "learning_rate": 3.980099502487562e-05, - "loss": 2.507, + "grad_norm": 0.19140625, + "learning_rate": 5.904059040590406e-05, + "loss": 2.4591, "step": 40 }, { "epoch": 0.03, - "grad_norm": 0.1748046875, - "learning_rate": 4.477611940298508e-05, - "loss": 2.4718, + "grad_norm": 0.1708984375, + "learning_rate": 6.642066420664207e-05, + "loss": 2.4235, "step": 45 }, { "epoch": 0.04, - "grad_norm": 0.1640625, - "learning_rate": 4.975124378109453e-05, - "loss": 2.4398, + "grad_norm": 0.1787109375, + "learning_rate": 7.380073800738008e-05, + "loss": 2.3654, "step": 50 }, { "epoch": 0.04, - "grad_norm": 0.16015625, - "learning_rate": 5.472636815920398e-05, - "loss": 2.4118, + "grad_norm": 0.1337890625, + "learning_rate": 8.118081180811809e-05, + "loss": 2.3604, "step": 55 }, { "epoch": 0.04, - "grad_norm": 0.1728515625, - "learning_rate": 5.970149253731343e-05, - "loss": 2.3949, + "grad_norm": 0.142578125, + "learning_rate": 8.85608856088561e-05, + "loss": 2.3221, "step": 60 }, { "epoch": 0.05, - "grad_norm": 0.13671875, - "learning_rate": 6.46766169154229e-05, - "loss": 2.3556, + "grad_norm": 0.11767578125, + "learning_rate": 9.59409594095941e-05, + "loss": 2.3041, "step": 65 }, { "epoch": 0.05, - "grad_norm": 0.1357421875, - "learning_rate": 6.965174129353235e-05, - "loss": 2.3644, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001033210332103321, + "loss": 2.3051, "step": 70 }, { "epoch": 0.06, - "grad_norm": 0.1474609375, - "learning_rate": 7.46268656716418e-05, - "loss": 2.3262, + "grad_norm": 0.109375, + "learning_rate": 0.00011070110701107013, + "loss": 2.2795, "step": 75 }, { "epoch": 0.06, - "grad_norm": 0.11865234375, - "learning_rate": 7.960199004975125e-05, - "loss": 2.3181, + "grad_norm": 0.0986328125, + "learning_rate": 0.00011808118081180812, + "loss": 2.2926, "step": 80 }, { "epoch": 0.06, - "grad_norm": 0.11376953125, - "learning_rate": 8.45771144278607e-05, - "loss": 2.3195, + "grad_norm": 0.103515625, + "learning_rate": 0.00012546125461254613, + "loss": 2.2594, "step": 85 }, { "epoch": 0.07, - "grad_norm": 0.10693359375, - "learning_rate": 8.955223880597016e-05, - "loss": 2.2922, + "grad_norm": 0.09814453125, + "learning_rate": 0.00013284132841328414, + "loss": 2.2639, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.10009765625, - "learning_rate": 9.452736318407961e-05, - "loss": 2.2996, + "learning_rate": 0.00014022140221402215, + "loss": 2.2496, "step": 95 }, { "epoch": 0.07, - "grad_norm": 0.095703125, - "learning_rate": 9.950248756218906e-05, - "loss": 2.2923, + "grad_norm": 0.103515625, + "learning_rate": 0.00014760147601476016, + "loss": 2.2428, "step": 100 }, { "epoch": 0.08, - "grad_norm": 0.09912109375, - "learning_rate": 0.0001044776119402985, - "loss": 2.2908, + "grad_norm": 0.1044921875, + "learning_rate": 0.00015498154981549817, + "loss": 2.2572, "step": 105 }, { "epoch": 0.08, - "grad_norm": 0.11328125, - "learning_rate": 0.00010945273631840796, - "loss": 2.2892, + "grad_norm": 0.1025390625, + "learning_rate": 0.00016236162361623618, + "loss": 2.2447, "step": 110 }, { "epoch": 0.09, - "grad_norm": 0.10107421875, - "learning_rate": 0.00011442786069651741, - "loss": 2.2772, + "grad_norm": 0.10888671875, + "learning_rate": 0.0001697416974169742, + "loss": 2.2258, "step": 115 }, { "epoch": 0.09, - "grad_norm": 0.09814453125, - "learning_rate": 0.00011940298507462686, - "loss": 2.2492, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001771217712177122, + "loss": 2.2284, "step": 120 }, { "epoch": 0.09, - "grad_norm": 0.10693359375, - "learning_rate": 0.0001243781094527363, - "loss": 2.2669, + "grad_norm": 0.109375, + "learning_rate": 0.0001845018450184502, + "loss": 2.2346, "step": 125 }, { "epoch": 0.1, - "grad_norm": 0.1025390625, - "learning_rate": 0.0001293532338308458, - "loss": 2.2486, + "grad_norm": 0.10595703125, + "learning_rate": 0.0001918819188191882, + "loss": 2.2365, "step": 130 }, { "epoch": 0.1, - "grad_norm": 0.10205078125, - "learning_rate": 0.00013432835820895525, - "loss": 2.2655, + "grad_norm": 0.11083984375, + "learning_rate": 0.00019926199261992622, + "loss": 2.2068, "step": 135 }, { "epoch": 0.1, - "grad_norm": 0.1005859375, - "learning_rate": 0.0001393034825870647, - "loss": 2.2695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002066420664206642, + "loss": 2.2316, "step": 140 }, { "epoch": 0.11, - "grad_norm": 0.10546875, - "learning_rate": 0.00014427860696517416, - "loss": 2.2627, + "grad_norm": 0.10986328125, + "learning_rate": 0.00021402214022140222, + "loss": 2.2427, "step": 145 }, { "epoch": 0.11, - "grad_norm": 0.1064453125, - "learning_rate": 0.0001492537313432836, - "loss": 2.2453, + "grad_norm": 0.11181640625, + "learning_rate": 0.00022140221402214025, + "loss": 2.2405, "step": 150 }, { - "epoch": 0.12, - "grad_norm": 0.1162109375, - "learning_rate": 0.00015422885572139304, - "loss": 2.2462, + "epoch": 0.11, + "grad_norm": 0.115234375, + "learning_rate": 0.00022878228782287826, + "loss": 2.2197, "step": 155 }, { "epoch": 0.12, - "grad_norm": 0.119140625, - "learning_rate": 0.0001592039800995025, - "loss": 2.2574, + "grad_norm": 0.10693359375, + "learning_rate": 0.00023616236162361624, + "loss": 2.2411, "step": 160 }, { "epoch": 0.12, - "grad_norm": 0.11474609375, - "learning_rate": 0.00016417910447761195, - "loss": 2.2522, + "grad_norm": 0.1181640625, + "learning_rate": 0.00024354243542435425, + "loss": 2.2195, "step": 165 }, { "epoch": 0.13, - "grad_norm": 0.12255859375, - "learning_rate": 0.0001691542288557214, - "loss": 2.2428, + "grad_norm": 0.10693359375, + "learning_rate": 0.00025092250922509226, + "loss": 2.2147, "step": 170 }, { "epoch": 0.13, - "grad_norm": 0.11669921875, - "learning_rate": 0.00017412935323383086, - "loss": 2.2617, + "grad_norm": 0.10888671875, + "learning_rate": 0.00025830258302583027, + "loss": 2.2341, "step": 175 }, { "epoch": 0.13, - "grad_norm": 0.11328125, - "learning_rate": 0.0001791044776119403, - "loss": 2.2604, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002656826568265683, + "loss": 2.2116, "step": 180 }, { "epoch": 0.14, - "grad_norm": 0.1142578125, - "learning_rate": 0.00018407960199004977, - "loss": 2.2437, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002730627306273063, + "loss": 2.1973, "step": 185 }, { "epoch": 0.14, - "grad_norm": 0.1171875, - "learning_rate": 0.00018905472636815922, - "loss": 2.2489, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002804428044280443, + "loss": 2.2137, "step": 190 }, { - "epoch": 0.15, - "grad_norm": 0.11181640625, - "learning_rate": 0.00019402985074626867, - "loss": 2.2788, + "epoch": 0.14, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002878228782287823, + "loss": 2.2388, "step": 195 }, { "epoch": 0.15, - "grad_norm": 0.10986328125, - "learning_rate": 0.00019900497512437813, - "loss": 2.2438, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002952029520295203, + "loss": 2.2111, "step": 200 }, { "epoch": 0.15, - "grad_norm": 0.11181640625, - "learning_rate": 0.00020398009950248756, - "loss": 2.2603, + "grad_norm": 0.10205078125, + "learning_rate": 0.00030258302583025833, + "loss": 2.2361, "step": 205 }, { "epoch": 0.16, - "grad_norm": 0.11181640625, - "learning_rate": 0.000208955223880597, - "loss": 2.242, + "grad_norm": 0.10986328125, + "learning_rate": 0.00030996309963099634, + "loss": 2.1882, "step": 210 }, { "epoch": 0.16, - "grad_norm": 0.10986328125, - "learning_rate": 0.00021393034825870647, - "loss": 2.2205, + "grad_norm": 0.1044921875, + "learning_rate": 0.00031734317343173435, + "loss": 2.2399, "step": 215 }, { "epoch": 0.16, - "grad_norm": 0.11669921875, - "learning_rate": 0.00021890547263681592, - "loss": 2.2329, + "grad_norm": 0.10546875, + "learning_rate": 0.00032472324723247236, + "loss": 2.2062, "step": 220 }, { "epoch": 0.17, - "grad_norm": 0.115234375, - "learning_rate": 0.00022388059701492538, - "loss": 2.2511, + "grad_norm": 0.10546875, + "learning_rate": 0.0003321033210332103, + "loss": 2.2068, "step": 225 }, { "epoch": 0.17, - "grad_norm": 0.1103515625, - "learning_rate": 0.00022885572139303483, - "loss": 2.2369, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003394833948339484, + "loss": 2.2058, "step": 230 }, { - "epoch": 0.18, - "grad_norm": 0.1123046875, - "learning_rate": 0.00023383084577114426, - "loss": 2.2116, + "epoch": 0.17, + "grad_norm": 0.10791015625, + "learning_rate": 0.0003468634686346864, + "loss": 2.2137, "step": 235 }, { "epoch": 0.18, - "grad_norm": 0.11572265625, - "learning_rate": 0.0002388059701492537, - "loss": 2.2502, + "grad_norm": 0.103515625, + "learning_rate": 0.0003542435424354244, + "loss": 2.2127, "step": 240 }, { "epoch": 0.18, - "grad_norm": 0.10986328125, - "learning_rate": 0.00024378109452736317, - "loss": 2.2344, + "grad_norm": 0.09716796875, + "learning_rate": 0.00036162361623616235, + "loss": 2.2093, "step": 245 }, { "epoch": 0.19, - "grad_norm": 0.1142578125, - "learning_rate": 0.0002487562189054726, - "loss": 2.2336, + "grad_norm": 0.1015625, + "learning_rate": 0.0003690036900369004, + "loss": 2.2063, "step": 250 }, { "epoch": 0.19, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002537313432835821, - "loss": 2.2274, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003763837638376384, + "loss": 2.2347, "step": 255 }, { "epoch": 0.19, - "grad_norm": 0.109375, - "learning_rate": 0.0002587064676616916, - "loss": 2.2365, + "grad_norm": 0.103515625, + "learning_rate": 0.0003837638376383764, + "loss": 2.1907, "step": 260 }, { "epoch": 0.2, - "grad_norm": 0.1044921875, - "learning_rate": 0.000263681592039801, - "loss": 2.2357, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003911439114391144, + "loss": 2.2137, "step": 265 }, { "epoch": 0.2, - "grad_norm": 0.10498046875, - "learning_rate": 0.0002686567164179105, - "loss": 2.2314, + "grad_norm": 0.0986328125, + "learning_rate": 0.00039852398523985245, + "loss": 2.1975, "step": 270 }, { - "epoch": 0.21, - "grad_norm": 0.10791015625, - "learning_rate": 0.0002736318407960199, - "loss": 2.2381, + "epoch": 0.2, + "grad_norm": 0.09619140625, + "learning_rate": 0.00039999732792377247, + "loss": 2.203, "step": 275 }, { "epoch": 0.21, - "grad_norm": 0.10888671875, - "learning_rate": 0.0002786069651741294, - "loss": 2.2481, + "grad_norm": 0.09765625, + "learning_rate": 0.00039998647273646793, + "loss": 2.2062, "step": 280 }, { "epoch": 0.21, - "grad_norm": 0.1142578125, - "learning_rate": 0.00028358208955223883, - "loss": 2.2104, + "grad_norm": 0.10107421875, + "learning_rate": 0.00039996726788618994, + "loss": 2.2064, "step": 285 }, { - "epoch": 0.22, - "grad_norm": 0.10302734375, - "learning_rate": 0.0002885572139303483, - "loss": 2.2203, + "epoch": 0.21, + "grad_norm": 0.09619140625, + "learning_rate": 0.00039993971417476293, + "loss": 2.1835, "step": 290 }, { "epoch": 0.22, - "grad_norm": 0.10986328125, - "learning_rate": 0.00029353233830845774, - "loss": 2.2426, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003999038127525856, + "loss": 2.1989, "step": 295 }, { "epoch": 0.22, - "grad_norm": 0.11083984375, - "learning_rate": 0.0002985074626865672, + "grad_norm": 0.09716796875, + "learning_rate": 0.00039985956511858335, "loss": 2.2137, "step": 300 }, { "epoch": 0.23, - "grad_norm": 0.1064453125, - "learning_rate": 0.00030348258706467665, - "loss": 2.2393, + "grad_norm": 0.09619140625, + "learning_rate": 0.00039980697312014523, + "loss": 2.1926, "step": 305 }, { "epoch": 0.23, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003084577114427861, - "loss": 2.2369, + "grad_norm": 0.095703125, + "learning_rate": 0.00039974603895304704, + "loss": 2.1829, "step": 310 }, { "epoch": 0.23, - "grad_norm": 0.10595703125, - "learning_rate": 0.00031343283582089556, - "loss": 2.2205, + "grad_norm": 0.1005859375, + "learning_rate": 0.00039967676516135974, + "loss": 2.1946, "step": 315 }, { "epoch": 0.24, - "grad_norm": 0.10400390625, - "learning_rate": 0.000318407960199005, - "loss": 2.2607, + "grad_norm": 0.099609375, + "learning_rate": 0.00039959915463734285, + "loss": 2.1814, "step": 320 }, { "epoch": 0.24, - "grad_norm": 0.10400390625, - "learning_rate": 0.00032338308457711447, - "loss": 2.201, + "grad_norm": 0.099609375, + "learning_rate": 0.00039951321062132425, + "loss": 2.187, "step": 325 }, { - "epoch": 0.25, - "grad_norm": 0.10595703125, - "learning_rate": 0.0003283582089552239, - "loss": 2.2304, + "epoch": 0.24, + "grad_norm": 0.09814453125, + "learning_rate": 0.00039941893670156453, + "loss": 2.2039, "step": 330 }, { "epoch": 0.25, - "grad_norm": 0.10595703125, - "learning_rate": 0.0003333333333333334, - "loss": 2.214, + "grad_norm": 0.09423828125, + "learning_rate": 0.0003993163368141071, + "loss": 2.1839, "step": 335 }, { "epoch": 0.25, - "grad_norm": 0.1015625, - "learning_rate": 0.0003383084577114428, - "loss": 2.2251, + "grad_norm": 0.099609375, + "learning_rate": 0.0003992054152426141, + "loss": 2.2013, "step": 340 }, { "epoch": 0.26, - "grad_norm": 0.0986328125, - "learning_rate": 0.0003432835820895523, - "loss": 2.2159, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003990861766181874, + "loss": 2.1924, "step": 345 }, { "epoch": 0.26, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003482587064676617, - "loss": 2.2085, + "grad_norm": 0.09619140625, + "learning_rate": 0.0003989586259191755, + "loss": 2.1955, "step": 350 }, { "epoch": 0.26, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003532338308457712, - "loss": 2.2165, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003988227684709653, + "loss": 2.1965, "step": 355 }, { "epoch": 0.27, - "grad_norm": 0.109375, - "learning_rate": 0.0003582089552238806, - "loss": 2.2196, + "grad_norm": 0.095703125, + "learning_rate": 0.00039867860994575994, + "loss": 2.1946, "step": 360 }, { "epoch": 0.27, - "grad_norm": 0.103515625, - "learning_rate": 0.0003631840796019901, - "loss": 2.212, + "grad_norm": 0.09765625, + "learning_rate": 0.000398526156362342, + "loss": 2.1955, "step": 365 }, { - "epoch": 0.28, - "grad_norm": 0.1044921875, - "learning_rate": 0.00036815920398009953, - "loss": 2.2246, + "epoch": 0.27, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003983654140858221, + "loss": 2.1897, "step": 370 }, { "epoch": 0.28, - "grad_norm": 0.10009765625, - "learning_rate": 0.00037313432835820896, - "loss": 2.2166, + "grad_norm": 0.09765625, + "learning_rate": 0.00039819638982737353, + "loss": 2.189, "step": 375 }, { "epoch": 0.28, - "grad_norm": 0.10498046875, - "learning_rate": 0.00037810945273631844, - "loss": 2.2207, + "grad_norm": 0.09423828125, + "learning_rate": 0.0003980190906439514, + "loss": 2.1812, "step": 380 }, { - "epoch": 0.29, - "grad_norm": 0.10693359375, - "learning_rate": 0.00038308457711442787, - "loss": 2.2283, + "epoch": 0.28, + "grad_norm": 0.0986328125, + "learning_rate": 0.00039783352393799856, + "loss": 2.188, "step": 385 }, { "epoch": 0.29, - "grad_norm": 0.09765625, - "learning_rate": 0.00038805970149253735, - "loss": 2.2168, + "grad_norm": 0.0927734375, + "learning_rate": 0.00039763969745713635, + "loss": 2.1859, "step": 390 }, { "epoch": 0.29, - "grad_norm": 0.0986328125, - "learning_rate": 0.0003930348258706468, - "loss": 2.2358, + "grad_norm": 0.095703125, + "learning_rate": 0.0003974376192938411, + "loss": 2.1697, "step": 395 }, { "epoch": 0.3, - "grad_norm": 0.1015625, - "learning_rate": 0.00039800995024875626, - "loss": 2.2289, + "grad_norm": 0.09765625, + "learning_rate": 0.0003972272978851061, + "loss": 2.2024, "step": 400 }, { "epoch": 0.3, - "grad_norm": 0.099609375, - "learning_rate": 0.00039999932141401753, - "loss": 2.2386, + "grad_norm": 0.09375, + "learning_rate": 0.00039700874201208976, + "loss": 2.1922, "step": 405 }, { - "epoch": 0.31, - "grad_norm": 0.1015625, - "learning_rate": 0.000399995174516356, - "loss": 2.2128, + "epoch": 0.3, + "grad_norm": 0.095703125, + "learning_rate": 0.00039678196079974865, + "loss": 2.1752, "step": 410 }, { "epoch": 0.31, - "grad_norm": 0.09765625, - "learning_rate": 0.00039998725779131805, - "loss": 2.2145, + "grad_norm": 0.09375, + "learning_rate": 0.00039654696371645663, + "loss": 2.1947, "step": 415 }, { "epoch": 0.31, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003999755713881306, - "loss": 2.2056, + "grad_norm": 0.09619140625, + "learning_rate": 0.0003963037605736096, + "loss": 2.194, "step": 420 }, { - "epoch": 0.32, - "grad_norm": 0.099609375, - "learning_rate": 0.0003999601155270777, - "loss": 2.2149, + "epoch": 0.31, + "grad_norm": 0.09375, + "learning_rate": 0.0003960523615252156, + "loss": 2.2065, "step": 425 }, { "epoch": 0.32, - "grad_norm": 0.099609375, - "learning_rate": 0.00039994089049949597, - "loss": 2.2165, + "grad_norm": 0.09619140625, + "learning_rate": 0.00039579277706747125, + "loss": 2.1707, "step": 430 }, { "epoch": 0.32, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003999178966677693, - "loss": 2.2031, + "grad_norm": 0.0947265625, + "learning_rate": 0.00039552501803832336, + "loss": 2.1889, "step": 435 }, { "epoch": 0.33, - "grad_norm": 0.1025390625, - "learning_rate": 0.00039989113446532205, - "loss": 2.2134, + "grad_norm": 0.09521484375, + "learning_rate": 0.0003952490956170161, + "loss": 2.1876, "step": 440 }, { "epoch": 0.33, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003998606043966108, - "loss": 2.1896, + "grad_norm": 0.09375, + "learning_rate": 0.00039496502132362494, + "loss": 2.1877, "step": 445 }, { - "epoch": 0.34, - "grad_norm": 0.10400390625, - "learning_rate": 0.00039982630703711496, - "loss": 2.2205, + "epoch": 0.33, + "grad_norm": 0.09765625, + "learning_rate": 0.000394672807018575, + "loss": 2.177, "step": 450 }, { "epoch": 0.34, - "grad_norm": 0.1005859375, - "learning_rate": 0.0003997882430333256, - "loss": 2.2203, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003943724649021464, + "loss": 2.2009, "step": 455 }, { "epoch": 0.34, - "grad_norm": 0.09912109375, - "learning_rate": 0.00039974641310273386, - "loss": 2.2134, + "grad_norm": 0.09423828125, + "learning_rate": 0.00039406400751396445, + "loss": 2.1802, "step": 460 }, { - "epoch": 0.35, - "grad_norm": 0.099609375, - "learning_rate": 0.0003997008180338166, - "loss": 2.2344, + "epoch": 0.34, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003937474477324764, + "loss": 2.1789, "step": 465 }, { "epoch": 0.35, - "grad_norm": 0.10302734375, - "learning_rate": 0.00039965145868602243, - "loss": 2.1976, + "grad_norm": 0.09521484375, + "learning_rate": 0.00039342279877441357, + "loss": 2.1829, "step": 470 }, { "epoch": 0.35, - "grad_norm": 0.09716796875, - "learning_rate": 0.0003995983359897548, - "loss": 2.2175, + "grad_norm": 0.09814453125, + "learning_rate": 0.00039309007419423964, + "loss": 2.1871, "step": 475 }, { "epoch": 0.36, - "grad_norm": 0.0947265625, - "learning_rate": 0.000399541450946355, - "loss": 2.2145, + "grad_norm": 0.09716796875, + "learning_rate": 0.00039274928788358477, + "loss": 2.2183, "step": 480 }, { "epoch": 0.36, - "grad_norm": 0.09619140625, - "learning_rate": 0.00039948080462808266, - "loss": 2.2186, + "grad_norm": 0.0966796875, + "learning_rate": 0.00039240045407066556, + "loss": 2.1888, "step": 485 }, { - "epoch": 0.37, - "grad_norm": 0.10009765625, - "learning_rate": 0.0003994163981780963, - "loss": 2.1911, + "epoch": 0.36, + "grad_norm": 0.09423828125, + "learning_rate": 0.00039204358731969083, + "loss": 2.1962, "step": 490 }, { "epoch": 0.37, - "grad_norm": 0.09716796875, - "learning_rate": 0.00039934823281043103, - "loss": 2.2123, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003916787025302538, + "loss": 2.1746, "step": 495 }, { "epoch": 0.37, - "grad_norm": 0.09716796875, - "learning_rate": 0.00039927630980997634, - "loss": 2.2019, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003913058149367101, + "loss": 2.1784, "step": 500 }, { - "epoch": 0.38, - "grad_norm": 0.09814453125, - "learning_rate": 0.00039920063053245145, - "loss": 2.2251, + "epoch": 0.37, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003909249401075413, + "loss": 2.1903, "step": 505 }, { "epoch": 0.38, - "grad_norm": 0.10009765625, - "learning_rate": 0.00039912119640437963, - "loss": 2.2116, + "grad_norm": 0.09423828125, + "learning_rate": 0.0003905360939447052, + "loss": 2.1772, "step": 510 }, { "epoch": 0.38, - "grad_norm": 0.1005859375, - "learning_rate": 0.00039903800892306194, - "loss": 2.2142, + "grad_norm": 0.103515625, + "learning_rate": 0.00039013929268297195, + "loss": 2.2023, "step": 515 }, { - "epoch": 0.39, - "grad_norm": 0.0986328125, - "learning_rate": 0.00039895106965654836, - "loss": 2.2006, + "epoch": 0.38, + "grad_norm": 0.099609375, + "learning_rate": 0.00038973455288924614, + "loss": 2.1887, "step": 520 }, { "epoch": 0.39, "grad_norm": 0.09765625, - "learning_rate": 0.0003988603802436086, - "loss": 2.2168, + "learning_rate": 0.0003893218914618749, + "loss": 2.1764, "step": 525 }, { - "epoch": 0.4, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003987659423937011, - "loss": 2.2054, + "epoch": 0.39, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038890132562994286, + "loss": 2.1911, "step": 530 }, { "epoch": 0.4, - "grad_norm": 0.1044921875, - "learning_rate": 0.0003986677578869407, - "loss": 2.2115, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003884728729525525, + "loss": 2.2008, "step": 535 }, { "epoch": 0.4, - "grad_norm": 0.0986328125, - "learning_rate": 0.00039856582857406524, - "loss": 2.1906, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003880365513180908, + "loss": 2.1832, "step": 540 }, { - "epoch": 0.41, - "grad_norm": 0.0986328125, - "learning_rate": 0.0003984601563764007, - "loss": 2.2139, + "epoch": 0.4, + "grad_norm": 0.09619140625, + "learning_rate": 0.00038759237894348306, + "loss": 2.1775, "step": 545 }, { "epoch": 0.41, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003983507432858249, - "loss": 2.1912, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003871403743734316, + "loss": 2.1768, "step": 550 }, { "epoch": 0.41, - "grad_norm": 0.09765625, - "learning_rate": 0.0003982375913647298, - "loss": 2.2201, + "grad_norm": 0.095703125, + "learning_rate": 0.0003866805564796421, + "loss": 2.1825, "step": 555 }, { - "epoch": 0.42, - "grad_norm": 0.0966796875, - "learning_rate": 0.000398120702745983, - "loss": 2.1941, + "epoch": 0.41, + "grad_norm": 0.09521484375, + "learning_rate": 0.000386212944460035, + "loss": 2.191, "step": 560 }, { "epoch": 0.42, - "grad_norm": 0.09716796875, - "learning_rate": 0.0003980000796328872, - "loss": 2.2206, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003857375578379449, + "loss": 2.168, "step": 565 }, { - "epoch": 0.43, - "grad_norm": 0.1005859375, - "learning_rate": 0.0003978757242991389, - "loss": 2.2062, - "step": 570 + "epoch": 0.42, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003852544164613043, + "loss": 2.1844, + "step": 570 }, { "epoch": 0.43, - "grad_norm": 0.0986328125, - "learning_rate": 0.00039774763908878525, - "loss": 2.2098, + "grad_norm": 0.09765625, + "learning_rate": 0.0003847635405018162, + "loss": 2.1926, "step": 575 }, { "epoch": 0.43, - "grad_norm": 0.09912109375, - "learning_rate": 0.0003976158264161802, - "loss": 2.2109, + "grad_norm": 0.099609375, + "learning_rate": 0.00038426495045411064, + "loss": 2.1935, "step": 580 }, { - "epoch": 0.44, - "grad_norm": 0.0986328125, - "learning_rate": 0.0003974802887659389, - "loss": 2.2148, + "epoch": 0.43, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003837586671348901, + "loss": 2.1654, "step": 585 }, { "epoch": 0.44, - "grad_norm": 0.09765625, - "learning_rate": 0.0003973410286928906, - "loss": 2.199, + "grad_norm": 0.0986328125, + "learning_rate": 0.00038324471168205945, + "loss": 2.1816, "step": 590 }, { "epoch": 0.44, - "grad_norm": 0.09765625, - "learning_rate": 0.0003971980488220308, - "loss": 2.2271, + "grad_norm": 0.095703125, + "learning_rate": 0.0003827231055538443, + "loss": 2.1697, "step": 595 }, { - "epoch": 0.45, - "grad_norm": 0.103515625, - "learning_rate": 0.0003970513518484718, - "loss": 2.2221, + "epoch": 0.44, + "grad_norm": 0.095703125, + "learning_rate": 0.0003821938705278944, + "loss": 2.1581, "step": 600 }, { "epoch": 0.45, - "grad_norm": 0.09912109375, - "learning_rate": 0.00039690094053739157, - "loss": 2.1961, + "grad_norm": 0.0966796875, + "learning_rate": 0.00038165702870037485, + "loss": 2.1588, "step": 605 }, { - "epoch": 0.46, - "grad_norm": 0.1015625, - "learning_rate": 0.0003967468177239819, - "loss": 2.2078, + "epoch": 0.45, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003811126024850432, + "loss": 2.1842, "step": 610 }, { "epoch": 0.46, - "grad_norm": 0.10205078125, - "learning_rate": 0.00039658898631339496, - "loss": 2.2026, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003805606146123139, + "loss": 2.169, "step": 615 }, { "epoch": 0.46, - "grad_norm": 0.099609375, - "learning_rate": 0.0003964274492806883, - "loss": 2.2094, + "grad_norm": 0.09765625, + "learning_rate": 0.0003800010881283093, + "loss": 2.17, "step": 620 }, { - "epoch": 0.47, - "grad_norm": 0.09814453125, - "learning_rate": 0.00039626220967076917, - "loss": 2.2022, + "epoch": 0.46, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003794340463938972, + "loss": 2.1613, "step": 625 }, { "epoch": 0.47, - "grad_norm": 0.0986328125, - "learning_rate": 0.0003960932705983365, - "loss": 2.1984, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003788595130837157, + "loss": 2.1806, "step": 630 }, { "epoch": 0.47, - "grad_norm": 0.10009765625, - "learning_rate": 0.00039592063524782306, - "loss": 2.1981, + "grad_norm": 0.09814453125, + "learning_rate": 0.00037827751218518494, + "loss": 2.1841, "step": 635 }, { - "epoch": 0.48, - "grad_norm": 0.1044921875, - "learning_rate": 0.00039574430687333464, - "loss": 2.2084, + "epoch": 0.47, + "grad_norm": 0.0966796875, + "learning_rate": 0.000377688067997505, + "loss": 2.1827, "step": 640 }, { "epoch": 0.48, - "grad_norm": 0.10546875, - "learning_rate": 0.00039556428879858904, - "loss": 2.1912, + "grad_norm": 0.09814453125, + "learning_rate": 0.000377091205130642, + "loss": 2.1846, "step": 645 }, { "epoch": 0.48, "grad_norm": 0.1005859375, - "learning_rate": 0.00039538058441685353, - "loss": 2.1871, + "learning_rate": 0.0003764869485043003, + "loss": 2.1877, "step": 650 }, { - "epoch": 0.49, - "grad_norm": 0.09912109375, - "learning_rate": 0.0003951931971908807, - "loss": 2.187, + "epoch": 0.48, + "grad_norm": 0.09765625, + "learning_rate": 0.0003758753233468823, + "loss": 2.1702, "step": 655 }, { "epoch": 0.49, "grad_norm": 0.0966796875, - "learning_rate": 0.0003950021306528432, - "loss": 2.1915, + "learning_rate": 0.00037525635519443466, + "loss": 2.1755, "step": 660 }, { - "epoch": 0.5, - "grad_norm": 0.09814453125, - "learning_rate": 0.0003948073884042673, - "loss": 2.1892, + "epoch": 0.49, + "grad_norm": 0.099609375, + "learning_rate": 0.00037463006988958266, + "loss": 2.1691, "step": 665 }, { "epoch": 0.5, - "grad_norm": 0.09912109375, - "learning_rate": 0.00039460897411596477, - "loss": 2.2194, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003739964935804509, + "loss": 2.1713, "step": 670 }, { "epoch": 0.5, - "grad_norm": 0.10107421875, - "learning_rate": 0.00039440689152796406, - "loss": 2.2103, + "grad_norm": 0.095703125, + "learning_rate": 0.0003733556527195719, + "loss": 2.1749, "step": 675 }, { - "epoch": 0.51, - "grad_norm": 0.10107421875, - "learning_rate": 0.00039420114444943934, - "loss": 2.2032, + "epoch": 0.5, + "grad_norm": 0.0986328125, + "learning_rate": 0.00037270757406278126, + "loss": 2.1685, "step": 680 }, { "epoch": 0.51, - "grad_norm": 0.1005859375, - "learning_rate": 0.0003939917367586391, - "loss": 2.1989, + "grad_norm": 0.0986328125, + "learning_rate": 0.00037205228466810094, + "loss": 2.1818, "step": 685 }, { "epoch": 0.51, - "grad_norm": 0.1005859375, - "learning_rate": 0.00039377867240281275, - "loss": 2.1929, + "grad_norm": 0.0986328125, + "learning_rate": 0.00037138981189460945, + "loss": 2.1661, "step": 690 }, { - "epoch": 0.52, - "grad_norm": 0.1015625, - "learning_rate": 0.0003935619553981364, - "loss": 2.1961, + "epoch": 0.51, + "grad_norm": 0.10009765625, + "learning_rate": 0.00037072018340129936, + "loss": 2.1733, "step": 695 }, { "epoch": 0.52, - "grad_norm": 0.09912109375, - "learning_rate": 0.0003933415898296372, - "loss": 2.1941, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003700434271459229, + "loss": 2.1749, "step": 700 }, { - "epoch": 0.53, + "epoch": 0.52, "grad_norm": 0.10009765625, - "learning_rate": 0.000393117579851116, - "loss": 2.1983, + "learning_rate": 0.0003693595713838243, + "loss": 2.1902, "step": 705 }, { "epoch": 0.53, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003928899296850695, - "loss": 2.1912, + "grad_norm": 0.099609375, + "learning_rate": 0.0003686686446667605, + "loss": 2.1807, "step": 710 }, { "epoch": 0.53, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003926586436226103, - "loss": 2.2096, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003679706758417087, + "loss": 2.1834, "step": 715 }, { - "epoch": 0.54, - "grad_norm": 0.10400390625, - "learning_rate": 0.0003924237260233863, - "loss": 2.2007, + "epoch": 0.53, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003672656940496621, + "loss": 2.1786, "step": 720 }, { "epoch": 0.54, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003921851813154983, - "loss": 2.2171, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003665537287244134, + "loss": 2.1742, "step": 725 }, { "epoch": 0.54, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003919430139954167, - "loss": 2.2002, + "grad_norm": 0.09765625, + "learning_rate": 0.00036583480959132567, + "loss": 2.1676, "step": 730 }, { - "epoch": 0.55, + "epoch": 0.54, "grad_norm": 0.10009765625, - "learning_rate": 0.00039169722862789644, - "loss": 2.1913, + "learning_rate": 0.0003651089666660914, + "loss": 2.1633, "step": 735 }, { "epoch": 0.55, - "grad_norm": 0.1005859375, - "learning_rate": 0.0003914478298458916, - "loss": 2.1765, + "grad_norm": 0.09765625, + "learning_rate": 0.0003643762302534792, + "loss": 2.1699, "step": 740 }, { - "epoch": 0.56, - "grad_norm": 0.099609375, - "learning_rate": 0.00039119482235046716, - "loss": 2.1971, + "epoch": 0.55, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003636366309460688, + "loss": 2.1772, "step": 745 }, { "epoch": 0.56, - "grad_norm": 0.10205078125, - "learning_rate": 0.00039093821091071117, - "loss": 2.1978, + "grad_norm": 0.1005859375, + "learning_rate": 0.00036289019962297347, + "loss": 2.1602, "step": 750 }, { "epoch": 0.56, - "grad_norm": 0.1005859375, - "learning_rate": 0.00039067800036364443, - "loss": 2.1954, + "grad_norm": 0.1015625, + "learning_rate": 0.000362136967448551, + "loss": 2.1592, "step": 755 }, { - "epoch": 0.57, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003904141956141297, - "loss": 2.2025, + "epoch": 0.56, + "grad_norm": 0.099609375, + "learning_rate": 0.00036137696587110234, + "loss": 2.1497, "step": 760 }, { "epoch": 0.57, - "grad_norm": 0.1015625, - "learning_rate": 0.0003901468016347786, - "loss": 2.2045, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003606102266215589, + "loss": 2.1816, "step": 765 }, { "epoch": 0.57, - "grad_norm": 0.103515625, - "learning_rate": 0.00038987582346585847, - "loss": 2.1939, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003598367817121574, + "loss": 2.1681, "step": 770 }, { - "epoch": 0.58, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003896012662151972, - "loss": 2.175, + "epoch": 0.57, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003590566634351036, + "loss": 2.179, "step": 775 }, { "epoch": 0.58, - "grad_norm": 0.099609375, - "learning_rate": 0.00038932313505808685, - "loss": 2.185, + "grad_norm": 0.09912109375, + "learning_rate": 0.000358269904361224, + "loss": 2.186, "step": 780 }, { - "epoch": 0.59, - "grad_norm": 0.10107421875, - "learning_rate": 0.00038904143523718615, - "loss": 2.1939, + "epoch": 0.58, + "grad_norm": 0.09765625, + "learning_rate": 0.00035747653733860576, + "loss": 2.1635, "step": 785 }, { - "epoch": 0.59, - "grad_norm": 0.099609375, - "learning_rate": 0.00038875617206242174, - "loss": 2.1634, + "epoch": 0.58, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003566765954912256, + "loss": 2.1863, "step": 790 }, { "epoch": 0.59, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003884673509108879, - "loss": 2.1947, + "grad_norm": 0.099609375, + "learning_rate": 0.0003558701122175666, + "loss": 2.1639, "step": 795 }, { - "epoch": 0.6, - "grad_norm": 0.10107421875, - "learning_rate": 0.00038817497722674546, - "loss": 2.1925, + "epoch": 0.59, + "grad_norm": 0.103515625, + "learning_rate": 0.0003550571211892238, + "loss": 2.1795, "step": 800 }, { "epoch": 0.6, - "grad_norm": 0.1015625, - "learning_rate": 0.000387879056521119, - "loss": 2.1809, + "grad_norm": 0.10009765625, + "learning_rate": 0.00035423765634949844, + "loss": 2.1634, "step": 805 }, { "epoch": 0.6, - "grad_norm": 0.099609375, - "learning_rate": 0.0003875795943719929, - "loss": 2.1763, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003534117519119807, + "loss": 2.1673, "step": 810 }, { - "epoch": 0.61, - "grad_norm": 0.1025390625, - "learning_rate": 0.00038727659642410654, - "loss": 2.2132, + "epoch": 0.6, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003525794423591214, + "loss": 2.1722, "step": 815 }, { "epoch": 0.61, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003869700683888474, - "loss": 2.194, + "grad_norm": 0.09765625, + "learning_rate": 0.00035174076244079216, + "loss": 2.1687, "step": 820 }, { - "epoch": 0.62, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003866600160441438, - "loss": 2.1861, + "epoch": 0.61, + "grad_norm": 0.10498046875, + "learning_rate": 0.00035089574717283466, + "loss": 2.1736, "step": 825 }, { - "epoch": 0.62, - "grad_norm": 0.1005859375, - "learning_rate": 0.00038634644523435587, - "loss": 2.2093, + "epoch": 0.61, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003500444318355986, + "loss": 2.1784, "step": 830 }, { "epoch": 0.62, "grad_norm": 0.1005859375, - "learning_rate": 0.0003860293618701653, - "loss": 2.1894, + "learning_rate": 0.0003491868519724688, + "loss": 2.1783, "step": 835 }, { - "epoch": 0.63, - "grad_norm": 0.09912109375, - "learning_rate": 0.0003857087719284641, - "loss": 2.2219, + "epoch": 0.62, + "grad_norm": 0.10107421875, + "learning_rate": 0.000348323043388381, + "loss": 2.1674, "step": 840 }, { "epoch": 0.63, - "grad_norm": 0.10009765625, - "learning_rate": 0.00038538468145224165, - "loss": 2.1982, + "grad_norm": 0.09912109375, + "learning_rate": 0.00034745304214832726, + "loss": 2.1791, "step": 845 }, { "epoch": 0.63, - "grad_norm": 0.10205078125, - "learning_rate": 0.00038505709655047113, - "loss": 2.1954, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003465768845758502, + "loss": 2.1573, "step": 850 }, { - "epoch": 0.64, - "grad_norm": 0.1005859375, - "learning_rate": 0.00038472602339799427, - "loss": 2.2064, + "epoch": 0.63, + "grad_norm": 0.09814453125, + "learning_rate": 0.00034569460725152615, + "loss": 2.1842, "step": 855 }, { "epoch": 0.64, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003843914682354047, - "loss": 2.2166, + "grad_norm": 0.09765625, + "learning_rate": 0.00034480624701143807, + "loss": 2.1624, "step": 860 }, { - "epoch": 0.65, - "grad_norm": 0.09912109375, - "learning_rate": 0.00038405343736893065, - "loss": 2.2001, + "epoch": 0.64, + "grad_norm": 0.10107421875, + "learning_rate": 0.00034391184094563764, + "loss": 2.1892, "step": 865 }, { - "epoch": 0.65, - "grad_norm": 0.1015625, - "learning_rate": 0.000383711937170316, - "loss": 2.1765, + "epoch": 0.64, + "grad_norm": 0.09814453125, + "learning_rate": 0.00034301142639659663, + "loss": 2.1788, "step": 870 }, { "epoch": 0.65, - "grad_norm": 0.10302734375, - "learning_rate": 0.00038336697407669994, - "loss": 2.1826, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003421050409576478, + "loss": 2.174, "step": 875 }, { - "epoch": 0.66, - "grad_norm": 0.10302734375, - "learning_rate": 0.000383018554590496, - "loss": 2.1964, + "epoch": 0.65, + "grad_norm": 0.103515625, + "learning_rate": 0.00034119272247141536, + "loss": 2.171, "step": 880 }, { "epoch": 0.66, - "grad_norm": 0.09912109375, - "learning_rate": 0.0003826666852792692, - "loss": 2.1954, + "grad_norm": 0.1015625, + "learning_rate": 0.0003402745090282351, + "loss": 2.1571, "step": 885 }, { "epoch": 0.66, - "grad_norm": 0.1044921875, - "learning_rate": 0.00038231137277561244, - "loss": 2.2015, + "grad_norm": 0.10498046875, + "learning_rate": 0.00033935043896456384, + "loss": 2.1716, "step": 890 }, { - "epoch": 0.67, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003819526237770212, - "loss": 2.1932, + "epoch": 0.66, + "grad_norm": 0.09912109375, + "learning_rate": 0.000338420550861379, + "loss": 2.1631, "step": 895 }, { "epoch": 0.67, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003815904450457677, - "loss": 2.1906, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003374848835425679, + "loss": 2.1652, "step": 900 }, { - "epoch": 0.68, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003812248434087732, - "loss": 2.1776, + "epoch": 0.67, + "grad_norm": 0.099609375, + "learning_rate": 0.00033654347607330656, + "loss": 2.1661, "step": 905 }, { - "epoch": 0.68, - "grad_norm": 0.10107421875, - "learning_rate": 0.00038085582575747914, - "loss": 2.1936, + "epoch": 0.67, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003355963677584288, + "loss": 2.1697, "step": 910 }, { "epoch": 0.68, - "grad_norm": 0.1015625, - "learning_rate": 0.0003804833990477177, - "loss": 2.1819, + "grad_norm": 0.1005859375, + "learning_rate": 0.00033464359814078536, + "loss": 2.1937, "step": 915 }, { - "epoch": 0.69, - "grad_norm": 0.10107421875, - "learning_rate": 0.00038010757029958016, - "loss": 2.1918, + "epoch": 0.68, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003336852069995927, + "loss": 2.1646, "step": 920 }, { - "epoch": 0.69, - "grad_norm": 0.10498046875, - "learning_rate": 0.0003797283465972851, - "loss": 2.1776, + "epoch": 0.68, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003327212343487725, + "loss": 2.1635, "step": 925 }, { "epoch": 0.69, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003793457350890443, - "loss": 2.1786, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003317517204352804, + "loss": 2.149, "step": 930 }, { - "epoch": 0.7, - "grad_norm": 0.1015625, - "learning_rate": 0.0003789597429869286, - "loss": 2.2086, + "epoch": 0.69, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003307767057374266, + "loss": 2.1678, "step": 935 }, { "epoch": 0.7, - "grad_norm": 0.10400390625, - "learning_rate": 0.0003785703775667314, - "loss": 2.2138, + "grad_norm": 0.1015625, + "learning_rate": 0.00032979623096318487, + "loss": 2.1643, "step": 940 }, { "epoch": 0.7, - "grad_norm": 0.099609375, - "learning_rate": 0.00037817764616783196, - "loss": 2.2123, + "grad_norm": 0.1005859375, + "learning_rate": 0.00032881033704849357, + "loss": 2.1748, "step": 945 }, { - "epoch": 0.71, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003777815561930568, - "loss": 2.1889, + "epoch": 0.7, + "grad_norm": 0.10107421875, + "learning_rate": 0.00032781906515554646, + "loss": 2.1617, "step": 950 }, { "epoch": 0.71, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003773821151085401, - "loss": 2.1946, + "grad_norm": 0.1015625, + "learning_rate": 0.0003268224566710738, + "loss": 2.1529, "step": 955 }, { - "epoch": 0.72, + "epoch": 0.71, "grad_norm": 0.1025390625, - "learning_rate": 0.00037697933044358335, - "loss": 2.1688, + "learning_rate": 0.00032582055320461465, + "loss": 2.1781, "step": 960 }, { - "epoch": 0.72, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003765732097905129, - "loss": 2.1795, + "epoch": 0.71, + "grad_norm": 0.10546875, + "learning_rate": 0.0003248133965867798, + "loss": 2.1429, "step": 965 }, { "epoch": 0.72, - "grad_norm": 0.1005859375, - "learning_rate": 0.00037616376080453737, - "loss": 2.1966, + "grad_norm": 0.10009765625, + "learning_rate": 0.00032380102886750493, + "loss": 2.1629, "step": 970 }, { - "epoch": 0.73, - "grad_norm": 0.10107421875, - "learning_rate": 0.0003757509912036028, - "loss": 2.1901, + "epoch": 0.72, + "grad_norm": 0.1005859375, + "learning_rate": 0.000322783492314295, + "loss": 2.1655, "step": 975 }, { "epoch": 0.73, - "grad_norm": 0.103515625, - "learning_rate": 0.0003753349087682477, - "loss": 2.1685, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003217608294104601, + "loss": 2.1758, "step": 980 }, { "epoch": 0.73, - "grad_norm": 0.1015625, - "learning_rate": 0.000374915521341456, - "loss": 2.1923, + "grad_norm": 0.09912109375, + "learning_rate": 0.00032073308285334085, + "loss": 2.1565, "step": 985 }, { - "epoch": 0.74, - "grad_norm": 0.10205078125, - "learning_rate": 0.00037449283682850957, - "loss": 2.1995, + "epoch": 0.73, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003197002955525264, + "loss": 2.1806, "step": 990 }, { "epoch": 0.74, - "grad_norm": 0.10400390625, - "learning_rate": 0.00037406686319683887, - "loss": 2.1921, + "grad_norm": 0.099609375, + "learning_rate": 0.00031866251062806267, + "loss": 2.1701, "step": 995 }, { - "epoch": 0.75, - "grad_norm": 0.10546875, - "learning_rate": 0.00037363760847587284, - "loss": 2.178, + "epoch": 0.74, + "grad_norm": 0.10107421875, + "learning_rate": 0.00031761977140865207, + "loss": 2.1675, "step": 1000 }, { - "epoch": 0.75, - "grad_norm": 0.10400390625, - "learning_rate": 0.00037320508075688776, - "loss": 2.1711, + "epoch": 0.74, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003165721214298444, + "loss": 2.1671, "step": 1005 }, { "epoch": 0.75, - "grad_norm": 0.10205078125, - "learning_rate": 0.00037276928819285446, - "loss": 2.1825, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003155196044322193, + "loss": 2.1575, "step": 1010 }, { - "epoch": 0.76, + "epoch": 0.75, "grad_norm": 0.1015625, - "learning_rate": 0.0003723302389982849, - "loss": 2.1925, + "learning_rate": 0.00031446226435956, + "loss": 2.1381, "step": 1015 }, { - "epoch": 0.76, - "grad_norm": 0.1044921875, - "learning_rate": 0.0003718879414490771, - "loss": 2.1758, + "epoch": 0.75, + "grad_norm": 0.1015625, + "learning_rate": 0.0003134001453570186, + "loss": 2.1626, "step": 1020 }, { "epoch": 0.76, "grad_norm": 0.10205078125, - "learning_rate": 0.0003714424038823592, - "loss": 2.1837, + "learning_rate": 0.00031233329176927295, + "loss": 2.1648, "step": 1025 }, { - "epoch": 0.77, - "grad_norm": 0.10302734375, - "learning_rate": 0.00037099363469633245, - "loss": 2.1914, + "epoch": 0.76, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003112617481386752, + "loss": 2.1671, "step": 1030 }, { "epoch": 0.77, - "grad_norm": 0.1044921875, - "learning_rate": 0.0003705416423501128, - "loss": 2.1667, + "grad_norm": 0.103515625, + "learning_rate": 0.0003101855592033922, + "loss": 2.1648, "step": 1035 }, { - "epoch": 0.78, - "grad_norm": 0.1015625, - "learning_rate": 0.0003700864353635714, - "loss": 2.1911, + "epoch": 0.77, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003091047698955375, + "loss": 2.1746, "step": 1040 }, { - "epoch": 0.78, - "grad_norm": 0.10205078125, - "learning_rate": 0.00036962802231717403, - "loss": 2.1867, + "epoch": 0.77, + "grad_norm": 0.10302734375, + "learning_rate": 0.00030801942533929545, + "loss": 2.1603, "step": 1045 }, { "epoch": 0.78, - "grad_norm": 0.10498046875, - "learning_rate": 0.0003691664118518195, - "loss": 2.1717, + "grad_norm": 0.1005859375, + "learning_rate": 0.00030692957084903726, + "loss": 2.172, "step": 1050 }, { - "epoch": 0.79, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003687016126686765, - "loss": 2.1794, + "epoch": 0.78, + "grad_norm": 0.10107421875, + "learning_rate": 0.00030583525192742897, + "loss": 2.1643, "step": 1055 }, { - "epoch": 0.79, - "grad_norm": 0.1044921875, - "learning_rate": 0.00036823363352901997, - "loss": 2.2012, + "epoch": 0.78, + "grad_norm": 0.1005859375, + "learning_rate": 0.00030473651426353167, + "loss": 2.1756, "step": 1060 }, { "epoch": 0.79, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003677624832540655, - "loss": 2.1957, + "grad_norm": 0.10107421875, + "learning_rate": 0.00030363340373089413, + "loss": 2.1637, "step": 1065 }, { - "epoch": 0.8, - "grad_norm": 0.10888671875, - "learning_rate": 0.0003672881707248034, - "loss": 2.1762, + "epoch": 0.79, + "grad_norm": 0.10400390625, + "learning_rate": 0.00030252596638563714, + "loss": 2.1718, "step": 1070 }, { "epoch": 0.8, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003668107048818312, - "loss": 2.2005, + "grad_norm": 0.1015625, + "learning_rate": 0.000301414248464531, + "loss": 2.1552, "step": 1075 }, { - "epoch": 0.81, - "grad_norm": 0.10693359375, - "learning_rate": 0.0003663300947251851, - "loss": 2.2066, + "epoch": 0.8, + "grad_norm": 0.1025390625, + "learning_rate": 0.000300298296383065, + "loss": 2.187, "step": 1080 }, { - "epoch": 0.81, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003658463493141703, - "loss": 2.1813, + "epoch": 0.8, + "grad_norm": 0.10205078125, + "learning_rate": 0.00029917815673350935, + "loss": 2.1747, "step": 1085 }, { "epoch": 0.81, - "grad_norm": 0.10498046875, - "learning_rate": 0.00036535947776719017, - "loss": 2.1659, + "grad_norm": 0.103515625, + "learning_rate": 0.0002980538762829698, + "loss": 2.1677, "step": 1090 }, { - "epoch": 0.82, - "grad_norm": 0.103515625, - "learning_rate": 0.0003648694892615747, - "loss": 2.191, + "epoch": 0.81, + "grad_norm": 0.10205078125, + "learning_rate": 0.00029692550197143563, + "loss": 2.1476, "step": 1095 }, { - "epoch": 0.82, - "grad_norm": 0.1025390625, - "learning_rate": 0.0003643763930334071, - "loss": 2.1706, + "epoch": 0.81, + "grad_norm": 0.10400390625, + "learning_rate": 0.00029579308090981913, + "loss": 2.163, "step": 1100 }, { "epoch": 0.82, - "grad_norm": 0.10546875, - "learning_rate": 0.00036388019837734994, - "loss": 2.1885, + "grad_norm": 0.1025390625, + "learning_rate": 0.00029465666037798935, + "loss": 2.1673, "step": 1105 }, { - "epoch": 0.83, - "grad_norm": 0.1015625, - "learning_rate": 0.00036338091464646984, - "loss": 2.1711, + "epoch": 0.82, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002935162878227975, + "loss": 2.1737, "step": 1110 }, { "epoch": 0.83, - "grad_norm": 0.10400390625, - "learning_rate": 0.0003628785512520613, - "loss": 2.1687, + "grad_norm": 0.103515625, + "learning_rate": 0.0002923720108560964, + "loss": 2.1412, "step": 1115 }, { - "epoch": 0.84, - "grad_norm": 0.1015625, - "learning_rate": 0.0003623731176634691, - "loss": 2.2004, + "epoch": 0.83, + "grad_norm": 0.1044921875, + "learning_rate": 0.00029122387725275244, + "loss": 2.1455, "step": 1120 }, { - "epoch": 0.84, - "grad_norm": 0.1025390625, - "learning_rate": 0.00036186462340791014, - "loss": 2.188, + "epoch": 0.83, + "grad_norm": 0.103515625, + "learning_rate": 0.00029007193494865103, + "loss": 2.1477, "step": 1125 }, { "epoch": 0.84, "grad_norm": 0.1025390625, - "learning_rate": 0.0003613530780702934, - "loss": 2.1822, + "learning_rate": 0.00028891623203869523, + "loss": 2.1486, "step": 1130 }, { - "epoch": 0.85, - "grad_norm": 0.10400390625, - "learning_rate": 0.00036083849129303966, - "loss": 2.1848, + "epoch": 0.84, + "grad_norm": 0.103515625, + "learning_rate": 0.0002877568167747975, + "loss": 2.1703, "step": 1135 }, { - "epoch": 0.85, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003603208727758995, - "loss": 2.1607, + "epoch": 0.84, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002865937375638654, + "loss": 2.1819, "step": 1140 }, { "epoch": 0.85, - "grad_norm": 0.1025390625, - "learning_rate": 0.00035980023227577063, - "loss": 2.1863, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002854270429657805, + "loss": 2.1564, "step": 1145 }, { - "epoch": 0.86, + "epoch": 0.85, "grad_norm": 0.10302734375, - "learning_rate": 0.00035927657960651394, - "loss": 2.1711, + "learning_rate": 0.0002842567816913708, + "loss": 2.1723, "step": 1150 }, { - "epoch": 0.86, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003587499246387684, - "loss": 2.1806, + "epoch": 0.85, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002830830026003773, + "loss": 2.1585, "step": 1155 }, { - "epoch": 0.87, - "grad_norm": 0.10693359375, - "learning_rate": 0.00035822027729976504, - "loss": 2.1735, + "epoch": 0.86, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002819057546994135, + "loss": 2.1758, "step": 1160 }, { - "epoch": 0.87, - "grad_norm": 0.1044921875, - "learning_rate": 0.00035768764757314, - "loss": 2.1989, + "epoch": 0.86, + "grad_norm": 0.10302734375, + "learning_rate": 0.00028072508713992007, + "loss": 2.1637, "step": 1165 }, { "epoch": 0.87, - "grad_norm": 0.103515625, - "learning_rate": 0.00035715204549874617, - "loss": 2.1728, + "grad_norm": 0.10107421875, + "learning_rate": 0.00027954104921611194, + "loss": 2.1478, "step": 1170 }, { - "epoch": 0.88, + "epoch": 0.87, "grad_norm": 0.10302734375, - "learning_rate": 0.0003566134811724639, - "loss": 2.1933, + "learning_rate": 0.00027835369036292087, + "loss": 2.1585, "step": 1175 }, { - "epoch": 0.88, + "epoch": 0.87, "grad_norm": 0.10302734375, - "learning_rate": 0.00035607196474601074, - "loss": 2.1886, + "learning_rate": 0.000277163060153931, + "loss": 2.1481, "step": 1180 }, { "epoch": 0.88, - "grad_norm": 0.10205078125, - "learning_rate": 0.00035552750642675043, - "loss": 2.1829, + "grad_norm": 0.1015625, + "learning_rate": 0.0002759692082993095, + "loss": 2.1636, "step": 1185 }, { - "epoch": 0.89, - "grad_norm": 0.1044921875, - "learning_rate": 0.00035498011647749976, - "loss": 2.1755, + "epoch": 0.88, + "grad_norm": 0.10205078125, + "learning_rate": 0.00027477218464373076, + "loss": 2.1594, "step": 1190 }, { - "epoch": 0.89, - "grad_norm": 0.10302734375, - "learning_rate": 0.00035442980521633595, - "loss": 2.2011, + "epoch": 0.88, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002735720391642956, + "loss": 2.153, "step": 1195 }, { - "epoch": 0.9, - "grad_norm": 0.1015625, - "learning_rate": 0.00035387658301640136, - "loss": 2.2043, + "epoch": 0.89, + "grad_norm": 0.1025390625, + "learning_rate": 0.0002723688219684443, + "loss": 2.1666, "step": 1200 }, { - "epoch": 0.9, - "grad_norm": 0.10302734375, - "learning_rate": 0.0003533204603057088, - "loss": 2.1782, + "epoch": 0.89, + "grad_norm": 0.1015625, + "learning_rate": 0.00027116258329186514, + "loss": 2.1728, "step": 1205 }, { "epoch": 0.9, - "grad_norm": 0.10498046875, - "learning_rate": 0.00035276144756694406, - "loss": 2.179, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002699533734963964, + "loss": 2.158, "step": 1210 }, { - "epoch": 0.91, - "grad_norm": 0.10205078125, - "learning_rate": 0.00035219955533726915, - "loss": 2.1841, + "epoch": 0.9, + "grad_norm": 0.10546875, + "learning_rate": 0.000268741243067924, + "loss": 2.1384, "step": 1215 }, { - "epoch": 0.91, - "grad_norm": 0.10498046875, - "learning_rate": 0.0003516347942081232, - "loss": 2.1646, + "epoch": 0.9, + "grad_norm": 0.10009765625, + "learning_rate": 0.00026752624261427375, + "loss": 2.1691, "step": 1220 }, { "epoch": 0.91, - "grad_norm": 0.10205078125, - "learning_rate": 0.00035106717482502267, - "loss": 2.1878, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002663084228630982, + "loss": 2.1531, "step": 1225 }, { - "epoch": 0.92, - "grad_norm": 0.10546875, - "learning_rate": 0.0003504967078873613, - "loss": 2.1753, + "epoch": 0.91, + "grad_norm": 0.103515625, + "learning_rate": 0.0002650878346597586, + "loss": 2.1576, "step": 1230 }, { - "epoch": 0.92, - "grad_norm": 0.103515625, - "learning_rate": 0.000349923404148208, - "loss": 2.1854, + "epoch": 0.91, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002638645289652025, + "loss": 2.1653, "step": 1235 }, { - "epoch": 0.93, - "grad_norm": 0.1044921875, - "learning_rate": 0.0003493472744141041, - "loss": 2.1878, + "epoch": 0.92, + "grad_norm": 0.109375, + "learning_rate": 0.0002626385568538358, + "loss": 2.15, "step": 1240 }, { - "epoch": 0.93, - "grad_norm": 0.10498046875, - "learning_rate": 0.0003487683295448598, - "loss": 2.1675, + "epoch": 0.92, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002614099695113901, + "loss": 2.1738, "step": 1245 }, { "epoch": 0.93, - "grad_norm": 0.103515625, - "learning_rate": 0.0003481865804533494, - "loss": 2.1902, + "grad_norm": 0.10107421875, + "learning_rate": 0.00026017881823278607, + "loss": 2.1608, "step": 1250 }, { - "epoch": 0.94, - "grad_norm": 0.10546875, - "learning_rate": 0.00034760203810530594, - "loss": 2.1848, + "epoch": 0.93, + "grad_norm": 0.103515625, + "learning_rate": 0.00025894515441999156, + "loss": 2.1914, "step": 1255 }, { - "epoch": 0.94, - "grad_norm": 0.103515625, - "learning_rate": 0.00034701471351911395, - "loss": 2.1638, + "epoch": 0.93, + "grad_norm": 0.10400390625, + "learning_rate": 0.00025770902957987556, + "loss": 2.1659, "step": 1260 }, { "epoch": 0.94, - "grad_norm": 0.10546875, - "learning_rate": 0.000346424617765602, - "loss": 2.1763, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002564704953220578, + "loss": 2.1719, "step": 1265 }, { - "epoch": 0.95, - "grad_norm": 0.1025390625, - "learning_rate": 0.000345831761967834, - "loss": 2.165, + "epoch": 0.94, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002552296033567541, + "loss": 2.1676, "step": 1270 }, { - "epoch": 0.95, - "grad_norm": 0.1044921875, - "learning_rate": 0.00034523615730089986, - "loss": 2.1875, + "epoch": 0.94, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002539864054926169, + "loss": 2.165, "step": 1275 }, { "epoch": 0.95, - "grad_norm": 0.10205078125, - "learning_rate": 0.0003446378149917042, - "loss": 2.1595, + "grad_norm": 0.1025390625, + "learning_rate": 0.0002527409536345728, + "loss": 2.1561, "step": 1280 }, { - "epoch": 0.96, - "grad_norm": 0.107421875, - "learning_rate": 0.0003440367463187553, - "loss": 2.1841, + "epoch": 0.95, + "grad_norm": 0.1025390625, + "learning_rate": 0.00025149329978165516, + "loss": 2.1635, "step": 1285 }, { - "epoch": 0.96, - "grad_norm": 0.10546875, - "learning_rate": 0.00034343296261195224, - "loss": 2.1882, + "epoch": 0.95, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002502434960248331, + "loss": 2.1513, "step": 1290 }, { - "epoch": 0.97, - "grad_norm": 0.10595703125, - "learning_rate": 0.0003428264752523712, - "loss": 2.1671, + "epoch": 0.96, + "grad_norm": 0.10205078125, + "learning_rate": 0.00024899159454483665, + "loss": 2.1599, "step": 1295 }, { - "epoch": 0.97, - "grad_norm": 0.10595703125, - "learning_rate": 0.0003422172956720514, - "loss": 2.1671, + "epoch": 0.96, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002477376476099784, + "loss": 2.1674, "step": 1300 }, { "epoch": 0.97, - "grad_norm": 0.103515625, - "learning_rate": 0.00034160543535377926, - "loss": 2.1607, + "grad_norm": 0.1025390625, + "learning_rate": 0.00024648170757397055, + "loss": 2.14, "step": 1305 }, { - "epoch": 0.98, - "grad_norm": 0.1083984375, - "learning_rate": 0.0003409909058308718, - "loss": 2.1827, + "epoch": 0.97, + "grad_norm": 0.103515625, + "learning_rate": 0.00024522382687374, + "loss": 2.1519, "step": 1310 }, { - "epoch": 0.98, + "epoch": 0.97, "grad_norm": 0.10595703125, - "learning_rate": 0.0003403737186869596, - "loss": 2.183, + "learning_rate": 0.0002439640580272384, + "loss": 2.1622, "step": 1315 }, { "epoch": 0.98, - "grad_norm": 0.10205078125, - "learning_rate": 0.00033975388555576835, - "loss": 2.1692, + "grad_norm": 0.1015625, + "learning_rate": 0.0002427024536312496, + "loss": 2.1535, "step": 1320 }, { - "epoch": 0.99, - "grad_norm": 0.1064453125, - "learning_rate": 0.0003391314181208995, - "loss": 2.1786, + "epoch": 0.98, + "grad_norm": 0.10302734375, + "learning_rate": 0.00024143906635919383, + "loss": 2.1383, "step": 1325 }, { - "epoch": 0.99, - "grad_norm": 0.10546875, - "learning_rate": 0.00033850632811561, - "loss": 2.166, + "epoch": 0.98, + "grad_norm": 0.1025390625, + "learning_rate": 0.00024017394895892838, + "loss": 2.1282, "step": 1330 }, { - "epoch": 1.0, - "grad_norm": 0.1044921875, - "learning_rate": 0.00033787862732259123, - "loss": 2.1767, + "epoch": 0.99, + "grad_norm": 0.10595703125, + "learning_rate": 0.00023890715425054545, + "loss": 2.1554, "step": 1335 }, { - "epoch": 1.0, - "grad_norm": 0.10986328125, - "learning_rate": 0.0003372483275737468, - "loss": 2.1729, + "epoch": 0.99, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002376387351241666, + "loss": 2.157, "step": 1340 }, { "epoch": 1.0, - "eval_loss": 2.1778452396392822, - "eval_runtime": 189.5201, - "eval_samples_per_second": 25.443, - "eval_steps_per_second": 3.182, - "step": 1340 + "grad_norm": 0.10107421875, + "learning_rate": 0.00023636874453773475, + "loss": 2.1471, + "step": 1345 }, { "epoch": 1.0, "grad_norm": 0.10546875, - "learning_rate": 0.0003366154407499695, - "loss": 2.1415, - "step": 1345 + "learning_rate": 0.00023509723551480325, + "loss": 2.1383, + "step": 1350 }, { - "epoch": 1.01, - "grad_norm": 0.10595703125, - "learning_rate": 0.0003359799787809179, - "loss": 2.1377, - "step": 1350 + "epoch": 1.0, + "eval_loss": 2.1541337966918945, + "eval_runtime": 188.5948, + "eval_samples_per_second": 25.785, + "eval_steps_per_second": 3.224, + "step": 1351 }, { - "epoch": 1.01, - "grad_norm": 0.1103515625, - "learning_rate": 0.0003353419536447902, - "loss": 2.1349, + "epoch": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.00023382426114232162, + "loss": 2.1347, "step": 1355 }, { "epoch": 1.01, - "grad_norm": 0.1123046875, - "learning_rate": 0.00033470137736809995, - "loss": 2.1339, + "grad_norm": 0.10595703125, + "learning_rate": 0.00023254987456841956, + "loss": 2.1403, "step": 1360 }, { - "epoch": 1.02, - "grad_norm": 0.11328125, - "learning_rate": 0.0003340582620254484, - "loss": 2.1364, + "epoch": 1.01, + "grad_norm": 0.107421875, + "learning_rate": 0.00023127412900018782, + "loss": 2.1468, "step": 1365 }, { - "epoch": 1.02, - "grad_norm": 0.107421875, - "learning_rate": 0.000333412619739297, - "loss": 2.1484, + "epoch": 1.01, + "grad_norm": 0.10498046875, + "learning_rate": 0.00022999707770145653, + "loss": 2.1267, "step": 1370 }, { - "epoch": 1.03, - "grad_norm": 0.10888671875, - "learning_rate": 0.0003327644626797394, - "loss": 2.1753, + "epoch": 1.02, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002287187739905717, + "loss": 2.1406, "step": 1375 }, { - "epoch": 1.03, - "grad_norm": 0.107421875, - "learning_rate": 0.0003321138030642714, - "loss": 2.165, + "epoch": 1.02, + "grad_norm": 0.10595703125, + "learning_rate": 0.00022743927123816899, + "loss": 2.1304, "step": 1380 }, { "epoch": 1.03, - "grad_norm": 0.1103515625, - "learning_rate": 0.00033146065315756113, - "loss": 2.1622, + "grad_norm": 0.10791015625, + "learning_rate": 0.00022615862286494537, + "loss": 2.1199, "step": 1385 }, { - "epoch": 1.04, - "grad_norm": 0.1123046875, - "learning_rate": 0.00033080502527121756, - "loss": 2.1704, + "epoch": 1.03, + "grad_norm": 0.109375, + "learning_rate": 0.00022487688233942862, + "loss": 2.1216, "step": 1390 }, { - "epoch": 1.04, - "grad_norm": 0.111328125, - "learning_rate": 0.0003301469317635587, - "loss": 2.162, + "epoch": 1.03, + "grad_norm": 0.10693359375, + "learning_rate": 0.00022359410317574548, + "loss": 2.1188, "step": 1395 }, { "epoch": 1.04, - "grad_norm": 0.11181640625, - "learning_rate": 0.00032948638503937846, - "loss": 2.1461, + "grad_norm": 0.10791015625, + "learning_rate": 0.00022231033893138668, + "loss": 2.123, "step": 1400 }, { - "epoch": 1.05, - "grad_norm": 0.10693359375, - "learning_rate": 0.0003288233975497128, - "loss": 2.1515, + "epoch": 1.04, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002210256432049714, + "loss": 2.1401, "step": 1405 }, { - "epoch": 1.05, - "grad_norm": 0.10986328125, - "learning_rate": 0.00032815798179160524, - "loss": 2.1306, + "epoch": 1.04, + "grad_norm": 0.10888671875, + "learning_rate": 0.0002197400696340091, + "loss": 2.1212, "step": 1410 }, { - "epoch": 1.06, - "grad_norm": 0.11083984375, - "learning_rate": 0.0003274901503078711, - "loss": 2.1343, + "epoch": 1.05, + "grad_norm": 0.10791015625, + "learning_rate": 0.00021845367189266042, + "loss": 2.1297, "step": 1415 }, { - "epoch": 1.06, - "grad_norm": 0.11328125, - "learning_rate": 0.0003268199156868612, - "loss": 2.1235, + "epoch": 1.05, + "grad_norm": 0.107421875, + "learning_rate": 0.0002171665036894959, + "loss": 2.1237, "step": 1420 }, { - "epoch": 1.06, - "grad_norm": 0.11083984375, - "learning_rate": 0.0003261472905622244, - "loss": 2.1496, + "epoch": 1.05, + "grad_norm": 0.11181640625, + "learning_rate": 0.00021587861876525377, + "loss": 2.1281, "step": 1425 }, { - "epoch": 1.07, - "grad_norm": 0.1103515625, - "learning_rate": 0.0003254722876126697, - "loss": 2.1503, + "epoch": 1.06, + "grad_norm": 0.1083984375, + "learning_rate": 0.00021459007089059625, + "loss": 2.0898, "step": 1430 }, { - "epoch": 1.07, - "grad_norm": 0.11572265625, - "learning_rate": 0.00032479491956172705, - "loss": 2.1566, + "epoch": 1.06, + "grad_norm": 0.10888671875, + "learning_rate": 0.0002133009138638645, + "loss": 2.1212, "step": 1435 }, { "epoch": 1.07, - "grad_norm": 0.10986328125, - "learning_rate": 0.0003241151991775076, - "loss": 2.1638, + "grad_norm": 0.10693359375, + "learning_rate": 0.00021201120150883234, + "loss": 2.1285, "step": 1440 }, { - "epoch": 1.08, - "grad_norm": 0.11279296875, - "learning_rate": 0.0003234331392724631, - "loss": 2.1497, + "epoch": 1.07, + "grad_norm": 0.10888671875, + "learning_rate": 0.00021072098767245932, + "loss": 2.1302, "step": 1445 }, { - "epoch": 1.08, - "grad_norm": 0.11376953125, - "learning_rate": 0.00032274875270314426, - "loss": 2.1478, + "epoch": 1.07, + "grad_norm": 0.10888671875, + "learning_rate": 0.00020943032622264238, + "loss": 2.1079, "step": 1450 }, { - "epoch": 1.09, - "grad_norm": 0.1103515625, - "learning_rate": 0.00032206205236995843, - "loss": 2.149, + "epoch": 1.08, + "grad_norm": 0.11181640625, + "learning_rate": 0.00020813927104596666, + "loss": 2.1353, "step": 1455 }, { - "epoch": 1.09, - "grad_norm": 0.11181640625, - "learning_rate": 0.00032137305121692655, - "loss": 2.1514, + "epoch": 1.08, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002068478760454562, + "loss": 2.1402, "step": 1460 }, { - "epoch": 1.09, - "grad_norm": 0.1123046875, - "learning_rate": 0.00032068176223143884, - "loss": 2.1817, + "epoch": 1.08, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002055561951383227, + "loss": 2.1276, "step": 1465 }, { - "epoch": 1.1, - "grad_norm": 0.11083984375, - "learning_rate": 0.0003199881984440106, - "loss": 2.1721, + "epoch": 1.09, + "grad_norm": 0.109375, + "learning_rate": 0.00020426428225371496, + "loss": 2.1241, "step": 1470 }, { - "epoch": 1.1, - "grad_norm": 0.111328125, - "learning_rate": 0.000319292372928036, - "loss": 2.1319, + "epoch": 1.09, + "grad_norm": 0.10791015625, + "learning_rate": 0.00020297219133046714, + "loss": 2.1255, "step": 1475 }, { "epoch": 1.1, - "grad_norm": 0.11181640625, - "learning_rate": 0.0003185942987995418, - "loss": 2.15, + "grad_norm": 0.11279296875, + "learning_rate": 0.0002016799763148467, + "loss": 2.1191, "step": 1480 }, { - "epoch": 1.11, - "grad_norm": 0.11279296875, - "learning_rate": 0.0003178939892169403, - "loss": 2.1696, + "epoch": 1.1, + "grad_norm": 0.1083984375, + "learning_rate": 0.00020038769115830198, + "loss": 2.1308, "step": 1485 }, { - "epoch": 1.11, - "grad_norm": 0.11376953125, - "learning_rate": 0.0003171914573807813, - "loss": 2.1567, + "epoch": 1.1, + "grad_norm": 0.111328125, + "learning_rate": 0.00019909538981521002, + "loss": 2.1422, "step": 1490 }, { - "epoch": 1.12, - "grad_norm": 0.115234375, - "learning_rate": 0.0003164867165335029, - "loss": 2.1454, + "epoch": 1.11, + "grad_norm": 0.1083984375, + "learning_rate": 0.00019780312624062326, + "loss": 2.1358, "step": 1495 }, { - "epoch": 1.12, - "grad_norm": 0.11474609375, - "learning_rate": 0.0003157797799591823, - "loss": 2.1482, + "epoch": 1.11, + "grad_norm": 0.11083984375, + "learning_rate": 0.00019651095438801775, + "loss": 2.1292, "step": 1500 }, { - "epoch": 1.12, - "grad_norm": 0.11328125, - "learning_rate": 0.0003150706609832854, - "loss": 2.1297, + "epoch": 1.11, + "grad_norm": 0.11083984375, + "learning_rate": 0.00019521892820703975, + "loss": 2.1308, "step": 1505 }, { - "epoch": 1.13, - "grad_norm": 0.11474609375, - "learning_rate": 0.00031435937297241527, - "loss": 2.1142, + "epoch": 1.12, + "grad_norm": 0.111328125, + "learning_rate": 0.0001939271016412536, + "loss": 2.1339, "step": 1510 }, { - "epoch": 1.13, - "grad_norm": 0.1142578125, - "learning_rate": 0.0003136459293340605, - "loss": 2.1509, + "epoch": 1.12, + "grad_norm": 0.111328125, + "learning_rate": 0.00019263552862588948, + "loss": 2.126, "step": 1515 }, { "epoch": 1.13, - "grad_norm": 0.11376953125, - "learning_rate": 0.00031293034351634227, - "loss": 2.1382, + "grad_norm": 0.1103515625, + "learning_rate": 0.00019134426308559162, + "loss": 2.1325, "step": 1520 }, { - "epoch": 1.14, - "grad_norm": 0.115234375, - "learning_rate": 0.00031221262900776116, - "loss": 2.1552, + "epoch": 1.13, + "grad_norm": 0.10986328125, + "learning_rate": 0.00019005335893216665, + "loss": 2.1302, "step": 1525 }, { - "epoch": 1.14, - "grad_norm": 0.11328125, - "learning_rate": 0.0003114927993369425, - "loss": 2.1434, + "epoch": 1.13, + "grad_norm": 0.11376953125, + "learning_rate": 0.0001887628700623332, + "loss": 2.125, "step": 1530 }, { - "epoch": 1.15, - "grad_norm": 0.1123046875, - "learning_rate": 0.00031077086807238175, - "loss": 2.1364, + "epoch": 1.14, + "grad_norm": 0.11083984375, + "learning_rate": 0.000187472850355471, + "loss": 2.1217, "step": 1535 }, { - "epoch": 1.15, - "grad_norm": 0.11279296875, - "learning_rate": 0.00031004684882218845, - "loss": 2.1569, + "epoch": 1.14, + "grad_norm": 0.1103515625, + "learning_rate": 0.00018618335367137195, + "loss": 2.147, "step": 1540 }, { - "epoch": 1.15, - "grad_norm": 0.11376953125, - "learning_rate": 0.0003093207552338298, - "loss": 2.1756, + "epoch": 1.14, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001848944338479909, + "loss": 2.1437, "step": 1545 }, { - "epoch": 1.16, - "grad_norm": 0.11474609375, - "learning_rate": 0.0003085926009938735, - "loss": 2.1448, + "epoch": 1.15, + "grad_norm": 0.10986328125, + "learning_rate": 0.00018360614469919835, + "loss": 2.117, "step": 1550 }, { - "epoch": 1.16, - "grad_norm": 0.115234375, - "learning_rate": 0.0003078623998277296, - "loss": 2.1403, + "epoch": 1.15, + "grad_norm": 0.1162109375, + "learning_rate": 0.0001823185400125333, + "loss": 2.1119, "step": 1555 }, { - "epoch": 1.16, - "grad_norm": 0.11376953125, - "learning_rate": 0.0003071301654993919, - "loss": 2.1391, + "epoch": 1.15, + "grad_norm": 0.11083984375, + "learning_rate": 0.00018103167354695756, + "loss": 2.1242, "step": 1560 }, { - "epoch": 1.17, - "grad_norm": 0.1162109375, - "learning_rate": 0.0003063959118111785, - "loss": 2.154, + "epoch": 1.16, + "grad_norm": 0.11279296875, + "learning_rate": 0.00017974559903061172, + "loss": 2.115, "step": 1565 }, { - "epoch": 1.17, - "grad_norm": 0.11572265625, - "learning_rate": 0.0003056596526034717, - "loss": 2.1494, + "epoch": 1.16, + "grad_norm": 0.11474609375, + "learning_rate": 0.00017846037015857127, + "loss": 2.1332, "step": 1570 }, { "epoch": 1.17, - "grad_norm": 0.1142578125, - "learning_rate": 0.0003049214017544569, - "loss": 2.1319, + "grad_norm": 0.111328125, + "learning_rate": 0.00017717604059060518, + "loss": 2.1049, "step": 1575 }, { - "epoch": 1.18, - "grad_norm": 0.12060546875, - "learning_rate": 0.0003041811731798611, - "loss": 2.1384, + "epoch": 1.17, + "grad_norm": 0.109375, + "learning_rate": 0.0001758926639489354, + "loss": 2.1246, "step": 1580 }, { - "epoch": 1.18, - "grad_norm": 0.115234375, - "learning_rate": 0.0003034389808326907, - "loss": 2.1642, + "epoch": 1.17, + "grad_norm": 0.11181640625, + "learning_rate": 0.00017461029381599832, + "loss": 2.1229, "step": 1585 }, { - "epoch": 1.19, - "grad_norm": 0.1142578125, - "learning_rate": 0.0003026948387029684, - "loss": 2.1557, + "epoch": 1.18, + "grad_norm": 0.11083984375, + "learning_rate": 0.00017332898373220707, + "loss": 2.1136, "step": 1590 }, { - "epoch": 1.19, - "grad_norm": 0.11376953125, - "learning_rate": 0.0003019487608174695, - "loss": 2.1392, + "epoch": 1.18, + "grad_norm": 0.1103515625, + "learning_rate": 0.000172048787193717, + "loss": 2.1204, "step": 1595 }, { - "epoch": 1.19, - "grad_norm": 0.11474609375, - "learning_rate": 0.0003012007612394575, - "loss": 2.1431, + "epoch": 1.18, + "grad_norm": 0.11279296875, + "learning_rate": 0.00017076975765019134, + "loss": 2.1239, "step": 1600 }, { - "epoch": 1.2, - "grad_norm": 0.11474609375, - "learning_rate": 0.000300450854068419, - "loss": 2.1507, + "epoch": 1.19, + "grad_norm": 0.1103515625, + "learning_rate": 0.00016949194850257002, + "loss": 2.1308, "step": 1605 }, { - "epoch": 1.2, - "grad_norm": 0.11474609375, - "learning_rate": 0.000299699053439798, - "loss": 2.147, + "epoch": 1.19, + "grad_norm": 0.111328125, + "learning_rate": 0.00016821541310084006, + "loss": 2.1219, "step": 1610 }, { "epoch": 1.2, - "grad_norm": 0.11474609375, - "learning_rate": 0.00029894537352472927, - "loss": 2.1361, + "grad_norm": 0.11181640625, + "learning_rate": 0.00016694020474180814, + "loss": 2.1299, "step": 1615 }, { - "epoch": 1.21, - "grad_norm": 0.11572265625, - "learning_rate": 0.00029818982852977157, - "loss": 2.1514, + "epoch": 1.2, + "grad_norm": 0.115234375, + "learning_rate": 0.00016566637666687547, + "loss": 2.127, "step": 1620 }, { - "epoch": 1.21, - "grad_norm": 0.12451171875, - "learning_rate": 0.00029743243269663957, - "loss": 2.1597, + "epoch": 1.2, + "grad_norm": 0.11328125, + "learning_rate": 0.00016439398205981472, + "loss": 2.1255, "step": 1625 }, { - "epoch": 1.22, - "grad_norm": 0.11474609375, - "learning_rate": 0.0002966732003019353, - "loss": 2.1449, + "epoch": 1.21, + "grad_norm": 0.11328125, + "learning_rate": 0.00016312307404454967, + "loss": 2.1149, "step": 1630 }, { - "epoch": 1.22, - "grad_norm": 0.1142578125, - "learning_rate": 0.0002959121456568796, - "loss": 2.1392, + "epoch": 1.21, + "grad_norm": 0.111328125, + "learning_rate": 0.0001618537056829373, + "loss": 2.1105, "step": 1635 }, { - "epoch": 1.22, - "grad_norm": 0.11474609375, - "learning_rate": 0.00029514928310704164, - "loss": 2.1396, + "epoch": 1.21, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016058592997255215, + "loss": 2.1477, "step": 1640 }, { - "epoch": 1.23, - "grad_norm": 0.11328125, - "learning_rate": 0.000294384627032069, - "loss": 2.1509, + "epoch": 1.22, + "grad_norm": 0.1123046875, + "learning_rate": 0.00015931979984447385, + "loss": 2.13, "step": 1645 }, { - "epoch": 1.23, - "grad_norm": 0.11376953125, - "learning_rate": 0.0002936181918454164, - "loss": 2.1538, + "epoch": 1.22, + "grad_norm": 0.111328125, + "learning_rate": 0.00015805536816107703, + "loss": 2.1188, "step": 1650 }, { "epoch": 1.23, - "grad_norm": 0.11376953125, - "learning_rate": 0.0002928499919940743, - "loss": 2.1337, + "grad_norm": 0.111328125, + "learning_rate": 0.00015679268771382428, + "loss": 2.1194, "step": 1655 }, { - "epoch": 1.24, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002920800419582961, - "loss": 2.1475, + "epoch": 1.23, + "grad_norm": 0.11328125, + "learning_rate": 0.00015553181122106234, + "loss": 2.1449, "step": 1660 }, { - "epoch": 1.24, - "grad_norm": 0.1201171875, - "learning_rate": 0.0002913083562513257, - "loss": 2.1293, + "epoch": 1.23, + "grad_norm": 0.11181640625, + "learning_rate": 0.00015427279132582055, + "loss": 2.122, "step": 1665 }, { - "epoch": 1.25, - "grad_norm": 0.11474609375, - "learning_rate": 0.0002905349494191235, - "loss": 2.1539, + "epoch": 1.24, + "grad_norm": 0.111328125, + "learning_rate": 0.00015301568059361323, + "loss": 2.117, "step": 1670 }, { - "epoch": 1.25, - "grad_norm": 0.11669921875, - "learning_rate": 0.00028975983604009246, - "loss": 2.1215, + "epoch": 1.24, + "grad_norm": 0.1103515625, + "learning_rate": 0.00015176053151024502, + "loss": 2.1244, "step": 1675 }, { - "epoch": 1.25, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002889830307248033, - "loss": 2.1578, + "epoch": 1.24, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001505073964796194, + "loss": 2.1206, "step": 1680 }, { - "epoch": 1.26, - "grad_norm": 0.11474609375, - "learning_rate": 0.00028820454811571907, - "loss": 2.1492, + "epoch": 1.25, + "grad_norm": 0.11376953125, + "learning_rate": 0.000149256327821551, + "loss": 2.1345, "step": 1685 }, { - "epoch": 1.26, - "grad_norm": 0.11474609375, - "learning_rate": 0.0002874244028869191, - "loss": 2.1536, + "epoch": 1.25, + "grad_norm": 0.11376953125, + "learning_rate": 0.00014800737776958097, + "loss": 2.1259, "step": 1690 }, { - "epoch": 1.26, + "epoch": 1.25, "grad_norm": 0.11328125, - "learning_rate": 0.0002866426097438222, - "loss": 2.1584, + "learning_rate": 0.00014676059846879615, + "loss": 2.125, "step": 1695 }, { - "epoch": 1.27, - "grad_norm": 0.1142578125, - "learning_rate": 0.0002858591834229102, - "loss": 2.163, + "epoch": 1.26, + "grad_norm": 0.11474609375, + "learning_rate": 0.00014551604197365222, + "loss": 2.124, "step": 1700 }, { - "epoch": 1.27, - "grad_norm": 0.1123046875, - "learning_rate": 0.000285074138691449, - "loss": 2.1423, + "epoch": 1.26, + "grad_norm": 0.11279296875, + "learning_rate": 0.0001442737602458001, + "loss": 2.1176, "step": 1705 }, { - "epoch": 1.28, - "grad_norm": 0.1162109375, - "learning_rate": 0.0002842874903472115, - "loss": 2.1499, + "epoch": 1.27, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001430338051519165, + "loss": 2.1208, "step": 1710 }, { - "epoch": 1.28, + "epoch": 1.27, "grad_norm": 0.1162109375, - "learning_rate": 0.00028349925321819776, - "loss": 2.1552, + "learning_rate": 0.00014179622846153872, + "loss": 2.1309, "step": 1715 }, { - "epoch": 1.28, - "grad_norm": 0.11572265625, - "learning_rate": 0.00028270944216235574, - "loss": 2.1471, + "epoch": 1.27, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001405610818449027, + "loss": 2.1126, "step": 1720 }, { - "epoch": 1.29, - "grad_norm": 0.11767578125, - "learning_rate": 0.0002819180720673013, - "loss": 2.1362, + "epoch": 1.28, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001393284168707864, + "loss": 2.1075, "step": 1725 }, { - "epoch": 1.29, - "grad_norm": 0.1162109375, - "learning_rate": 0.0002811251578500377, - "loss": 2.173, + "epoch": 1.28, + "grad_norm": 0.11376953125, + "learning_rate": 0.0001380982850043562, + "loss": 2.1107, "step": 1730 }, { - "epoch": 1.29, - "grad_norm": 0.11572265625, - "learning_rate": 0.0002803307144566741, - "loss": 2.1381, + "epoch": 1.28, + "grad_norm": 0.1123046875, + "learning_rate": 0.00013687073760501828, + "loss": 2.1447, "step": 1735 }, { - "epoch": 1.3, - "grad_norm": 0.11376953125, - "learning_rate": 0.00027953475686214404, - "loss": 2.1409, + "epoch": 1.29, + "grad_norm": 0.11181640625, + "learning_rate": 0.00013564582592427444, + "loss": 2.1151, "step": 1740 }, { - "epoch": 1.3, - "grad_norm": 0.115234375, - "learning_rate": 0.0002787373000699232, - "loss": 2.1486, + "epoch": 1.29, + "grad_norm": 0.11669921875, + "learning_rate": 0.00013442360110358224, + "loss": 2.1212, "step": 1745 }, { - "epoch": 1.31, - "grad_norm": 0.11767578125, - "learning_rate": 0.00027793835911174656, - "loss": 2.1659, + "epoch": 1.3, + "grad_norm": 0.1123046875, + "learning_rate": 0.00013320411417221974, + "loss": 2.1249, "step": 1750 }, { - "epoch": 1.31, - "grad_norm": 0.11474609375, - "learning_rate": 0.00027713794904732483, - "loss": 2.1639, + "epoch": 1.3, + "grad_norm": 0.11328125, + "learning_rate": 0.0001319874160451551, + "loss": 2.1384, "step": 1755 }, { - "epoch": 1.31, - "grad_norm": 0.1171875, - "learning_rate": 0.00027633608496406103, - "loss": 2.1667, + "epoch": 1.3, + "grad_norm": 0.1123046875, + "learning_rate": 0.00013077355752092061, + "loss": 2.1235, "step": 1760 }, { - "epoch": 1.32, - "grad_norm": 0.1142578125, - "learning_rate": 0.00027553278197676567, - "loss": 2.1442, + "epoch": 1.31, + "grad_norm": 0.11279296875, + "learning_rate": 0.00012956258927949196, + "loss": 2.1457, "step": 1765 }, { - "epoch": 1.32, - "grad_norm": 0.115234375, - "learning_rate": 0.00027472805522737195, - "loss": 2.117, + "epoch": 1.31, + "grad_norm": 0.11328125, + "learning_rate": 0.00012835456188017226, + "loss": 2.1232, "step": 1770 }, { - "epoch": 1.32, - "grad_norm": 0.11669921875, - "learning_rate": 0.00027392191988465065, - "loss": 2.1499, + "epoch": 1.31, + "grad_norm": 0.11328125, + "learning_rate": 0.00012714952575948102, + "loss": 2.1135, "step": 1775 }, { - "epoch": 1.33, - "grad_norm": 0.11474609375, - "learning_rate": 0.0002731143911439237, - "loss": 2.15, + "epoch": 1.32, + "grad_norm": 0.11376953125, + "learning_rate": 0.00012594753122904858, + "loss": 2.1268, "step": 1780 }, { - "epoch": 1.33, - "grad_norm": 0.11328125, - "learning_rate": 0.00027230548422677817, - "loss": 2.1542, + "epoch": 1.32, + "grad_norm": 0.11279296875, + "learning_rate": 0.00012474862847351527, + "loss": 2.1204, "step": 1785 }, { - "epoch": 1.34, - "grad_norm": 0.11767578125, - "learning_rate": 0.0002714952143807792, - "loss": 2.1437, + "epoch": 1.32, + "grad_norm": 0.111328125, + "learning_rate": 0.00012355286754843654, + "loss": 2.1317, "step": 1790 }, { - "epoch": 1.34, - "grad_norm": 0.1162109375, - "learning_rate": 0.0002706835968791824, - "loss": 2.1627, + "epoch": 1.33, + "grad_norm": 0.11328125, + "learning_rate": 0.00012236029837819264, + "loss": 2.1233, "step": 1795 }, { - "epoch": 1.34, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002698706470206464, - "loss": 2.1453, + "epoch": 1.33, + "grad_norm": 0.11279296875, + "learning_rate": 0.00012117097075390449, + "loss": 2.1264, "step": 1800 }, { - "epoch": 1.35, - "grad_norm": 0.11474609375, - "learning_rate": 0.00026905638012894405, - "loss": 2.1482, + "epoch": 1.34, + "grad_norm": 0.11181640625, + "learning_rate": 0.00011998493433135474, + "loss": 2.1372, "step": 1805 }, { - "epoch": 1.35, - "grad_norm": 0.115234375, - "learning_rate": 0.00026824081155267374, - "loss": 2.1707, + "epoch": 1.34, + "grad_norm": 0.11376953125, + "learning_rate": 0.00011880223862891462, + "loss": 2.1296, "step": 1810 }, { - "epoch": 1.35, - "grad_norm": 0.11669921875, - "learning_rate": 0.00026742395666497015, - "loss": 2.1583, + "epoch": 1.34, + "grad_norm": 0.11376953125, + "learning_rate": 0.00011762293302547649, + "loss": 2.1252, "step": 1815 }, { - "epoch": 1.36, - "grad_norm": 0.11767578125, - "learning_rate": 0.0002666058308632144, - "loss": 2.1528, + "epoch": 1.35, + "grad_norm": 0.11181640625, + "learning_rate": 0.00011644706675839232, + "loss": 2.1107, "step": 1820 }, { - "epoch": 1.36, - "grad_norm": 0.1171875, - "learning_rate": 0.0002657864495687437, - "loss": 2.1546, + "epoch": 1.35, + "grad_norm": 0.1142578125, + "learning_rate": 0.00011527468892141785, + "loss": 2.1107, "step": 1825 }, { - "epoch": 1.37, + "epoch": 1.35, "grad_norm": 0.1162109375, - "learning_rate": 0.00026496582822656094, - "loss": 2.145, + "learning_rate": 0.00011410584846266266, + "loss": 2.1142, "step": 1830 }, { - "epoch": 1.37, - "grad_norm": 0.1162109375, - "learning_rate": 0.00026414398230504335, - "loss": 2.1581, + "epoch": 1.36, + "grad_norm": 0.11376953125, + "learning_rate": 0.0001129405941825471, + "loss": 2.1143, "step": 1835 }, { - "epoch": 1.37, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002633209272956509, - "loss": 2.1352, + "epoch": 1.36, + "grad_norm": 0.11572265625, + "learning_rate": 0.00011177897473176413, + "loss": 2.1129, "step": 1840 }, { - "epoch": 1.38, - "grad_norm": 0.11572265625, - "learning_rate": 0.0002624966787126345, - "loss": 2.1661, + "epoch": 1.37, + "grad_norm": 0.111328125, + "learning_rate": 0.00011062103860924873, + "loss": 2.1186, "step": 1845 }, { - "epoch": 1.38, - "grad_norm": 0.11962890625, - "learning_rate": 0.0002616712520927434, - "loss": 2.15, + "epoch": 1.37, + "grad_norm": 0.11474609375, + "learning_rate": 0.00010946683416015264, + "loss": 2.1213, "step": 1850 }, { - "epoch": 1.38, - "grad_norm": 0.11865234375, - "learning_rate": 0.00026084466299493227, - "loss": 2.1389, + "epoch": 1.37, + "grad_norm": 0.1123046875, + "learning_rate": 0.00010831640957382601, + "loss": 2.1192, "step": 1855 }, { - "epoch": 1.39, - "grad_norm": 0.12109375, - "learning_rate": 0.0002600169270000682, - "loss": 2.1566, + "epoch": 1.38, + "grad_norm": 0.11376953125, + "learning_rate": 0.00010716981288180526, + "loss": 2.1373, "step": 1860 }, { - "epoch": 1.39, - "grad_norm": 0.11865234375, - "learning_rate": 0.0002591880597106365, - "loss": 2.155, + "epoch": 1.38, + "grad_norm": 0.11572265625, + "learning_rate": 0.000106027091955808, + "loss": 2.1416, "step": 1865 }, { - "epoch": 1.4, - "grad_norm": 0.1171875, - "learning_rate": 0.0002583580767504474, - "loss": 2.1491, + "epoch": 1.38, + "grad_norm": 0.11279296875, + "learning_rate": 0.00010488829450573434, + "loss": 2.1381, "step": 1870 }, { - "epoch": 1.4, - "grad_norm": 0.1171875, - "learning_rate": 0.0002575269937643406, - "loss": 2.1399, + "epoch": 1.39, + "grad_norm": 0.11474609375, + "learning_rate": 0.0001037534680776744, + "loss": 2.1228, "step": 1875 }, { - "epoch": 1.4, - "grad_norm": 0.1162109375, - "learning_rate": 0.00025669482641789106, - "loss": 2.1316, + "epoch": 1.39, + "grad_norm": 0.11279296875, + "learning_rate": 0.00010262266005192399, + "loss": 2.1245, "step": 1880 }, { - "epoch": 1.41, - "grad_norm": 0.1181640625, - "learning_rate": 0.0002558615903971135, - "loss": 2.1265, + "epoch": 1.4, + "grad_norm": 0.1123046875, + "learning_rate": 0.00010149591764100586, + "loss": 2.1272, "step": 1885 }, { - "epoch": 1.41, - "grad_norm": 0.1181640625, - "learning_rate": 0.00025502730140816666, - "loss": 2.1384, + "epoch": 1.4, + "grad_norm": 0.11328125, + "learning_rate": 0.00010037328788769884, + "loss": 2.1265, "step": 1890 }, { - "epoch": 1.41, - "grad_norm": 0.1220703125, - "learning_rate": 0.0002541919751770574, - "loss": 2.1535, + "epoch": 1.4, + "grad_norm": 0.11279296875, + "learning_rate": 9.925481766307341e-05, + "loss": 2.1364, "step": 1895 }, { - "epoch": 1.42, - "grad_norm": 0.1201171875, - "learning_rate": 0.00025335562744934403, - "loss": 2.1292, + "epoch": 1.41, + "grad_norm": 0.1162109375, + "learning_rate": 9.814055366453523e-05, + "loss": 2.1193, "step": 1900 }, { - "epoch": 1.42, - "grad_norm": 0.1181640625, - "learning_rate": 0.0002525182739898397, - "loss": 2.1489, + "epoch": 1.41, + "grad_norm": 0.1142578125, + "learning_rate": 9.703054241387499e-05, + "loss": 2.1262, "step": 1905 }, { - "epoch": 1.42, - "grad_norm": 0.1171875, - "learning_rate": 0.00025167993058231524, - "loss": 2.1454, + "epoch": 1.41, + "grad_norm": 0.11279296875, + "learning_rate": 9.592483025532652e-05, + "loss": 2.144, "step": 1910 }, { - "epoch": 1.43, - "grad_norm": 0.1181640625, - "learning_rate": 0.00025084061302920146, - "loss": 2.1436, + "epoch": 1.42, + "grad_norm": 0.11279296875, + "learning_rate": 9.48234633536316e-05, + "loss": 2.1294, "step": 1915 }, { - "epoch": 1.43, - "grad_norm": 0.11865234375, - "learning_rate": 0.0002500003371512917, - "loss": 2.1461, + "epoch": 1.42, + "grad_norm": 0.11181640625, + "learning_rate": 9.372648769211258e-05, + "loss": 2.1113, "step": 1920 }, { - "epoch": 1.44, - "grad_norm": 0.11572265625, - "learning_rate": 0.000249159118787443, - "loss": 2.1515, + "epoch": 1.42, + "grad_norm": 0.1123046875, + "learning_rate": 9.263394907075244e-05, + "loss": 2.1247, "step": 1925 }, { - "epoch": 1.44, + "epoch": 1.43, "grad_norm": 0.1142578125, - "learning_rate": 0.00024831697379427807, - "loss": 2.1204, + "learning_rate": 9.154589310428288e-05, + "loss": 2.1161, "step": 1930 }, { - "epoch": 1.44, - "grad_norm": 0.11767578125, - "learning_rate": 0.0002474739180458863, - "loss": 2.1579, + "epoch": 1.43, + "grad_norm": 0.1142578125, + "learning_rate": 9.046236522027938e-05, + "loss": 2.13, "step": 1935 }, { - "epoch": 1.45, - "grad_norm": 0.11865234375, - "learning_rate": 0.0002466299674335241, - "loss": 2.1382, + "epoch": 1.44, + "grad_norm": 0.11376953125, + "learning_rate": 8.938341065726508e-05, + "loss": 2.1218, "step": 1940 }, { - "epoch": 1.45, - "grad_norm": 0.12060546875, - "learning_rate": 0.00024578513786531605, - "loss": 2.1551, + "epoch": 1.44, + "grad_norm": 0.111328125, + "learning_rate": 8.830907446282162e-05, + "loss": 2.1196, "step": 1945 }, { - "epoch": 1.45, - "grad_norm": 0.1171875, - "learning_rate": 0.0002449394452659544, - "loss": 2.1509, + "epoch": 1.44, + "grad_norm": 0.115234375, + "learning_rate": 8.723940149170853e-05, + "loss": 2.132, "step": 1950 }, { - "epoch": 1.46, + "epoch": 1.45, "grad_norm": 0.1171875, - "learning_rate": 0.00024409290557639947, - "loss": 2.1462, + "learning_rate": 8.617443640399056e-05, + "loss": 2.12, "step": 1955 }, { - "epoch": 1.46, - "grad_norm": 0.1162109375, - "learning_rate": 0.00024324553475357866, - "loss": 2.1404, + "epoch": 1.45, + "grad_norm": 0.11279296875, + "learning_rate": 8.51142236631727e-05, + "loss": 2.1178, "step": 1960 }, { - "epoch": 1.47, - "grad_norm": 0.11669921875, - "learning_rate": 0.00024239734877008604, - "loss": 2.1677, + "epoch": 1.45, + "grad_norm": 0.1142578125, + "learning_rate": 8.405880753434434e-05, + "loss": 2.1231, "step": 1965 }, { - "epoch": 1.47, - "grad_norm": 0.11962890625, - "learning_rate": 0.000241548363613881, - "loss": 2.1602, + "epoch": 1.46, + "grad_norm": 0.11474609375, + "learning_rate": 8.300823208233062e-05, + "loss": 2.1259, "step": 1970 }, { - "epoch": 1.47, - "grad_norm": 0.11767578125, - "learning_rate": 0.00024069859528798714, - "loss": 2.1534, + "epoch": 1.46, + "grad_norm": 0.11181640625, + "learning_rate": 8.196254116985303e-05, + "loss": 2.1061, "step": 1975 }, { - "epoch": 1.48, - "grad_norm": 0.115234375, - "learning_rate": 0.0002398480598101903, - "loss": 2.1448, + "epoch": 1.47, + "grad_norm": 0.11376953125, + "learning_rate": 8.0921778455698e-05, + "loss": 2.1365, "step": 1980 }, { - "epoch": 1.48, - "grad_norm": 0.11474609375, - "learning_rate": 0.00023899677321273714, - "loss": 2.1356, + "epoch": 1.47, + "grad_norm": 0.11376953125, + "learning_rate": 7.988598739289408e-05, + "loss": 2.1183, "step": 1985 }, { - "epoch": 1.48, - "grad_norm": 0.115234375, - "learning_rate": 0.00023814475154203222, - "loss": 2.154, + "epoch": 1.47, + "grad_norm": 0.11376953125, + "learning_rate": 7.885521122689753e-05, + "loss": 2.1135, "step": 1990 }, { - "epoch": 1.49, - "grad_norm": 0.11865234375, - "learning_rate": 0.00023729201085833626, - "loss": 2.1383, + "epoch": 1.48, + "grad_norm": 0.11181640625, + "learning_rate": 7.782949299378724e-05, + "loss": 2.1114, "step": 1995 }, { - "epoch": 1.49, - "grad_norm": 0.1181640625, - "learning_rate": 0.00023643856723546295, - "loss": 2.1611, + "epoch": 1.48, + "grad_norm": 0.11376953125, + "learning_rate": 7.68088755184673e-05, + "loss": 2.1381, "step": 2000 }, { - "epoch": 1.5, - "grad_norm": 0.1201171875, - "learning_rate": 0.00023558443676047596, - "loss": 2.1302, + "epoch": 1.48, + "grad_norm": 0.1162109375, + "learning_rate": 7.579340141287965e-05, + "loss": 2.1033, "step": 2005 }, { - "epoch": 1.5, - "grad_norm": 0.1171875, - "learning_rate": 0.00023472963553338613, - "loss": 2.1535, + "epoch": 1.49, + "grad_norm": 0.11474609375, + "learning_rate": 7.478311307422456e-05, + "loss": 2.1169, "step": 2010 }, { - "epoch": 1.5, - "grad_norm": 0.11962890625, - "learning_rate": 0.00023387417966684742, - "loss": 2.1414, + "epoch": 1.49, + "grad_norm": 0.11376953125, + "learning_rate": 7.377805268319076e-05, + "loss": 2.1155, "step": 2015 }, { - "epoch": 1.51, - "grad_norm": 0.11767578125, - "learning_rate": 0.00023301808528585375, - "loss": 2.1352, + "epoch": 1.5, + "grad_norm": 0.1162109375, + "learning_rate": 7.27782622021939e-05, + "loss": 2.1077, "step": 2020 }, { - "epoch": 1.51, - "grad_norm": 0.11962890625, - "learning_rate": 0.0002321613685274346, - "loss": 2.152, + "epoch": 1.5, + "grad_norm": 0.11376953125, + "learning_rate": 7.178378337362519e-05, + "loss": 2.1162, "step": 2025 }, { - "epoch": 1.51, - "grad_norm": 0.11865234375, - "learning_rate": 0.00023130404554035102, - "loss": 2.142, + "epoch": 1.5, + "grad_norm": 0.11279296875, + "learning_rate": 7.079465771810828e-05, + "loss": 2.1294, "step": 2030 }, { - "epoch": 1.52, - "grad_norm": 0.12158203125, - "learning_rate": 0.00023044613248479116, - "loss": 2.1588, + "epoch": 1.51, + "grad_norm": 0.11083984375, + "learning_rate": 6.981092653276547e-05, + "loss": 2.1204, "step": 2035 }, { - "epoch": 1.52, - "grad_norm": 0.1171875, - "learning_rate": 0.000229587645532066, - "loss": 2.1475, + "epoch": 1.51, + "grad_norm": 0.1142578125, + "learning_rate": 6.88326308894941e-05, + "loss": 2.1197, "step": 2040 }, { - "epoch": 1.53, - "grad_norm": 0.119140625, - "learning_rate": 0.00022872860086430393, - "loss": 2.1593, + "epoch": 1.51, + "grad_norm": 0.11279296875, + "learning_rate": 6.78598116332513e-05, + "loss": 2.1183, "step": 2045 }, { - "epoch": 1.53, - "grad_norm": 0.1181640625, - "learning_rate": 0.00022786901467414619, - "loss": 2.1467, + "epoch": 1.52, + "grad_norm": 0.11181640625, + "learning_rate": 6.68925093803489e-05, + "loss": 2.1259, "step": 2050 }, { - "epoch": 1.53, - "grad_norm": 0.11669921875, - "learning_rate": 0.0002270089031644415, - "loss": 2.1364, + "epoch": 1.52, + "grad_norm": 0.11474609375, + "learning_rate": 6.593076451675734e-05, + "loss": 2.1203, "step": 2055 }, { - "epoch": 1.54, - "grad_norm": 0.119140625, - "learning_rate": 0.00022614828254794055, - "loss": 2.1384, + "epoch": 1.52, + "grad_norm": 0.11376953125, + "learning_rate": 6.497461719642003e-05, + "loss": 2.1051, "step": 2060 }, { - "epoch": 1.54, - "grad_norm": 0.11669921875, - "learning_rate": 0.00022528716904699056, - "loss": 2.1428, + "epoch": 1.53, + "grad_norm": 0.11328125, + "learning_rate": 6.402410733957627e-05, + "loss": 2.1393, "step": 2065 }, { - "epoch": 1.54, - "grad_norm": 0.1181640625, - "learning_rate": 0.00022442557889322946, - "loss": 2.1517, + "epoch": 1.53, + "grad_norm": 0.11572265625, + "learning_rate": 6.307927463109504e-05, + "loss": 2.131, "step": 2070 }, { - "epoch": 1.55, - "grad_norm": 0.12060546875, - "learning_rate": 0.00022356352832727985, - "loss": 2.1474, + "epoch": 1.54, + "grad_norm": 0.115234375, + "learning_rate": 6.214015851881788e-05, + "loss": 2.1138, "step": 2075 }, { - "epoch": 1.55, - "grad_norm": 0.1201171875, - "learning_rate": 0.00022270103359844283, - "loss": 2.1684, + "epoch": 1.54, + "grad_norm": 0.1142578125, + "learning_rate": 6.120679821191193e-05, + "loss": 2.1361, "step": 2080 }, { - "epoch": 1.56, - "grad_norm": 0.11669921875, - "learning_rate": 0.00022183811096439194, - "loss": 2.1616, + "epoch": 1.54, + "grad_norm": 0.115234375, + "learning_rate": 6.027923267923279e-05, + "loss": 2.1242, "step": 2085 }, { - "epoch": 1.56, - "grad_norm": 0.12109375, - "learning_rate": 0.00022097477669086638, - "loss": 2.1468, + "epoch": 1.55, + "grad_norm": 0.11376953125, + "learning_rate": 5.9357500647697786e-05, + "loss": 2.1396, "step": 2090 }, { - "epoch": 1.56, - "grad_norm": 0.1162109375, - "learning_rate": 0.00022011104705136475, - "loss": 2.1374, + "epoch": 1.55, + "grad_norm": 0.1142578125, + "learning_rate": 5.8441640600668924e-05, + "loss": 2.1037, "step": 2095 }, { - "epoch": 1.57, - "grad_norm": 0.11962890625, - "learning_rate": 0.00021924693832683806, - "loss": 2.1539, + "epoch": 1.55, + "grad_norm": 0.1123046875, + "learning_rate": 5.75316907763459e-05, + "loss": 2.1335, "step": 2100 }, { - "epoch": 1.57, - "grad_norm": 0.1171875, - "learning_rate": 0.00021838246680538293, - "loss": 2.1514, + "epoch": 1.56, + "grad_norm": 0.111328125, + "learning_rate": 5.6627689166170364e-05, + "loss": 2.1064, "step": 2105 }, { - "epoch": 1.57, - "grad_norm": 0.12158203125, - "learning_rate": 0.00021751764878193459, - "loss": 2.1407, + "epoch": 1.56, + "grad_norm": 0.111328125, + "learning_rate": 5.5729673513238814e-05, + "loss": 2.1039, "step": 2110 }, { - "epoch": 1.58, - "grad_norm": 0.1181640625, - "learning_rate": 0.00021665250055795957, - "loss": 2.1485, + "epoch": 1.57, + "grad_norm": 0.11279296875, + "learning_rate": 5.4837681310727464e-05, + "loss": 2.1427, "step": 2115 }, { - "epoch": 1.58, - "grad_norm": 0.11865234375, - "learning_rate": 0.0002157870384411487, - "loss": 2.1496, + "epoch": 1.57, + "grad_norm": 0.1142578125, + "learning_rate": 5.395174980032645e-05, + "loss": 2.1386, "step": 2120 }, { - "epoch": 1.59, - "grad_norm": 0.1181640625, - "learning_rate": 0.00021492127874510946, - "loss": 2.143, + "epoch": 1.57, + "grad_norm": 0.1142578125, + "learning_rate": 5.307191597068531e-05, + "loss": 2.124, "step": 2125 }, { - "epoch": 1.59, - "grad_norm": 0.119140625, - "learning_rate": 0.0002140552377890586, - "loss": 2.1498, + "epoch": 1.58, + "grad_norm": 0.11376953125, + "learning_rate": 5.2198216555868206e-05, + "loss": 2.1275, "step": 2130 }, { - "epoch": 1.59, - "grad_norm": 0.1201171875, - "learning_rate": 0.00021318893189751457, - "loss": 2.1586, + "epoch": 1.58, + "grad_norm": 0.1142578125, + "learning_rate": 5.133068803382073e-05, + "loss": 2.1182, "step": 2135 }, { - "epoch": 1.6, - "grad_norm": 0.119140625, - "learning_rate": 0.00021232237739998965, - "loss": 2.139, - "step": 2140 - }, + "epoch": 1.58, + "grad_norm": 0.11279296875, + "learning_rate": 5.046936662484658e-05, + "loss": 2.1199, + "step": 2140 + }, { - "epoch": 1.6, - "grad_norm": 0.11865234375, - "learning_rate": 0.00021145559063068223, - "loss": 2.1481, + "epoch": 1.59, + "grad_norm": 0.11572265625, + "learning_rate": 4.9614288290095467e-05, + "loss": 2.1488, "step": 2145 }, { - "epoch": 1.6, - "grad_norm": 0.12060546875, - "learning_rate": 0.00021058858792816904, - "loss": 2.1449, + "epoch": 1.59, + "grad_norm": 0.11376953125, + "learning_rate": 4.8765488730061485e-05, + "loss": 2.1171, "step": 2150 }, { - "epoch": 1.61, - "grad_norm": 0.11865234375, - "learning_rate": 0.00020972138563509708, - "loss": 2.1629, + "epoch": 1.6, + "grad_norm": 0.1142578125, + "learning_rate": 4.792300338309288e-05, + "loss": 2.1305, "step": 2155 }, { - "epoch": 1.61, - "grad_norm": 0.1181640625, - "learning_rate": 0.00020885400009787528, - "loss": 2.1458, + "epoch": 1.6, + "grad_norm": 0.11279296875, + "learning_rate": 4.70868674239124e-05, + "loss": 2.1236, "step": 2160 }, { - "epoch": 1.62, - "grad_norm": 0.119140625, - "learning_rate": 0.000207986447666367, - "loss": 2.1531, + "epoch": 1.6, + "grad_norm": 0.115234375, + "learning_rate": 4.625711576214831e-05, + "loss": 2.1145, "step": 2165 }, { - "epoch": 1.62, - "grad_norm": 0.11767578125, - "learning_rate": 0.0002071187446935813, - "loss": 2.1339, + "epoch": 1.61, + "grad_norm": 0.1162109375, + "learning_rate": 4.543378304087746e-05, + "loss": 2.1362, "step": 2170 }, { - "epoch": 1.62, - "grad_norm": 0.11767578125, - "learning_rate": 0.00020625090753536492, - "loss": 2.1525, + "epoch": 1.61, + "grad_norm": 0.11572265625, + "learning_rate": 4.461690363517857e-05, + "loss": 2.1243, "step": 2175 }, { - "epoch": 1.63, - "grad_norm": 0.11962890625, - "learning_rate": 0.00020538295255009384, - "loss": 2.1522, + "epoch": 1.61, + "grad_norm": 0.1123046875, + "learning_rate": 4.380651165069707e-05, + "loss": 2.1126, "step": 2180 }, { - "epoch": 1.63, - "grad_norm": 0.1171875, - "learning_rate": 0.0002045148960983652, - "loss": 2.1358, + "epoch": 1.62, + "grad_norm": 0.11328125, + "learning_rate": 4.3002640922221084e-05, + "loss": 2.1095, "step": 2185 }, { - "epoch": 1.63, - "grad_norm": 0.1181640625, - "learning_rate": 0.0002036467545426886, - "loss": 2.149, + "epoch": 1.62, + "grad_norm": 0.115234375, + "learning_rate": 4.220532501226902e-05, + "loss": 2.1351, "step": 2190 }, { - "epoch": 1.64, - "grad_norm": 0.12109375, - "learning_rate": 0.00020277854424717803, - "loss": 2.1394, + "epoch": 1.62, + "grad_norm": 0.1123046875, + "learning_rate": 4.141459720968792e-05, + "loss": 2.1225, "step": 2195 }, { - "epoch": 1.64, - "grad_norm": 0.1171875, - "learning_rate": 0.00020191028157724294, - "loss": 2.1424, + "epoch": 1.63, + "grad_norm": 0.115234375, + "learning_rate": 4.0630490528264196e-05, + "loss": 2.1168, "step": 2200 }, { - "epoch": 1.64, - "grad_norm": 0.125, - "learning_rate": 0.0002010419828992801, - "loss": 2.1615, + "epoch": 1.63, + "grad_norm": 0.11376953125, + "learning_rate": 3.985303770534459e-05, + "loss": 2.1339, "step": 2205 }, { - "epoch": 1.65, - "grad_norm": 0.11572265625, - "learning_rate": 0.00020017366458036513, - "loss": 2.1549, + "epoch": 1.64, + "grad_norm": 0.1142578125, + "learning_rate": 3.908227120046983e-05, + "loss": 2.1018, "step": 2210 }, { - "epoch": 1.65, - "grad_norm": 0.11767578125, - "learning_rate": 0.00019930534298794365, - "loss": 2.1115, + "epoch": 1.64, + "grad_norm": 0.11328125, + "learning_rate": 3.831822319401916e-05, + "loss": 2.1128, "step": 2215 }, { - "epoch": 1.66, - "grad_norm": 0.12109375, - "learning_rate": 0.0001984370344895232, - "loss": 2.1267, + "epoch": 1.64, + "grad_norm": 0.11474609375, + "learning_rate": 3.756092558586694e-05, + "loss": 2.1124, "step": 2220 }, { - "epoch": 1.66, - "grad_norm": 0.11962890625, - "learning_rate": 0.00019756875545236453, - "loss": 2.1387, + "epoch": 1.65, + "grad_norm": 0.11328125, + "learning_rate": 3.681040999405079e-05, + "loss": 2.1334, "step": 2225 }, { - "epoch": 1.66, - "grad_norm": 0.11962890625, - "learning_rate": 0.00019670052224317274, - "loss": 2.1365, + "epoch": 1.65, + "grad_norm": 0.1142578125, + "learning_rate": 3.606670775345116e-05, + "loss": 2.1247, "step": 2230 }, { - "epoch": 1.67, - "grad_norm": 0.12109375, - "learning_rate": 0.0001958323512277895, - "loss": 2.1511, + "epoch": 1.65, + "grad_norm": 0.11328125, + "learning_rate": 3.532984991448356e-05, + "loss": 2.1211, "step": 2235 }, { - "epoch": 1.67, - "grad_norm": 0.11865234375, - "learning_rate": 0.0001949642587708838, - "loss": 2.1503, + "epoch": 1.66, + "grad_norm": 0.1162109375, + "learning_rate": 3.459986724180189e-05, + "loss": 2.1455, "step": 2240 }, { - "epoch": 1.67, - "grad_norm": 0.119140625, - "learning_rate": 0.00019409626123564403, - "loss": 2.1469, + "epoch": 1.66, + "grad_norm": 0.115234375, + "learning_rate": 3.387679021301406e-05, + "loss": 2.1305, "step": 2245 }, { - "epoch": 1.68, - "grad_norm": 0.119140625, - "learning_rate": 0.00019322837498346934, - "loss": 2.125, + "epoch": 1.67, + "grad_norm": 0.11328125, + "learning_rate": 3.316064901740934e-05, + "loss": 2.1421, "step": 2250 }, { - "epoch": 1.68, - "grad_norm": 0.11865234375, - "learning_rate": 0.00019236061637366124, - "loss": 2.1331, + "epoch": 1.67, + "grad_norm": 0.11279296875, + "learning_rate": 3.245147355469822e-05, + "loss": 2.1317, "step": 2255 }, { - "epoch": 1.69, - "grad_norm": 0.11865234375, - "learning_rate": 0.00019149300176311504, - "loss": 2.1354, + "epoch": 1.67, + "grad_norm": 0.11279296875, + "learning_rate": 3.174929343376374e-05, + "loss": 2.1087, "step": 2260 }, { - "epoch": 1.69, + "epoch": 1.68, "grad_norm": 0.119140625, - "learning_rate": 0.00019062554750601198, - "loss": 2.1512, + "learning_rate": 3.105413797142576e-05, + "loss": 2.1278, "step": 2265 }, { - "epoch": 1.69, - "grad_norm": 0.11767578125, - "learning_rate": 0.0001897582699535107, - "loss": 2.1151, + "epoch": 1.68, + "grad_norm": 0.1123046875, + "learning_rate": 3.0366036191216274e-05, + "loss": 2.1105, "step": 2270 }, { - "epoch": 1.7, - "grad_norm": 0.12255859375, - "learning_rate": 0.00018889118545343877, - "loss": 2.1239, + "epoch": 1.68, + "grad_norm": 0.11376953125, + "learning_rate": 2.9685016822168287e-05, + "loss": 2.1215, "step": 2275 }, { - "epoch": 1.7, - "grad_norm": 0.1181640625, - "learning_rate": 0.000188024310349985, - "loss": 2.1381, + "epoch": 1.69, + "grad_norm": 0.11328125, + "learning_rate": 2.9011108297615908e-05, + "loss": 2.1419, "step": 2280 }, { - "epoch": 1.7, - "grad_norm": 0.11669921875, - "learning_rate": 0.00018715766098339117, - "loss": 2.1306, + "epoch": 1.69, + "grad_norm": 0.11279296875, + "learning_rate": 2.834433875400755e-05, + "loss": 2.1307, "step": 2285 }, { - "epoch": 1.71, - "grad_norm": 0.11669921875, - "learning_rate": 0.00018629125368964405, - "loss": 2.1489, + "epoch": 1.7, + "grad_norm": 0.11376953125, + "learning_rate": 2.768473602973083e-05, + "loss": 2.1006, "step": 2290 }, { - "epoch": 1.71, - "grad_norm": 0.1220703125, - "learning_rate": 0.00018542510480016713, - "loss": 2.1547, + "epoch": 1.7, + "grad_norm": 0.1142578125, + "learning_rate": 2.7032327663950675e-05, + "loss": 2.1167, "step": 2295 }, { - "epoch": 1.72, - "grad_norm": 0.119140625, - "learning_rate": 0.00018455923064151342, - "loss": 2.1343, + "epoch": 1.7, + "grad_norm": 0.11328125, + "learning_rate": 2.6387140895459284e-05, + "loss": 2.1192, "step": 2300 }, { - "epoch": 1.72, - "grad_norm": 0.12353515625, - "learning_rate": 0.00018369364753505728, - "loss": 2.144, + "epoch": 1.71, + "grad_norm": 0.11181640625, + "learning_rate": 2.5749202661538972e-05, + "loss": 2.1193, "step": 2305 }, { - "epoch": 1.72, - "grad_norm": 0.11962890625, - "learning_rate": 0.00018282837179668679, - "loss": 2.1494, + "epoch": 1.71, + "grad_norm": 0.1142578125, + "learning_rate": 2.511853959683752e-05, + "loss": 2.119, "step": 2310 }, { - "epoch": 1.73, - "grad_norm": 0.1220703125, - "learning_rate": 0.00018196341973649637, - "loss": 2.1511, + "epoch": 1.71, + "grad_norm": 0.11328125, + "learning_rate": 2.4495178032255918e-05, + "loss": 2.1334, "step": 2315 }, { - "epoch": 1.73, - "grad_norm": 0.11962890625, - "learning_rate": 0.00018109880765847906, - "loss": 2.148, + "epoch": 1.72, + "grad_norm": 0.11328125, + "learning_rate": 2.3879143993849474e-05, + "loss": 2.1016, "step": 2320 }, { - "epoch": 1.73, - "grad_norm": 0.12109375, - "learning_rate": 0.00018023455186021961, - "loss": 2.142, + "epoch": 1.72, + "grad_norm": 0.11279296875, + "learning_rate": 2.3270463201740665e-05, + "loss": 2.1279, "step": 2325 }, { - "epoch": 1.74, - "grad_norm": 0.1201171875, - "learning_rate": 0.0001793706686325868, - "loss": 2.1445, + "epoch": 1.72, + "grad_norm": 0.11376953125, + "learning_rate": 2.2669161069045863e-05, + "loss": 2.1051, "step": 2330 }, { - "epoch": 1.74, - "grad_norm": 0.12158203125, - "learning_rate": 0.0001785071742594268, - "loss": 2.1344, + "epoch": 1.73, + "grad_norm": 0.1142578125, + "learning_rate": 2.2075262700813747e-05, + "loss": 2.1403, "step": 2335 }, { - "epoch": 1.75, - "grad_norm": 0.11669921875, - "learning_rate": 0.00017764408501725593, - "loss": 2.1214, + "epoch": 1.73, + "grad_norm": 0.1142578125, + "learning_rate": 2.148879289297756e-05, + "loss": 2.1177, "step": 2340 }, { - "epoch": 1.75, - "grad_norm": 0.11865234375, - "learning_rate": 0.00017678141717495394, - "loss": 2.1232, + "epoch": 1.74, + "grad_norm": 0.1181640625, + "learning_rate": 2.0909776131319548e-05, + "loss": 2.1135, "step": 2345 }, { - "epoch": 1.75, - "grad_norm": 0.11962890625, - "learning_rate": 0.00017591918699345755, - "loss": 2.129, + "epoch": 1.74, + "grad_norm": 0.1142578125, + "learning_rate": 2.0338236590448978e-05, + "loss": 2.1412, "step": 2350 }, { - "epoch": 1.76, - "grad_norm": 0.119140625, - "learning_rate": 0.00017505741072545346, - "loss": 2.1462, + "epoch": 1.74, + "grad_norm": 0.11376953125, + "learning_rate": 1.9774198132792353e-05, + "loss": 2.1297, "step": 2355 }, { - "epoch": 1.76, - "grad_norm": 0.119140625, - "learning_rate": 0.00017419610461507254, - "loss": 2.1401, + "epoch": 1.75, + "grad_norm": 0.1142578125, + "learning_rate": 1.9217684307597806e-05, + "loss": 2.1109, "step": 2360 }, { - "epoch": 1.76, - "grad_norm": 0.119140625, - "learning_rate": 0.0001733352848975832, - "loss": 2.1497, + "epoch": 1.75, + "grad_norm": 0.11376953125, + "learning_rate": 1.866871834995112e-05, + "loss": 2.1333, "step": 2365 }, { - "epoch": 1.77, - "grad_norm": 0.11767578125, - "learning_rate": 0.00017247496779908565, - "loss": 2.1356, + "epoch": 1.75, + "grad_norm": 0.11474609375, + "learning_rate": 1.8127323179806234e-05, + "loss": 2.1076, "step": 2370 }, { - "epoch": 1.77, - "grad_norm": 0.1201171875, - "learning_rate": 0.0001716151695362059, - "loss": 2.1436, + "epoch": 1.76, + "grad_norm": 0.11181640625, + "learning_rate": 1.7593521401027967e-05, + "loss": 2.1145, "step": 2375 }, { - "epoch": 1.78, - "grad_norm": 0.12060546875, - "learning_rate": 0.00017075590631579019, - "loss": 2.1538, + "epoch": 1.76, + "grad_norm": 0.11279296875, + "learning_rate": 1.7067335300448506e-05, + "loss": 2.1214, "step": 2380 }, { - "epoch": 1.78, - "grad_norm": 0.11865234375, - "learning_rate": 0.00016989719433459924, - "loss": 2.1497, + "epoch": 1.77, + "grad_norm": 0.11572265625, + "learning_rate": 1.654878684693677e-05, + "loss": 2.1398, "step": 2385 }, { - "epoch": 1.78, - "grad_norm": 0.1171875, - "learning_rate": 0.00016903904977900333, - "loss": 2.1333, + "epoch": 1.77, + "grad_norm": 0.1142578125, + "learning_rate": 1.6037897690481075e-05, + "loss": 2.0997, "step": 2390 }, { - "epoch": 1.79, - "grad_norm": 0.1220703125, - "learning_rate": 0.000168181488824677, - "loss": 2.1542, + "epoch": 1.77, + "grad_norm": 0.11572265625, + "learning_rate": 1.5534689161285643e-05, + "loss": 2.1291, "step": 2395 }, { - "epoch": 1.79, - "grad_norm": 0.1171875, - "learning_rate": 0.00016732452763629395, - "loss": 2.1197, + "epoch": 1.78, + "grad_norm": 0.115234375, + "learning_rate": 1.5039182268879504e-05, + "loss": 2.1358, "step": 2400 }, { - "epoch": 1.79, - "grad_norm": 0.123046875, - "learning_rate": 0.00016646818236722282, - "loss": 2.1151, + "epoch": 1.78, + "grad_norm": 0.11279296875, + "learning_rate": 1.4551397701239721e-05, + "loss": 2.1091, "step": 2405 }, { - "epoch": 1.8, - "grad_norm": 0.12060546875, - "learning_rate": 0.00016561246915922204, - "loss": 2.1505, + "epoch": 1.78, + "grad_norm": 0.11474609375, + "learning_rate": 1.4071355823927424e-05, + "loss": 2.1098, "step": 2410 }, { - "epoch": 1.8, - "grad_norm": 0.11865234375, - "learning_rate": 0.00016475740414213642, - "loss": 2.1501, + "epoch": 1.79, + "grad_norm": 0.115234375, + "learning_rate": 1.3599076679237676e-05, + "loss": 2.1321, "step": 2415 }, { - "epoch": 1.81, - "grad_norm": 0.1201171875, - "learning_rate": 0.00016390300343359216, - "loss": 2.1556, + "epoch": 1.79, + "grad_norm": 0.1123046875, + "learning_rate": 1.3134579985362517e-05, + "loss": 2.1004, "step": 2420 }, { - "epoch": 1.81, - "grad_norm": 0.119140625, - "learning_rate": 0.0001630492831386939, - "loss": 2.133, + "epoch": 1.79, + "grad_norm": 0.11328125, + "learning_rate": 1.2677885135567979e-05, + "loss": 2.1175, "step": 2425 }, { - "epoch": 1.81, - "grad_norm": 0.123046875, - "learning_rate": 0.0001621962593497205, - "loss": 2.162, + "epoch": 1.8, + "grad_norm": 0.11328125, + "learning_rate": 1.2229011197384021e-05, + "loss": 2.1235, "step": 2430 }, { - "epoch": 1.82, - "grad_norm": 0.12060546875, - "learning_rate": 0.0001613439481458221, - "loss": 2.1333, + "epoch": 1.8, + "grad_norm": 0.115234375, + "learning_rate": 1.1787976911808773e-05, + "loss": 2.1341, "step": 2435 }, { - "epoch": 1.82, - "grad_norm": 0.12060546875, - "learning_rate": 0.000160492365592717, - "loss": 2.1566, + "epoch": 1.81, + "grad_norm": 0.11328125, + "learning_rate": 1.1354800692525835e-05, + "loss": 2.1321, "step": 2440 }, { - "epoch": 1.82, - "grad_norm": 0.11865234375, - "learning_rate": 0.00015964152774238842, - "loss": 2.1692, + "epoch": 1.81, + "grad_norm": 0.11279296875, + "learning_rate": 1.092950062513567e-05, + "loss": 2.0928, "step": 2445 }, { - "epoch": 1.83, - "grad_norm": 0.119140625, - "learning_rate": 0.00015879145063278256, - "loss": 2.1413, + "epoch": 1.81, + "grad_norm": 0.11328125, + "learning_rate": 1.0512094466400402e-05, + "loss": 2.1177, "step": 2450 }, { - "epoch": 1.83, - "grad_norm": 0.12255859375, - "learning_rate": 0.00015794215028750567, - "loss": 2.1564, + "epoch": 1.82, + "grad_norm": 0.11279296875, + "learning_rate": 1.0102599643502508e-05, + "loss": 2.1335, "step": 2455 }, { - "epoch": 1.84, - "grad_norm": 0.11962890625, - "learning_rate": 0.00015709364271552262, - "loss": 2.1305, + "epoch": 1.82, + "grad_norm": 0.11572265625, + "learning_rate": 9.70103325331717e-06, + "loss": 2.111, "step": 2460 }, { - "epoch": 1.84, - "grad_norm": 0.12060546875, - "learning_rate": 0.00015624594391085457, - "loss": 2.1526, + "epoch": 1.82, + "grad_norm": 0.11328125, + "learning_rate": 9.307412061698428e-06, + "loss": 2.1119, "step": 2465 }, { - "epoch": 1.84, - "grad_norm": 0.12353515625, - "learning_rate": 0.00015539906985227798, - "loss": 2.138, + "epoch": 1.83, + "grad_norm": 0.11279296875, + "learning_rate": 8.92175250277929e-06, + "loss": 2.1151, "step": 2470 }, { - "epoch": 1.85, - "grad_norm": 0.12353515625, - "learning_rate": 0.0001545530365030229, - "loss": 2.1432, + "epoch": 1.83, + "grad_norm": 0.1123046875, + "learning_rate": 8.54407067828551e-06, + "loss": 2.1228, "step": 2475 }, { - "epoch": 1.85, - "grad_norm": 0.1201171875, - "learning_rate": 0.00015370785981047252, - "loss": 2.1508, + "epoch": 1.84, + "grad_norm": 0.115234375, + "learning_rate": 8.174382356863365e-06, + "loss": 2.1171, "step": 2480 }, { - "epoch": 1.85, - "grad_norm": 0.1201171875, - "learning_rate": 0.00015286355570586255, - "loss": 2.1347, + "epoch": 1.84, + "grad_norm": 0.11376953125, + "learning_rate": 7.812702973421182e-06, + "loss": 2.1102, "step": 2485 }, { - "epoch": 1.86, - "grad_norm": 0.12060546875, - "learning_rate": 0.00015202014010398042, - "loss": 2.1497, + "epoch": 1.84, + "grad_norm": 0.11572265625, + "learning_rate": 7.4590476284852165e-06, + "loss": 2.1014, "step": 2490 }, { - "epoch": 1.86, - "grad_norm": 0.12060546875, - "learning_rate": 0.00015117762890286602, - "loss": 2.1377, + "epoch": 1.85, + "grad_norm": 0.11328125, + "learning_rate": 7.113431087568745e-06, + "loss": 2.1109, "step": 2495 }, { - "epoch": 1.86, - "grad_norm": 0.1171875, - "learning_rate": 0.0001503360379835113, - "loss": 2.1337, + "epoch": 1.85, + "grad_norm": 0.11474609375, + "learning_rate": 6.775867780555989e-06, + "loss": 2.1115, "step": 2500 }, { - "epoch": 1.87, - "grad_norm": 0.11865234375, - "learning_rate": 0.00014949538320956158, - "loss": 2.156, + "epoch": 1.85, + "grad_norm": 0.11376953125, + "learning_rate": 6.446371801099371e-06, + "loss": 2.1103, "step": 2505 }, { - "epoch": 1.87, - "grad_norm": 0.11962890625, - "learning_rate": 0.00014865568042701592, - "loss": 2.1386, + "epoch": 1.86, + "grad_norm": 0.11328125, + "learning_rate": 6.124956906031276e-06, + "loss": 2.1203, "step": 2510 }, { - "epoch": 1.88, - "grad_norm": 0.119140625, - "learning_rate": 0.0001478169454639291, - "loss": 2.1468, + "epoch": 1.86, + "grad_norm": 0.1181640625, + "learning_rate": 5.811636514789598e-06, + "loss": 2.1227, "step": 2515 }, { - "epoch": 1.88, - "grad_norm": 0.119140625, - "learning_rate": 0.00014697919413011253, - "loss": 2.1566, + "epoch": 1.87, + "grad_norm": 0.11328125, + "learning_rate": 5.506423708857455e-06, + "loss": 2.1251, "step": 2520 }, { - "epoch": 1.88, - "grad_norm": 0.123046875, - "learning_rate": 0.00014614244221683686, - "loss": 2.1441, + "epoch": 1.87, + "grad_norm": 0.11376953125, + "learning_rate": 5.209331231217052e-06, + "loss": 2.1305, "step": 2525 }, { - "epoch": 1.89, - "grad_norm": 0.12060546875, - "learning_rate": 0.00014530670549653407, - "loss": 2.1473, + "epoch": 1.87, + "grad_norm": 0.11328125, + "learning_rate": 4.920371485817632e-06, + "loss": 2.1124, "step": 2530 }, { - "epoch": 1.89, - "grad_norm": 0.12060546875, - "learning_rate": 0.00014447199972249987, - "loss": 2.1481, + "epoch": 1.88, + "grad_norm": 0.11376953125, + "learning_rate": 4.639556537057677e-06, + "loss": 2.11, "step": 2535 }, { - "epoch": 1.89, - "grad_norm": 0.1201171875, - "learning_rate": 0.00014363834062859748, - "loss": 2.1546, + "epoch": 1.88, + "grad_norm": 0.11279296875, + "learning_rate": 4.3668981092810365e-06, + "loss": 2.1314, "step": 2540 }, { - "epoch": 1.9, - "grad_norm": 0.12060546875, - "learning_rate": 0.00014280574392896032, - "loss": 2.1314, + "epoch": 1.88, + "grad_norm": 0.1142578125, + "learning_rate": 4.102407586287571e-06, + "loss": 2.1307, "step": 2545 }, { - "epoch": 1.9, - "grad_norm": 0.11767578125, - "learning_rate": 0.0001419742253176962, - "loss": 2.1129, + "epoch": 1.89, + "grad_norm": 0.11328125, + "learning_rate": 3.846096010857791e-06, + "loss": 2.1042, "step": 2550 }, { - "epoch": 1.91, - "grad_norm": 0.119140625, - "learning_rate": 0.00014114380046859138, + "epoch": 1.89, + "grad_norm": 0.1142578125, + "learning_rate": 3.5979740842918995e-06, "loss": 2.1353, "step": 2555 }, { - "epoch": 1.91, - "grad_norm": 0.1201171875, - "learning_rate": 0.00014031448503481532, - "loss": 2.1423, + "epoch": 1.89, + "grad_norm": 0.11376953125, + "learning_rate": 3.3580521659628106e-06, + "loss": 2.1164, "step": 2560 }, { - "epoch": 1.91, - "grad_norm": 0.1201171875, - "learning_rate": 0.00013948629464862516, - "loss": 2.1173, + "epoch": 1.9, + "grad_norm": 0.11572265625, + "learning_rate": 3.126340272883899e-06, + "loss": 2.1268, "step": 2565 }, { - "epoch": 1.92, - "grad_norm": 0.1201171875, - "learning_rate": 0.00013865924492107153, - "loss": 2.1468, + "epoch": 1.9, + "grad_norm": 0.11376953125, + "learning_rate": 2.902848079290488e-06, + "loss": 2.119, "step": 2570 }, { - "epoch": 1.92, - "grad_norm": 0.12158203125, - "learning_rate": 0.00013783335144170418, - "loss": 2.1517, + "epoch": 1.91, + "grad_norm": 0.11474609375, + "learning_rate": 2.687584916236241e-06, + "loss": 2.1196, "step": 2575 }, { - "epoch": 1.92, - "grad_norm": 0.119140625, - "learning_rate": 0.0001370086297782779, - "loss": 2.153, + "epoch": 1.91, + "grad_norm": 0.1162109375, + "learning_rate": 2.4805597712032946e-06, + "loss": 2.1185, "step": 2580 }, { - "epoch": 1.93, - "grad_norm": 0.12109375, - "learning_rate": 0.0001361850954764594, - "loss": 2.1427, + "epoch": 1.91, + "grad_norm": 0.1123046875, + "learning_rate": 2.281781287727247e-06, + "loss": 2.1372, "step": 2585 }, { - "epoch": 1.93, - "grad_norm": 0.1181640625, - "learning_rate": 0.0001353627640595338, - "loss": 2.1477, + "epoch": 1.92, + "grad_norm": 0.11279296875, + "learning_rate": 2.091257765036092e-06, + "loss": 2.1092, "step": 2590 }, { - "epoch": 1.94, - "grad_norm": 0.12353515625, - "learning_rate": 0.00013454165102811272, - "loss": 2.1414, + "epoch": 1.92, + "grad_norm": 0.11328125, + "learning_rate": 1.908997157703918e-06, + "loss": 2.1357, "step": 2595 }, { - "epoch": 1.94, - "grad_norm": 0.1220703125, - "learning_rate": 0.00013372177185984134, - "loss": 2.1579, + "epoch": 1.92, + "grad_norm": 0.11181640625, + "learning_rate": 1.7350070753186176e-06, + "loss": 2.1145, "step": 2600 }, { - "epoch": 1.94, - "grad_norm": 0.11865234375, - "learning_rate": 0.00013290314200910735, - "loss": 2.123, + "epoch": 1.93, + "grad_norm": 0.115234375, + "learning_rate": 1.5692947821642324e-06, + "loss": 2.114, "step": 2605 }, { - "epoch": 1.95, - "grad_norm": 0.119140625, - "learning_rate": 0.00013208577690674905, - "loss": 2.1311, + "epoch": 1.93, + "grad_norm": 0.1123046875, + "learning_rate": 1.4118671969177265e-06, + "loss": 2.1391, "step": 2610 }, { - "epoch": 1.95, - "grad_norm": 0.119140625, - "learning_rate": 0.00013126969195976495, - "loss": 2.1314, + "epoch": 1.94, + "grad_norm": 0.1142578125, + "learning_rate": 1.2627308923600644e-06, + "loss": 2.1302, "step": 2615 }, { - "epoch": 1.95, - "grad_norm": 0.11865234375, - "learning_rate": 0.00013045490255102316, - "loss": 2.1374, + "epoch": 1.94, + "grad_norm": 0.11328125, + "learning_rate": 1.1218920951018064e-06, + "loss": 2.135, "step": 2620 }, { - "epoch": 1.96, - "grad_norm": 0.12060546875, - "learning_rate": 0.00012964142403897112, - "loss": 2.1489, + "epoch": 1.94, + "grad_norm": 0.115234375, + "learning_rate": 9.893566853230951e-07, + "loss": 2.1385, "step": 2625 }, { - "epoch": 1.96, - "grad_norm": 0.1181640625, - "learning_rate": 0.0001288292717573468, - "loss": 2.145, + "epoch": 1.95, + "grad_norm": 0.111328125, + "learning_rate": 8.651301965282077e-07, + "loss": 2.1302, "step": 2630 }, { - "epoch": 1.97, - "grad_norm": 0.1181640625, - "learning_rate": 0.00012801846101488898, - "loss": 2.1288, + "epoch": 1.95, + "grad_norm": 0.11669921875, + "learning_rate": 7.492178153145402e-07, + "loss": 2.125, "step": 2635 }, { - "epoch": 1.97, - "grad_norm": 0.119140625, - "learning_rate": 0.00012720900709504917, - "loss": 2.1468, + "epoch": 1.95, + "grad_norm": 0.11572265625, + "learning_rate": 6.416243811559808e-07, + "loss": 2.1311, "step": 2640 }, { - "epoch": 1.97, - "grad_norm": 0.12109375, - "learning_rate": 0.00012640092525570312, - "loss": 2.1201, + "epoch": 1.96, + "grad_norm": 0.11279296875, + "learning_rate": 5.423543862009384e-07, + "loss": 2.1087, "step": 2645 }, { - "epoch": 1.98, - "grad_norm": 0.1220703125, - "learning_rate": 0.0001255942307288637, - "loss": 2.1523, + "epoch": 1.96, + "grad_norm": 0.11328125, + "learning_rate": 4.5141197508475894e-07, + "loss": 2.105, "step": 2650 }, { - "epoch": 1.98, - "grad_norm": 0.1201171875, - "learning_rate": 0.00012478893872039314, - "loss": 2.146, + "epoch": 1.97, + "grad_norm": 0.115234375, + "learning_rate": 3.6880094475664204e-07, + "loss": 2.1256, "step": 2655 }, { - "epoch": 1.98, - "grad_norm": 0.11962890625, - "learning_rate": 0.00012398506440971713, - "loss": 2.1387, + "epoch": 1.97, + "grad_norm": 0.11328125, + "learning_rate": 2.945247443212118e-07, + "loss": 2.1204, "step": 2660 }, { - "epoch": 1.99, - "grad_norm": 0.11767578125, - "learning_rate": 0.00012318262294953815, - "loss": 2.1272, + "epoch": 1.97, + "grad_norm": 0.11474609375, + "learning_rate": 2.2858647489441e-07, + "loss": 2.1264, "step": 2665 }, { - "epoch": 1.99, - "grad_norm": 0.119140625, - "learning_rate": 0.00012238162946555002, - "loss": 2.1488, + "epoch": 1.98, + "grad_norm": 0.11279296875, + "learning_rate": 1.7098888947404412e-07, + "loss": 2.1144, "step": 2670 }, { - "epoch": 2.0, - "grad_norm": 0.11962890625, - "learning_rate": 0.00012158209905615301, - "loss": 2.1445, + "epoch": 1.98, + "grad_norm": 0.1142578125, + "learning_rate": 1.217343928249237e-07, + "loss": 2.0979, "step": 2675 }, { - "epoch": 2.0, - "grad_norm": 0.1201171875, - "learning_rate": 0.00012078404679216864, - "loss": 2.1327, + "epoch": 1.98, + "grad_norm": 0.1123046875, + "learning_rate": 8.082504137836288e-08, + "loss": 2.136, "step": 2680 }, { - "epoch": 2.0, - "eval_loss": 2.1632144451141357, - "eval_runtime": 186.9566, - "eval_samples_per_second": 25.792, - "eval_steps_per_second": 3.225, - "step": 2681 - }, - { - "epoch": 2.0, - "grad_norm": 0.12158203125, - "learning_rate": 0.0001199874877165564, - "loss": 2.1156, + "epoch": 1.99, + "grad_norm": 0.11279296875, + "learning_rate": 4.8262543146382345e-08, + "loss": 2.1116, "step": 2685 }, { - "epoch": 2.01, - "grad_norm": 0.1181640625, - "learning_rate": 0.00011919243684412948, - "loss": 2.115, + "epoch": 1.99, + "grad_norm": 0.11474609375, + "learning_rate": 2.404825765034424e-08, + "loss": 2.1157, "step": 2690 }, { - "epoch": 2.01, - "grad_norm": 0.11962890625, - "learning_rate": 0.00011839890916127228, - "loss": 2.0971, + "epoch": 1.99, + "grad_norm": 0.12255859375, + "learning_rate": 8.183195864264192e-09, + "loss": 2.1259, "step": 2695 }, { - "epoch": 2.01, - "grad_norm": 0.12109375, - "learning_rate": 0.00011760691962565752, - "loss": 2.119, + "epoch": 2.0, + "grad_norm": 0.1123046875, + "learning_rate": 6.680201725117741e-10, + "loss": 2.1135, "step": 2700 }, { - "epoch": 2.02, - "grad_norm": 0.1220703125, - "learning_rate": 0.00011681648316596461, - "loss": 2.1282, - "step": 2705 - }, - { - "epoch": 2.02, - "grad_norm": 0.12255859375, - "learning_rate": 0.00011602761468159813, - "loss": 2.1151, - "step": 2710 - }, - { - "epoch": 2.03, - "grad_norm": 0.1201171875, - "learning_rate": 0.00011524032904240671, - "loss": 2.101, - "step": 2715 - }, - { - "epoch": 2.03, - "grad_norm": 0.12255859375, - "learning_rate": 0.00011445464108840345, - "loss": 2.113, - "step": 2720 - }, - { - "epoch": 2.03, - "grad_norm": 0.123046875, - "learning_rate": 0.0001136705656294851, - "loss": 2.118, - "step": 2725 - }, - { - "epoch": 2.04, - "grad_norm": 0.126953125, - "learning_rate": 0.00011288811744515433, - "loss": 2.1045, - "step": 2730 - }, - { - "epoch": 2.04, - "grad_norm": 0.123046875, - "learning_rate": 0.0001121073112842395, - "loss": 2.108, - "step": 2735 - }, - { - "epoch": 2.04, - "grad_norm": 0.12109375, - "learning_rate": 0.00011132816186461821, - "loss": 2.0919, - "step": 2740 - }, - { - "epoch": 2.05, - "grad_norm": 0.1240234375, - "learning_rate": 0.00011055068387293879, - "loss": 2.1063, - "step": 2745 - }, - { - "epoch": 2.05, - "grad_norm": 0.123046875, - "learning_rate": 0.00010977489196434381, - "loss": 2.1175, - "step": 2750 - }, - { - "epoch": 2.06, - "grad_norm": 0.123046875, - "learning_rate": 0.00010900080076219426, - "loss": 2.1103, - "step": 2755 - }, - { - "epoch": 2.06, - "grad_norm": 0.1220703125, - "learning_rate": 0.00010822842485779285, - "loss": 2.1111, - "step": 2760 + "epoch": 2.0, + "eval_loss": 2.14662766456604, + "eval_runtime": 188.6485, + "eval_samples_per_second": 25.778, + "eval_steps_per_second": 3.223, + "step": 2702 }, { - "epoch": 2.06, - "grad_norm": 0.12158203125, - "learning_rate": 0.00010745777881011027, - "loss": 2.0899, - "step": 2765 - }, - { - "epoch": 2.07, - "grad_norm": 0.123046875, - "learning_rate": 0.00010668887714550974, - "loss": 2.0935, - "step": 2770 - }, - { - "epoch": 2.07, - "grad_norm": 0.1201171875, - "learning_rate": 0.0001059217343574737, - "loss": 2.0919, - "step": 2775 - }, - { - "epoch": 2.07, - "grad_norm": 0.123046875, - "learning_rate": 0.00010515636490633043, - "loss": 2.1157, - "step": 2780 - }, - { - "epoch": 2.08, - "grad_norm": 0.12255859375, - "learning_rate": 0.00010439278321898153, - "loss": 2.1024, - "step": 2785 - }, - { - "epoch": 2.08, - "grad_norm": 0.1240234375, - "learning_rate": 0.00010363100368863021, - "loss": 2.1038, - "step": 2790 - }, - { - "epoch": 2.09, - "grad_norm": 0.12255859375, - "learning_rate": 0.00010287104067450928, - "loss": 2.1088, - "step": 2795 - }, - { - "epoch": 2.09, - "grad_norm": 0.12353515625, - "learning_rate": 0.00010211290850161144, - "loss": 2.0861, - "step": 2800 - }, - { - "epoch": 2.09, - "grad_norm": 0.12451171875, - "learning_rate": 0.00010135662146041855, - "loss": 2.1215, - "step": 2805 - }, - { - "epoch": 2.1, - "grad_norm": 0.123046875, - "learning_rate": 0.0001006021938066325, - "loss": 2.1062, - "step": 2810 - }, - { - "epoch": 2.1, - "grad_norm": 0.12451171875, - "learning_rate": 9.984963976090651e-05, - "loss": 2.1014, - "step": 2815 - }, - { - "epoch": 2.1, - "grad_norm": 0.123046875, - "learning_rate": 9.909897350857706e-05, - "loss": 2.1023, - "step": 2820 - }, - { - "epoch": 2.11, - "grad_norm": 0.12158203125, - "learning_rate": 9.83502091993965e-05, - "loss": 2.1025, - "step": 2825 - }, - { - "epoch": 2.11, - "grad_norm": 0.1279296875, - "learning_rate": 9.760336094726624e-05, - "loss": 2.1241, - "step": 2830 - }, - { - "epoch": 2.11, - "grad_norm": 0.12451171875, - "learning_rate": 9.6858442829971e-05, - "loss": 2.1155, - "step": 2835 - }, - { - "epoch": 2.12, - "grad_norm": 0.123046875, - "learning_rate": 9.611546888891307e-05, - "loss": 2.1012, - "step": 2840 - }, - { - "epoch": 2.12, - "grad_norm": 0.12451171875, - "learning_rate": 9.537445312884788e-05, - "loss": 2.1058, - "step": 2845 - }, - { - "epoch": 2.13, - "grad_norm": 0.12353515625, - "learning_rate": 9.463540951761989e-05, - "loss": 2.0876, - "step": 2850 - }, - { - "epoch": 2.13, - "grad_norm": 0.12451171875, - "learning_rate": 9.389835198589944e-05, - "loss": 2.1222, - "step": 2855 - }, - { - "epoch": 2.13, - "grad_norm": 0.126953125, - "learning_rate": 9.316329442691995e-05, - "loss": 2.1107, - "step": 2860 - }, - { - "epoch": 2.14, - "grad_norm": 0.1240234375, - "learning_rate": 9.243025069621649e-05, - "loss": 2.1065, - "step": 2865 - }, - { - "epoch": 2.14, - "grad_norm": 0.12451171875, - "learning_rate": 9.169923461136376e-05, - "loss": 2.1193, - "step": 2870 - }, - { - "epoch": 2.14, - "grad_norm": 0.125, - "learning_rate": 9.097025995171669e-05, - "loss": 2.1154, - "step": 2875 - }, - { - "epoch": 2.15, - "grad_norm": 0.12255859375, - "learning_rate": 9.024334045814988e-05, - "loss": 2.1055, - "step": 2880 - }, - { - "epoch": 2.15, - "grad_norm": 0.12255859375, - "learning_rate": 8.951848983279898e-05, - "loss": 2.1039, - "step": 2885 - }, - { - "epoch": 2.16, - "grad_norm": 0.123046875, - "learning_rate": 8.87957217388023e-05, - "loss": 2.1249, - "step": 2890 - }, - { - "epoch": 2.16, - "grad_norm": 0.125, - "learning_rate": 8.80750498000432e-05, - "loss": 2.1231, - "step": 2895 - }, - { - "epoch": 2.16, - "grad_norm": 0.1240234375, - "learning_rate": 8.735648760089367e-05, - "loss": 2.1346, - "step": 2900 - }, - { - "epoch": 2.17, - "grad_norm": 0.1259765625, - "learning_rate": 8.66400486859575e-05, - "loss": 2.107, - "step": 2905 - }, - { - "epoch": 2.17, - "grad_norm": 0.12255859375, - "learning_rate": 8.592574655981594e-05, - "loss": 2.1189, - "step": 2910 - }, - { - "epoch": 2.17, - "grad_norm": 0.1259765625, - "learning_rate": 8.521359468677214e-05, - "loss": 2.1061, - "step": 2915 - }, - { - "epoch": 2.18, - "grad_norm": 0.1259765625, - "learning_rate": 8.450360649059834e-05, - "loss": 2.1297, - "step": 2920 - }, - { - "epoch": 2.18, - "grad_norm": 0.1279296875, - "learning_rate": 8.379579535428203e-05, - "loss": 2.1119, - "step": 2925 - }, - { - "epoch": 2.19, - "grad_norm": 0.12353515625, - "learning_rate": 8.309017461977409e-05, - "loss": 2.0947, - "step": 2930 - }, - { - "epoch": 2.19, - "grad_norm": 0.1259765625, - "learning_rate": 8.23867575877374e-05, - "loss": 2.1072, - "step": 2935 - }, - { - "epoch": 2.19, - "grad_norm": 0.123046875, - "learning_rate": 8.168555751729551e-05, - "loss": 2.106, - "step": 2940 - }, - { - "epoch": 2.2, - "grad_norm": 0.1259765625, - "learning_rate": 8.098658762578369e-05, - "loss": 2.1183, - "step": 2945 - }, - { - "epoch": 2.2, - "grad_norm": 0.125, - "learning_rate": 8.028986108849887e-05, - "loss": 2.1103, - "step": 2950 - }, - { - "epoch": 2.2, - "grad_norm": 0.1279296875, - "learning_rate": 7.959539103845184e-05, - "loss": 2.1414, - "step": 2955 - }, - { - "epoch": 2.21, - "grad_norm": 0.125, - "learning_rate": 7.890319056611942e-05, - "loss": 2.11, - "step": 2960 - }, - { - "epoch": 2.21, - "grad_norm": 0.12255859375, - "learning_rate": 7.82132727191978e-05, - "loss": 2.1178, - "step": 2965 - }, - { - "epoch": 2.22, - "grad_norm": 0.1240234375, - "learning_rate": 7.752565050235694e-05, - "loss": 2.1018, - "step": 2970 - }, - { - "epoch": 2.22, - "grad_norm": 0.1259765625, - "learning_rate": 7.684033687699455e-05, - "loss": 2.1184, - "step": 2975 - }, - { - "epoch": 2.22, - "grad_norm": 0.1259765625, - "learning_rate": 7.615734476099284e-05, - "loss": 2.1208, - "step": 2980 - }, - { - "epoch": 2.23, - "grad_norm": 0.1279296875, - "learning_rate": 7.547668702847421e-05, - "loss": 2.1201, - "step": 2985 - }, - { - "epoch": 2.23, - "grad_norm": 0.12451171875, - "learning_rate": 7.479837650955906e-05, - "loss": 2.123, - "step": 2990 - }, - { - "epoch": 2.23, - "grad_norm": 0.1220703125, - "learning_rate": 7.412242599012366e-05, - "loss": 2.1324, - "step": 2995 - }, - { - "epoch": 2.24, - "grad_norm": 0.123046875, - "learning_rate": 7.34488482115593e-05, - "loss": 2.1275, - "step": 3000 - }, - { - "epoch": 2.24, - "grad_norm": 0.12353515625, - "learning_rate": 7.277765587053206e-05, - "loss": 2.1193, - "step": 3005 - }, - { - "epoch": 2.25, - "grad_norm": 0.12451171875, - "learning_rate": 7.210886161874344e-05, - "loss": 2.1165, - "step": 3010 - }, - { - "epoch": 2.25, - "grad_norm": 0.12353515625, - "learning_rate": 7.144247806269213e-05, - "loss": 2.1136, - "step": 3015 - }, - { - "epoch": 2.25, - "grad_norm": 0.12353515625, - "learning_rate": 7.0778517763436e-05, - "loss": 2.0813, - "step": 3020 - }, - { - "epoch": 2.26, - "grad_norm": 0.12451171875, - "learning_rate": 7.011699323635559e-05, - "loss": 2.0982, - "step": 3025 - }, - { - "epoch": 2.26, - "grad_norm": 0.12451171875, - "learning_rate": 6.94579169509181e-05, - "loss": 2.1135, - "step": 3030 - }, - { - "epoch": 2.26, - "grad_norm": 0.12255859375, - "learning_rate": 6.88013013304424e-05, - "loss": 2.1057, - "step": 3035 - }, - { - "epoch": 2.27, - "grad_norm": 0.12353515625, - "learning_rate": 6.814715875186475e-05, - "loss": 2.1319, - "step": 3040 - }, - { - "epoch": 2.27, - "grad_norm": 0.12353515625, - "learning_rate": 6.749550154550585e-05, - "loss": 2.1206, - "step": 3045 - }, - { - "epoch": 2.28, - "grad_norm": 0.126953125, - "learning_rate": 6.684634199483773e-05, - "loss": 2.123, - "step": 3050 - }, - { - "epoch": 2.28, - "grad_norm": 0.126953125, - "learning_rate": 6.619969233625298e-05, - "loss": 2.1197, - "step": 3055 - }, - { - "epoch": 2.28, - "grad_norm": 0.1259765625, - "learning_rate": 6.55555647588336e-05, - "loss": 2.1075, - "step": 3060 - }, - { - "epoch": 2.29, - "grad_norm": 0.125, - "learning_rate": 6.491397140412139e-05, - "loss": 2.1185, - "step": 3065 - }, - { - "epoch": 2.29, - "grad_norm": 0.1259765625, - "learning_rate": 6.42749243658891e-05, - "loss": 2.1114, - "step": 3070 - }, - { - "epoch": 2.29, - "grad_norm": 0.1259765625, - "learning_rate": 6.363843568991243e-05, - "loss": 2.0937, - "step": 3075 - }, - { - "epoch": 2.3, - "grad_norm": 0.123046875, - "learning_rate": 6.300451737374322e-05, - "loss": 2.0954, - "step": 3080 - }, - { - "epoch": 2.3, - "grad_norm": 0.125, - "learning_rate": 6.237318136648258e-05, - "loss": 2.1127, - "step": 3085 - }, - { - "epoch": 2.31, - "grad_norm": 0.12451171875, - "learning_rate": 6.174443956855671e-05, - "loss": 2.1174, - "step": 3090 - }, - { - "epoch": 2.31, - "grad_norm": 0.1328125, - "learning_rate": 6.111830383149164e-05, - "loss": 2.1148, - "step": 3095 - }, - { - "epoch": 2.31, - "grad_norm": 0.1259765625, - "learning_rate": 6.04947859576904e-05, - "loss": 2.1382, - "step": 3100 - }, - { - "epoch": 2.32, - "grad_norm": 0.126953125, - "learning_rate": 5.9873897700210304e-05, - "loss": 2.1125, - "step": 3105 - }, - { - "epoch": 2.32, - "grad_norm": 0.125, - "learning_rate": 5.92556507625414e-05, - "loss": 2.1068, - "step": 3110 - }, - { - "epoch": 2.32, - "grad_norm": 0.12451171875, - "learning_rate": 5.86400567983862e-05, - "loss": 2.1204, - "step": 3115 - }, - { - "epoch": 2.33, - "grad_norm": 0.1240234375, - "learning_rate": 5.802712741143934e-05, - "loss": 2.1046, - "step": 3120 - }, - { - "epoch": 2.33, - "grad_norm": 0.126953125, - "learning_rate": 5.741687415516968e-05, - "loss": 2.1179, - "step": 3125 - }, - { - "epoch": 2.33, - "grad_norm": 0.125, - "learning_rate": 5.680930853260182e-05, - "loss": 2.1184, - "step": 3130 - }, - { - "epoch": 2.34, - "grad_norm": 0.12451171875, - "learning_rate": 5.6204441996099686e-05, - "loss": 2.1204, - "step": 3135 - }, - { - "epoch": 2.34, - "grad_norm": 0.125, - "learning_rate": 5.560228594715049e-05, - "loss": 2.1097, - "step": 3140 - }, - { - "epoch": 2.35, - "grad_norm": 0.1328125, - "learning_rate": 5.500285173614985e-05, - "loss": 2.1141, - "step": 3145 - }, - { - "epoch": 2.35, - "grad_norm": 0.125, - "learning_rate": 5.4406150662188035e-05, - "loss": 2.1096, - "step": 3150 - }, - { - "epoch": 2.35, - "grad_norm": 0.1259765625, - "learning_rate": 5.3812193972836436e-05, - "loss": 2.1134, - "step": 3155 - }, - { - "epoch": 2.36, - "grad_norm": 0.126953125, - "learning_rate": 5.322099286393625e-05, - "loss": 2.1132, - "step": 3160 - }, - { - "epoch": 2.36, - "grad_norm": 0.126953125, - "learning_rate": 5.263255847938693e-05, - "loss": 2.1083, - "step": 3165 - }, - { - "epoch": 2.36, - "grad_norm": 0.126953125, - "learning_rate": 5.204690191093635e-05, - "loss": 2.1093, - "step": 3170 - }, - { - "epoch": 2.37, - "grad_norm": 0.1318359375, - "learning_rate": 5.1464034197971726e-05, - "loss": 2.1123, - "step": 3175 - }, - { - "epoch": 2.37, - "grad_norm": 0.1240234375, - "learning_rate": 5.08839663273114e-05, - "loss": 2.1115, - "step": 3180 - }, - { - "epoch": 2.38, - "grad_norm": 0.12451171875, - "learning_rate": 5.030670923299785e-05, - "loss": 2.1129, - "step": 3185 - }, - { - "epoch": 2.38, - "grad_norm": 0.125, - "learning_rate": 4.9732273796091685e-05, - "loss": 2.1259, - "step": 3190 - }, - { - "epoch": 2.38, - "grad_norm": 0.12451171875, - "learning_rate": 4.916067084446632e-05, - "loss": 2.1305, - "step": 3195 - }, - { - "epoch": 2.39, - "grad_norm": 0.1259765625, - "learning_rate": 4.859191115260393e-05, - "loss": 2.098, - "step": 3200 - }, - { - "epoch": 2.39, - "grad_norm": 0.126953125, - "learning_rate": 4.8026005441392505e-05, - "loss": 2.1109, - "step": 3205 - }, - { - "epoch": 2.39, - "grad_norm": 0.1259765625, - "learning_rate": 4.7462964377923635e-05, - "loss": 2.1043, - "step": 3210 - }, - { - "epoch": 2.4, - "grad_norm": 0.1240234375, - "learning_rate": 4.690279857529145e-05, - "loss": 2.0896, - "step": 3215 - }, - { - "epoch": 2.4, - "grad_norm": 0.125, - "learning_rate": 4.634551859239254e-05, - "loss": 2.1266, - "step": 3220 - }, - { - "epoch": 2.41, - "grad_norm": 0.125, - "learning_rate": 4.57911349337272e-05, - "loss": 2.0982, - "step": 3225 - }, - { - "epoch": 2.41, - "grad_norm": 0.1279296875, - "learning_rate": 4.523965804920078e-05, - "loss": 2.1275, - "step": 3230 - }, - { - "epoch": 2.41, - "grad_norm": 0.1279296875, - "learning_rate": 4.469109833392759e-05, - "loss": 2.1323, - "step": 3235 - }, - { - "epoch": 2.42, - "grad_norm": 0.12451171875, - "learning_rate": 4.414546612803421e-05, - "loss": 2.1026, - "step": 3240 - }, - { - "epoch": 2.42, - "grad_norm": 0.12353515625, - "learning_rate": 4.3602771716464874e-05, - "loss": 2.085, - "step": 3245 - }, - { - "epoch": 2.42, - "grad_norm": 0.126953125, - "learning_rate": 4.3063025328787676e-05, - "loss": 2.1026, - "step": 3250 - }, - { - "epoch": 2.43, - "grad_norm": 0.12451171875, - "learning_rate": 4.252623713900159e-05, - "loss": 2.1093, - "step": 3255 - }, - { - "epoch": 2.43, - "grad_norm": 0.125, - "learning_rate": 4.199241726534495e-05, - "loss": 2.1026, - "step": 3260 - }, - { - "epoch": 2.44, - "grad_norm": 0.1279296875, - "learning_rate": 4.146157577010421e-05, - "loss": 2.1192, - "step": 3265 - }, - { - "epoch": 2.44, - "grad_norm": 0.12451171875, - "learning_rate": 4.0933722659424945e-05, - "loss": 2.1114, - "step": 3270 - }, - { - "epoch": 2.44, - "grad_norm": 0.1259765625, - "learning_rate": 4.040886788312268e-05, - "loss": 2.1239, - "step": 3275 - }, - { - "epoch": 2.45, - "grad_norm": 0.1259765625, - "learning_rate": 3.9887021334495625e-05, - "loss": 2.0952, - "step": 3280 - }, - { - "epoch": 2.45, - "grad_norm": 0.1259765625, - "learning_rate": 3.936819285013826e-05, - "loss": 2.114, - "step": 3285 - }, - { - "epoch": 2.45, - "grad_norm": 0.12890625, - "learning_rate": 3.885239220975547e-05, - "loss": 2.1189, - "step": 3290 - }, - { - "epoch": 2.46, - "grad_norm": 0.125, - "learning_rate": 3.833962913597893e-05, - "loss": 2.0974, - "step": 3295 - }, - { - "epoch": 2.46, - "grad_norm": 0.1259765625, - "learning_rate": 3.7829913294183e-05, - "loss": 2.1085, - "step": 3300 - }, - { - "epoch": 2.47, - "grad_norm": 0.1240234375, - "learning_rate": 3.73232542923033e-05, - "loss": 2.1023, - "step": 3305 - }, - { - "epoch": 2.47, - "grad_norm": 0.125, - "learning_rate": 3.681966168065509e-05, - "loss": 2.1213, - "step": 3310 - }, - { - "epoch": 2.47, - "grad_norm": 0.12451171875, - "learning_rate": 3.6319144951753436e-05, - "loss": 2.1312, - "step": 3315 - }, - { - "epoch": 2.48, - "grad_norm": 0.1259765625, - "learning_rate": 3.582171354013444e-05, - "loss": 2.1268, - "step": 3320 - }, - { - "epoch": 2.48, - "grad_norm": 0.126953125, - "learning_rate": 3.5327376822176885e-05, - "loss": 2.1068, - "step": 3325 - }, - { - "epoch": 2.48, - "grad_norm": 0.126953125, - "learning_rate": 3.483614411592628e-05, - "loss": 2.1057, - "step": 3330 - }, - { - "epoch": 2.49, - "grad_norm": 0.12890625, - "learning_rate": 3.434802468091836e-05, - "loss": 2.103, - "step": 3335 - }, - { - "epoch": 2.49, - "grad_norm": 0.1259765625, - "learning_rate": 3.386302771800527e-05, - "loss": 2.1166, - "step": 3340 - }, - { - "epoch": 2.5, - "grad_norm": 0.1240234375, - "learning_rate": 3.3381162369181717e-05, - "loss": 2.1176, - "step": 3345 - }, - { - "epoch": 2.5, - "grad_norm": 0.125, - "learning_rate": 3.290243771741275e-05, - "loss": 2.1137, - "step": 3350 - }, - { - "epoch": 2.5, - "grad_norm": 0.1259765625, - "learning_rate": 3.2426862786462565e-05, - "loss": 2.1017, - "step": 3355 - }, - { - "epoch": 2.51, - "grad_norm": 0.1259765625, - "learning_rate": 3.195444654072439e-05, - "loss": 2.1049, - "step": 3360 - }, - { - "epoch": 2.51, - "grad_norm": 0.125, - "learning_rate": 3.148519788505166e-05, - "loss": 2.1144, - "step": 3365 - }, - { - "epoch": 2.51, - "grad_norm": 0.125, - "learning_rate": 3.101912566458989e-05, - "loss": 2.0956, - "step": 3370 - }, - { - "epoch": 2.52, - "grad_norm": 0.1259765625, - "learning_rate": 3.0556238664610105e-05, - "loss": 2.1077, - "step": 3375 - }, - { - "epoch": 2.52, - "grad_norm": 0.126953125, - "learning_rate": 3.009654561034323e-05, - "loss": 2.1178, - "step": 3380 - }, - { - "epoch": 2.53, - "grad_norm": 0.12451171875, - "learning_rate": 2.9640055166815673e-05, - "loss": 2.105, - "step": 3385 - }, - { - "epoch": 2.53, - "grad_norm": 0.12451171875, - "learning_rate": 2.918677593868586e-05, - "loss": 2.1051, - "step": 3390 - }, - { - "epoch": 2.53, - "grad_norm": 0.12451171875, - "learning_rate": 2.8736716470082204e-05, - "loss": 2.0967, - "step": 3395 - }, - { - "epoch": 2.54, - "grad_norm": 0.12451171875, - "learning_rate": 2.8289885244441803e-05, - "loss": 2.1174, - "step": 3400 - }, - { - "epoch": 2.54, - "grad_norm": 0.12890625, - "learning_rate": 2.7846290684350963e-05, - "loss": 2.1216, - "step": 3405 - }, - { - "epoch": 2.54, - "grad_norm": 0.1259765625, - "learning_rate": 2.740594115138595e-05, - "loss": 2.1199, - "step": 3410 - }, - { - "epoch": 2.55, - "grad_norm": 0.1240234375, - "learning_rate": 2.6968844945955617e-05, - "loss": 2.1112, - "step": 3415 - }, - { - "epoch": 2.55, - "grad_norm": 0.1259765625, - "learning_rate": 2.6535010307145002e-05, - "loss": 2.1374, - "step": 3420 - }, - { - "epoch": 2.56, - "grad_norm": 0.1259765625, - "learning_rate": 2.6104445412559876e-05, - "loss": 2.1233, - "step": 3425 - }, - { - "epoch": 2.56, - "grad_norm": 0.1259765625, - "learning_rate": 2.5677158378172707e-05, - "loss": 2.1049, - "step": 3430 - }, - { - "epoch": 2.56, - "grad_norm": 0.1279296875, - "learning_rate": 2.5253157258169567e-05, - "loss": 2.1058, - "step": 3435 - }, - { - "epoch": 2.57, - "grad_norm": 0.125, - "learning_rate": 2.4832450044798573e-05, - "loss": 2.1154, - "step": 3440 - }, - { - "epoch": 2.57, - "grad_norm": 0.126953125, - "learning_rate": 2.4415044668218735e-05, - "loss": 2.1126, - "step": 3445 - }, - { - "epoch": 2.57, - "grad_norm": 0.1259765625, - "learning_rate": 2.4000948996351104e-05, - "loss": 2.132, - "step": 3450 - }, - { - "epoch": 2.58, - "grad_norm": 0.1279296875, - "learning_rate": 2.359017083472994e-05, - "loss": 2.1093, - "step": 3455 - }, - { - "epoch": 2.58, - "grad_norm": 0.12353515625, - "learning_rate": 2.3182717926355845e-05, - "loss": 2.0929, - "step": 3460 - }, - { - "epoch": 2.58, - "grad_norm": 0.1259765625, - "learning_rate": 2.277859795154986e-05, - "loss": 2.1068, - "step": 3465 - }, - { - "epoch": 2.59, - "grad_norm": 0.1259765625, - "learning_rate": 2.237781852780838e-05, - "loss": 2.1095, - "step": 3470 - }, - { - "epoch": 2.59, - "grad_norm": 0.125, - "learning_rate": 2.1980387209660026e-05, - "loss": 2.1148, - "step": 3475 - }, - { - "epoch": 2.6, - "grad_norm": 0.126953125, - "learning_rate": 2.1586311488522702e-05, - "loss": 2.1104, - "step": 3480 - }, - { - "epoch": 2.6, - "grad_norm": 0.1259765625, - "learning_rate": 2.1195598792562964e-05, - "loss": 2.1175, - "step": 3485 - }, - { - "epoch": 2.6, - "grad_norm": 0.1259765625, - "learning_rate": 2.0808256486555554e-05, - "loss": 2.1094, - "step": 3490 - }, - { - "epoch": 2.61, - "grad_norm": 0.126953125, - "learning_rate": 2.042429187174475e-05, - "loss": 2.121, - "step": 3495 - }, - { - "epoch": 2.61, - "grad_norm": 0.1240234375, - "learning_rate": 2.0043712185706863e-05, - "loss": 2.1047, - "step": 3500 - }, - { - "epoch": 2.61, - "grad_norm": 0.126953125, - "learning_rate": 1.966652460221341e-05, - "loss": 2.1098, - "step": 3505 - }, - { - "epoch": 2.62, - "grad_norm": 0.1279296875, - "learning_rate": 1.9292736231096464e-05, - "loss": 2.1114, - "step": 3510 - }, - { - "epoch": 2.62, - "grad_norm": 0.12890625, - "learning_rate": 1.8922354118114138e-05, - "loss": 2.1267, - "step": 3515 - }, - { - "epoch": 2.63, - "grad_norm": 0.125, - "learning_rate": 1.8555385244818035e-05, - "loss": 2.0916, - "step": 3520 - }, - { - "epoch": 2.63, - "grad_norm": 0.126953125, - "learning_rate": 1.8191836528421558e-05, - "loss": 2.0985, - "step": 3525 - }, - { - "epoch": 2.63, - "grad_norm": 0.123046875, - "learning_rate": 1.7831714821669588e-05, - "loss": 2.1129, - "step": 3530 - }, - { - "epoch": 2.64, - "grad_norm": 0.1279296875, - "learning_rate": 1.7475026912709235e-05, - "loss": 2.0889, - "step": 3535 - }, - { - "epoch": 2.64, - "grad_norm": 0.12890625, - "learning_rate": 1.71217795249619e-05, - "loss": 2.1067, - "step": 3540 - }, - { - "epoch": 2.64, - "grad_norm": 0.1240234375, - "learning_rate": 1.6771979316996677e-05, - "loss": 2.0987, - "step": 3545 - }, - { - "epoch": 2.65, - "grad_norm": 0.1259765625, - "learning_rate": 1.6425632882404618e-05, - "loss": 2.099, - "step": 3550 - }, - { - "epoch": 2.65, - "grad_norm": 0.1279296875, - "learning_rate": 1.6082746749674604e-05, - "loss": 2.1211, - "step": 3555 - }, - { - "epoch": 2.66, - "grad_norm": 0.12451171875, - "learning_rate": 1.5743327382070206e-05, - "loss": 2.1099, - "step": 3560 - }, - { - "epoch": 2.66, - "grad_norm": 0.125, - "learning_rate": 1.540738117750793e-05, - "loss": 2.1109, - "step": 3565 - }, - { - "epoch": 2.66, - "grad_norm": 0.126953125, - "learning_rate": 1.507491446843654e-05, - "loss": 2.1064, - "step": 3570 - }, - { - "epoch": 2.67, - "grad_norm": 0.1259765625, - "learning_rate": 1.4745933521717781e-05, - "loss": 2.0908, - "step": 3575 - }, - { - "epoch": 2.67, - "grad_norm": 0.1259765625, - "learning_rate": 1.4420444538508083e-05, - "loss": 2.119, - "step": 3580 - }, - { - "epoch": 2.67, - "grad_norm": 0.125, - "learning_rate": 1.4098453654141975e-05, - "loss": 2.1008, - "step": 3585 - }, - { - "epoch": 2.68, - "grad_norm": 0.130859375, - "learning_rate": 1.377996693801611e-05, - "loss": 2.1134, - "step": 3590 - }, - { - "epoch": 2.68, - "grad_norm": 0.126953125, - "learning_rate": 1.346499039347504e-05, - "loss": 2.1147, - "step": 3595 - }, - { - "epoch": 2.69, - "grad_norm": 0.1259765625, - "learning_rate": 1.3153529957698008e-05, - "loss": 2.1141, - "step": 3600 - }, - { - "epoch": 2.69, - "grad_norm": 0.1259765625, - "learning_rate": 1.2845591501587017e-05, - "loss": 2.0835, - "step": 3605 - }, - { - "epoch": 2.69, - "grad_norm": 0.126953125, - "learning_rate": 1.254118082965634e-05, - "loss": 2.1109, - "step": 3610 - }, - { - "epoch": 2.7, - "grad_norm": 0.1259765625, - "learning_rate": 1.2240303679922727e-05, - "loss": 2.1165, - "step": 3615 - }, - { - "epoch": 2.7, - "grad_norm": 0.1240234375, - "learning_rate": 1.1942965723797671e-05, - "loss": 2.1035, - "step": 3620 - }, - { - "epoch": 2.7, - "grad_norm": 0.1259765625, - "learning_rate": 1.164917256598017e-05, - "loss": 2.1112, - "step": 3625 - }, - { - "epoch": 2.71, - "grad_norm": 0.12451171875, - "learning_rate": 1.1358929744351332e-05, - "loss": 2.1051, - "step": 3630 - }, - { - "epoch": 2.71, - "grad_norm": 0.1279296875, - "learning_rate": 1.1072242729869819e-05, - "loss": 2.1133, - "step": 3635 - }, - { - "epoch": 2.72, - "grad_norm": 0.123046875, - "learning_rate": 1.0789116926468756e-05, - "loss": 2.1097, - "step": 3640 - }, - { - "epoch": 2.72, - "grad_norm": 0.125, - "learning_rate": 1.050955767095403e-05, - "loss": 2.1082, - "step": 3645 - }, - { - "epoch": 2.72, - "grad_norm": 0.1259765625, - "learning_rate": 1.0233570232903323e-05, - "loss": 2.1208, - "step": 3650 - }, - { - "epoch": 2.73, - "grad_norm": 0.1240234375, - "learning_rate": 9.961159814567267e-06, - "loss": 2.097, - "step": 3655 - }, - { - "epoch": 2.73, - "grad_norm": 0.1259765625, - "learning_rate": 9.692331550770918e-06, - "loss": 2.1102, - "step": 3660 - }, - { - "epoch": 2.73, - "grad_norm": 0.126953125, - "learning_rate": 9.42709050881736e-06, - "loss": 2.0946, - "step": 3665 - }, - { - "epoch": 2.74, - "grad_norm": 0.125, - "learning_rate": 9.165441688391885e-06, - "loss": 2.1129, - "step": 3670 - }, - { - "epoch": 2.74, - "grad_norm": 0.1259765625, - "learning_rate": 8.907390021467921e-06, - "loss": 2.1016, - "step": 3675 - }, - { - "epoch": 2.75, - "grad_norm": 0.12451171875, - "learning_rate": 8.652940372214069e-06, - "loss": 2.0817, - "step": 3680 - }, - { - "epoch": 2.75, - "grad_norm": 0.125, - "learning_rate": 8.40209753690222e-06, - "loss": 2.1327, - "step": 3685 - }, - { - "epoch": 2.75, - "grad_norm": 0.125, - "learning_rate": 8.154866243817494e-06, - "loss": 2.1231, - "step": 3690 - }, - { - "epoch": 2.76, - "grad_norm": 0.125, - "learning_rate": 7.911251153168752e-06, - "loss": 2.1175, - "step": 3695 - }, - { - "epoch": 2.76, - "grad_norm": 0.1259765625, - "learning_rate": 7.67125685700103e-06, - "loss": 2.113, - "step": 3700 - }, - { - "epoch": 2.76, - "grad_norm": 0.1240234375, - "learning_rate": 7.434887879108776e-06, - "loss": 2.0957, - "step": 3705 - }, - { - "epoch": 2.77, - "grad_norm": 0.126953125, - "learning_rate": 7.202148674950704e-06, - "loss": 2.1117, - "step": 3710 - }, - { - "epoch": 2.77, - "grad_norm": 0.12451171875, - "learning_rate": 6.97304363156579e-06, - "loss": 2.1021, - "step": 3715 - }, - { - "epoch": 2.78, - "grad_norm": 0.12451171875, - "learning_rate": 6.747577067490563e-06, - "loss": 2.1233, - "step": 3720 - }, - { - "epoch": 2.78, - "grad_norm": 0.126953125, - "learning_rate": 6.525753232677678e-06, - "loss": 2.1289, - "step": 3725 - }, - { - "epoch": 2.78, - "grad_norm": 0.125, - "learning_rate": 6.307576308415852e-06, - "loss": 2.1155, - "step": 3730 - }, - { - "epoch": 2.79, - "grad_norm": 0.12451171875, - "learning_rate": 6.093050407251033e-06, - "loss": 2.108, - "step": 3735 - }, - { - "epoch": 2.79, - "grad_norm": 0.130859375, - "learning_rate": 5.882179572908841e-06, - "loss": 2.1112, - "step": 3740 - }, - { - "epoch": 2.79, - "grad_norm": 0.125, - "learning_rate": 5.6749677802184095e-06, - "loss": 2.118, - "step": 3745 - }, - { - "epoch": 2.8, - "grad_norm": 0.12353515625, - "learning_rate": 5.471418935037398e-06, - "loss": 2.096, - "step": 3750 - }, - { - "epoch": 2.8, - "grad_norm": 0.1279296875, - "learning_rate": 5.271536874178451e-06, - "loss": 2.1087, - "step": 3755 - }, - { - "epoch": 2.8, - "grad_norm": 0.1259765625, - "learning_rate": 5.075325365336791e-06, - "loss": 2.1044, - "step": 3760 - }, - { - "epoch": 2.81, - "grad_norm": 0.1279296875, - "learning_rate": 4.882788107019231e-06, - "loss": 2.127, - "step": 3765 - }, - { - "epoch": 2.81, - "grad_norm": 0.12451171875, - "learning_rate": 4.693928728474517e-06, - "loss": 2.0874, - "step": 3770 - }, - { - "epoch": 2.82, - "grad_norm": 0.1259765625, - "learning_rate": 4.5087507896247605e-06, - "loss": 2.1114, - "step": 3775 - }, - { - "epoch": 2.82, - "grad_norm": 0.12890625, - "learning_rate": 4.327257780998517e-06, - "loss": 2.1198, - "step": 3780 - }, - { - "epoch": 2.82, - "grad_norm": 0.1259765625, - "learning_rate": 4.149453123664881e-06, - "loss": 2.1174, - "step": 3785 - }, - { - "epoch": 2.83, - "grad_norm": 0.1259765625, - "learning_rate": 3.975340169169095e-06, - "loss": 2.0976, - "step": 3790 - }, - { - "epoch": 2.83, - "grad_norm": 0.125, - "learning_rate": 3.804922199469174e-06, - "loss": 2.0964, - "step": 3795 - }, - { - "epoch": 2.83, - "grad_norm": 0.1259765625, - "learning_rate": 3.6382024268743153e-06, - "loss": 2.1217, - "step": 3800 - }, - { - "epoch": 2.84, - "grad_norm": 0.126953125, - "learning_rate": 3.4751839939841435e-06, - "loss": 2.1284, - "step": 3805 - }, - { - "epoch": 2.84, - "grad_norm": 0.12451171875, - "learning_rate": 3.3158699736295375e-06, - "loss": 2.1031, - "step": 3810 - }, - { - "epoch": 2.85, - "grad_norm": 0.1240234375, - "learning_rate": 3.160263368814764e-06, - "loss": 2.1215, - "step": 3815 - }, - { - "epoch": 2.85, - "grad_norm": 0.1298828125, - "learning_rate": 3.0083671126607484e-06, - "loss": 2.1315, - "step": 3820 - }, - { - "epoch": 2.85, - "grad_norm": 0.126953125, - "learning_rate": 2.860184068349958e-06, - "loss": 2.1137, - "step": 3825 - }, - { - "epoch": 2.86, - "grad_norm": 0.1259765625, - "learning_rate": 2.7157170290721625e-06, - "loss": 2.1069, - "step": 3830 - }, - { - "epoch": 2.86, - "grad_norm": 0.12451171875, - "learning_rate": 2.5749687179721815e-06, - "loss": 2.1111, - "step": 3835 - }, - { - "epoch": 2.86, - "grad_norm": 0.12451171875, - "learning_rate": 2.4379417880981304e-06, - "loss": 2.0955, - "step": 3840 - }, - { - "epoch": 2.87, - "grad_norm": 0.12451171875, - "learning_rate": 2.304638822351701e-06, - "loss": 2.1026, - "step": 3845 - }, - { - "epoch": 2.87, - "grad_norm": 0.125, - "learning_rate": 2.1750623334393816e-06, - "loss": 2.0882, - "step": 3850 - }, - { - "epoch": 2.88, - "grad_norm": 0.125, - "learning_rate": 2.049214763825069e-06, - "loss": 2.1075, - "step": 3855 - }, - { - "epoch": 2.88, - "grad_norm": 0.126953125, - "learning_rate": 1.9270984856840867e-06, - "loss": 2.1132, - "step": 3860 - }, - { - "epoch": 2.88, - "grad_norm": 0.125, - "learning_rate": 1.8087158008583515e-06, - "loss": 2.1085, - "step": 3865 - }, - { - "epoch": 2.89, - "grad_norm": 0.125, - "learning_rate": 1.6940689408132092e-06, - "loss": 2.0978, - "step": 3870 - }, - { - "epoch": 2.89, - "grad_norm": 0.126953125, - "learning_rate": 1.583160066595113e-06, - "loss": 2.1095, - "step": 3875 - }, - { - "epoch": 2.89, - "grad_norm": 0.1240234375, - "learning_rate": 1.4759912687910771e-06, - "loss": 2.1079, - "step": 3880 - }, - { - "epoch": 2.9, - "grad_norm": 0.1279296875, - "learning_rate": 1.3725645674891762e-06, - "loss": 2.1186, - "step": 3885 - }, - { - "epoch": 2.9, - "grad_norm": 0.12890625, - "learning_rate": 1.2728819122404646e-06, - "loss": 2.1165, - "step": 3890 - }, - { - "epoch": 2.91, - "grad_norm": 0.12353515625, - "learning_rate": 1.1769451820223376e-06, - "loss": 2.1125, - "step": 3895 - }, - { - "epoch": 2.91, - "grad_norm": 0.126953125, - "learning_rate": 1.084756185202962e-06, - "loss": 2.1132, - "step": 3900 - }, - { - "epoch": 2.91, - "grad_norm": 0.126953125, - "learning_rate": 9.963166595073014e-07, - "loss": 2.1062, - "step": 3905 - }, - { - "epoch": 2.92, - "grad_norm": 0.1240234375, - "learning_rate": 9.116282719842772e-07, - "loss": 2.1068, - "step": 3910 - }, - { - "epoch": 2.92, - "grad_norm": 0.12451171875, - "learning_rate": 8.306926189754372e-07, - "loss": 2.1134, - "step": 3915 - }, - { - "epoch": 2.92, - "grad_norm": 0.1279296875, - "learning_rate": 7.535112260847799e-07, - "loss": 2.1227, - "step": 3920 - }, - { - "epoch": 2.93, - "grad_norm": 0.1240234375, - "learning_rate": 6.800855481500445e-07, - "loss": 2.1204, - "step": 3925 - }, - { - "epoch": 2.93, - "grad_norm": 0.126953125, - "learning_rate": 6.104169692153105e-07, - "loss": 2.1101, - "step": 3930 - }, - { - "epoch": 2.94, - "grad_norm": 0.1279296875, - "learning_rate": 5.44506802504774e-07, - "loss": 2.1111, - "step": 3935 - }, - { - "epoch": 2.94, - "grad_norm": 0.125, - "learning_rate": 4.823562903982337e-07, - "loss": 2.1068, - "step": 3940 - }, - { - "epoch": 2.94, - "grad_norm": 0.1298828125, - "learning_rate": 4.239666044074442e-07, - "loss": 2.1416, - "step": 3945 - }, - { - "epoch": 2.95, - "grad_norm": 0.12451171875, - "learning_rate": 3.693388451541102e-07, - "loss": 2.1217, - "step": 3950 - }, - { - "epoch": 2.95, - "grad_norm": 0.1240234375, - "learning_rate": 3.1847404234923715e-07, - "loss": 2.1201, - "step": 3955 - }, - { - "epoch": 2.95, - "grad_norm": 0.1259765625, - "learning_rate": 2.713731547735687e-07, - "loss": 2.1135, - "step": 3960 - }, - { - "epoch": 2.96, - "grad_norm": 0.125, - "learning_rate": 2.280370702596013e-07, - "loss": 2.0992, - "step": 3965 - }, - { - "epoch": 2.96, - "grad_norm": 0.12353515625, - "learning_rate": 1.8846660567484186e-07, - "loss": 2.1167, - "step": 3970 - }, - { - "epoch": 2.97, - "grad_norm": 0.12451171875, - "learning_rate": 1.5266250690635363e-07, - "loss": 2.0971, - "step": 3975 - }, - { - "epoch": 2.97, - "grad_norm": 0.1240234375, - "learning_rate": 1.2062544884683391e-07, - "loss": 2.1277, - "step": 3980 - }, - { - "epoch": 2.97, - "grad_norm": 0.125, - "learning_rate": 9.235603538171322e-08, - "loss": 2.1004, - "step": 3985 - }, - { - "epoch": 2.98, - "grad_norm": 0.12451171875, - "learning_rate": 6.785479937789773e-08, - "loss": 2.0967, - "step": 3990 - }, - { - "epoch": 2.98, - "grad_norm": 0.1259765625, - "learning_rate": 4.712220267366618e-08, - "loss": 2.1211, - "step": 3995 - }, - { - "epoch": 2.98, - "grad_norm": 0.126953125, - "learning_rate": 3.015863607003233e-08, - "loss": 2.1347, - "step": 4000 - }, - { - "epoch": 2.99, - "grad_norm": 0.125, - "learning_rate": 1.69644193232843e-08, - "loss": 2.1306, - "step": 4005 - }, - { - "epoch": 2.99, - "grad_norm": 0.1240234375, - "learning_rate": 7.539801139011538e-09, - "loss": 2.1013, - "step": 4010 - }, - { - "epoch": 3.0, - "grad_norm": 0.1240234375, - "learning_rate": 1.884959167419709e-09, - "loss": 2.099, - "step": 4015 - }, - { - "epoch": 3.0, - "grad_norm": 0.1259765625, - "learning_rate": 0.0, - "loss": 2.0989, - "step": 4020 - }, - { - "epoch": 3.0, - "eval_loss": 2.1637070178985596, - "eval_runtime": 187.0083, - "eval_samples_per_second": 25.785, - "eval_steps_per_second": 3.224, - "step": 4020 - }, - { - "epoch": 3.0, - "step": 4020, - "total_flos": 9.114826330351862e+17, - "train_loss": 2.1598364234563725, - "train_runtime": 32657.7671, - "train_samples_per_second": 7.879, + "epoch": 2.0, + "step": 2702, + "total_flos": 6.137045069867254e+17, + "train_loss": 2.1601854102158, + "train_runtime": 21984.2259, + "train_samples_per_second": 7.864, "train_steps_per_second": 0.123 } ], "logging_steps": 5, - "max_steps": 4020, + "max_steps": 2702, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 2, "save_steps": 100, - "total_flos": 9.114826330351862e+17, + "total_flos": 6.137045069867254e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null