diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,982 +1,5688 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.999627004848937, + "epoch": 2.9988810145468108, "eval_steps": 500, - "global_step": 670, + "global_step": 4020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 0.337890625, - "learning_rate": 9.477611940298507e-07, - "loss": 2.637, + "grad_norm": 0.353515625, + "learning_rate": 9.950248756218907e-07, + "loss": 2.6395, "step": 1 }, { - "epoch": 0.01, - "grad_norm": 0.349609375, - "learning_rate": 4.738805970149253e-06, - "loss": 2.6461, + "epoch": 0.0, + "grad_norm": 0.328125, + "learning_rate": 4.975124378109453e-06, + "loss": 2.6482, "step": 5 }, { "epoch": 0.01, - "grad_norm": 0.32421875, - "learning_rate": 9.477611940298506e-06, - "loss": 2.6439, + "grad_norm": 0.337890625, + "learning_rate": 9.950248756218906e-06, + "loss": 2.6389, "step": 10 }, { - "epoch": 0.02, - "grad_norm": 0.35546875, - "learning_rate": 1.4216417910447761e-05, - "loss": 2.642, + "epoch": 0.01, + "grad_norm": 0.3203125, + "learning_rate": 1.4925373134328357e-05, + "loss": 2.629, "step": 15 }, { - "epoch": 0.03, - "grad_norm": 0.365234375, - "learning_rate": 1.895522388059701e-05, - "loss": 2.6207, + "epoch": 0.01, + "grad_norm": 0.328125, + "learning_rate": 1.990049751243781e-05, + "loss": 2.6301, "step": 20 }, { - "epoch": 0.04, - "grad_norm": 0.353515625, - "learning_rate": 2.369402985074627e-05, - "loss": 2.6062, + "epoch": 0.02, + "grad_norm": 0.36328125, + "learning_rate": 2.4875621890547266e-05, + "loss": 2.6048, "step": 25 }, { - "epoch": 0.04, - "grad_norm": 0.291015625, - "learning_rate": 2.8432835820895522e-05, - "loss": 2.5733, + "epoch": 0.02, + "grad_norm": 0.298828125, + "learning_rate": 2.9850746268656714e-05, + "loss": 2.5818, "step": 30 }, { - "epoch": 0.05, - "grad_norm": 0.255859375, - "learning_rate": 3.317164179104477e-05, - "loss": 2.531, + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 3.4825870646766175e-05, + "loss": 2.5372, "step": 35 }, { - "epoch": 0.06, - "grad_norm": 0.259765625, - "learning_rate": 3.791044776119402e-05, - "loss": 2.495, + "epoch": 0.03, + "grad_norm": 0.271484375, + "learning_rate": 3.980099502487562e-05, + "loss": 2.507, "step": 40 }, { - "epoch": 0.07, - "grad_norm": 0.177734375, - "learning_rate": 4.2649253731343286e-05, - "loss": 2.4618, + "epoch": 0.03, + "grad_norm": 0.1748046875, + "learning_rate": 4.477611940298508e-05, + "loss": 2.4718, "step": 45 }, { - "epoch": 0.07, - "grad_norm": 0.1494140625, - "learning_rate": 4.738805970149254e-05, - "loss": 2.4374, + "epoch": 0.04, + "grad_norm": 0.1640625, + "learning_rate": 4.975124378109453e-05, + "loss": 2.4398, "step": 50 }, { - "epoch": 0.08, - "grad_norm": 0.1513671875, - "learning_rate": 5.2126865671641794e-05, - "loss": 2.4205, + "epoch": 0.04, + "grad_norm": 0.16015625, + "learning_rate": 5.472636815920398e-05, + "loss": 2.4118, "step": 55 }, { - "epoch": 0.09, - "grad_norm": 0.15234375, - "learning_rate": 5.6865671641791044e-05, - "loss": 2.3846, + "epoch": 0.04, + "grad_norm": 0.1728515625, + "learning_rate": 5.970149253731343e-05, + "loss": 2.3949, "step": 60 }, { - "epoch": 0.1, - "grad_norm": 0.1298828125, - "learning_rate": 6.16044776119403e-05, - "loss": 2.3675, + "epoch": 0.05, + "grad_norm": 0.13671875, + "learning_rate": 6.46766169154229e-05, + "loss": 2.3556, "step": 65 }, { - "epoch": 0.1, - "grad_norm": 0.10888671875, - "learning_rate": 6.349612195786573e-05, - "loss": 2.3648, + "epoch": 0.05, + "grad_norm": 0.1357421875, + "learning_rate": 6.965174129353235e-05, + "loss": 2.3644, "step": 70 }, { - "epoch": 0.11, - "grad_norm": 0.138671875, - "learning_rate": 6.3472426242142e-05, - "loss": 2.3394, + "epoch": 0.06, + "grad_norm": 0.1474609375, + "learning_rate": 7.46268656716418e-05, + "loss": 2.3262, "step": 75 }, { - "epoch": 0.12, - "grad_norm": 0.09912109375, - "learning_rate": 6.342720533773385e-05, - "loss": 2.3241, + "epoch": 0.06, + "grad_norm": 0.11865234375, + "learning_rate": 7.960199004975125e-05, + "loss": 2.3181, "step": 80 }, { - "epoch": 0.13, - "grad_norm": 0.083984375, - "learning_rate": 6.336048992919527e-05, - "loss": 2.3123, + "epoch": 0.06, + "grad_norm": 0.11376953125, + "learning_rate": 8.45771144278607e-05, + "loss": 2.3195, "step": 85 }, { - "epoch": 0.13, - "grad_norm": 0.09228515625, - "learning_rate": 6.327232528613285e-05, - "loss": 2.3201, + "epoch": 0.07, + "grad_norm": 0.10693359375, + "learning_rate": 8.955223880597016e-05, + "loss": 2.2922, "step": 90 }, { - "epoch": 0.14, - "grad_norm": 0.0908203125, - "learning_rate": 6.316277123248829e-05, - "loss": 2.2999, + "epoch": 0.07, + "grad_norm": 0.10009765625, + "learning_rate": 9.452736318407961e-05, + "loss": 2.2996, "step": 95 }, { - "epoch": 0.15, - "grad_norm": 0.07958984375, - "learning_rate": 6.303190210594489e-05, - "loss": 2.3114, + "epoch": 0.07, + "grad_norm": 0.095703125, + "learning_rate": 9.950248756218906e-05, + "loss": 2.2923, "step": 100 }, { - "epoch": 0.16, - "grad_norm": 0.0712890625, - "learning_rate": 6.287980670748592e-05, - "loss": 2.3002, + "epoch": 0.08, + "grad_norm": 0.09912109375, + "learning_rate": 0.0001044776119402985, + "loss": 2.2908, "step": 105 }, { - "epoch": 0.16, - "grad_norm": 0.072265625, - "learning_rate": 6.270658824113884e-05, - "loss": 2.2732, + "epoch": 0.08, + "grad_norm": 0.11328125, + "learning_rate": 0.00010945273631840796, + "loss": 2.2892, "step": 110 }, { - "epoch": 0.17, - "grad_norm": 0.07275390625, - "learning_rate": 6.251236424394651e-05, - "loss": 2.2867, + "epoch": 0.09, + "grad_norm": 0.10107421875, + "learning_rate": 0.00011442786069651741, + "loss": 2.2772, "step": 115 }, { - "epoch": 0.18, - "grad_norm": 0.0732421875, - "learning_rate": 6.229726650621257e-05, - "loss": 2.2718, + "epoch": 0.09, + "grad_norm": 0.09814453125, + "learning_rate": 0.00011940298507462686, + "loss": 2.2492, "step": 120 }, { - "epoch": 0.19, - "grad_norm": 0.0751953125, - "learning_rate": 6.20614409820754e-05, - "loss": 2.274, + "epoch": 0.09, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001243781094527363, + "loss": 2.2669, "step": 125 }, { - "epoch": 0.19, - "grad_norm": 0.072265625, - "learning_rate": 6.180504769047129e-05, - "loss": 2.2688, + "epoch": 0.1, + "grad_norm": 0.1025390625, + "learning_rate": 0.0001293532338308458, + "loss": 2.2486, "step": 130 }, { - "epoch": 0.2, - "grad_norm": 0.0830078125, - "learning_rate": 6.152826060655387e-05, - "loss": 2.272, + "epoch": 0.1, + "grad_norm": 0.10205078125, + "learning_rate": 0.00013432835820895525, + "loss": 2.2655, "step": 135 }, { - "epoch": 0.21, - "grad_norm": 0.07861328125, - "learning_rate": 6.123126754364366e-05, - "loss": 2.2807, + "epoch": 0.1, + "grad_norm": 0.1005859375, + "learning_rate": 0.0001393034825870647, + "loss": 2.2695, "step": 140 }, { - "epoch": 0.22, - "grad_norm": 0.072265625, - "learning_rate": 6.091427002578765e-05, - "loss": 2.2519, + "epoch": 0.11, + "grad_norm": 0.10546875, + "learning_rate": 0.00014427860696517416, + "loss": 2.2627, "step": 145 }, { - "epoch": 0.22, - "grad_norm": 0.0732421875, - "learning_rate": 6.057748315101562e-05, - "loss": 2.2648, + "epoch": 0.11, + "grad_norm": 0.1064453125, + "learning_rate": 0.0001492537313432836, + "loss": 2.2453, "step": 150 }, { - "epoch": 0.23, - "grad_norm": 0.07177734375, - "learning_rate": 6.0221135445385774e-05, - "loss": 2.2739, + "epoch": 0.12, + "grad_norm": 0.1162109375, + "learning_rate": 0.00015422885572139304, + "loss": 2.2462, "step": 155 }, { - "epoch": 0.24, - "grad_norm": 0.07470703125, - "learning_rate": 5.984546870791885e-05, - "loss": 2.2764, + "epoch": 0.12, + "grad_norm": 0.119140625, + "learning_rate": 0.0001592039800995025, + "loss": 2.2574, "step": 160 }, { - "epoch": 0.25, - "grad_norm": 0.0751953125, - "learning_rate": 5.945073784652589e-05, - "loss": 2.2514, + "epoch": 0.12, + "grad_norm": 0.11474609375, + "learning_rate": 0.00016417910447761195, + "loss": 2.2522, "step": 165 }, { - "epoch": 0.25, - "grad_norm": 0.078125, - "learning_rate": 5.9037210705040984e-05, - "loss": 2.2544, + "epoch": 0.13, + "grad_norm": 0.12255859375, + "learning_rate": 0.0001691542288557214, + "loss": 2.2428, "step": 170 }, { - "epoch": 0.26, - "grad_norm": 0.07470703125, - "learning_rate": 5.860516788147634e-05, - "loss": 2.2479, + "epoch": 0.13, + "grad_norm": 0.11669921875, + "learning_rate": 0.00017412935323383086, + "loss": 2.2617, "step": 175 }, { - "epoch": 0.27, - "grad_norm": 0.07763671875, - "learning_rate": 5.815490253762313e-05, - "loss": 2.253, + "epoch": 0.13, + "grad_norm": 0.11328125, + "learning_rate": 0.0001791044776119403, + "loss": 2.2604, "step": 180 }, { - "epoch": 0.28, - "grad_norm": 0.080078125, - "learning_rate": 5.7686720200127084e-05, - "loss": 2.2528, + "epoch": 0.14, + "grad_norm": 0.1142578125, + "learning_rate": 0.00018407960199004977, + "loss": 2.2437, "step": 185 }, { - "epoch": 0.28, - "grad_norm": 0.0791015625, - "learning_rate": 5.7200938553174043e-05, - "loss": 2.253, + "epoch": 0.14, + "grad_norm": 0.1171875, + "learning_rate": 0.00018905472636815922, + "loss": 2.2489, "step": 190 }, { - "epoch": 0.29, - "grad_norm": 0.078125, - "learning_rate": 5.669788722292595e-05, - "loss": 2.2571, + "epoch": 0.15, + "grad_norm": 0.11181640625, + "learning_rate": 0.00019402985074626867, + "loss": 2.2788, "step": 195 }, { - "epoch": 0.3, - "grad_norm": 0.0791015625, - "learning_rate": 5.617790755385372e-05, - "loss": 2.2666, + "epoch": 0.15, + "grad_norm": 0.10986328125, + "learning_rate": 0.00019900497512437813, + "loss": 2.2438, "step": 200 }, { - "epoch": 0.31, - "grad_norm": 0.07958984375, - "learning_rate": 5.5641352377118605e-05, - "loss": 2.2601, + "epoch": 0.15, + "grad_norm": 0.11181640625, + "learning_rate": 0.00020398009950248756, + "loss": 2.2603, "step": 205 }, { - "epoch": 0.31, - "grad_norm": 0.07861328125, - "learning_rate": 5.508858577115933e-05, - "loss": 2.2471, + "epoch": 0.16, + "grad_norm": 0.11181640625, + "learning_rate": 0.000208955223880597, + "loss": 2.242, "step": 210 }, { - "epoch": 0.32, - "grad_norm": 0.080078125, - "learning_rate": 5.451998281464741e-05, - "loss": 2.2523, + "epoch": 0.16, + "grad_norm": 0.10986328125, + "learning_rate": 0.00021393034825870647, + "loss": 2.2205, "step": 215 }, { - "epoch": 0.33, - "grad_norm": 0.07958984375, - "learning_rate": 5.393592933197822e-05, - "loss": 2.244, + "epoch": 0.16, + "grad_norm": 0.11669921875, + "learning_rate": 0.00021890547263681592, + "loss": 2.2329, "step": 220 }, { - "epoch": 0.34, - "grad_norm": 0.0791015625, - "learning_rate": 5.333682163147071e-05, - "loss": 2.242, + "epoch": 0.17, + "grad_norm": 0.115234375, + "learning_rate": 0.00022388059701492538, + "loss": 2.2511, "step": 225 }, { - "epoch": 0.34, - "grad_norm": 0.0771484375, - "learning_rate": 5.2723066236453086e-05, - "loss": 2.2519, + "epoch": 0.17, + "grad_norm": 0.1103515625, + "learning_rate": 0.00022885572139303483, + "loss": 2.2369, "step": 230 }, { - "epoch": 0.35, - "grad_norm": 0.08349609375, - "learning_rate": 5.209507960941733e-05, - "loss": 2.2521, + "epoch": 0.18, + "grad_norm": 0.1123046875, + "learning_rate": 0.00023383084577114426, + "loss": 2.2116, "step": 235 }, { - "epoch": 0.36, - "grad_norm": 0.0830078125, - "learning_rate": 5.145328786942933e-05, - "loss": 2.2525, + "epoch": 0.18, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002388059701492537, + "loss": 2.2502, "step": 240 }, { - "epoch": 0.37, - "grad_norm": 0.07958984375, - "learning_rate": 5.0798126502986685e-05, - "loss": 2.2423, + "epoch": 0.18, + "grad_norm": 0.10986328125, + "learning_rate": 0.00024378109452736317, + "loss": 2.2344, "step": 245 }, { - "epoch": 0.37, - "grad_norm": 0.080078125, - "learning_rate": 5.013004006852019e-05, - "loss": 2.2444, + "epoch": 0.19, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002487562189054726, + "loss": 2.2336, "step": 250 }, { - "epoch": 0.38, - "grad_norm": 0.08154296875, - "learning_rate": 4.944948189473962e-05, - "loss": 2.2562, + "epoch": 0.19, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002537313432835821, + "loss": 2.2274, "step": 255 }, { - "epoch": 0.39, - "grad_norm": 0.08203125, - "learning_rate": 4.875691377302846e-05, - "loss": 2.2443, + "epoch": 0.19, + "grad_norm": 0.109375, + "learning_rate": 0.0002587064676616916, + "loss": 2.2365, "step": 260 }, { - "epoch": 0.4, - "grad_norm": 0.0859375, - "learning_rate": 4.805280564409623e-05, - "loss": 2.2492, + "epoch": 0.2, + "grad_norm": 0.1044921875, + "learning_rate": 0.000263681592039801, + "loss": 2.2357, "step": 265 }, { - "epoch": 0.4, - "grad_norm": 0.08154296875, - "learning_rate": 4.7337635279101233e-05, - "loss": 2.2402, + "epoch": 0.2, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002686567164179105, + "loss": 2.2314, "step": 270 }, { - "epoch": 0.41, - "grad_norm": 0.08203125, - "learning_rate": 4.661188795545985e-05, - "loss": 2.2402, + "epoch": 0.21, + "grad_norm": 0.10791015625, + "learning_rate": 0.0002736318407960199, + "loss": 2.2381, "step": 275 }, { - "epoch": 0.42, - "grad_norm": 0.0869140625, - "learning_rate": 4.5876056127562524e-05, - "loss": 2.2464, + "epoch": 0.21, + "grad_norm": 0.10888671875, + "learning_rate": 0.0002786069651741294, + "loss": 2.2481, "step": 280 }, { - "epoch": 0.43, - "grad_norm": 0.0859375, - "learning_rate": 4.5130639092619825e-05, - "loss": 2.2518, + "epoch": 0.21, + "grad_norm": 0.1142578125, + "learning_rate": 0.00028358208955223883, + "loss": 2.2104, "step": 285 }, { - "epoch": 0.43, - "grad_norm": 0.083984375, - "learning_rate": 4.437614265186536e-05, - "loss": 2.2491, + "epoch": 0.22, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002885572139303483, + "loss": 2.2203, "step": 290 }, { - "epoch": 0.44, - "grad_norm": 0.08154296875, - "learning_rate": 4.361307876734529e-05, - "loss": 2.2474, + "epoch": 0.22, + "grad_norm": 0.10986328125, + "learning_rate": 0.00029353233830845774, + "loss": 2.2426, "step": 295 }, { - "epoch": 0.45, - "grad_norm": 0.083984375, - "learning_rate": 4.2841965214527606e-05, - "loss": 2.2634, + "epoch": 0.22, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002985074626865672, + "loss": 2.2137, "step": 300 }, { - "epoch": 0.46, - "grad_norm": 0.0849609375, - "learning_rate": 4.206332523096655e-05, - "loss": 2.2433, + "epoch": 0.23, + "grad_norm": 0.1064453125, + "learning_rate": 0.00030348258706467665, + "loss": 2.2393, "step": 305 }, { - "epoch": 0.46, - "grad_norm": 0.083984375, - "learning_rate": 4.127768716126082e-05, - "loss": 2.2468, + "epoch": 0.23, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003084577114427861, + "loss": 2.2369, "step": 310 }, { - "epoch": 0.47, - "grad_norm": 0.0869140625, - "learning_rate": 4.0485584098546456e-05, - "loss": 2.2399, + "epoch": 0.23, + "grad_norm": 0.10595703125, + "learning_rate": 0.00031343283582089556, + "loss": 2.2205, "step": 315 }, { - "epoch": 0.48, - "grad_norm": 0.08544921875, - "learning_rate": 3.968755352276755e-05, - "loss": 2.2431, + "epoch": 0.24, + "grad_norm": 0.10400390625, + "learning_rate": 0.000318407960199005, + "loss": 2.2607, "step": 320 }, { - "epoch": 0.48, - "grad_norm": 0.08349609375, - "learning_rate": 3.888413693597025e-05, - "loss": 2.2309, + "epoch": 0.24, + "grad_norm": 0.10400390625, + "learning_rate": 0.00032338308457711447, + "loss": 2.201, "step": 325 }, { - "epoch": 0.49, - "grad_norm": 0.08740234375, - "learning_rate": 3.8075879494867705e-05, - "loss": 2.2298, + "epoch": 0.25, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003283582089552239, + "loss": 2.2304, "step": 330 }, { - "epoch": 0.5, - "grad_norm": 0.08349609375, - "learning_rate": 3.726332964092504e-05, - "loss": 2.2443, + "epoch": 0.25, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003333333333333334, + "loss": 2.214, "step": 335 }, { - "epoch": 0.51, - "grad_norm": 0.09033203125, - "learning_rate": 3.644703872821547e-05, - "loss": 2.2481, + "epoch": 0.25, + "grad_norm": 0.1015625, + "learning_rate": 0.0003383084577114428, + "loss": 2.2251, "step": 340 }, { - "epoch": 0.51, - "grad_norm": 0.08642578125, - "learning_rate": 3.5627560649300175e-05, - "loss": 2.2375, + "epoch": 0.26, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003432835820895523, + "loss": 2.2159, "step": 345 }, { - "epoch": 0.52, - "grad_norm": 0.087890625, - "learning_rate": 3.4805451459385544e-05, - "loss": 2.2368, + "epoch": 0.26, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003482587064676617, + "loss": 2.2085, "step": 350 }, { - "epoch": 0.53, - "grad_norm": 0.0830078125, - "learning_rate": 3.398126899901305e-05, - "loss": 2.2359, + "epoch": 0.26, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003532338308457712, + "loss": 2.2165, "step": 355 }, { - "epoch": 0.54, - "grad_norm": 0.0830078125, - "learning_rate": 3.31555725155377e-05, - "loss": 2.2472, + "epoch": 0.27, + "grad_norm": 0.109375, + "learning_rate": 0.0003582089552238806, + "loss": 2.2196, "step": 360 }, { - "epoch": 0.54, - "grad_norm": 0.0849609375, - "learning_rate": 3.232892228365181e-05, - "loss": 2.2511, + "epoch": 0.27, + "grad_norm": 0.103515625, + "learning_rate": 0.0003631840796019901, + "loss": 2.212, "step": 365 }, { - "epoch": 0.55, - "grad_norm": 0.0849609375, - "learning_rate": 3.15018792252118e-05, - "loss": 2.2273, + "epoch": 0.28, + "grad_norm": 0.1044921875, + "learning_rate": 0.00036815920398009953, + "loss": 2.2246, "step": 370 }, { - "epoch": 0.56, - "grad_norm": 0.08740234375, - "learning_rate": 3.067500452862575e-05, - "loss": 2.2391, + "epoch": 0.28, + "grad_norm": 0.10009765625, + "learning_rate": 0.00037313432835820896, + "loss": 2.2166, "step": 375 }, { - "epoch": 0.57, - "grad_norm": 0.08447265625, - "learning_rate": 2.984885926806012e-05, - "loss": 2.2424, + "epoch": 0.28, + "grad_norm": 0.10498046875, + "learning_rate": 0.00037810945273631844, + "loss": 2.2207, "step": 380 }, { - "epoch": 0.57, - "grad_norm": 0.0830078125, - "learning_rate": 2.9024004022724027e-05, - "loss": 2.2435, + "epoch": 0.29, + "grad_norm": 0.10693359375, + "learning_rate": 0.00038308457711442787, + "loss": 2.2283, "step": 385 }, { - "epoch": 0.58, - "grad_norm": 0.087890625, - "learning_rate": 2.8200998496489373e-05, - "loss": 2.223, + "epoch": 0.29, + "grad_norm": 0.09765625, + "learning_rate": 0.00038805970149253735, + "loss": 2.2168, "step": 390 }, { - "epoch": 0.59, - "grad_norm": 0.08447265625, - "learning_rate": 2.7380401138104845e-05, - "loss": 2.2235, + "epoch": 0.29, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003930348258706468, + "loss": 2.2358, "step": 395 }, { - "epoch": 0.6, - "grad_norm": 0.08984375, - "learning_rate": 2.656276876226166e-05, - "loss": 2.2379, + "epoch": 0.3, + "grad_norm": 0.1015625, + "learning_rate": 0.00039800995024875626, + "loss": 2.2289, "step": 400 }, { - "epoch": 0.6, - "grad_norm": 0.0869140625, - "learning_rate": 2.5748656171768127e-05, - "loss": 2.2232, + "epoch": 0.3, + "grad_norm": 0.099609375, + "learning_rate": 0.00039999932141401753, + "loss": 2.2386, "step": 405 }, { - "epoch": 0.61, - "grad_norm": 0.0888671875, - "learning_rate": 2.4938615781089216e-05, - "loss": 2.2492, + "epoch": 0.31, + "grad_norm": 0.1015625, + "learning_rate": 0.000399995174516356, + "loss": 2.2128, "step": 410 }, { - "epoch": 0.62, - "grad_norm": 0.08447265625, - "learning_rate": 2.413319724150689e-05, - "loss": 2.2429, + "epoch": 0.31, + "grad_norm": 0.09765625, + "learning_rate": 0.00039998725779131805, + "loss": 2.2145, "step": 415 }, { - "epoch": 0.63, - "grad_norm": 0.0849609375, - "learning_rate": 2.3332947068155364e-05, - "loss": 2.2486, + "epoch": 0.31, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003999755713881306, + "loss": 2.2056, "step": 420 }, { - "epoch": 0.63, - "grad_norm": 0.08349609375, - "learning_rate": 2.2538408269184268e-05, - "loss": 2.2415, + "epoch": 0.32, + "grad_norm": 0.099609375, + "learning_rate": 0.0003999601155270777, + "loss": 2.2149, "step": 425 }, { - "epoch": 0.64, - "grad_norm": 0.0869140625, - "learning_rate": 2.1750119977301616e-05, - "loss": 2.2568, + "epoch": 0.32, + "grad_norm": 0.099609375, + "learning_rate": 0.00039994089049949597, + "loss": 2.2165, "step": 430 }, { - "epoch": 0.65, - "grad_norm": 0.0859375, - "learning_rate": 2.096861708394641e-05, - "loss": 2.235, + "epoch": 0.32, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003999178966677693, + "loss": 2.2031, "step": 435 }, { - "epoch": 0.66, - "grad_norm": 0.0849609375, - "learning_rate": 2.0194429876339054e-05, - "loss": 2.2364, + "epoch": 0.33, + "grad_norm": 0.1025390625, + "learning_rate": 0.00039989113446532205, + "loss": 2.2134, "step": 440 }, { - "epoch": 0.66, - "grad_norm": 0.0859375, - "learning_rate": 1.9428083677656066e-05, - "loss": 2.2439, + "epoch": 0.33, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003998606043966108, + "loss": 2.1896, "step": 445 }, { - "epoch": 0.67, - "grad_norm": 0.0859375, - "learning_rate": 1.8670098490573132e-05, - "loss": 2.238, + "epoch": 0.34, + "grad_norm": 0.10400390625, + "learning_rate": 0.00039982630703711496, + "loss": 2.2205, "step": 450 }, { - "epoch": 0.68, - "grad_norm": 0.0869140625, - "learning_rate": 1.792098864441825e-05, - "loss": 2.2325, + "epoch": 0.34, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003997882430333256, + "loss": 2.2203, "step": 455 }, { - "epoch": 0.69, - "grad_norm": 0.08544921875, - "learning_rate": 1.7181262446174615e-05, - "loss": 2.2342, + "epoch": 0.34, + "grad_norm": 0.09912109375, + "learning_rate": 0.00039974641310273386, + "loss": 2.2134, "step": 460 }, { - "epoch": 0.69, - "grad_norm": 0.08740234375, - "learning_rate": 1.6451421835570044e-05, - "loss": 2.2262, + "epoch": 0.35, + "grad_norm": 0.099609375, + "learning_rate": 0.0003997008180338166, + "loss": 2.2344, "step": 465 }, { - "epoch": 0.7, - "grad_norm": 0.08642578125, - "learning_rate": 1.5731962044486608e-05, - "loss": 2.2577, + "epoch": 0.35, + "grad_norm": 0.10302734375, + "learning_rate": 0.00039965145868602243, + "loss": 2.1976, "step": 470 }, { - "epoch": 0.71, - "grad_norm": 0.08544921875, - "learning_rate": 1.5023371260922157e-05, - "loss": 2.2481, + "epoch": 0.35, + "grad_norm": 0.09716796875, + "learning_rate": 0.0003995983359897548, + "loss": 2.2175, "step": 475 }, { - "epoch": 0.72, - "grad_norm": 0.083984375, - "learning_rate": 1.4326130297731294e-05, - "loss": 2.229, + "epoch": 0.36, + "grad_norm": 0.0947265625, + "learning_rate": 0.000399541450946355, + "loss": 2.2145, "step": 480 }, { - "epoch": 0.72, - "grad_norm": 0.08251953125, - "learning_rate": 1.3640712266370778e-05, - "loss": 2.2365, + "epoch": 0.36, + "grad_norm": 0.09619140625, + "learning_rate": 0.00039948080462808266, + "loss": 2.2186, "step": 485 }, { - "epoch": 0.73, - "grad_norm": 0.08349609375, - "learning_rate": 1.2967582255870662e-05, - "loss": 2.2276, + "epoch": 0.37, + "grad_norm": 0.10009765625, + "learning_rate": 0.0003994163981780963, + "loss": 2.1911, "step": 490 }, { - "epoch": 0.74, - "grad_norm": 0.08203125, - "learning_rate": 1.2307197017249163e-05, - "loss": 2.245, + "epoch": 0.37, + "grad_norm": 0.09716796875, + "learning_rate": 0.00039934823281043103, + "loss": 2.2123, "step": 495 }, { - "epoch": 0.75, - "grad_norm": 0.08544921875, - "learning_rate": 1.166000465358504e-05, - "loss": 2.2327, + "epoch": 0.37, + "grad_norm": 0.09716796875, + "learning_rate": 0.00039927630980997634, + "loss": 2.2019, "step": 500 }, { - "epoch": 0.75, - "grad_norm": 0.08349609375, - "learning_rate": 1.1026444315958248e-05, - "loss": 2.2257, + "epoch": 0.38, + "grad_norm": 0.09814453125, + "learning_rate": 0.00039920063053245145, + "loss": 2.2251, "step": 505 }, { - "epoch": 0.76, - "grad_norm": 0.08642578125, - "learning_rate": 1.0406945905464832e-05, - "loss": 2.2355, + "epoch": 0.38, + "grad_norm": 0.10009765625, + "learning_rate": 0.00039912119640437963, + "loss": 2.2116, "step": 510 }, { - "epoch": 0.77, - "grad_norm": 0.08349609375, - "learning_rate": 9.801929781508377e-06, - "loss": 2.2366, + "epoch": 0.38, + "grad_norm": 0.1005859375, + "learning_rate": 0.00039903800892306194, + "loss": 2.2142, "step": 515 }, { - "epoch": 0.78, - "grad_norm": 0.0849609375, - "learning_rate": 9.211806476565995e-06, - "loss": 2.2307, + "epoch": 0.39, + "grad_norm": 0.0986328125, + "learning_rate": 0.00039895106965654836, + "loss": 2.2006, "step": 520 }, { - "epoch": 0.78, - "grad_norm": 0.0869140625, - "learning_rate": 8.63697641762235e-06, - "loss": 2.2298, + "epoch": 0.39, + "grad_norm": 0.09765625, + "learning_rate": 0.0003988603802436086, + "loss": 2.2168, "step": 525 }, { - "epoch": 0.79, - "grad_norm": 0.0830078125, - "learning_rate": 8.077829654460684e-06, - "loss": 2.24, + "epoch": 0.4, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003987659423937011, + "loss": 2.2054, "step": 530 }, { - "epoch": 0.8, - "grad_norm": 0.0869140625, - "learning_rate": 7.534745594995376e-06, - "loss": 2.2349, + "epoch": 0.4, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003986677578869407, + "loss": 2.2115, "step": 535 }, { - "epoch": 0.81, - "grad_norm": 0.08544921875, - "learning_rate": 7.00809274782543e-06, - "loss": 2.2522, + "epoch": 0.4, + "grad_norm": 0.0986328125, + "learning_rate": 0.00039856582857406524, + "loss": 2.1906, "step": 540 }, { - "epoch": 0.81, - "grad_norm": 0.087890625, - "learning_rate": 6.498228472183709e-06, - "loss": 2.2253, + "epoch": 0.41, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003984601563764007, + "loss": 2.2139, "step": 545 }, { - "epoch": 0.82, - "grad_norm": 0.08544921875, - "learning_rate": 6.00549873545155e-06, - "loss": 2.2319, + "epoch": 0.41, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003983507432858249, + "loss": 2.1912, "step": 550 }, { - "epoch": 0.83, - "grad_norm": 0.08544921875, - "learning_rate": 5.530237878403297e-06, - "loss": 2.23, + "epoch": 0.41, + "grad_norm": 0.09765625, + "learning_rate": 0.0003982375913647298, + "loss": 2.2201, "step": 555 }, { - "epoch": 0.84, - "grad_norm": 0.08203125, - "learning_rate": 5.0727683883399965e-06, - "loss": 2.2366, + "epoch": 0.42, + "grad_norm": 0.0966796875, + "learning_rate": 0.000398120702745983, + "loss": 2.1941, "step": 560 }, { - "epoch": 0.84, - "grad_norm": 0.08642578125, - "learning_rate": 4.63340068026638e-06, - "loss": 2.238, + "epoch": 0.42, + "grad_norm": 0.09716796875, + "learning_rate": 0.0003980000796328872, + "loss": 2.2206, "step": 565 }, { - "epoch": 0.85, - "grad_norm": 0.087890625, - "learning_rate": 4.212432886259269e-06, - "loss": 2.2238, + "epoch": 0.43, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003978757242991389, + "loss": 2.2062, "step": 570 }, { - "epoch": 0.86, - "grad_norm": 0.0859375, - "learning_rate": 3.8101506531707373e-06, - "loss": 2.2308, + "epoch": 0.43, + "grad_norm": 0.0986328125, + "learning_rate": 0.00039774763908878525, + "loss": 2.2098, "step": 575 }, { - "epoch": 0.87, - "grad_norm": 0.0859375, - "learning_rate": 3.426826948802979e-06, - "loss": 2.2305, + "epoch": 0.43, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003976158264161802, + "loss": 2.2109, "step": 580 }, { - "epoch": 0.87, - "grad_norm": 0.0849609375, - "learning_rate": 3.0627218766865635e-06, - "loss": 2.2364, + "epoch": 0.44, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003974802887659389, + "loss": 2.2148, "step": 585 }, { - "epoch": 0.88, - "grad_norm": 0.08642578125, - "learning_rate": 2.7180824995877015e-06, - "loss": 2.2435, + "epoch": 0.44, + "grad_norm": 0.09765625, + "learning_rate": 0.0003973410286928906, + "loss": 2.199, "step": 590 }, { - "epoch": 0.89, - "grad_norm": 0.08642578125, - "learning_rate": 2.3931426718643005e-06, - "loss": 2.2347, + "epoch": 0.44, + "grad_norm": 0.09765625, + "learning_rate": 0.0003971980488220308, + "loss": 2.2271, "step": 595 }, { - "epoch": 0.9, - "grad_norm": 0.08251953125, - "learning_rate": 2.0881228807845587e-06, - "loss": 2.2589, + "epoch": 0.45, + "grad_norm": 0.103515625, + "learning_rate": 0.0003970513518484718, + "loss": 2.2221, "step": 600 }, { - "epoch": 0.9, - "grad_norm": 0.0849609375, - "learning_rate": 1.8032300969157738e-06, - "loss": 2.2323, + "epoch": 0.45, + "grad_norm": 0.09912109375, + "learning_rate": 0.00039690094053739157, + "loss": 2.1961, "step": 605 }, { - "epoch": 0.91, - "grad_norm": 0.08642578125, - "learning_rate": 1.5386576336848828e-06, - "loss": 2.2285, + "epoch": 0.46, + "grad_norm": 0.1015625, + "learning_rate": 0.0003967468177239819, + "loss": 2.2078, "step": 610 }, { - "epoch": 0.92, - "grad_norm": 0.0849609375, - "learning_rate": 1.2945850162060271e-06, - "loss": 2.2368, + "epoch": 0.46, + "grad_norm": 0.10205078125, + "learning_rate": 0.00039658898631339496, + "loss": 2.2026, "step": 615 }, { - "epoch": 0.93, - "grad_norm": 0.08251953125, - "learning_rate": 1.0711778594641267e-06, - "loss": 2.2417, + "epoch": 0.46, + "grad_norm": 0.099609375, + "learning_rate": 0.0003964274492806883, + "loss": 2.2094, "step": 620 }, { - "epoch": 0.93, - "grad_norm": 0.08642578125, - "learning_rate": 8.685877559371869e-07, - "loss": 2.2355, + "epoch": 0.47, + "grad_norm": 0.09814453125, + "learning_rate": 0.00039626220967076917, + "loss": 2.2022, "step": 625 }, { - "epoch": 0.94, - "grad_norm": 0.08544921875, - "learning_rate": 6.869521727335145e-07, - "loss": 2.2285, + "epoch": 0.47, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003960932705983365, + "loss": 2.1984, "step": 630 }, { - "epoch": 0.95, - "grad_norm": 0.08447265625, - "learning_rate": 5.26394358313689e-07, - "loss": 2.2262, + "epoch": 0.47, + "grad_norm": 0.10009765625, + "learning_rate": 0.00039592063524782306, + "loss": 2.1981, "step": 635 }, { - "epoch": 0.95, - "grad_norm": 0.08154296875, - "learning_rate": 3.8702325886057816e-07, - "loss": 2.2289, + "epoch": 0.48, + "grad_norm": 0.1044921875, + "learning_rate": 0.00039574430687333464, + "loss": 2.2084, "step": 640 }, { - "epoch": 0.96, - "grad_norm": 0.08642578125, - "learning_rate": 2.6893344435409695e-07, - "loss": 2.2428, + "epoch": 0.48, + "grad_norm": 0.10546875, + "learning_rate": 0.00039556428879858904, + "loss": 2.1912, "step": 645 }, { - "epoch": 0.97, - "grad_norm": 0.08740234375, - "learning_rate": 1.722050444009702e-07, - "loss": 2.2248, + "epoch": 0.48, + "grad_norm": 0.1005859375, + "learning_rate": 0.00039538058441685353, + "loss": 2.1871, "step": 650 }, { - "epoch": 0.98, - "grad_norm": 0.08349609375, - "learning_rate": 9.690369386293054e-08, - "loss": 2.2297, + "epoch": 0.49, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003951931971908807, + "loss": 2.187, "step": 655 }, { - "epoch": 0.98, - "grad_norm": 0.0849609375, - "learning_rate": 4.308048832030403e-08, - "loss": 2.2344, + "epoch": 0.49, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003950021306528432, + "loss": 2.1915, "step": 660 }, { - "epoch": 0.99, - "grad_norm": 0.0859375, - "learning_rate": 1.0771949401241265e-08, - "loss": 2.2296, + "epoch": 0.5, + "grad_norm": 0.09814453125, + "learning_rate": 0.0003948073884042673, + "loss": 2.1892, "step": 665 }, { - "epoch": 1.0, - "grad_norm": 0.087890625, - "learning_rate": 0.0, - "loss": 2.2321, + "epoch": 0.5, + "grad_norm": 0.09912109375, + "learning_rate": 0.00039460897411596477, + "loss": 2.2194, "step": 670 }, { - "epoch": 1.0, - "eval_loss": 2.2356441020965576, - "eval_runtime": 186.994, - "eval_samples_per_second": 25.787, - "eval_steps_per_second": 3.225, - "step": 670 + "epoch": 0.5, + "grad_norm": 0.10107421875, + "learning_rate": 0.00039440689152796406, + "loss": 2.2103, + "step": 675 }, { - "epoch": 1.0, - "step": 670, - "total_flos": 3.038889647317975e+17, - "train_loss": 2.275535515884855, - "train_runtime": 10859.5951, - "train_samples_per_second": 7.898, - "train_steps_per_second": 0.062 + "epoch": 0.51, + "grad_norm": 0.10107421875, + "learning_rate": 0.00039420114444943934, + "loss": 2.2032, + "step": 680 + }, + { + "epoch": 0.51, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003939917367586391, + "loss": 2.1989, + "step": 685 + }, + { + "epoch": 0.51, + "grad_norm": 0.1005859375, + "learning_rate": 0.00039377867240281275, + "loss": 2.1929, + "step": 690 + }, + { + "epoch": 0.52, + "grad_norm": 0.1015625, + "learning_rate": 0.0003935619553981364, + "loss": 2.1961, + "step": 695 + }, + { + "epoch": 0.52, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003933415898296372, + "loss": 2.1941, + "step": 700 + }, + { + "epoch": 0.53, + "grad_norm": 0.10009765625, + "learning_rate": 0.000393117579851116, + "loss": 2.1983, + "step": 705 + }, + { + "epoch": 0.53, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003928899296850695, + "loss": 2.1912, + "step": 710 + }, + { + "epoch": 0.53, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003926586436226103, + "loss": 2.2096, + "step": 715 + }, + { + "epoch": 0.54, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003924237260233863, + "loss": 2.2007, + "step": 720 + }, + { + "epoch": 0.54, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003921851813154983, + "loss": 2.2171, + "step": 725 + }, + { + "epoch": 0.54, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003919430139954167, + "loss": 2.2002, + "step": 730 + }, + { + "epoch": 0.55, + "grad_norm": 0.10009765625, + "learning_rate": 0.00039169722862789644, + "loss": 2.1913, + "step": 735 + }, + { + "epoch": 0.55, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003914478298458916, + "loss": 2.1765, + "step": 740 + }, + { + "epoch": 0.56, + "grad_norm": 0.099609375, + "learning_rate": 0.00039119482235046716, + "loss": 2.1971, + "step": 745 + }, + { + "epoch": 0.56, + "grad_norm": 0.10205078125, + "learning_rate": 0.00039093821091071117, + "loss": 2.1978, + "step": 750 + }, + { + "epoch": 0.56, + "grad_norm": 0.1005859375, + "learning_rate": 0.00039067800036364443, + "loss": 2.1954, + "step": 755 + }, + { + "epoch": 0.57, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003904141956141297, + "loss": 2.2025, + "step": 760 + }, + { + "epoch": 0.57, + "grad_norm": 0.1015625, + "learning_rate": 0.0003901468016347786, + "loss": 2.2045, + "step": 765 + }, + { + "epoch": 0.57, + "grad_norm": 0.103515625, + "learning_rate": 0.00038987582346585847, + "loss": 2.1939, + "step": 770 + }, + { + "epoch": 0.58, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003896012662151972, + "loss": 2.175, + "step": 775 + }, + { + "epoch": 0.58, + "grad_norm": 0.099609375, + "learning_rate": 0.00038932313505808685, + "loss": 2.185, + "step": 780 + }, + { + "epoch": 0.59, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038904143523718615, + "loss": 2.1939, + "step": 785 + }, + { + "epoch": 0.59, + "grad_norm": 0.099609375, + "learning_rate": 0.00038875617206242174, + "loss": 2.1634, + "step": 790 + }, + { + "epoch": 0.59, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003884673509108879, + "loss": 2.1947, + "step": 795 + }, + { + "epoch": 0.6, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038817497722674546, + "loss": 2.1925, + "step": 800 + }, + { + "epoch": 0.6, + "grad_norm": 0.1015625, + "learning_rate": 0.000387879056521119, + "loss": 2.1809, + "step": 805 + }, + { + "epoch": 0.6, + "grad_norm": 0.099609375, + "learning_rate": 0.0003875795943719929, + "loss": 2.1763, + "step": 810 + }, + { + "epoch": 0.61, + "grad_norm": 0.1025390625, + "learning_rate": 0.00038727659642410654, + "loss": 2.2132, + "step": 815 + }, + { + "epoch": 0.61, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003869700683888474, + "loss": 2.194, + "step": 820 + }, + { + "epoch": 0.62, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003866600160441438, + "loss": 2.1861, + "step": 825 + }, + { + "epoch": 0.62, + "grad_norm": 0.1005859375, + "learning_rate": 0.00038634644523435587, + "loss": 2.2093, + "step": 830 + }, + { + "epoch": 0.62, + "grad_norm": 0.1005859375, + "learning_rate": 0.0003860293618701653, + "loss": 2.1894, + "step": 835 + }, + { + "epoch": 0.63, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003857087719284641, + "loss": 2.2219, + "step": 840 + }, + { + "epoch": 0.63, + "grad_norm": 0.10009765625, + "learning_rate": 0.00038538468145224165, + "loss": 2.1982, + "step": 845 + }, + { + "epoch": 0.63, + "grad_norm": 0.10205078125, + "learning_rate": 0.00038505709655047113, + "loss": 2.1954, + "step": 850 + }, + { + "epoch": 0.64, + "grad_norm": 0.1005859375, + "learning_rate": 0.00038472602339799427, + "loss": 2.2064, + "step": 855 + }, + { + "epoch": 0.64, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003843914682354047, + "loss": 2.2166, + "step": 860 + }, + { + "epoch": 0.65, + "grad_norm": 0.09912109375, + "learning_rate": 0.00038405343736893065, + "loss": 2.2001, + "step": 865 + }, + { + "epoch": 0.65, + "grad_norm": 0.1015625, + "learning_rate": 0.000383711937170316, + "loss": 2.1765, + "step": 870 + }, + { + "epoch": 0.65, + "grad_norm": 0.10302734375, + "learning_rate": 0.00038336697407669994, + "loss": 2.1826, + "step": 875 + }, + { + "epoch": 0.66, + "grad_norm": 0.10302734375, + "learning_rate": 0.000383018554590496, + "loss": 2.1964, + "step": 880 + }, + { + "epoch": 0.66, + "grad_norm": 0.09912109375, + "learning_rate": 0.0003826666852792692, + "loss": 2.1954, + "step": 885 + }, + { + "epoch": 0.66, + "grad_norm": 0.1044921875, + "learning_rate": 0.00038231137277561244, + "loss": 2.2015, + "step": 890 + }, + { + "epoch": 0.67, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003819526237770212, + "loss": 2.1932, + "step": 895 + }, + { + "epoch": 0.67, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003815904450457677, + "loss": 2.1906, + "step": 900 + }, + { + "epoch": 0.68, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003812248434087732, + "loss": 2.1776, + "step": 905 + }, + { + "epoch": 0.68, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038085582575747914, + "loss": 2.1936, + "step": 910 + }, + { + "epoch": 0.68, + "grad_norm": 0.1015625, + "learning_rate": 0.0003804833990477177, + "loss": 2.1819, + "step": 915 + }, + { + "epoch": 0.69, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038010757029958016, + "loss": 2.1918, + "step": 920 + }, + { + "epoch": 0.69, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003797283465972851, + "loss": 2.1776, + "step": 925 + }, + { + "epoch": 0.69, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003793457350890443, + "loss": 2.1786, + "step": 930 + }, + { + "epoch": 0.7, + "grad_norm": 0.1015625, + "learning_rate": 0.0003789597429869286, + "loss": 2.2086, + "step": 935 + }, + { + "epoch": 0.7, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003785703775667314, + "loss": 2.2138, + "step": 940 + }, + { + "epoch": 0.7, + "grad_norm": 0.099609375, + "learning_rate": 0.00037817764616783196, + "loss": 2.2123, + "step": 945 + }, + { + "epoch": 0.71, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003777815561930568, + "loss": 2.1889, + "step": 950 + }, + { + "epoch": 0.71, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003773821151085401, + "loss": 2.1946, + "step": 955 + }, + { + "epoch": 0.72, + "grad_norm": 0.1025390625, + "learning_rate": 0.00037697933044358335, + "loss": 2.1688, + "step": 960 + }, + { + "epoch": 0.72, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003765732097905129, + "loss": 2.1795, + "step": 965 + }, + { + "epoch": 0.72, + "grad_norm": 0.1005859375, + "learning_rate": 0.00037616376080453737, + "loss": 2.1966, + "step": 970 + }, + { + "epoch": 0.73, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003757509912036028, + "loss": 2.1901, + "step": 975 + }, + { + "epoch": 0.73, + "grad_norm": 0.103515625, + "learning_rate": 0.0003753349087682477, + "loss": 2.1685, + "step": 980 + }, + { + "epoch": 0.73, + "grad_norm": 0.1015625, + "learning_rate": 0.000374915521341456, + "loss": 2.1923, + "step": 985 + }, + { + "epoch": 0.74, + "grad_norm": 0.10205078125, + "learning_rate": 0.00037449283682850957, + "loss": 2.1995, + "step": 990 + }, + { + "epoch": 0.74, + "grad_norm": 0.10400390625, + "learning_rate": 0.00037406686319683887, + "loss": 2.1921, + "step": 995 + }, + { + "epoch": 0.75, + "grad_norm": 0.10546875, + "learning_rate": 0.00037363760847587284, + "loss": 2.178, + "step": 1000 + }, + { + "epoch": 0.75, + "grad_norm": 0.10400390625, + "learning_rate": 0.00037320508075688776, + "loss": 2.1711, + "step": 1005 + }, + { + "epoch": 0.75, + "grad_norm": 0.10205078125, + "learning_rate": 0.00037276928819285446, + "loss": 2.1825, + "step": 1010 + }, + { + "epoch": 0.76, + "grad_norm": 0.1015625, + "learning_rate": 0.0003723302389982849, + "loss": 2.1925, + "step": 1015 + }, + { + "epoch": 0.76, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003718879414490771, + "loss": 2.1758, + "step": 1020 + }, + { + "epoch": 0.76, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003714424038823592, + "loss": 2.1837, + "step": 1025 + }, + { + "epoch": 0.77, + "grad_norm": 0.10302734375, + "learning_rate": 0.00037099363469633245, + "loss": 2.1914, + "step": 1030 + }, + { + "epoch": 0.77, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003705416423501128, + "loss": 2.1667, + "step": 1035 + }, + { + "epoch": 0.78, + "grad_norm": 0.1015625, + "learning_rate": 0.0003700864353635714, + "loss": 2.1911, + "step": 1040 + }, + { + "epoch": 0.78, + "grad_norm": 0.10205078125, + "learning_rate": 0.00036962802231717403, + "loss": 2.1867, + "step": 1045 + }, + { + "epoch": 0.78, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003691664118518195, + "loss": 2.1717, + "step": 1050 + }, + { + "epoch": 0.79, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003687016126686765, + "loss": 2.1794, + "step": 1055 + }, + { + "epoch": 0.79, + "grad_norm": 0.1044921875, + "learning_rate": 0.00036823363352901997, + "loss": 2.2012, + "step": 1060 + }, + { + "epoch": 0.79, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003677624832540655, + "loss": 2.1957, + "step": 1065 + }, + { + "epoch": 0.8, + "grad_norm": 0.10888671875, + "learning_rate": 0.0003672881707248034, + "loss": 2.1762, + "step": 1070 + }, + { + "epoch": 0.8, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003668107048818312, + "loss": 2.2005, + "step": 1075 + }, + { + "epoch": 0.81, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003663300947251851, + "loss": 2.2066, + "step": 1080 + }, + { + "epoch": 0.81, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003658463493141703, + "loss": 2.1813, + "step": 1085 + }, + { + "epoch": 0.81, + "grad_norm": 0.10498046875, + "learning_rate": 0.00036535947776719017, + "loss": 2.1659, + "step": 1090 + }, + { + "epoch": 0.82, + "grad_norm": 0.103515625, + "learning_rate": 0.0003648694892615747, + "loss": 2.191, + "step": 1095 + }, + { + "epoch": 0.82, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003643763930334071, + "loss": 2.1706, + "step": 1100 + }, + { + "epoch": 0.82, + "grad_norm": 0.10546875, + "learning_rate": 0.00036388019837734994, + "loss": 2.1885, + "step": 1105 + }, + { + "epoch": 0.83, + "grad_norm": 0.1015625, + "learning_rate": 0.00036338091464646984, + "loss": 2.1711, + "step": 1110 + }, + { + "epoch": 0.83, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003628785512520613, + "loss": 2.1687, + "step": 1115 + }, + { + "epoch": 0.84, + "grad_norm": 0.1015625, + "learning_rate": 0.0003623731176634691, + "loss": 2.2004, + "step": 1120 + }, + { + "epoch": 0.84, + "grad_norm": 0.1025390625, + "learning_rate": 0.00036186462340791014, + "loss": 2.188, + "step": 1125 + }, + { + "epoch": 0.84, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003613530780702934, + "loss": 2.1822, + "step": 1130 + }, + { + "epoch": 0.85, + "grad_norm": 0.10400390625, + "learning_rate": 0.00036083849129303966, + "loss": 2.1848, + "step": 1135 + }, + { + "epoch": 0.85, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003603208727758995, + "loss": 2.1607, + "step": 1140 + }, + { + "epoch": 0.85, + "grad_norm": 0.1025390625, + "learning_rate": 0.00035980023227577063, + "loss": 2.1863, + "step": 1145 + }, + { + "epoch": 0.86, + "grad_norm": 0.10302734375, + "learning_rate": 0.00035927657960651394, + "loss": 2.1711, + "step": 1150 + }, + { + "epoch": 0.86, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003587499246387684, + "loss": 2.1806, + "step": 1155 + }, + { + "epoch": 0.87, + "grad_norm": 0.10693359375, + "learning_rate": 0.00035822027729976504, + "loss": 2.1735, + "step": 1160 + }, + { + "epoch": 0.87, + "grad_norm": 0.1044921875, + "learning_rate": 0.00035768764757314, + "loss": 2.1989, + "step": 1165 + }, + { + "epoch": 0.87, + "grad_norm": 0.103515625, + "learning_rate": 0.00035715204549874617, + "loss": 2.1728, + "step": 1170 + }, + { + "epoch": 0.88, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003566134811724639, + "loss": 2.1933, + "step": 1175 + }, + { + "epoch": 0.88, + "grad_norm": 0.10302734375, + "learning_rate": 0.00035607196474601074, + "loss": 2.1886, + "step": 1180 + }, + { + "epoch": 0.88, + "grad_norm": 0.10205078125, + "learning_rate": 0.00035552750642675043, + "loss": 2.1829, + "step": 1185 + }, + { + "epoch": 0.89, + "grad_norm": 0.1044921875, + "learning_rate": 0.00035498011647749976, + "loss": 2.1755, + "step": 1190 + }, + { + "epoch": 0.89, + "grad_norm": 0.10302734375, + "learning_rate": 0.00035442980521633595, + "loss": 2.2011, + "step": 1195 + }, + { + "epoch": 0.9, + "grad_norm": 0.1015625, + "learning_rate": 0.00035387658301640136, + "loss": 2.2043, + "step": 1200 + }, + { + "epoch": 0.9, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003533204603057088, + "loss": 2.1782, + "step": 1205 + }, + { + "epoch": 0.9, + "grad_norm": 0.10498046875, + "learning_rate": 0.00035276144756694406, + "loss": 2.179, + "step": 1210 + }, + { + "epoch": 0.91, + "grad_norm": 0.10205078125, + "learning_rate": 0.00035219955533726915, + "loss": 2.1841, + "step": 1215 + }, + { + "epoch": 0.91, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003516347942081232, + "loss": 2.1646, + "step": 1220 + }, + { + "epoch": 0.91, + "grad_norm": 0.10205078125, + "learning_rate": 0.00035106717482502267, + "loss": 2.1878, + "step": 1225 + }, + { + "epoch": 0.92, + "grad_norm": 0.10546875, + "learning_rate": 0.0003504967078873613, + "loss": 2.1753, + "step": 1230 + }, + { + "epoch": 0.92, + "grad_norm": 0.103515625, + "learning_rate": 0.000349923404148208, + "loss": 2.1854, + "step": 1235 + }, + { + "epoch": 0.93, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003493472744141041, + "loss": 2.1878, + "step": 1240 + }, + { + "epoch": 0.93, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003487683295448598, + "loss": 2.1675, + "step": 1245 + }, + { + "epoch": 0.93, + "grad_norm": 0.103515625, + "learning_rate": 0.0003481865804533494, + "loss": 2.1902, + "step": 1250 + }, + { + "epoch": 0.94, + "grad_norm": 0.10546875, + "learning_rate": 0.00034760203810530594, + "loss": 2.1848, + "step": 1255 + }, + { + "epoch": 0.94, + "grad_norm": 0.103515625, + "learning_rate": 0.00034701471351911395, + "loss": 2.1638, + "step": 1260 + }, + { + "epoch": 0.94, + "grad_norm": 0.10546875, + "learning_rate": 0.000346424617765602, + "loss": 2.1763, + "step": 1265 + }, + { + "epoch": 0.95, + "grad_norm": 0.1025390625, + "learning_rate": 0.000345831761967834, + "loss": 2.165, + "step": 1270 + }, + { + "epoch": 0.95, + "grad_norm": 0.1044921875, + "learning_rate": 0.00034523615730089986, + "loss": 2.1875, + "step": 1275 + }, + { + "epoch": 0.95, + "grad_norm": 0.10205078125, + "learning_rate": 0.0003446378149917042, + "loss": 2.1595, + "step": 1280 + }, + { + "epoch": 0.96, + "grad_norm": 0.107421875, + "learning_rate": 0.0003440367463187553, + "loss": 2.1841, + "step": 1285 + }, + { + "epoch": 0.96, + "grad_norm": 0.10546875, + "learning_rate": 0.00034343296261195224, + "loss": 2.1882, + "step": 1290 + }, + { + "epoch": 0.97, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003428264752523712, + "loss": 2.1671, + "step": 1295 + }, + { + "epoch": 0.97, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003422172956720514, + "loss": 2.1671, + "step": 1300 + }, + { + "epoch": 0.97, + "grad_norm": 0.103515625, + "learning_rate": 0.00034160543535377926, + "loss": 2.1607, + "step": 1305 + }, + { + "epoch": 0.98, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003409909058308718, + "loss": 2.1827, + "step": 1310 + }, + { + "epoch": 0.98, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003403737186869596, + "loss": 2.183, + "step": 1315 + }, + { + "epoch": 0.98, + "grad_norm": 0.10205078125, + "learning_rate": 0.00033975388555576835, + "loss": 2.1692, + "step": 1320 + }, + { + "epoch": 0.99, + "grad_norm": 0.1064453125, + "learning_rate": 0.0003391314181208995, + "loss": 2.1786, + "step": 1325 + }, + { + "epoch": 0.99, + "grad_norm": 0.10546875, + "learning_rate": 0.00033850632811561, + "loss": 2.166, + "step": 1330 + }, + { + "epoch": 1.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.00033787862732259123, + "loss": 2.1767, + "step": 1335 + }, + { + "epoch": 1.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003372483275737468, + "loss": 2.1729, + "step": 1340 + }, + { + "epoch": 1.0, + "eval_loss": 2.1778452396392822, + "eval_runtime": 189.5201, + "eval_samples_per_second": 25.443, + "eval_steps_per_second": 3.182, + "step": 1340 + }, + { + "epoch": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0003366154407499695, + "loss": 2.1415, + "step": 1345 + }, + { + "epoch": 1.01, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003359799787809179, + "loss": 2.1377, + "step": 1350 + }, + { + "epoch": 1.01, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003353419536447902, + "loss": 2.1349, + "step": 1355 + }, + { + "epoch": 1.01, + "grad_norm": 0.1123046875, + "learning_rate": 0.00033470137736809995, + "loss": 2.1339, + "step": 1360 + }, + { + "epoch": 1.02, + "grad_norm": 0.11328125, + "learning_rate": 0.0003340582620254484, + "loss": 2.1364, + "step": 1365 + }, + { + "epoch": 1.02, + "grad_norm": 0.107421875, + "learning_rate": 0.000333412619739297, + "loss": 2.1484, + "step": 1370 + }, + { + "epoch": 1.03, + "grad_norm": 0.10888671875, + "learning_rate": 0.0003327644626797394, + "loss": 2.1753, + "step": 1375 + }, + { + "epoch": 1.03, + "grad_norm": 0.107421875, + "learning_rate": 0.0003321138030642714, + "loss": 2.165, + "step": 1380 + }, + { + "epoch": 1.03, + "grad_norm": 0.1103515625, + "learning_rate": 0.00033146065315756113, + "loss": 2.1622, + "step": 1385 + }, + { + "epoch": 1.04, + "grad_norm": 0.1123046875, + "learning_rate": 0.00033080502527121756, + "loss": 2.1704, + "step": 1390 + }, + { + "epoch": 1.04, + "grad_norm": 0.111328125, + "learning_rate": 0.0003301469317635587, + "loss": 2.162, + "step": 1395 + }, + { + "epoch": 1.04, + "grad_norm": 0.11181640625, + "learning_rate": 0.00032948638503937846, + "loss": 2.1461, + "step": 1400 + }, + { + "epoch": 1.05, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003288233975497128, + "loss": 2.1515, + "step": 1405 + }, + { + "epoch": 1.05, + "grad_norm": 0.10986328125, + "learning_rate": 0.00032815798179160524, + "loss": 2.1306, + "step": 1410 + }, + { + "epoch": 1.06, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003274901503078711, + "loss": 2.1343, + "step": 1415 + }, + { + "epoch": 1.06, + "grad_norm": 0.11328125, + "learning_rate": 0.0003268199156868612, + "loss": 2.1235, + "step": 1420 + }, + { + "epoch": 1.06, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003261472905622244, + "loss": 2.1496, + "step": 1425 + }, + { + "epoch": 1.07, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003254722876126697, + "loss": 2.1503, + "step": 1430 + }, + { + "epoch": 1.07, + "grad_norm": 0.11572265625, + "learning_rate": 0.00032479491956172705, + "loss": 2.1566, + "step": 1435 + }, + { + "epoch": 1.07, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003241151991775076, + "loss": 2.1638, + "step": 1440 + }, + { + "epoch": 1.08, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003234331392724631, + "loss": 2.1497, + "step": 1445 + }, + { + "epoch": 1.08, + "grad_norm": 0.11376953125, + "learning_rate": 0.00032274875270314426, + "loss": 2.1478, + "step": 1450 + }, + { + "epoch": 1.09, + "grad_norm": 0.1103515625, + "learning_rate": 0.00032206205236995843, + "loss": 2.149, + "step": 1455 + }, + { + "epoch": 1.09, + "grad_norm": 0.11181640625, + "learning_rate": 0.00032137305121692655, + "loss": 2.1514, + "step": 1460 + }, + { + "epoch": 1.09, + "grad_norm": 0.1123046875, + "learning_rate": 0.00032068176223143884, + "loss": 2.1817, + "step": 1465 + }, + { + "epoch": 1.1, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003199881984440106, + "loss": 2.1721, + "step": 1470 + }, + { + "epoch": 1.1, + "grad_norm": 0.111328125, + "learning_rate": 0.000319292372928036, + "loss": 2.1319, + "step": 1475 + }, + { + "epoch": 1.1, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003185942987995418, + "loss": 2.15, + "step": 1480 + }, + { + "epoch": 1.11, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003178939892169403, + "loss": 2.1696, + "step": 1485 + }, + { + "epoch": 1.11, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003171914573807813, + "loss": 2.1567, + "step": 1490 + }, + { + "epoch": 1.12, + "grad_norm": 0.115234375, + "learning_rate": 0.0003164867165335029, + "loss": 2.1454, + "step": 1495 + }, + { + "epoch": 1.12, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003157797799591823, + "loss": 2.1482, + "step": 1500 + }, + { + "epoch": 1.12, + "grad_norm": 0.11328125, + "learning_rate": 0.0003150706609832854, + "loss": 2.1297, + "step": 1505 + }, + { + "epoch": 1.13, + "grad_norm": 0.11474609375, + "learning_rate": 0.00031435937297241527, + "loss": 2.1142, + "step": 1510 + }, + { + "epoch": 1.13, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003136459293340605, + "loss": 2.1509, + "step": 1515 + }, + { + "epoch": 1.13, + "grad_norm": 0.11376953125, + "learning_rate": 0.00031293034351634227, + "loss": 2.1382, + "step": 1520 + }, + { + "epoch": 1.14, + "grad_norm": 0.115234375, + "learning_rate": 0.00031221262900776116, + "loss": 2.1552, + "step": 1525 + }, + { + "epoch": 1.14, + "grad_norm": 0.11328125, + "learning_rate": 0.0003114927993369425, + "loss": 2.1434, + "step": 1530 + }, + { + "epoch": 1.15, + "grad_norm": 0.1123046875, + "learning_rate": 0.00031077086807238175, + "loss": 2.1364, + "step": 1535 + }, + { + "epoch": 1.15, + "grad_norm": 0.11279296875, + "learning_rate": 0.00031004684882218845, + "loss": 2.1569, + "step": 1540 + }, + { + "epoch": 1.15, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003093207552338298, + "loss": 2.1756, + "step": 1545 + }, + { + "epoch": 1.16, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003085926009938735, + "loss": 2.1448, + "step": 1550 + }, + { + "epoch": 1.16, + "grad_norm": 0.115234375, + "learning_rate": 0.0003078623998277296, + "loss": 2.1403, + "step": 1555 + }, + { + "epoch": 1.16, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003071301654993919, + "loss": 2.1391, + "step": 1560 + }, + { + "epoch": 1.17, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003063959118111785, + "loss": 2.154, + "step": 1565 + }, + { + "epoch": 1.17, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003056596526034717, + "loss": 2.1494, + "step": 1570 + }, + { + "epoch": 1.17, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003049214017544569, + "loss": 2.1319, + "step": 1575 + }, + { + "epoch": 1.18, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003041811731798611, + "loss": 2.1384, + "step": 1580 + }, + { + "epoch": 1.18, + "grad_norm": 0.115234375, + "learning_rate": 0.0003034389808326907, + "loss": 2.1642, + "step": 1585 + }, + { + "epoch": 1.19, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003026948387029684, + "loss": 2.1557, + "step": 1590 + }, + { + "epoch": 1.19, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003019487608174695, + "loss": 2.1392, + "step": 1595 + }, + { + "epoch": 1.19, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003012007612394575, + "loss": 2.1431, + "step": 1600 + }, + { + "epoch": 1.2, + "grad_norm": 0.11474609375, + "learning_rate": 0.000300450854068419, + "loss": 2.1507, + "step": 1605 + }, + { + "epoch": 1.2, + "grad_norm": 0.11474609375, + "learning_rate": 0.000299699053439798, + "loss": 2.147, + "step": 1610 + }, + { + "epoch": 1.2, + "grad_norm": 0.11474609375, + "learning_rate": 0.00029894537352472927, + "loss": 2.1361, + "step": 1615 + }, + { + "epoch": 1.21, + "grad_norm": 0.11572265625, + "learning_rate": 0.00029818982852977157, + "loss": 2.1514, + "step": 1620 + }, + { + "epoch": 1.21, + "grad_norm": 0.12451171875, + "learning_rate": 0.00029743243269663957, + "loss": 2.1597, + "step": 1625 + }, + { + "epoch": 1.22, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002966732003019353, + "loss": 2.1449, + "step": 1630 + }, + { + "epoch": 1.22, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002959121456568796, + "loss": 2.1392, + "step": 1635 + }, + { + "epoch": 1.22, + "grad_norm": 0.11474609375, + "learning_rate": 0.00029514928310704164, + "loss": 2.1396, + "step": 1640 + }, + { + "epoch": 1.23, + "grad_norm": 0.11328125, + "learning_rate": 0.000294384627032069, + "loss": 2.1509, + "step": 1645 + }, + { + "epoch": 1.23, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002936181918454164, + "loss": 2.1538, + "step": 1650 + }, + { + "epoch": 1.23, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002928499919940743, + "loss": 2.1337, + "step": 1655 + }, + { + "epoch": 1.24, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002920800419582961, + "loss": 2.1475, + "step": 1660 + }, + { + "epoch": 1.24, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002913083562513257, + "loss": 2.1293, + "step": 1665 + }, + { + "epoch": 1.25, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002905349494191235, + "loss": 2.1539, + "step": 1670 + }, + { + "epoch": 1.25, + "grad_norm": 0.11669921875, + "learning_rate": 0.00028975983604009246, + "loss": 2.1215, + "step": 1675 + }, + { + "epoch": 1.25, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002889830307248033, + "loss": 2.1578, + "step": 1680 + }, + { + "epoch": 1.26, + "grad_norm": 0.11474609375, + "learning_rate": 0.00028820454811571907, + "loss": 2.1492, + "step": 1685 + }, + { + "epoch": 1.26, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002874244028869191, + "loss": 2.1536, + "step": 1690 + }, + { + "epoch": 1.26, + "grad_norm": 0.11328125, + "learning_rate": 0.0002866426097438222, + "loss": 2.1584, + "step": 1695 + }, + { + "epoch": 1.27, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002858591834229102, + "loss": 2.163, + "step": 1700 + }, + { + "epoch": 1.27, + "grad_norm": 0.1123046875, + "learning_rate": 0.000285074138691449, + "loss": 2.1423, + "step": 1705 + }, + { + "epoch": 1.28, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002842874903472115, + "loss": 2.1499, + "step": 1710 + }, + { + "epoch": 1.28, + "grad_norm": 0.1162109375, + "learning_rate": 0.00028349925321819776, + "loss": 2.1552, + "step": 1715 + }, + { + "epoch": 1.28, + "grad_norm": 0.11572265625, + "learning_rate": 0.00028270944216235574, + "loss": 2.1471, + "step": 1720 + }, + { + "epoch": 1.29, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002819180720673013, + "loss": 2.1362, + "step": 1725 + }, + { + "epoch": 1.29, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002811251578500377, + "loss": 2.173, + "step": 1730 + }, + { + "epoch": 1.29, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002803307144566741, + "loss": 2.1381, + "step": 1735 + }, + { + "epoch": 1.3, + "grad_norm": 0.11376953125, + "learning_rate": 0.00027953475686214404, + "loss": 2.1409, + "step": 1740 + }, + { + "epoch": 1.3, + "grad_norm": 0.115234375, + "learning_rate": 0.0002787373000699232, + "loss": 2.1486, + "step": 1745 + }, + { + "epoch": 1.31, + "grad_norm": 0.11767578125, + "learning_rate": 0.00027793835911174656, + "loss": 2.1659, + "step": 1750 + }, + { + "epoch": 1.31, + "grad_norm": 0.11474609375, + "learning_rate": 0.00027713794904732483, + "loss": 2.1639, + "step": 1755 + }, + { + "epoch": 1.31, + "grad_norm": 0.1171875, + "learning_rate": 0.00027633608496406103, + "loss": 2.1667, + "step": 1760 + }, + { + "epoch": 1.32, + "grad_norm": 0.1142578125, + "learning_rate": 0.00027553278197676567, + "loss": 2.1442, + "step": 1765 + }, + { + "epoch": 1.32, + "grad_norm": 0.115234375, + "learning_rate": 0.00027472805522737195, + "loss": 2.117, + "step": 1770 + }, + { + "epoch": 1.32, + "grad_norm": 0.11669921875, + "learning_rate": 0.00027392191988465065, + "loss": 2.1499, + "step": 1775 + }, + { + "epoch": 1.33, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002731143911439237, + "loss": 2.15, + "step": 1780 + }, + { + "epoch": 1.33, + "grad_norm": 0.11328125, + "learning_rate": 0.00027230548422677817, + "loss": 2.1542, + "step": 1785 + }, + { + "epoch": 1.34, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002714952143807792, + "loss": 2.1437, + "step": 1790 + }, + { + "epoch": 1.34, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002706835968791824, + "loss": 2.1627, + "step": 1795 + }, + { + "epoch": 1.34, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002698706470206464, + "loss": 2.1453, + "step": 1800 + }, + { + "epoch": 1.35, + "grad_norm": 0.11474609375, + "learning_rate": 0.00026905638012894405, + "loss": 2.1482, + "step": 1805 + }, + { + "epoch": 1.35, + "grad_norm": 0.115234375, + "learning_rate": 0.00026824081155267374, + "loss": 2.1707, + "step": 1810 + }, + { + "epoch": 1.35, + "grad_norm": 0.11669921875, + "learning_rate": 0.00026742395666497015, + "loss": 2.1583, + "step": 1815 + }, + { + "epoch": 1.36, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002666058308632144, + "loss": 2.1528, + "step": 1820 + }, + { + "epoch": 1.36, + "grad_norm": 0.1171875, + "learning_rate": 0.0002657864495687437, + "loss": 2.1546, + "step": 1825 + }, + { + "epoch": 1.37, + "grad_norm": 0.1162109375, + "learning_rate": 0.00026496582822656094, + "loss": 2.145, + "step": 1830 + }, + { + "epoch": 1.37, + "grad_norm": 0.1162109375, + "learning_rate": 0.00026414398230504335, + "loss": 2.1581, + "step": 1835 + }, + { + "epoch": 1.37, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002633209272956509, + "loss": 2.1352, + "step": 1840 + }, + { + "epoch": 1.38, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002624966787126345, + "loss": 2.1661, + "step": 1845 + }, + { + "epoch": 1.38, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002616712520927434, + "loss": 2.15, + "step": 1850 + }, + { + "epoch": 1.38, + "grad_norm": 0.11865234375, + "learning_rate": 0.00026084466299493227, + "loss": 2.1389, + "step": 1855 + }, + { + "epoch": 1.39, + "grad_norm": 0.12109375, + "learning_rate": 0.0002600169270000682, + "loss": 2.1566, + "step": 1860 + }, + { + "epoch": 1.39, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002591880597106365, + "loss": 2.155, + "step": 1865 + }, + { + "epoch": 1.4, + "grad_norm": 0.1171875, + "learning_rate": 0.0002583580767504474, + "loss": 2.1491, + "step": 1870 + }, + { + "epoch": 1.4, + "grad_norm": 0.1171875, + "learning_rate": 0.0002575269937643406, + "loss": 2.1399, + "step": 1875 + }, + { + "epoch": 1.4, + "grad_norm": 0.1162109375, + "learning_rate": 0.00025669482641789106, + "loss": 2.1316, + "step": 1880 + }, + { + "epoch": 1.41, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002558615903971135, + "loss": 2.1265, + "step": 1885 + }, + { + "epoch": 1.41, + "grad_norm": 0.1181640625, + "learning_rate": 0.00025502730140816666, + "loss": 2.1384, + "step": 1890 + }, + { + "epoch": 1.41, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002541919751770574, + "loss": 2.1535, + "step": 1895 + }, + { + "epoch": 1.42, + "grad_norm": 0.1201171875, + "learning_rate": 0.00025335562744934403, + "loss": 2.1292, + "step": 1900 + }, + { + "epoch": 1.42, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002525182739898397, + "loss": 2.1489, + "step": 1905 + }, + { + "epoch": 1.42, + "grad_norm": 0.1171875, + "learning_rate": 0.00025167993058231524, + "loss": 2.1454, + "step": 1910 + }, + { + "epoch": 1.43, + "grad_norm": 0.1181640625, + "learning_rate": 0.00025084061302920146, + "loss": 2.1436, + "step": 1915 + }, + { + "epoch": 1.43, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002500003371512917, + "loss": 2.1461, + "step": 1920 + }, + { + "epoch": 1.44, + "grad_norm": 0.11572265625, + "learning_rate": 0.000249159118787443, + "loss": 2.1515, + "step": 1925 + }, + { + "epoch": 1.44, + "grad_norm": 0.1142578125, + "learning_rate": 0.00024831697379427807, + "loss": 2.1204, + "step": 1930 + }, + { + "epoch": 1.44, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002474739180458863, + "loss": 2.1579, + "step": 1935 + }, + { + "epoch": 1.45, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002466299674335241, + "loss": 2.1382, + "step": 1940 + }, + { + "epoch": 1.45, + "grad_norm": 0.12060546875, + "learning_rate": 0.00024578513786531605, + "loss": 2.1551, + "step": 1945 + }, + { + "epoch": 1.45, + "grad_norm": 0.1171875, + "learning_rate": 0.0002449394452659544, + "loss": 2.1509, + "step": 1950 + }, + { + "epoch": 1.46, + "grad_norm": 0.1171875, + "learning_rate": 0.00024409290557639947, + "loss": 2.1462, + "step": 1955 + }, + { + "epoch": 1.46, + "grad_norm": 0.1162109375, + "learning_rate": 0.00024324553475357866, + "loss": 2.1404, + "step": 1960 + }, + { + "epoch": 1.47, + "grad_norm": 0.11669921875, + "learning_rate": 0.00024239734877008604, + "loss": 2.1677, + "step": 1965 + }, + { + "epoch": 1.47, + "grad_norm": 0.11962890625, + "learning_rate": 0.000241548363613881, + "loss": 2.1602, + "step": 1970 + }, + { + "epoch": 1.47, + "grad_norm": 0.11767578125, + "learning_rate": 0.00024069859528798714, + "loss": 2.1534, + "step": 1975 + }, + { + "epoch": 1.48, + "grad_norm": 0.115234375, + "learning_rate": 0.0002398480598101903, + "loss": 2.1448, + "step": 1980 + }, + { + "epoch": 1.48, + "grad_norm": 0.11474609375, + "learning_rate": 0.00023899677321273714, + "loss": 2.1356, + "step": 1985 + }, + { + "epoch": 1.48, + "grad_norm": 0.115234375, + "learning_rate": 0.00023814475154203222, + "loss": 2.154, + "step": 1990 + }, + { + "epoch": 1.49, + "grad_norm": 0.11865234375, + "learning_rate": 0.00023729201085833626, + "loss": 2.1383, + "step": 1995 + }, + { + "epoch": 1.49, + "grad_norm": 0.1181640625, + "learning_rate": 0.00023643856723546295, + "loss": 2.1611, + "step": 2000 + }, + { + "epoch": 1.5, + "grad_norm": 0.1201171875, + "learning_rate": 0.00023558443676047596, + "loss": 2.1302, + "step": 2005 + }, + { + "epoch": 1.5, + "grad_norm": 0.1171875, + "learning_rate": 0.00023472963553338613, + "loss": 2.1535, + "step": 2010 + }, + { + "epoch": 1.5, + "grad_norm": 0.11962890625, + "learning_rate": 0.00023387417966684742, + "loss": 2.1414, + "step": 2015 + }, + { + "epoch": 1.51, + "grad_norm": 0.11767578125, + "learning_rate": 0.00023301808528585375, + "loss": 2.1352, + "step": 2020 + }, + { + "epoch": 1.51, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002321613685274346, + "loss": 2.152, + "step": 2025 + }, + { + "epoch": 1.51, + "grad_norm": 0.11865234375, + "learning_rate": 0.00023130404554035102, + "loss": 2.142, + "step": 2030 + }, + { + "epoch": 1.52, + "grad_norm": 0.12158203125, + "learning_rate": 0.00023044613248479116, + "loss": 2.1588, + "step": 2035 + }, + { + "epoch": 1.52, + "grad_norm": 0.1171875, + "learning_rate": 0.000229587645532066, + "loss": 2.1475, + "step": 2040 + }, + { + "epoch": 1.53, + "grad_norm": 0.119140625, + "learning_rate": 0.00022872860086430393, + "loss": 2.1593, + "step": 2045 + }, + { + "epoch": 1.53, + "grad_norm": 0.1181640625, + "learning_rate": 0.00022786901467414619, + "loss": 2.1467, + "step": 2050 + }, + { + "epoch": 1.53, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002270089031644415, + "loss": 2.1364, + "step": 2055 + }, + { + "epoch": 1.54, + "grad_norm": 0.119140625, + "learning_rate": 0.00022614828254794055, + "loss": 2.1384, + "step": 2060 + }, + { + "epoch": 1.54, + "grad_norm": 0.11669921875, + "learning_rate": 0.00022528716904699056, + "loss": 2.1428, + "step": 2065 + }, + { + "epoch": 1.54, + "grad_norm": 0.1181640625, + "learning_rate": 0.00022442557889322946, + "loss": 2.1517, + "step": 2070 + }, + { + "epoch": 1.55, + "grad_norm": 0.12060546875, + "learning_rate": 0.00022356352832727985, + "loss": 2.1474, + "step": 2075 + }, + { + "epoch": 1.55, + "grad_norm": 0.1201171875, + "learning_rate": 0.00022270103359844283, + "loss": 2.1684, + "step": 2080 + }, + { + "epoch": 1.56, + "grad_norm": 0.11669921875, + "learning_rate": 0.00022183811096439194, + "loss": 2.1616, + "step": 2085 + }, + { + "epoch": 1.56, + "grad_norm": 0.12109375, + "learning_rate": 0.00022097477669086638, + "loss": 2.1468, + "step": 2090 + }, + { + "epoch": 1.56, + "grad_norm": 0.1162109375, + "learning_rate": 0.00022011104705136475, + "loss": 2.1374, + "step": 2095 + }, + { + "epoch": 1.57, + "grad_norm": 0.11962890625, + "learning_rate": 0.00021924693832683806, + "loss": 2.1539, + "step": 2100 + }, + { + "epoch": 1.57, + "grad_norm": 0.1171875, + "learning_rate": 0.00021838246680538293, + "loss": 2.1514, + "step": 2105 + }, + { + "epoch": 1.57, + "grad_norm": 0.12158203125, + "learning_rate": 0.00021751764878193459, + "loss": 2.1407, + "step": 2110 + }, + { + "epoch": 1.58, + "grad_norm": 0.1181640625, + "learning_rate": 0.00021665250055795957, + "loss": 2.1485, + "step": 2115 + }, + { + "epoch": 1.58, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002157870384411487, + "loss": 2.1496, + "step": 2120 + }, + { + "epoch": 1.59, + "grad_norm": 0.1181640625, + "learning_rate": 0.00021492127874510946, + "loss": 2.143, + "step": 2125 + }, + { + "epoch": 1.59, + "grad_norm": 0.119140625, + "learning_rate": 0.0002140552377890586, + "loss": 2.1498, + "step": 2130 + }, + { + "epoch": 1.59, + "grad_norm": 0.1201171875, + "learning_rate": 0.00021318893189751457, + "loss": 2.1586, + "step": 2135 + }, + { + "epoch": 1.6, + "grad_norm": 0.119140625, + "learning_rate": 0.00021232237739998965, + "loss": 2.139, + "step": 2140 + }, + { + "epoch": 1.6, + "grad_norm": 0.11865234375, + "learning_rate": 0.00021145559063068223, + "loss": 2.1481, + "step": 2145 + }, + { + "epoch": 1.6, + "grad_norm": 0.12060546875, + "learning_rate": 0.00021058858792816904, + "loss": 2.1449, + "step": 2150 + }, + { + "epoch": 1.61, + "grad_norm": 0.11865234375, + "learning_rate": 0.00020972138563509708, + "loss": 2.1629, + "step": 2155 + }, + { + "epoch": 1.61, + "grad_norm": 0.1181640625, + "learning_rate": 0.00020885400009787528, + "loss": 2.1458, + "step": 2160 + }, + { + "epoch": 1.62, + "grad_norm": 0.119140625, + "learning_rate": 0.000207986447666367, + "loss": 2.1531, + "step": 2165 + }, + { + "epoch": 1.62, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002071187446935813, + "loss": 2.1339, + "step": 2170 + }, + { + "epoch": 1.62, + "grad_norm": 0.11767578125, + "learning_rate": 0.00020625090753536492, + "loss": 2.1525, + "step": 2175 + }, + { + "epoch": 1.63, + "grad_norm": 0.11962890625, + "learning_rate": 0.00020538295255009384, + "loss": 2.1522, + "step": 2180 + }, + { + "epoch": 1.63, + "grad_norm": 0.1171875, + "learning_rate": 0.0002045148960983652, + "loss": 2.1358, + "step": 2185 + }, + { + "epoch": 1.63, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002036467545426886, + "loss": 2.149, + "step": 2190 + }, + { + "epoch": 1.64, + "grad_norm": 0.12109375, + "learning_rate": 0.00020277854424717803, + "loss": 2.1394, + "step": 2195 + }, + { + "epoch": 1.64, + "grad_norm": 0.1171875, + "learning_rate": 0.00020191028157724294, + "loss": 2.1424, + "step": 2200 + }, + { + "epoch": 1.64, + "grad_norm": 0.125, + "learning_rate": 0.0002010419828992801, + "loss": 2.1615, + "step": 2205 + }, + { + "epoch": 1.65, + "grad_norm": 0.11572265625, + "learning_rate": 0.00020017366458036513, + "loss": 2.1549, + "step": 2210 + }, + { + "epoch": 1.65, + "grad_norm": 0.11767578125, + "learning_rate": 0.00019930534298794365, + "loss": 2.1115, + "step": 2215 + }, + { + "epoch": 1.66, + "grad_norm": 0.12109375, + "learning_rate": 0.0001984370344895232, + "loss": 2.1267, + "step": 2220 + }, + { + "epoch": 1.66, + "grad_norm": 0.11962890625, + "learning_rate": 0.00019756875545236453, + "loss": 2.1387, + "step": 2225 + }, + { + "epoch": 1.66, + "grad_norm": 0.11962890625, + "learning_rate": 0.00019670052224317274, + "loss": 2.1365, + "step": 2230 + }, + { + "epoch": 1.67, + "grad_norm": 0.12109375, + "learning_rate": 0.0001958323512277895, + "loss": 2.1511, + "step": 2235 + }, + { + "epoch": 1.67, + "grad_norm": 0.11865234375, + "learning_rate": 0.0001949642587708838, + "loss": 2.1503, + "step": 2240 + }, + { + "epoch": 1.67, + "grad_norm": 0.119140625, + "learning_rate": 0.00019409626123564403, + "loss": 2.1469, + "step": 2245 + }, + { + "epoch": 1.68, + "grad_norm": 0.119140625, + "learning_rate": 0.00019322837498346934, + "loss": 2.125, + "step": 2250 + }, + { + "epoch": 1.68, + "grad_norm": 0.11865234375, + "learning_rate": 0.00019236061637366124, + "loss": 2.1331, + "step": 2255 + }, + { + "epoch": 1.69, + "grad_norm": 0.11865234375, + "learning_rate": 0.00019149300176311504, + "loss": 2.1354, + "step": 2260 + }, + { + "epoch": 1.69, + "grad_norm": 0.119140625, + "learning_rate": 0.00019062554750601198, + "loss": 2.1512, + "step": 2265 + }, + { + "epoch": 1.69, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001897582699535107, + "loss": 2.1151, + "step": 2270 + }, + { + "epoch": 1.7, + "grad_norm": 0.12255859375, + "learning_rate": 0.00018889118545343877, + "loss": 2.1239, + "step": 2275 + }, + { + "epoch": 1.7, + "grad_norm": 0.1181640625, + "learning_rate": 0.000188024310349985, + "loss": 2.1381, + "step": 2280 + }, + { + "epoch": 1.7, + "grad_norm": 0.11669921875, + "learning_rate": 0.00018715766098339117, + "loss": 2.1306, + "step": 2285 + }, + { + "epoch": 1.71, + "grad_norm": 0.11669921875, + "learning_rate": 0.00018629125368964405, + "loss": 2.1489, + "step": 2290 + }, + { + "epoch": 1.71, + "grad_norm": 0.1220703125, + "learning_rate": 0.00018542510480016713, + "loss": 2.1547, + "step": 2295 + }, + { + "epoch": 1.72, + "grad_norm": 0.119140625, + "learning_rate": 0.00018455923064151342, + "loss": 2.1343, + "step": 2300 + }, + { + "epoch": 1.72, + "grad_norm": 0.12353515625, + "learning_rate": 0.00018369364753505728, + "loss": 2.144, + "step": 2305 + }, + { + "epoch": 1.72, + "grad_norm": 0.11962890625, + "learning_rate": 0.00018282837179668679, + "loss": 2.1494, + "step": 2310 + }, + { + "epoch": 1.73, + "grad_norm": 0.1220703125, + "learning_rate": 0.00018196341973649637, + "loss": 2.1511, + "step": 2315 + }, + { + "epoch": 1.73, + "grad_norm": 0.11962890625, + "learning_rate": 0.00018109880765847906, + "loss": 2.148, + "step": 2320 + }, + { + "epoch": 1.73, + "grad_norm": 0.12109375, + "learning_rate": 0.00018023455186021961, + "loss": 2.142, + "step": 2325 + }, + { + "epoch": 1.74, + "grad_norm": 0.1201171875, + "learning_rate": 0.0001793706686325868, + "loss": 2.1445, + "step": 2330 + }, + { + "epoch": 1.74, + "grad_norm": 0.12158203125, + "learning_rate": 0.0001785071742594268, + "loss": 2.1344, + "step": 2335 + }, + { + "epoch": 1.75, + "grad_norm": 0.11669921875, + "learning_rate": 0.00017764408501725593, + "loss": 2.1214, + "step": 2340 + }, + { + "epoch": 1.75, + "grad_norm": 0.11865234375, + "learning_rate": 0.00017678141717495394, + "loss": 2.1232, + "step": 2345 + }, + { + "epoch": 1.75, + "grad_norm": 0.11962890625, + "learning_rate": 0.00017591918699345755, + "loss": 2.129, + "step": 2350 + }, + { + "epoch": 1.76, + "grad_norm": 0.119140625, + "learning_rate": 0.00017505741072545346, + "loss": 2.1462, + "step": 2355 + }, + { + "epoch": 1.76, + "grad_norm": 0.119140625, + "learning_rate": 0.00017419610461507254, + "loss": 2.1401, + "step": 2360 + }, + { + "epoch": 1.76, + "grad_norm": 0.119140625, + "learning_rate": 0.0001733352848975832, + "loss": 2.1497, + "step": 2365 + }, + { + "epoch": 1.77, + "grad_norm": 0.11767578125, + "learning_rate": 0.00017247496779908565, + "loss": 2.1356, + "step": 2370 + }, + { + "epoch": 1.77, + "grad_norm": 0.1201171875, + "learning_rate": 0.0001716151695362059, + "loss": 2.1436, + "step": 2375 + }, + { + "epoch": 1.78, + "grad_norm": 0.12060546875, + "learning_rate": 0.00017075590631579019, + "loss": 2.1538, + "step": 2380 + }, + { + "epoch": 1.78, + "grad_norm": 0.11865234375, + "learning_rate": 0.00016989719433459924, + "loss": 2.1497, + "step": 2385 + }, + { + "epoch": 1.78, + "grad_norm": 0.1171875, + "learning_rate": 0.00016903904977900333, + "loss": 2.1333, + "step": 2390 + }, + { + "epoch": 1.79, + "grad_norm": 0.1220703125, + "learning_rate": 0.000168181488824677, + "loss": 2.1542, + "step": 2395 + }, + { + "epoch": 1.79, + "grad_norm": 0.1171875, + "learning_rate": 0.00016732452763629395, + "loss": 2.1197, + "step": 2400 + }, + { + "epoch": 1.79, + "grad_norm": 0.123046875, + "learning_rate": 0.00016646818236722282, + "loss": 2.1151, + "step": 2405 + }, + { + "epoch": 1.8, + "grad_norm": 0.12060546875, + "learning_rate": 0.00016561246915922204, + "loss": 2.1505, + "step": 2410 + }, + { + "epoch": 1.8, + "grad_norm": 0.11865234375, + "learning_rate": 0.00016475740414213642, + "loss": 2.1501, + "step": 2415 + }, + { + "epoch": 1.81, + "grad_norm": 0.1201171875, + "learning_rate": 0.00016390300343359216, + "loss": 2.1556, + "step": 2420 + }, + { + "epoch": 1.81, + "grad_norm": 0.119140625, + "learning_rate": 0.0001630492831386939, + "loss": 2.133, + "step": 2425 + }, + { + "epoch": 1.81, + "grad_norm": 0.123046875, + "learning_rate": 0.0001621962593497205, + "loss": 2.162, + "step": 2430 + }, + { + "epoch": 1.82, + "grad_norm": 0.12060546875, + "learning_rate": 0.0001613439481458221, + "loss": 2.1333, + "step": 2435 + }, + { + "epoch": 1.82, + "grad_norm": 0.12060546875, + "learning_rate": 0.000160492365592717, + "loss": 2.1566, + "step": 2440 + }, + { + "epoch": 1.82, + "grad_norm": 0.11865234375, + "learning_rate": 0.00015964152774238842, + "loss": 2.1692, + "step": 2445 + }, + { + "epoch": 1.83, + "grad_norm": 0.119140625, + "learning_rate": 0.00015879145063278256, + "loss": 2.1413, + "step": 2450 + }, + { + "epoch": 1.83, + "grad_norm": 0.12255859375, + "learning_rate": 0.00015794215028750567, + "loss": 2.1564, + "step": 2455 + }, + { + "epoch": 1.84, + "grad_norm": 0.11962890625, + "learning_rate": 0.00015709364271552262, + "loss": 2.1305, + "step": 2460 + }, + { + "epoch": 1.84, + "grad_norm": 0.12060546875, + "learning_rate": 0.00015624594391085457, + "loss": 2.1526, + "step": 2465 + }, + { + "epoch": 1.84, + "grad_norm": 0.12353515625, + "learning_rate": 0.00015539906985227798, + "loss": 2.138, + "step": 2470 + }, + { + "epoch": 1.85, + "grad_norm": 0.12353515625, + "learning_rate": 0.0001545530365030229, + "loss": 2.1432, + "step": 2475 + }, + { + "epoch": 1.85, + "grad_norm": 0.1201171875, + "learning_rate": 0.00015370785981047252, + "loss": 2.1508, + "step": 2480 + }, + { + "epoch": 1.85, + "grad_norm": 0.1201171875, + "learning_rate": 0.00015286355570586255, + "loss": 2.1347, + "step": 2485 + }, + { + "epoch": 1.86, + "grad_norm": 0.12060546875, + "learning_rate": 0.00015202014010398042, + "loss": 2.1497, + "step": 2490 + }, + { + "epoch": 1.86, + "grad_norm": 0.12060546875, + "learning_rate": 0.00015117762890286602, + "loss": 2.1377, + "step": 2495 + }, + { + "epoch": 1.86, + "grad_norm": 0.1171875, + "learning_rate": 0.0001503360379835113, + "loss": 2.1337, + "step": 2500 + }, + { + "epoch": 1.87, + "grad_norm": 0.11865234375, + "learning_rate": 0.00014949538320956158, + "loss": 2.156, + "step": 2505 + }, + { + "epoch": 1.87, + "grad_norm": 0.11962890625, + "learning_rate": 0.00014865568042701592, + "loss": 2.1386, + "step": 2510 + }, + { + "epoch": 1.88, + "grad_norm": 0.119140625, + "learning_rate": 0.0001478169454639291, + "loss": 2.1468, + "step": 2515 + }, + { + "epoch": 1.88, + "grad_norm": 0.119140625, + "learning_rate": 0.00014697919413011253, + "loss": 2.1566, + "step": 2520 + }, + { + "epoch": 1.88, + "grad_norm": 0.123046875, + "learning_rate": 0.00014614244221683686, + "loss": 2.1441, + "step": 2525 + }, + { + "epoch": 1.89, + "grad_norm": 0.12060546875, + "learning_rate": 0.00014530670549653407, + "loss": 2.1473, + "step": 2530 + }, + { + "epoch": 1.89, + "grad_norm": 0.12060546875, + "learning_rate": 0.00014447199972249987, + "loss": 2.1481, + "step": 2535 + }, + { + "epoch": 1.89, + "grad_norm": 0.1201171875, + "learning_rate": 0.00014363834062859748, + "loss": 2.1546, + "step": 2540 + }, + { + "epoch": 1.9, + "grad_norm": 0.12060546875, + "learning_rate": 0.00014280574392896032, + "loss": 2.1314, + "step": 2545 + }, + { + "epoch": 1.9, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001419742253176962, + "loss": 2.1129, + "step": 2550 + }, + { + "epoch": 1.91, + "grad_norm": 0.119140625, + "learning_rate": 0.00014114380046859138, + "loss": 2.1353, + "step": 2555 + }, + { + "epoch": 1.91, + "grad_norm": 0.1201171875, + "learning_rate": 0.00014031448503481532, + "loss": 2.1423, + "step": 2560 + }, + { + "epoch": 1.91, + "grad_norm": 0.1201171875, + "learning_rate": 0.00013948629464862516, + "loss": 2.1173, + "step": 2565 + }, + { + "epoch": 1.92, + "grad_norm": 0.1201171875, + "learning_rate": 0.00013865924492107153, + "loss": 2.1468, + "step": 2570 + }, + { + "epoch": 1.92, + "grad_norm": 0.12158203125, + "learning_rate": 0.00013783335144170418, + "loss": 2.1517, + "step": 2575 + }, + { + "epoch": 1.92, + "grad_norm": 0.119140625, + "learning_rate": 0.0001370086297782779, + "loss": 2.153, + "step": 2580 + }, + { + "epoch": 1.93, + "grad_norm": 0.12109375, + "learning_rate": 0.0001361850954764594, + "loss": 2.1427, + "step": 2585 + }, + { + "epoch": 1.93, + "grad_norm": 0.1181640625, + "learning_rate": 0.0001353627640595338, + "loss": 2.1477, + "step": 2590 + }, + { + "epoch": 1.94, + "grad_norm": 0.12353515625, + "learning_rate": 0.00013454165102811272, + "loss": 2.1414, + "step": 2595 + }, + { + "epoch": 1.94, + "grad_norm": 0.1220703125, + "learning_rate": 0.00013372177185984134, + "loss": 2.1579, + "step": 2600 + }, + { + "epoch": 1.94, + "grad_norm": 0.11865234375, + "learning_rate": 0.00013290314200910735, + "loss": 2.123, + "step": 2605 + }, + { + "epoch": 1.95, + "grad_norm": 0.119140625, + "learning_rate": 0.00013208577690674905, + "loss": 2.1311, + "step": 2610 + }, + { + "epoch": 1.95, + "grad_norm": 0.119140625, + "learning_rate": 0.00013126969195976495, + "loss": 2.1314, + "step": 2615 + }, + { + "epoch": 1.95, + "grad_norm": 0.11865234375, + "learning_rate": 0.00013045490255102316, + "loss": 2.1374, + "step": 2620 + }, + { + "epoch": 1.96, + "grad_norm": 0.12060546875, + "learning_rate": 0.00012964142403897112, + "loss": 2.1489, + "step": 2625 + }, + { + "epoch": 1.96, + "grad_norm": 0.1181640625, + "learning_rate": 0.0001288292717573468, + "loss": 2.145, + "step": 2630 + }, + { + "epoch": 1.97, + "grad_norm": 0.1181640625, + "learning_rate": 0.00012801846101488898, + "loss": 2.1288, + "step": 2635 + }, + { + "epoch": 1.97, + "grad_norm": 0.119140625, + "learning_rate": 0.00012720900709504917, + "loss": 2.1468, + "step": 2640 + }, + { + "epoch": 1.97, + "grad_norm": 0.12109375, + "learning_rate": 0.00012640092525570312, + "loss": 2.1201, + "step": 2645 + }, + { + "epoch": 1.98, + "grad_norm": 0.1220703125, + "learning_rate": 0.0001255942307288637, + "loss": 2.1523, + "step": 2650 + }, + { + "epoch": 1.98, + "grad_norm": 0.1201171875, + "learning_rate": 0.00012478893872039314, + "loss": 2.146, + "step": 2655 + }, + { + "epoch": 1.98, + "grad_norm": 0.11962890625, + "learning_rate": 0.00012398506440971713, + "loss": 2.1387, + "step": 2660 + }, + { + "epoch": 1.99, + "grad_norm": 0.11767578125, + "learning_rate": 0.00012318262294953815, + "loss": 2.1272, + "step": 2665 + }, + { + "epoch": 1.99, + "grad_norm": 0.119140625, + "learning_rate": 0.00012238162946555002, + "loss": 2.1488, + "step": 2670 + }, + { + "epoch": 2.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.00012158209905615301, + "loss": 2.1445, + "step": 2675 + }, + { + "epoch": 2.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.00012078404679216864, + "loss": 2.1327, + "step": 2680 + }, + { + "epoch": 2.0, + "eval_loss": 2.1632144451141357, + "eval_runtime": 186.9566, + "eval_samples_per_second": 25.792, + "eval_steps_per_second": 3.225, + "step": 2681 + }, + { + "epoch": 2.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0001199874877165564, + "loss": 2.1156, + "step": 2685 + }, + { + "epoch": 2.01, + "grad_norm": 0.1181640625, + "learning_rate": 0.00011919243684412948, + "loss": 2.115, + "step": 2690 + }, + { + "epoch": 2.01, + "grad_norm": 0.11962890625, + "learning_rate": 0.00011839890916127228, + "loss": 2.0971, + "step": 2695 + }, + { + "epoch": 2.01, + "grad_norm": 0.12109375, + "learning_rate": 0.00011760691962565752, + "loss": 2.119, + "step": 2700 + }, + { + "epoch": 2.02, + "grad_norm": 0.1220703125, + "learning_rate": 0.00011681648316596461, + "loss": 2.1282, + "step": 2705 + }, + { + "epoch": 2.02, + "grad_norm": 0.12255859375, + "learning_rate": 0.00011602761468159813, + "loss": 2.1151, + "step": 2710 + }, + { + "epoch": 2.03, + "grad_norm": 0.1201171875, + "learning_rate": 0.00011524032904240671, + "loss": 2.101, + "step": 2715 + }, + { + "epoch": 2.03, + "grad_norm": 0.12255859375, + "learning_rate": 0.00011445464108840345, + "loss": 2.113, + "step": 2720 + }, + { + "epoch": 2.03, + "grad_norm": 0.123046875, + "learning_rate": 0.0001136705656294851, + "loss": 2.118, + "step": 2725 + }, + { + "epoch": 2.04, + "grad_norm": 0.126953125, + "learning_rate": 0.00011288811744515433, + "loss": 2.1045, + "step": 2730 + }, + { + "epoch": 2.04, + "grad_norm": 0.123046875, + "learning_rate": 0.0001121073112842395, + "loss": 2.108, + "step": 2735 + }, + { + "epoch": 2.04, + "grad_norm": 0.12109375, + "learning_rate": 0.00011132816186461821, + "loss": 2.0919, + "step": 2740 + }, + { + "epoch": 2.05, + "grad_norm": 0.1240234375, + "learning_rate": 0.00011055068387293879, + "loss": 2.1063, + "step": 2745 + }, + { + "epoch": 2.05, + "grad_norm": 0.123046875, + "learning_rate": 0.00010977489196434381, + "loss": 2.1175, + "step": 2750 + }, + { + "epoch": 2.06, + "grad_norm": 0.123046875, + "learning_rate": 0.00010900080076219426, + "loss": 2.1103, + "step": 2755 + }, + { + "epoch": 2.06, + "grad_norm": 0.1220703125, + "learning_rate": 0.00010822842485779285, + "loss": 2.1111, + "step": 2760 + }, + { + "epoch": 2.06, + "grad_norm": 0.12158203125, + "learning_rate": 0.00010745777881011027, + "loss": 2.0899, + "step": 2765 + }, + { + "epoch": 2.07, + "grad_norm": 0.123046875, + "learning_rate": 0.00010668887714550974, + "loss": 2.0935, + "step": 2770 + }, + { + "epoch": 2.07, + "grad_norm": 0.1201171875, + "learning_rate": 0.0001059217343574737, + "loss": 2.0919, + "step": 2775 + }, + { + "epoch": 2.07, + "grad_norm": 0.123046875, + "learning_rate": 0.00010515636490633043, + "loss": 2.1157, + "step": 2780 + }, + { + "epoch": 2.08, + "grad_norm": 0.12255859375, + "learning_rate": 0.00010439278321898153, + "loss": 2.1024, + "step": 2785 + }, + { + "epoch": 2.08, + "grad_norm": 0.1240234375, + "learning_rate": 0.00010363100368863021, + "loss": 2.1038, + "step": 2790 + }, + { + "epoch": 2.09, + "grad_norm": 0.12255859375, + "learning_rate": 0.00010287104067450928, + "loss": 2.1088, + "step": 2795 + }, + { + "epoch": 2.09, + "grad_norm": 0.12353515625, + "learning_rate": 0.00010211290850161144, + "loss": 2.0861, + "step": 2800 + }, + { + "epoch": 2.09, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010135662146041855, + "loss": 2.1215, + "step": 2805 + }, + { + "epoch": 2.1, + "grad_norm": 0.123046875, + "learning_rate": 0.0001006021938066325, + "loss": 2.1062, + "step": 2810 + }, + { + "epoch": 2.1, + "grad_norm": 0.12451171875, + "learning_rate": 9.984963976090651e-05, + "loss": 2.1014, + "step": 2815 + }, + { + "epoch": 2.1, + "grad_norm": 0.123046875, + "learning_rate": 9.909897350857706e-05, + "loss": 2.1023, + "step": 2820 + }, + { + "epoch": 2.11, + "grad_norm": 0.12158203125, + "learning_rate": 9.83502091993965e-05, + "loss": 2.1025, + "step": 2825 + }, + { + "epoch": 2.11, + "grad_norm": 0.1279296875, + "learning_rate": 9.760336094726624e-05, + "loss": 2.1241, + "step": 2830 + }, + { + "epoch": 2.11, + "grad_norm": 0.12451171875, + "learning_rate": 9.6858442829971e-05, + "loss": 2.1155, + "step": 2835 + }, + { + "epoch": 2.12, + "grad_norm": 0.123046875, + "learning_rate": 9.611546888891307e-05, + "loss": 2.1012, + "step": 2840 + }, + { + "epoch": 2.12, + "grad_norm": 0.12451171875, + "learning_rate": 9.537445312884788e-05, + "loss": 2.1058, + "step": 2845 + }, + { + "epoch": 2.13, + "grad_norm": 0.12353515625, + "learning_rate": 9.463540951761989e-05, + "loss": 2.0876, + "step": 2850 + }, + { + "epoch": 2.13, + "grad_norm": 0.12451171875, + "learning_rate": 9.389835198589944e-05, + "loss": 2.1222, + "step": 2855 + }, + { + "epoch": 2.13, + "grad_norm": 0.126953125, + "learning_rate": 9.316329442691995e-05, + "loss": 2.1107, + "step": 2860 + }, + { + "epoch": 2.14, + "grad_norm": 0.1240234375, + "learning_rate": 9.243025069621649e-05, + "loss": 2.1065, + "step": 2865 + }, + { + "epoch": 2.14, + "grad_norm": 0.12451171875, + "learning_rate": 9.169923461136376e-05, + "loss": 2.1193, + "step": 2870 + }, + { + "epoch": 2.14, + "grad_norm": 0.125, + "learning_rate": 9.097025995171669e-05, + "loss": 2.1154, + "step": 2875 + }, + { + "epoch": 2.15, + "grad_norm": 0.12255859375, + "learning_rate": 9.024334045814988e-05, + "loss": 2.1055, + "step": 2880 + }, + { + "epoch": 2.15, + "grad_norm": 0.12255859375, + "learning_rate": 8.951848983279898e-05, + "loss": 2.1039, + "step": 2885 + }, + { + "epoch": 2.16, + "grad_norm": 0.123046875, + "learning_rate": 8.87957217388023e-05, + "loss": 2.1249, + "step": 2890 + }, + { + "epoch": 2.16, + "grad_norm": 0.125, + "learning_rate": 8.80750498000432e-05, + "loss": 2.1231, + "step": 2895 + }, + { + "epoch": 2.16, + "grad_norm": 0.1240234375, + "learning_rate": 8.735648760089367e-05, + "loss": 2.1346, + "step": 2900 + }, + { + "epoch": 2.17, + "grad_norm": 0.1259765625, + "learning_rate": 8.66400486859575e-05, + "loss": 2.107, + "step": 2905 + }, + { + "epoch": 2.17, + "grad_norm": 0.12255859375, + "learning_rate": 8.592574655981594e-05, + "loss": 2.1189, + "step": 2910 + }, + { + "epoch": 2.17, + "grad_norm": 0.1259765625, + "learning_rate": 8.521359468677214e-05, + "loss": 2.1061, + "step": 2915 + }, + { + "epoch": 2.18, + "grad_norm": 0.1259765625, + "learning_rate": 8.450360649059834e-05, + "loss": 2.1297, + "step": 2920 + }, + { + "epoch": 2.18, + "grad_norm": 0.1279296875, + "learning_rate": 8.379579535428203e-05, + "loss": 2.1119, + "step": 2925 + }, + { + "epoch": 2.19, + "grad_norm": 0.12353515625, + "learning_rate": 8.309017461977409e-05, + "loss": 2.0947, + "step": 2930 + }, + { + "epoch": 2.19, + "grad_norm": 0.1259765625, + "learning_rate": 8.23867575877374e-05, + "loss": 2.1072, + "step": 2935 + }, + { + "epoch": 2.19, + "grad_norm": 0.123046875, + "learning_rate": 8.168555751729551e-05, + "loss": 2.106, + "step": 2940 + }, + { + "epoch": 2.2, + "grad_norm": 0.1259765625, + "learning_rate": 8.098658762578369e-05, + "loss": 2.1183, + "step": 2945 + }, + { + "epoch": 2.2, + "grad_norm": 0.125, + "learning_rate": 8.028986108849887e-05, + "loss": 2.1103, + "step": 2950 + }, + { + "epoch": 2.2, + "grad_norm": 0.1279296875, + "learning_rate": 7.959539103845184e-05, + "loss": 2.1414, + "step": 2955 + }, + { + "epoch": 2.21, + "grad_norm": 0.125, + "learning_rate": 7.890319056611942e-05, + "loss": 2.11, + "step": 2960 + }, + { + "epoch": 2.21, + "grad_norm": 0.12255859375, + "learning_rate": 7.82132727191978e-05, + "loss": 2.1178, + "step": 2965 + }, + { + "epoch": 2.22, + "grad_norm": 0.1240234375, + "learning_rate": 7.752565050235694e-05, + "loss": 2.1018, + "step": 2970 + }, + { + "epoch": 2.22, + "grad_norm": 0.1259765625, + "learning_rate": 7.684033687699455e-05, + "loss": 2.1184, + "step": 2975 + }, + { + "epoch": 2.22, + "grad_norm": 0.1259765625, + "learning_rate": 7.615734476099284e-05, + "loss": 2.1208, + "step": 2980 + }, + { + "epoch": 2.23, + "grad_norm": 0.1279296875, + "learning_rate": 7.547668702847421e-05, + "loss": 2.1201, + "step": 2985 + }, + { + "epoch": 2.23, + "grad_norm": 0.12451171875, + "learning_rate": 7.479837650955906e-05, + "loss": 2.123, + "step": 2990 + }, + { + "epoch": 2.23, + "grad_norm": 0.1220703125, + "learning_rate": 7.412242599012366e-05, + "loss": 2.1324, + "step": 2995 + }, + { + "epoch": 2.24, + "grad_norm": 0.123046875, + "learning_rate": 7.34488482115593e-05, + "loss": 2.1275, + "step": 3000 + }, + { + "epoch": 2.24, + "grad_norm": 0.12353515625, + "learning_rate": 7.277765587053206e-05, + "loss": 2.1193, + "step": 3005 + }, + { + "epoch": 2.25, + "grad_norm": 0.12451171875, + "learning_rate": 7.210886161874344e-05, + "loss": 2.1165, + "step": 3010 + }, + { + "epoch": 2.25, + "grad_norm": 0.12353515625, + "learning_rate": 7.144247806269213e-05, + "loss": 2.1136, + "step": 3015 + }, + { + "epoch": 2.25, + "grad_norm": 0.12353515625, + "learning_rate": 7.0778517763436e-05, + "loss": 2.0813, + "step": 3020 + }, + { + "epoch": 2.26, + "grad_norm": 0.12451171875, + "learning_rate": 7.011699323635559e-05, + "loss": 2.0982, + "step": 3025 + }, + { + "epoch": 2.26, + "grad_norm": 0.12451171875, + "learning_rate": 6.94579169509181e-05, + "loss": 2.1135, + "step": 3030 + }, + { + "epoch": 2.26, + "grad_norm": 0.12255859375, + "learning_rate": 6.88013013304424e-05, + "loss": 2.1057, + "step": 3035 + }, + { + "epoch": 2.27, + "grad_norm": 0.12353515625, + "learning_rate": 6.814715875186475e-05, + "loss": 2.1319, + "step": 3040 + }, + { + "epoch": 2.27, + "grad_norm": 0.12353515625, + "learning_rate": 6.749550154550585e-05, + "loss": 2.1206, + "step": 3045 + }, + { + "epoch": 2.28, + "grad_norm": 0.126953125, + "learning_rate": 6.684634199483773e-05, + "loss": 2.123, + "step": 3050 + }, + { + "epoch": 2.28, + "grad_norm": 0.126953125, + "learning_rate": 6.619969233625298e-05, + "loss": 2.1197, + "step": 3055 + }, + { + "epoch": 2.28, + "grad_norm": 0.1259765625, + "learning_rate": 6.55555647588336e-05, + "loss": 2.1075, + "step": 3060 + }, + { + "epoch": 2.29, + "grad_norm": 0.125, + "learning_rate": 6.491397140412139e-05, + "loss": 2.1185, + "step": 3065 + }, + { + "epoch": 2.29, + "grad_norm": 0.1259765625, + "learning_rate": 6.42749243658891e-05, + "loss": 2.1114, + "step": 3070 + }, + { + "epoch": 2.29, + "grad_norm": 0.1259765625, + "learning_rate": 6.363843568991243e-05, + "loss": 2.0937, + "step": 3075 + }, + { + "epoch": 2.3, + "grad_norm": 0.123046875, + "learning_rate": 6.300451737374322e-05, + "loss": 2.0954, + "step": 3080 + }, + { + "epoch": 2.3, + "grad_norm": 0.125, + "learning_rate": 6.237318136648258e-05, + "loss": 2.1127, + "step": 3085 + }, + { + "epoch": 2.31, + "grad_norm": 0.12451171875, + "learning_rate": 6.174443956855671e-05, + "loss": 2.1174, + "step": 3090 + }, + { + "epoch": 2.31, + "grad_norm": 0.1328125, + "learning_rate": 6.111830383149164e-05, + "loss": 2.1148, + "step": 3095 + }, + { + "epoch": 2.31, + "grad_norm": 0.1259765625, + "learning_rate": 6.04947859576904e-05, + "loss": 2.1382, + "step": 3100 + }, + { + "epoch": 2.32, + "grad_norm": 0.126953125, + "learning_rate": 5.9873897700210304e-05, + "loss": 2.1125, + "step": 3105 + }, + { + "epoch": 2.32, + "grad_norm": 0.125, + "learning_rate": 5.92556507625414e-05, + "loss": 2.1068, + "step": 3110 + }, + { + "epoch": 2.32, + "grad_norm": 0.12451171875, + "learning_rate": 5.86400567983862e-05, + "loss": 2.1204, + "step": 3115 + }, + { + "epoch": 2.33, + "grad_norm": 0.1240234375, + "learning_rate": 5.802712741143934e-05, + "loss": 2.1046, + "step": 3120 + }, + { + "epoch": 2.33, + "grad_norm": 0.126953125, + "learning_rate": 5.741687415516968e-05, + "loss": 2.1179, + "step": 3125 + }, + { + "epoch": 2.33, + "grad_norm": 0.125, + "learning_rate": 5.680930853260182e-05, + "loss": 2.1184, + "step": 3130 + }, + { + "epoch": 2.34, + "grad_norm": 0.12451171875, + "learning_rate": 5.6204441996099686e-05, + "loss": 2.1204, + "step": 3135 + }, + { + "epoch": 2.34, + "grad_norm": 0.125, + "learning_rate": 5.560228594715049e-05, + "loss": 2.1097, + "step": 3140 + }, + { + "epoch": 2.35, + "grad_norm": 0.1328125, + "learning_rate": 5.500285173614985e-05, + "loss": 2.1141, + "step": 3145 + }, + { + "epoch": 2.35, + "grad_norm": 0.125, + "learning_rate": 5.4406150662188035e-05, + "loss": 2.1096, + "step": 3150 + }, + { + "epoch": 2.35, + "grad_norm": 0.1259765625, + "learning_rate": 5.3812193972836436e-05, + "loss": 2.1134, + "step": 3155 + }, + { + "epoch": 2.36, + "grad_norm": 0.126953125, + "learning_rate": 5.322099286393625e-05, + "loss": 2.1132, + "step": 3160 + }, + { + "epoch": 2.36, + "grad_norm": 0.126953125, + "learning_rate": 5.263255847938693e-05, + "loss": 2.1083, + "step": 3165 + }, + { + "epoch": 2.36, + "grad_norm": 0.126953125, + "learning_rate": 5.204690191093635e-05, + "loss": 2.1093, + "step": 3170 + }, + { + "epoch": 2.37, + "grad_norm": 0.1318359375, + "learning_rate": 5.1464034197971726e-05, + "loss": 2.1123, + "step": 3175 + }, + { + "epoch": 2.37, + "grad_norm": 0.1240234375, + "learning_rate": 5.08839663273114e-05, + "loss": 2.1115, + "step": 3180 + }, + { + "epoch": 2.38, + "grad_norm": 0.12451171875, + "learning_rate": 5.030670923299785e-05, + "loss": 2.1129, + "step": 3185 + }, + { + "epoch": 2.38, + "grad_norm": 0.125, + "learning_rate": 4.9732273796091685e-05, + "loss": 2.1259, + "step": 3190 + }, + { + "epoch": 2.38, + "grad_norm": 0.12451171875, + "learning_rate": 4.916067084446632e-05, + "loss": 2.1305, + "step": 3195 + }, + { + "epoch": 2.39, + "grad_norm": 0.1259765625, + "learning_rate": 4.859191115260393e-05, + "loss": 2.098, + "step": 3200 + }, + { + "epoch": 2.39, + "grad_norm": 0.126953125, + "learning_rate": 4.8026005441392505e-05, + "loss": 2.1109, + "step": 3205 + }, + { + "epoch": 2.39, + "grad_norm": 0.1259765625, + "learning_rate": 4.7462964377923635e-05, + "loss": 2.1043, + "step": 3210 + }, + { + "epoch": 2.4, + "grad_norm": 0.1240234375, + "learning_rate": 4.690279857529145e-05, + "loss": 2.0896, + "step": 3215 + }, + { + "epoch": 2.4, + "grad_norm": 0.125, + "learning_rate": 4.634551859239254e-05, + "loss": 2.1266, + "step": 3220 + }, + { + "epoch": 2.41, + "grad_norm": 0.125, + "learning_rate": 4.57911349337272e-05, + "loss": 2.0982, + "step": 3225 + }, + { + "epoch": 2.41, + "grad_norm": 0.1279296875, + "learning_rate": 4.523965804920078e-05, + "loss": 2.1275, + "step": 3230 + }, + { + "epoch": 2.41, + "grad_norm": 0.1279296875, + "learning_rate": 4.469109833392759e-05, + "loss": 2.1323, + "step": 3235 + }, + { + "epoch": 2.42, + "grad_norm": 0.12451171875, + "learning_rate": 4.414546612803421e-05, + "loss": 2.1026, + "step": 3240 + }, + { + "epoch": 2.42, + "grad_norm": 0.12353515625, + "learning_rate": 4.3602771716464874e-05, + "loss": 2.085, + "step": 3245 + }, + { + "epoch": 2.42, + "grad_norm": 0.126953125, + "learning_rate": 4.3063025328787676e-05, + "loss": 2.1026, + "step": 3250 + }, + { + "epoch": 2.43, + "grad_norm": 0.12451171875, + "learning_rate": 4.252623713900159e-05, + "loss": 2.1093, + "step": 3255 + }, + { + "epoch": 2.43, + "grad_norm": 0.125, + "learning_rate": 4.199241726534495e-05, + "loss": 2.1026, + "step": 3260 + }, + { + "epoch": 2.44, + "grad_norm": 0.1279296875, + "learning_rate": 4.146157577010421e-05, + "loss": 2.1192, + "step": 3265 + }, + { + "epoch": 2.44, + "grad_norm": 0.12451171875, + "learning_rate": 4.0933722659424945e-05, + "loss": 2.1114, + "step": 3270 + }, + { + "epoch": 2.44, + "grad_norm": 0.1259765625, + "learning_rate": 4.040886788312268e-05, + "loss": 2.1239, + "step": 3275 + }, + { + "epoch": 2.45, + "grad_norm": 0.1259765625, + "learning_rate": 3.9887021334495625e-05, + "loss": 2.0952, + "step": 3280 + }, + { + "epoch": 2.45, + "grad_norm": 0.1259765625, + "learning_rate": 3.936819285013826e-05, + "loss": 2.114, + "step": 3285 + }, + { + "epoch": 2.45, + "grad_norm": 0.12890625, + "learning_rate": 3.885239220975547e-05, + "loss": 2.1189, + "step": 3290 + }, + { + "epoch": 2.46, + "grad_norm": 0.125, + "learning_rate": 3.833962913597893e-05, + "loss": 2.0974, + "step": 3295 + }, + { + "epoch": 2.46, + "grad_norm": 0.1259765625, + "learning_rate": 3.7829913294183e-05, + "loss": 2.1085, + "step": 3300 + }, + { + "epoch": 2.47, + "grad_norm": 0.1240234375, + "learning_rate": 3.73232542923033e-05, + "loss": 2.1023, + "step": 3305 + }, + { + "epoch": 2.47, + "grad_norm": 0.125, + "learning_rate": 3.681966168065509e-05, + "loss": 2.1213, + "step": 3310 + }, + { + "epoch": 2.47, + "grad_norm": 0.12451171875, + "learning_rate": 3.6319144951753436e-05, + "loss": 2.1312, + "step": 3315 + }, + { + "epoch": 2.48, + "grad_norm": 0.1259765625, + "learning_rate": 3.582171354013444e-05, + "loss": 2.1268, + "step": 3320 + }, + { + "epoch": 2.48, + "grad_norm": 0.126953125, + "learning_rate": 3.5327376822176885e-05, + "loss": 2.1068, + "step": 3325 + }, + { + "epoch": 2.48, + "grad_norm": 0.126953125, + "learning_rate": 3.483614411592628e-05, + "loss": 2.1057, + "step": 3330 + }, + { + "epoch": 2.49, + "grad_norm": 0.12890625, + "learning_rate": 3.434802468091836e-05, + "loss": 2.103, + "step": 3335 + }, + { + "epoch": 2.49, + "grad_norm": 0.1259765625, + "learning_rate": 3.386302771800527e-05, + "loss": 2.1166, + "step": 3340 + }, + { + "epoch": 2.5, + "grad_norm": 0.1240234375, + "learning_rate": 3.3381162369181717e-05, + "loss": 2.1176, + "step": 3345 + }, + { + "epoch": 2.5, + "grad_norm": 0.125, + "learning_rate": 3.290243771741275e-05, + "loss": 2.1137, + "step": 3350 + }, + { + "epoch": 2.5, + "grad_norm": 0.1259765625, + "learning_rate": 3.2426862786462565e-05, + "loss": 2.1017, + "step": 3355 + }, + { + "epoch": 2.51, + "grad_norm": 0.1259765625, + "learning_rate": 3.195444654072439e-05, + "loss": 2.1049, + "step": 3360 + }, + { + "epoch": 2.51, + "grad_norm": 0.125, + "learning_rate": 3.148519788505166e-05, + "loss": 2.1144, + "step": 3365 + }, + { + "epoch": 2.51, + "grad_norm": 0.125, + "learning_rate": 3.101912566458989e-05, + "loss": 2.0956, + "step": 3370 + }, + { + "epoch": 2.52, + "grad_norm": 0.1259765625, + "learning_rate": 3.0556238664610105e-05, + "loss": 2.1077, + "step": 3375 + }, + { + "epoch": 2.52, + "grad_norm": 0.126953125, + "learning_rate": 3.009654561034323e-05, + "loss": 2.1178, + "step": 3380 + }, + { + "epoch": 2.53, + "grad_norm": 0.12451171875, + "learning_rate": 2.9640055166815673e-05, + "loss": 2.105, + "step": 3385 + }, + { + "epoch": 2.53, + "grad_norm": 0.12451171875, + "learning_rate": 2.918677593868586e-05, + "loss": 2.1051, + "step": 3390 + }, + { + "epoch": 2.53, + "grad_norm": 0.12451171875, + "learning_rate": 2.8736716470082204e-05, + "loss": 2.0967, + "step": 3395 + }, + { + "epoch": 2.54, + "grad_norm": 0.12451171875, + "learning_rate": 2.8289885244441803e-05, + "loss": 2.1174, + "step": 3400 + }, + { + "epoch": 2.54, + "grad_norm": 0.12890625, + "learning_rate": 2.7846290684350963e-05, + "loss": 2.1216, + "step": 3405 + }, + { + "epoch": 2.54, + "grad_norm": 0.1259765625, + "learning_rate": 2.740594115138595e-05, + "loss": 2.1199, + "step": 3410 + }, + { + "epoch": 2.55, + "grad_norm": 0.1240234375, + "learning_rate": 2.6968844945955617e-05, + "loss": 2.1112, + "step": 3415 + }, + { + "epoch": 2.55, + "grad_norm": 0.1259765625, + "learning_rate": 2.6535010307145002e-05, + "loss": 2.1374, + "step": 3420 + }, + { + "epoch": 2.56, + "grad_norm": 0.1259765625, + "learning_rate": 2.6104445412559876e-05, + "loss": 2.1233, + "step": 3425 + }, + { + "epoch": 2.56, + "grad_norm": 0.1259765625, + "learning_rate": 2.5677158378172707e-05, + "loss": 2.1049, + "step": 3430 + }, + { + "epoch": 2.56, + "grad_norm": 0.1279296875, + "learning_rate": 2.5253157258169567e-05, + "loss": 2.1058, + "step": 3435 + }, + { + "epoch": 2.57, + "grad_norm": 0.125, + "learning_rate": 2.4832450044798573e-05, + "loss": 2.1154, + "step": 3440 + }, + { + "epoch": 2.57, + "grad_norm": 0.126953125, + "learning_rate": 2.4415044668218735e-05, + "loss": 2.1126, + "step": 3445 + }, + { + "epoch": 2.57, + "grad_norm": 0.1259765625, + "learning_rate": 2.4000948996351104e-05, + "loss": 2.132, + "step": 3450 + }, + { + "epoch": 2.58, + "grad_norm": 0.1279296875, + "learning_rate": 2.359017083472994e-05, + "loss": 2.1093, + "step": 3455 + }, + { + "epoch": 2.58, + "grad_norm": 0.12353515625, + "learning_rate": 2.3182717926355845e-05, + "loss": 2.0929, + "step": 3460 + }, + { + "epoch": 2.58, + "grad_norm": 0.1259765625, + "learning_rate": 2.277859795154986e-05, + "loss": 2.1068, + "step": 3465 + }, + { + "epoch": 2.59, + "grad_norm": 0.1259765625, + "learning_rate": 2.237781852780838e-05, + "loss": 2.1095, + "step": 3470 + }, + { + "epoch": 2.59, + "grad_norm": 0.125, + "learning_rate": 2.1980387209660026e-05, + "loss": 2.1148, + "step": 3475 + }, + { + "epoch": 2.6, + "grad_norm": 0.126953125, + "learning_rate": 2.1586311488522702e-05, + "loss": 2.1104, + "step": 3480 + }, + { + "epoch": 2.6, + "grad_norm": 0.1259765625, + "learning_rate": 2.1195598792562964e-05, + "loss": 2.1175, + "step": 3485 + }, + { + "epoch": 2.6, + "grad_norm": 0.1259765625, + "learning_rate": 2.0808256486555554e-05, + "loss": 2.1094, + "step": 3490 + }, + { + "epoch": 2.61, + "grad_norm": 0.126953125, + "learning_rate": 2.042429187174475e-05, + "loss": 2.121, + "step": 3495 + }, + { + "epoch": 2.61, + "grad_norm": 0.1240234375, + "learning_rate": 2.0043712185706863e-05, + "loss": 2.1047, + "step": 3500 + }, + { + "epoch": 2.61, + "grad_norm": 0.126953125, + "learning_rate": 1.966652460221341e-05, + "loss": 2.1098, + "step": 3505 + }, + { + "epoch": 2.62, + "grad_norm": 0.1279296875, + "learning_rate": 1.9292736231096464e-05, + "loss": 2.1114, + "step": 3510 + }, + { + "epoch": 2.62, + "grad_norm": 0.12890625, + "learning_rate": 1.8922354118114138e-05, + "loss": 2.1267, + "step": 3515 + }, + { + "epoch": 2.63, + "grad_norm": 0.125, + "learning_rate": 1.8555385244818035e-05, + "loss": 2.0916, + "step": 3520 + }, + { + "epoch": 2.63, + "grad_norm": 0.126953125, + "learning_rate": 1.8191836528421558e-05, + "loss": 2.0985, + "step": 3525 + }, + { + "epoch": 2.63, + "grad_norm": 0.123046875, + "learning_rate": 1.7831714821669588e-05, + "loss": 2.1129, + "step": 3530 + }, + { + "epoch": 2.64, + "grad_norm": 0.1279296875, + "learning_rate": 1.7475026912709235e-05, + "loss": 2.0889, + "step": 3535 + }, + { + "epoch": 2.64, + "grad_norm": 0.12890625, + "learning_rate": 1.71217795249619e-05, + "loss": 2.1067, + "step": 3540 + }, + { + "epoch": 2.64, + "grad_norm": 0.1240234375, + "learning_rate": 1.6771979316996677e-05, + "loss": 2.0987, + "step": 3545 + }, + { + "epoch": 2.65, + "grad_norm": 0.1259765625, + "learning_rate": 1.6425632882404618e-05, + "loss": 2.099, + "step": 3550 + }, + { + "epoch": 2.65, + "grad_norm": 0.1279296875, + "learning_rate": 1.6082746749674604e-05, + "loss": 2.1211, + "step": 3555 + }, + { + "epoch": 2.66, + "grad_norm": 0.12451171875, + "learning_rate": 1.5743327382070206e-05, + "loss": 2.1099, + "step": 3560 + }, + { + "epoch": 2.66, + "grad_norm": 0.125, + "learning_rate": 1.540738117750793e-05, + "loss": 2.1109, + "step": 3565 + }, + { + "epoch": 2.66, + "grad_norm": 0.126953125, + "learning_rate": 1.507491446843654e-05, + "loss": 2.1064, + "step": 3570 + }, + { + "epoch": 2.67, + "grad_norm": 0.1259765625, + "learning_rate": 1.4745933521717781e-05, + "loss": 2.0908, + "step": 3575 + }, + { + "epoch": 2.67, + "grad_norm": 0.1259765625, + "learning_rate": 1.4420444538508083e-05, + "loss": 2.119, + "step": 3580 + }, + { + "epoch": 2.67, + "grad_norm": 0.125, + "learning_rate": 1.4098453654141975e-05, + "loss": 2.1008, + "step": 3585 + }, + { + "epoch": 2.68, + "grad_norm": 0.130859375, + "learning_rate": 1.377996693801611e-05, + "loss": 2.1134, + "step": 3590 + }, + { + "epoch": 2.68, + "grad_norm": 0.126953125, + "learning_rate": 1.346499039347504e-05, + "loss": 2.1147, + "step": 3595 + }, + { + "epoch": 2.69, + "grad_norm": 0.1259765625, + "learning_rate": 1.3153529957698008e-05, + "loss": 2.1141, + "step": 3600 + }, + { + "epoch": 2.69, + "grad_norm": 0.1259765625, + "learning_rate": 1.2845591501587017e-05, + "loss": 2.0835, + "step": 3605 + }, + { + "epoch": 2.69, + "grad_norm": 0.126953125, + "learning_rate": 1.254118082965634e-05, + "loss": 2.1109, + "step": 3610 + }, + { + "epoch": 2.7, + "grad_norm": 0.1259765625, + "learning_rate": 1.2240303679922727e-05, + "loss": 2.1165, + "step": 3615 + }, + { + "epoch": 2.7, + "grad_norm": 0.1240234375, + "learning_rate": 1.1942965723797671e-05, + "loss": 2.1035, + "step": 3620 + }, + { + "epoch": 2.7, + "grad_norm": 0.1259765625, + "learning_rate": 1.164917256598017e-05, + "loss": 2.1112, + "step": 3625 + }, + { + "epoch": 2.71, + "grad_norm": 0.12451171875, + "learning_rate": 1.1358929744351332e-05, + "loss": 2.1051, + "step": 3630 + }, + { + "epoch": 2.71, + "grad_norm": 0.1279296875, + "learning_rate": 1.1072242729869819e-05, + "loss": 2.1133, + "step": 3635 + }, + { + "epoch": 2.72, + "grad_norm": 0.123046875, + "learning_rate": 1.0789116926468756e-05, + "loss": 2.1097, + "step": 3640 + }, + { + "epoch": 2.72, + "grad_norm": 0.125, + "learning_rate": 1.050955767095403e-05, + "loss": 2.1082, + "step": 3645 + }, + { + "epoch": 2.72, + "grad_norm": 0.1259765625, + "learning_rate": 1.0233570232903323e-05, + "loss": 2.1208, + "step": 3650 + }, + { + "epoch": 2.73, + "grad_norm": 0.1240234375, + "learning_rate": 9.961159814567267e-06, + "loss": 2.097, + "step": 3655 + }, + { + "epoch": 2.73, + "grad_norm": 0.1259765625, + "learning_rate": 9.692331550770918e-06, + "loss": 2.1102, + "step": 3660 + }, + { + "epoch": 2.73, + "grad_norm": 0.126953125, + "learning_rate": 9.42709050881736e-06, + "loss": 2.0946, + "step": 3665 + }, + { + "epoch": 2.74, + "grad_norm": 0.125, + "learning_rate": 9.165441688391885e-06, + "loss": 2.1129, + "step": 3670 + }, + { + "epoch": 2.74, + "grad_norm": 0.1259765625, + "learning_rate": 8.907390021467921e-06, + "loss": 2.1016, + "step": 3675 + }, + { + "epoch": 2.75, + "grad_norm": 0.12451171875, + "learning_rate": 8.652940372214069e-06, + "loss": 2.0817, + "step": 3680 + }, + { + "epoch": 2.75, + "grad_norm": 0.125, + "learning_rate": 8.40209753690222e-06, + "loss": 2.1327, + "step": 3685 + }, + { + "epoch": 2.75, + "grad_norm": 0.125, + "learning_rate": 8.154866243817494e-06, + "loss": 2.1231, + "step": 3690 + }, + { + "epoch": 2.76, + "grad_norm": 0.125, + "learning_rate": 7.911251153168752e-06, + "loss": 2.1175, + "step": 3695 + }, + { + "epoch": 2.76, + "grad_norm": 0.1259765625, + "learning_rate": 7.67125685700103e-06, + "loss": 2.113, + "step": 3700 + }, + { + "epoch": 2.76, + "grad_norm": 0.1240234375, + "learning_rate": 7.434887879108776e-06, + "loss": 2.0957, + "step": 3705 + }, + { + "epoch": 2.77, + "grad_norm": 0.126953125, + "learning_rate": 7.202148674950704e-06, + "loss": 2.1117, + "step": 3710 + }, + { + "epoch": 2.77, + "grad_norm": 0.12451171875, + "learning_rate": 6.97304363156579e-06, + "loss": 2.1021, + "step": 3715 + }, + { + "epoch": 2.78, + "grad_norm": 0.12451171875, + "learning_rate": 6.747577067490563e-06, + "loss": 2.1233, + "step": 3720 + }, + { + "epoch": 2.78, + "grad_norm": 0.126953125, + "learning_rate": 6.525753232677678e-06, + "loss": 2.1289, + "step": 3725 + }, + { + "epoch": 2.78, + "grad_norm": 0.125, + "learning_rate": 6.307576308415852e-06, + "loss": 2.1155, + "step": 3730 + }, + { + "epoch": 2.79, + "grad_norm": 0.12451171875, + "learning_rate": 6.093050407251033e-06, + "loss": 2.108, + "step": 3735 + }, + { + "epoch": 2.79, + "grad_norm": 0.130859375, + "learning_rate": 5.882179572908841e-06, + "loss": 2.1112, + "step": 3740 + }, + { + "epoch": 2.79, + "grad_norm": 0.125, + "learning_rate": 5.6749677802184095e-06, + "loss": 2.118, + "step": 3745 + }, + { + "epoch": 2.8, + "grad_norm": 0.12353515625, + "learning_rate": 5.471418935037398e-06, + "loss": 2.096, + "step": 3750 + }, + { + "epoch": 2.8, + "grad_norm": 0.1279296875, + "learning_rate": 5.271536874178451e-06, + "loss": 2.1087, + "step": 3755 + }, + { + "epoch": 2.8, + "grad_norm": 0.1259765625, + "learning_rate": 5.075325365336791e-06, + "loss": 2.1044, + "step": 3760 + }, + { + "epoch": 2.81, + "grad_norm": 0.1279296875, + "learning_rate": 4.882788107019231e-06, + "loss": 2.127, + "step": 3765 + }, + { + "epoch": 2.81, + "grad_norm": 0.12451171875, + "learning_rate": 4.693928728474517e-06, + "loss": 2.0874, + "step": 3770 + }, + { + "epoch": 2.82, + "grad_norm": 0.1259765625, + "learning_rate": 4.5087507896247605e-06, + "loss": 2.1114, + "step": 3775 + }, + { + "epoch": 2.82, + "grad_norm": 0.12890625, + "learning_rate": 4.327257780998517e-06, + "loss": 2.1198, + "step": 3780 + }, + { + "epoch": 2.82, + "grad_norm": 0.1259765625, + "learning_rate": 4.149453123664881e-06, + "loss": 2.1174, + "step": 3785 + }, + { + "epoch": 2.83, + "grad_norm": 0.1259765625, + "learning_rate": 3.975340169169095e-06, + "loss": 2.0976, + "step": 3790 + }, + { + "epoch": 2.83, + "grad_norm": 0.125, + "learning_rate": 3.804922199469174e-06, + "loss": 2.0964, + "step": 3795 + }, + { + "epoch": 2.83, + "grad_norm": 0.1259765625, + "learning_rate": 3.6382024268743153e-06, + "loss": 2.1217, + "step": 3800 + }, + { + "epoch": 2.84, + "grad_norm": 0.126953125, + "learning_rate": 3.4751839939841435e-06, + "loss": 2.1284, + "step": 3805 + }, + { + "epoch": 2.84, + "grad_norm": 0.12451171875, + "learning_rate": 3.3158699736295375e-06, + "loss": 2.1031, + "step": 3810 + }, + { + "epoch": 2.85, + "grad_norm": 0.1240234375, + "learning_rate": 3.160263368814764e-06, + "loss": 2.1215, + "step": 3815 + }, + { + "epoch": 2.85, + "grad_norm": 0.1298828125, + "learning_rate": 3.0083671126607484e-06, + "loss": 2.1315, + "step": 3820 + }, + { + "epoch": 2.85, + "grad_norm": 0.126953125, + "learning_rate": 2.860184068349958e-06, + "loss": 2.1137, + "step": 3825 + }, + { + "epoch": 2.86, + "grad_norm": 0.1259765625, + "learning_rate": 2.7157170290721625e-06, + "loss": 2.1069, + "step": 3830 + }, + { + "epoch": 2.86, + "grad_norm": 0.12451171875, + "learning_rate": 2.5749687179721815e-06, + "loss": 2.1111, + "step": 3835 + }, + { + "epoch": 2.86, + "grad_norm": 0.12451171875, + "learning_rate": 2.4379417880981304e-06, + "loss": 2.0955, + "step": 3840 + }, + { + "epoch": 2.87, + "grad_norm": 0.12451171875, + "learning_rate": 2.304638822351701e-06, + "loss": 2.1026, + "step": 3845 + }, + { + "epoch": 2.87, + "grad_norm": 0.125, + "learning_rate": 2.1750623334393816e-06, + "loss": 2.0882, + "step": 3850 + }, + { + "epoch": 2.88, + "grad_norm": 0.125, + "learning_rate": 2.049214763825069e-06, + "loss": 2.1075, + "step": 3855 + }, + { + "epoch": 2.88, + "grad_norm": 0.126953125, + "learning_rate": 1.9270984856840867e-06, + "loss": 2.1132, + "step": 3860 + }, + { + "epoch": 2.88, + "grad_norm": 0.125, + "learning_rate": 1.8087158008583515e-06, + "loss": 2.1085, + "step": 3865 + }, + { + "epoch": 2.89, + "grad_norm": 0.125, + "learning_rate": 1.6940689408132092e-06, + "loss": 2.0978, + "step": 3870 + }, + { + "epoch": 2.89, + "grad_norm": 0.126953125, + "learning_rate": 1.583160066595113e-06, + "loss": 2.1095, + "step": 3875 + }, + { + "epoch": 2.89, + "grad_norm": 0.1240234375, + "learning_rate": 1.4759912687910771e-06, + "loss": 2.1079, + "step": 3880 + }, + { + "epoch": 2.9, + "grad_norm": 0.1279296875, + "learning_rate": 1.3725645674891762e-06, + "loss": 2.1186, + "step": 3885 + }, + { + "epoch": 2.9, + "grad_norm": 0.12890625, + "learning_rate": 1.2728819122404646e-06, + "loss": 2.1165, + "step": 3890 + }, + { + "epoch": 2.91, + "grad_norm": 0.12353515625, + "learning_rate": 1.1769451820223376e-06, + "loss": 2.1125, + "step": 3895 + }, + { + "epoch": 2.91, + "grad_norm": 0.126953125, + "learning_rate": 1.084756185202962e-06, + "loss": 2.1132, + "step": 3900 + }, + { + "epoch": 2.91, + "grad_norm": 0.126953125, + "learning_rate": 9.963166595073014e-07, + "loss": 2.1062, + "step": 3905 + }, + { + "epoch": 2.92, + "grad_norm": 0.1240234375, + "learning_rate": 9.116282719842772e-07, + "loss": 2.1068, + "step": 3910 + }, + { + "epoch": 2.92, + "grad_norm": 0.12451171875, + "learning_rate": 8.306926189754372e-07, + "loss": 2.1134, + "step": 3915 + }, + { + "epoch": 2.92, + "grad_norm": 0.1279296875, + "learning_rate": 7.535112260847799e-07, + "loss": 2.1227, + "step": 3920 + }, + { + "epoch": 2.93, + "grad_norm": 0.1240234375, + "learning_rate": 6.800855481500445e-07, + "loss": 2.1204, + "step": 3925 + }, + { + "epoch": 2.93, + "grad_norm": 0.126953125, + "learning_rate": 6.104169692153105e-07, + "loss": 2.1101, + "step": 3930 + }, + { + "epoch": 2.94, + "grad_norm": 0.1279296875, + "learning_rate": 5.44506802504774e-07, + "loss": 2.1111, + "step": 3935 + }, + { + "epoch": 2.94, + "grad_norm": 0.125, + "learning_rate": 4.823562903982337e-07, + "loss": 2.1068, + "step": 3940 + }, + { + "epoch": 2.94, + "grad_norm": 0.1298828125, + "learning_rate": 4.239666044074442e-07, + "loss": 2.1416, + "step": 3945 + }, + { + "epoch": 2.95, + "grad_norm": 0.12451171875, + "learning_rate": 3.693388451541102e-07, + "loss": 2.1217, + "step": 3950 + }, + { + "epoch": 2.95, + "grad_norm": 0.1240234375, + "learning_rate": 3.1847404234923715e-07, + "loss": 2.1201, + "step": 3955 + }, + { + "epoch": 2.95, + "grad_norm": 0.1259765625, + "learning_rate": 2.713731547735687e-07, + "loss": 2.1135, + "step": 3960 + }, + { + "epoch": 2.96, + "grad_norm": 0.125, + "learning_rate": 2.280370702596013e-07, + "loss": 2.0992, + "step": 3965 + }, + { + "epoch": 2.96, + "grad_norm": 0.12353515625, + "learning_rate": 1.8846660567484186e-07, + "loss": 2.1167, + "step": 3970 + }, + { + "epoch": 2.97, + "grad_norm": 0.12451171875, + "learning_rate": 1.5266250690635363e-07, + "loss": 2.0971, + "step": 3975 + }, + { + "epoch": 2.97, + "grad_norm": 0.1240234375, + "learning_rate": 1.2062544884683391e-07, + "loss": 2.1277, + "step": 3980 + }, + { + "epoch": 2.97, + "grad_norm": 0.125, + "learning_rate": 9.235603538171322e-08, + "loss": 2.1004, + "step": 3985 + }, + { + "epoch": 2.98, + "grad_norm": 0.12451171875, + "learning_rate": 6.785479937789773e-08, + "loss": 2.0967, + "step": 3990 + }, + { + "epoch": 2.98, + "grad_norm": 0.1259765625, + "learning_rate": 4.712220267366618e-08, + "loss": 2.1211, + "step": 3995 + }, + { + "epoch": 2.98, + "grad_norm": 0.126953125, + "learning_rate": 3.015863607003233e-08, + "loss": 2.1347, + "step": 4000 + }, + { + "epoch": 2.99, + "grad_norm": 0.125, + "learning_rate": 1.69644193232843e-08, + "loss": 2.1306, + "step": 4005 + }, + { + "epoch": 2.99, + "grad_norm": 0.1240234375, + "learning_rate": 7.539801139011538e-09, + "loss": 2.1013, + "step": 4010 + }, + { + "epoch": 3.0, + "grad_norm": 0.1240234375, + "learning_rate": 1.884959167419709e-09, + "loss": 2.099, + "step": 4015 + }, + { + "epoch": 3.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0, + "loss": 2.0989, + "step": 4020 + }, + { + "epoch": 3.0, + "eval_loss": 2.1637070178985596, + "eval_runtime": 187.0083, + "eval_samples_per_second": 25.785, + "eval_steps_per_second": 3.224, + "step": 4020 + }, + { + "epoch": 3.0, + "step": 4020, + "total_flos": 9.114826330351862e+17, + "train_loss": 2.1598364234563725, + "train_runtime": 32657.7671, + "train_samples_per_second": 7.879, + "train_steps_per_second": 0.123 } ], "logging_steps": 5, - "max_steps": 670, + "max_steps": 4020, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 3, "save_steps": 100, - "total_flos": 3.038889647317975e+17, + "total_flos": 9.114826330351862e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null