{ "best_metric": null, "best_model_checkpoint": null, "epoch": 69.23076923076923, "eval_steps": 100, "global_step": 13500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5128205128205128, "grad_norm": 92.9238510131836, "learning_rate": 9.9907e-06, "loss": 2.4016, "step": 100 }, { "epoch": 0.5128205128205128, "eval_loss": 2.1255013942718506, "eval_runtime": 34.4318, "eval_samples_per_second": 11.414, "eval_steps_per_second": 1.452, "step": 100 }, { "epoch": 1.0256410256410255, "grad_norm": 47.019439697265625, "learning_rate": 9.980700000000001e-06, "loss": 2.0696, "step": 200 }, { "epoch": 1.0256410256410255, "eval_loss": 1.9405728578567505, "eval_runtime": 34.0713, "eval_samples_per_second": 11.535, "eval_steps_per_second": 1.468, "step": 200 }, { "epoch": 1.5384615384615383, "grad_norm": 52.230751037597656, "learning_rate": 9.970700000000001e-06, "loss": 1.9983, "step": 300 }, { "epoch": 1.5384615384615383, "eval_loss": 1.900020718574524, "eval_runtime": 34.1694, "eval_samples_per_second": 11.502, "eval_steps_per_second": 1.463, "step": 300 }, { "epoch": 2.051282051282051, "grad_norm": 39.62374496459961, "learning_rate": 9.960800000000001e-06, "loss": 1.8888, "step": 400 }, { "epoch": 2.051282051282051, "eval_loss": 1.8164124488830566, "eval_runtime": 34.1213, "eval_samples_per_second": 11.518, "eval_steps_per_second": 1.465, "step": 400 }, { "epoch": 2.564102564102564, "grad_norm": 39.697731018066406, "learning_rate": 9.9508e-06, "loss": 1.8456, "step": 500 }, { "epoch": 2.564102564102564, "eval_loss": 1.7753618955612183, "eval_runtime": 34.149, "eval_samples_per_second": 11.508, "eval_steps_per_second": 1.464, "step": 500 }, { "epoch": 3.076923076923077, "grad_norm": 52.823184967041016, "learning_rate": 9.9408e-06, "loss": 1.7839, "step": 600 }, { "epoch": 3.076923076923077, "eval_loss": 1.7344136238098145, "eval_runtime": 34.121, "eval_samples_per_second": 11.518, "eval_steps_per_second": 1.465, "step": 600 }, { "epoch": 3.58974358974359, "grad_norm": 107.05223083496094, "learning_rate": 9.9308e-06, "loss": 1.7544, "step": 700 }, { "epoch": 3.58974358974359, "eval_loss": 1.6865615844726562, "eval_runtime": 34.0842, "eval_samples_per_second": 11.53, "eval_steps_per_second": 1.467, "step": 700 }, { "epoch": 4.102564102564102, "grad_norm": 53.641353607177734, "learning_rate": 9.9208e-06, "loss": 1.6812, "step": 800 }, { "epoch": 4.102564102564102, "eval_loss": 1.6568766832351685, "eval_runtime": 34.432, "eval_samples_per_second": 11.414, "eval_steps_per_second": 1.452, "step": 800 }, { "epoch": 4.615384615384615, "grad_norm": 40.92328643798828, "learning_rate": 9.9109e-06, "loss": 1.6501, "step": 900 }, { "epoch": 4.615384615384615, "eval_loss": 1.6080188751220703, "eval_runtime": 34.1751, "eval_samples_per_second": 11.5, "eval_steps_per_second": 1.463, "step": 900 }, { "epoch": 5.128205128205128, "grad_norm": 28.52039909362793, "learning_rate": 9.9009e-06, "loss": 1.6579, "step": 1000 }, { "epoch": 5.128205128205128, "eval_loss": 1.6059809923171997, "eval_runtime": 34.1634, "eval_samples_per_second": 11.504, "eval_steps_per_second": 1.464, "step": 1000 }, { "epoch": 5.641025641025641, "grad_norm": 73.21099090576172, "learning_rate": 9.8909e-06, "loss": 1.6286, "step": 1100 }, { "epoch": 5.641025641025641, "eval_loss": 1.5779187679290771, "eval_runtime": 34.1489, "eval_samples_per_second": 11.508, "eval_steps_per_second": 1.464, "step": 1100 }, { "epoch": 6.153846153846154, "grad_norm": 36.768428802490234, "learning_rate": 9.8809e-06, "loss": 1.5871, "step": 1200 }, { "epoch": 6.153846153846154, "eval_loss": 1.5641562938690186, "eval_runtime": 34.1081, "eval_samples_per_second": 11.522, "eval_steps_per_second": 1.466, "step": 1200 }, { "epoch": 6.666666666666667, "grad_norm": 28.098352432250977, "learning_rate": 9.8709e-06, "loss": 1.6231, "step": 1300 }, { "epoch": 6.666666666666667, "eval_loss": 1.530659556388855, "eval_runtime": 34.0717, "eval_samples_per_second": 11.534, "eval_steps_per_second": 1.467, "step": 1300 }, { "epoch": 7.17948717948718, "grad_norm": 48.131195068359375, "learning_rate": 9.8609e-06, "loss": 1.5178, "step": 1400 }, { "epoch": 7.17948717948718, "eval_loss": 1.5207794904708862, "eval_runtime": 34.0224, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.47, "step": 1400 }, { "epoch": 7.6923076923076925, "grad_norm": 15.9362211227417, "learning_rate": 9.8509e-06, "loss": 1.5434, "step": 1500 }, { "epoch": 7.6923076923076925, "eval_loss": 1.4978805780410767, "eval_runtime": 34.0269, "eval_samples_per_second": 11.55, "eval_steps_per_second": 1.469, "step": 1500 }, { "epoch": 8.205128205128204, "grad_norm": 21.21210479736328, "learning_rate": 9.840900000000001e-06, "loss": 1.5368, "step": 1600 }, { "epoch": 8.205128205128204, "eval_loss": 1.4791795015335083, "eval_runtime": 34.5402, "eval_samples_per_second": 11.378, "eval_steps_per_second": 1.448, "step": 1600 }, { "epoch": 8.717948717948717, "grad_norm": 39.53807830810547, "learning_rate": 9.830900000000001e-06, "loss": 1.5163, "step": 1700 }, { "epoch": 8.717948717948717, "eval_loss": 1.4630368947982788, "eval_runtime": 34.1482, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.464, "step": 1700 }, { "epoch": 9.23076923076923, "grad_norm": 42.564842224121094, "learning_rate": 9.820900000000001e-06, "loss": 1.483, "step": 1800 }, { "epoch": 9.23076923076923, "eval_loss": 1.4529582262039185, "eval_runtime": 34.2374, "eval_samples_per_second": 11.479, "eval_steps_per_second": 1.46, "step": 1800 }, { "epoch": 9.743589743589745, "grad_norm": 55.969017028808594, "learning_rate": 9.810900000000001e-06, "loss": 1.4795, "step": 1900 }, { "epoch": 9.743589743589745, "eval_loss": 1.4357120990753174, "eval_runtime": 34.2522, "eval_samples_per_second": 11.474, "eval_steps_per_second": 1.46, "step": 1900 }, { "epoch": 10.256410256410255, "grad_norm": 25.308517456054688, "learning_rate": 9.800900000000001e-06, "loss": 1.4151, "step": 2000 }, { "epoch": 10.256410256410255, "eval_loss": 1.432659387588501, "eval_runtime": 34.2016, "eval_samples_per_second": 11.491, "eval_steps_per_second": 1.462, "step": 2000 }, { "epoch": 10.76923076923077, "grad_norm": 54.97860336303711, "learning_rate": 9.790900000000001e-06, "loss": 1.4665, "step": 2100 }, { "epoch": 10.76923076923077, "eval_loss": 1.4322772026062012, "eval_runtime": 34.2513, "eval_samples_per_second": 11.474, "eval_steps_per_second": 1.46, "step": 2100 }, { "epoch": 11.282051282051283, "grad_norm": 26.281103134155273, "learning_rate": 9.780900000000002e-06, "loss": 1.4336, "step": 2200 }, { "epoch": 11.282051282051283, "eval_loss": 1.41593337059021, "eval_runtime": 33.815, "eval_samples_per_second": 11.622, "eval_steps_per_second": 1.479, "step": 2200 }, { "epoch": 11.794871794871796, "grad_norm": 37.469024658203125, "learning_rate": 9.770900000000002e-06, "loss": 1.4149, "step": 2300 }, { "epoch": 11.794871794871796, "eval_loss": 1.4032217264175415, "eval_runtime": 33.8113, "eval_samples_per_second": 11.623, "eval_steps_per_second": 1.479, "step": 2300 }, { "epoch": 12.307692307692308, "grad_norm": 17.248014450073242, "learning_rate": 9.760900000000002e-06, "loss": 1.3918, "step": 2400 }, { "epoch": 12.307692307692308, "eval_loss": 1.3832417726516724, "eval_runtime": 34.1456, "eval_samples_per_second": 11.51, "eval_steps_per_second": 1.464, "step": 2400 }, { "epoch": 12.820512820512821, "grad_norm": 36.33113479614258, "learning_rate": 9.7509e-06, "loss": 1.4319, "step": 2500 }, { "epoch": 12.820512820512821, "eval_loss": 1.384070634841919, "eval_runtime": 33.7658, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.481, "step": 2500 }, { "epoch": 13.333333333333334, "grad_norm": 28.724761962890625, "learning_rate": 9.7409e-06, "loss": 1.3936, "step": 2600 }, { "epoch": 13.333333333333334, "eval_loss": 1.3891230821609497, "eval_runtime": 33.7884, "eval_samples_per_second": 11.631, "eval_steps_per_second": 1.48, "step": 2600 }, { "epoch": 13.846153846153847, "grad_norm": 25.300365447998047, "learning_rate": 9.7309e-06, "loss": 1.386, "step": 2700 }, { "epoch": 13.846153846153847, "eval_loss": 1.373278260231018, "eval_runtime": 33.8109, "eval_samples_per_second": 11.623, "eval_steps_per_second": 1.479, "step": 2700 }, { "epoch": 14.35897435897436, "grad_norm": 165.74114990234375, "learning_rate": 9.7209e-06, "loss": 1.3514, "step": 2800 }, { "epoch": 14.35897435897436, "eval_loss": 1.376756191253662, "eval_runtime": 33.8844, "eval_samples_per_second": 11.598, "eval_steps_per_second": 1.476, "step": 2800 }, { "epoch": 14.871794871794872, "grad_norm": 37.75138854980469, "learning_rate": 9.7109e-06, "loss": 1.3542, "step": 2900 }, { "epoch": 14.871794871794872, "eval_loss": 1.3550430536270142, "eval_runtime": 34.0038, "eval_samples_per_second": 11.558, "eval_steps_per_second": 1.47, "step": 2900 }, { "epoch": 15.384615384615385, "grad_norm": 41.38801956176758, "learning_rate": 9.7009e-06, "loss": 1.3653, "step": 3000 }, { "epoch": 15.384615384615385, "eval_loss": 1.3493261337280273, "eval_runtime": 33.7831, "eval_samples_per_second": 11.633, "eval_steps_per_second": 1.48, "step": 3000 }, { "epoch": 15.897435897435898, "grad_norm": 54.01289749145508, "learning_rate": 9.6909e-06, "loss": 1.3451, "step": 3100 }, { "epoch": 15.897435897435898, "eval_loss": 1.3563708066940308, "eval_runtime": 33.7662, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.481, "step": 3100 }, { "epoch": 16.41025641025641, "grad_norm": 47.22911834716797, "learning_rate": 9.6809e-06, "loss": 1.323, "step": 3200 }, { "epoch": 16.41025641025641, "eval_loss": 1.343112587928772, "eval_runtime": 34.3174, "eval_samples_per_second": 11.452, "eval_steps_per_second": 1.457, "step": 3200 }, { "epoch": 16.923076923076923, "grad_norm": 254.61305236816406, "learning_rate": 9.670900000000001e-06, "loss": 1.3348, "step": 3300 }, { "epoch": 16.923076923076923, "eval_loss": 1.3341063261032104, "eval_runtime": 33.9905, "eval_samples_per_second": 11.562, "eval_steps_per_second": 1.471, "step": 3300 }, { "epoch": 17.435897435897434, "grad_norm": 25.823589324951172, "learning_rate": 9.660900000000001e-06, "loss": 1.3185, "step": 3400 }, { "epoch": 17.435897435897434, "eval_loss": 1.3256839513778687, "eval_runtime": 33.9185, "eval_samples_per_second": 11.587, "eval_steps_per_second": 1.474, "step": 3400 }, { "epoch": 17.94871794871795, "grad_norm": 44.86510467529297, "learning_rate": 9.650900000000001e-06, "loss": 1.328, "step": 3500 }, { "epoch": 17.94871794871795, "eval_loss": 1.3411694765090942, "eval_runtime": 33.8769, "eval_samples_per_second": 11.601, "eval_steps_per_second": 1.476, "step": 3500 }, { "epoch": 18.46153846153846, "grad_norm": 20.39423942565918, "learning_rate": 9.640900000000001e-06, "loss": 1.3306, "step": 3600 }, { "epoch": 18.46153846153846, "eval_loss": 1.318002462387085, "eval_runtime": 33.7275, "eval_samples_per_second": 11.652, "eval_steps_per_second": 1.482, "step": 3600 }, { "epoch": 18.974358974358974, "grad_norm": 42.16006088256836, "learning_rate": 9.630900000000001e-06, "loss": 1.301, "step": 3700 }, { "epoch": 18.974358974358974, "eval_loss": 1.3143377304077148, "eval_runtime": 33.8038, "eval_samples_per_second": 11.626, "eval_steps_per_second": 1.479, "step": 3700 }, { "epoch": 19.487179487179485, "grad_norm": 36.292999267578125, "learning_rate": 9.620900000000001e-06, "loss": 1.3036, "step": 3800 }, { "epoch": 19.487179487179485, "eval_loss": 1.3157984018325806, "eval_runtime": 33.7447, "eval_samples_per_second": 11.646, "eval_steps_per_second": 1.482, "step": 3800 }, { "epoch": 20.0, "grad_norm": 64.44886016845703, "learning_rate": 9.610900000000001e-06, "loss": 1.3077, "step": 3900 }, { "epoch": 20.0, "eval_loss": 1.31955885887146, "eval_runtime": 33.8296, "eval_samples_per_second": 11.617, "eval_steps_per_second": 1.478, "step": 3900 }, { "epoch": 20.51282051282051, "grad_norm": 37.88364028930664, "learning_rate": 9.600900000000002e-06, "loss": 1.3092, "step": 4000 }, { "epoch": 20.51282051282051, "eval_loss": 1.3180863857269287, "eval_runtime": 34.1642, "eval_samples_per_second": 11.503, "eval_steps_per_second": 1.464, "step": 4000 }, { "epoch": 21.025641025641026, "grad_norm": 27.300247192382812, "learning_rate": 9.5909e-06, "loss": 1.27, "step": 4100 }, { "epoch": 21.025641025641026, "eval_loss": 1.3031089305877686, "eval_runtime": 34.0151, "eval_samples_per_second": 11.554, "eval_steps_per_second": 1.47, "step": 4100 }, { "epoch": 21.53846153846154, "grad_norm": 27.94341278076172, "learning_rate": 9.5809e-06, "loss": 1.2983, "step": 4200 }, { "epoch": 21.53846153846154, "eval_loss": 1.3018323183059692, "eval_runtime": 34.1294, "eval_samples_per_second": 11.515, "eval_steps_per_second": 1.465, "step": 4200 }, { "epoch": 22.05128205128205, "grad_norm": 35.468135833740234, "learning_rate": 9.5709e-06, "loss": 1.2568, "step": 4300 }, { "epoch": 22.05128205128205, "eval_loss": 1.3130985498428345, "eval_runtime": 33.9979, "eval_samples_per_second": 11.56, "eval_steps_per_second": 1.471, "step": 4300 }, { "epoch": 22.564102564102566, "grad_norm": 21.431076049804688, "learning_rate": 9.5609e-06, "loss": 1.2812, "step": 4400 }, { "epoch": 22.564102564102566, "eval_loss": 1.293820858001709, "eval_runtime": 33.9036, "eval_samples_per_second": 11.592, "eval_steps_per_second": 1.475, "step": 4400 }, { "epoch": 23.076923076923077, "grad_norm": 59.38164520263672, "learning_rate": 9.5509e-06, "loss": 1.2776, "step": 4500 }, { "epoch": 23.076923076923077, "eval_loss": 1.2872756719589233, "eval_runtime": 33.8513, "eval_samples_per_second": 11.61, "eval_steps_per_second": 1.477, "step": 4500 }, { "epoch": 23.58974358974359, "grad_norm": 48.05171585083008, "learning_rate": 9.5409e-06, "loss": 1.2488, "step": 4600 }, { "epoch": 23.58974358974359, "eval_loss": 1.2994896173477173, "eval_runtime": 33.9738, "eval_samples_per_second": 11.568, "eval_steps_per_second": 1.472, "step": 4600 }, { "epoch": 24.102564102564102, "grad_norm": 32.86454391479492, "learning_rate": 9.5309e-06, "loss": 1.3085, "step": 4700 }, { "epoch": 24.102564102564102, "eval_loss": 1.2995264530181885, "eval_runtime": 33.8598, "eval_samples_per_second": 11.607, "eval_steps_per_second": 1.477, "step": 4700 }, { "epoch": 24.615384615384617, "grad_norm": 36.235042572021484, "learning_rate": 9.5209e-06, "loss": 1.2439, "step": 4800 }, { "epoch": 24.615384615384617, "eval_loss": 1.285250186920166, "eval_runtime": 34.2054, "eval_samples_per_second": 11.489, "eval_steps_per_second": 1.462, "step": 4800 }, { "epoch": 25.128205128205128, "grad_norm": 17.993911743164062, "learning_rate": 9.5109e-06, "loss": 1.2427, "step": 4900 }, { "epoch": 25.128205128205128, "eval_loss": 1.2752227783203125, "eval_runtime": 33.8959, "eval_samples_per_second": 11.594, "eval_steps_per_second": 1.475, "step": 4900 }, { "epoch": 25.641025641025642, "grad_norm": 55.71548080444336, "learning_rate": 9.501000000000001e-06, "loss": 1.2774, "step": 5000 }, { "epoch": 25.641025641025642, "eval_loss": 1.28258216381073, "eval_runtime": 33.7988, "eval_samples_per_second": 11.628, "eval_steps_per_second": 1.479, "step": 5000 }, { "epoch": 26.153846153846153, "grad_norm": 30.139249801635742, "learning_rate": 9.491000000000001e-06, "loss": 1.2224, "step": 5100 }, { "epoch": 26.153846153846153, "eval_loss": 1.2681365013122559, "eval_runtime": 33.759, "eval_samples_per_second": 11.641, "eval_steps_per_second": 1.481, "step": 5100 }, { "epoch": 26.666666666666668, "grad_norm": 39.02808380126953, "learning_rate": 9.481000000000001e-06, "loss": 1.2226, "step": 5200 }, { "epoch": 26.666666666666668, "eval_loss": 1.2783496379852295, "eval_runtime": 33.8934, "eval_samples_per_second": 11.595, "eval_steps_per_second": 1.475, "step": 5200 }, { "epoch": 27.17948717948718, "grad_norm": 27.232601165771484, "learning_rate": 9.471000000000001e-06, "loss": 1.2542, "step": 5300 }, { "epoch": 27.17948717948718, "eval_loss": 1.2665411233901978, "eval_runtime": 33.8407, "eval_samples_per_second": 11.613, "eval_steps_per_second": 1.478, "step": 5300 }, { "epoch": 27.692307692307693, "grad_norm": 25.21843719482422, "learning_rate": 9.461000000000001e-06, "loss": 1.2193, "step": 5400 }, { "epoch": 27.692307692307693, "eval_loss": 1.2602121829986572, "eval_runtime": 33.8192, "eval_samples_per_second": 11.621, "eval_steps_per_second": 1.478, "step": 5400 }, { "epoch": 28.205128205128204, "grad_norm": 27.98350715637207, "learning_rate": 9.451000000000002e-06, "loss": 1.2602, "step": 5500 }, { "epoch": 28.205128205128204, "eval_loss": 1.257614016532898, "eval_runtime": 33.8642, "eval_samples_per_second": 11.605, "eval_steps_per_second": 1.476, "step": 5500 }, { "epoch": 28.71794871794872, "grad_norm": 80.89112854003906, "learning_rate": 9.441000000000002e-06, "loss": 1.2233, "step": 5600 }, { "epoch": 28.71794871794872, "eval_loss": 1.2567039728164673, "eval_runtime": 33.9431, "eval_samples_per_second": 11.578, "eval_steps_per_second": 1.473, "step": 5600 }, { "epoch": 29.23076923076923, "grad_norm": 89.17146301269531, "learning_rate": 9.431000000000002e-06, "loss": 1.2349, "step": 5700 }, { "epoch": 29.23076923076923, "eval_loss": 1.2528877258300781, "eval_runtime": 33.691, "eval_samples_per_second": 11.665, "eval_steps_per_second": 1.484, "step": 5700 }, { "epoch": 29.743589743589745, "grad_norm": 27.014436721801758, "learning_rate": 9.421000000000002e-06, "loss": 1.1977, "step": 5800 }, { "epoch": 29.743589743589745, "eval_loss": 1.2418824434280396, "eval_runtime": 33.786, "eval_samples_per_second": 11.632, "eval_steps_per_second": 1.48, "step": 5800 }, { "epoch": 30.256410256410255, "grad_norm": 34.459503173828125, "learning_rate": 9.411000000000002e-06, "loss": 1.2017, "step": 5900 }, { "epoch": 30.256410256410255, "eval_loss": 1.2447059154510498, "eval_runtime": 33.7138, "eval_samples_per_second": 11.657, "eval_steps_per_second": 1.483, "step": 5900 }, { "epoch": 30.76923076923077, "grad_norm": 33.78609848022461, "learning_rate": 9.401000000000002e-06, "loss": 1.1899, "step": 6000 }, { "epoch": 30.76923076923077, "eval_loss": 1.2605268955230713, "eval_runtime": 34.1716, "eval_samples_per_second": 11.501, "eval_steps_per_second": 1.463, "step": 6000 }, { "epoch": 31.28205128205128, "grad_norm": 15.015114784240723, "learning_rate": 9.391e-06, "loss": 1.2086, "step": 6100 }, { "epoch": 31.28205128205128, "eval_loss": 1.2350114583969116, "eval_runtime": 33.8206, "eval_samples_per_second": 11.62, "eval_steps_per_second": 1.478, "step": 6100 }, { "epoch": 31.794871794871796, "grad_norm": 34.10783386230469, "learning_rate": 9.381e-06, "loss": 1.1763, "step": 6200 }, { "epoch": 31.794871794871796, "eval_loss": 1.2277166843414307, "eval_runtime": 33.7042, "eval_samples_per_second": 11.66, "eval_steps_per_second": 1.483, "step": 6200 }, { "epoch": 32.30769230769231, "grad_norm": 42.146934509277344, "learning_rate": 9.371e-06, "loss": 1.1821, "step": 6300 }, { "epoch": 32.30769230769231, "eval_loss": 1.2275365591049194, "eval_runtime": 33.8943, "eval_samples_per_second": 11.595, "eval_steps_per_second": 1.475, "step": 6300 }, { "epoch": 32.82051282051282, "grad_norm": 27.311311721801758, "learning_rate": 9.361e-06, "loss": 1.1891, "step": 6400 }, { "epoch": 32.82051282051282, "eval_loss": 1.2212082147598267, "eval_runtime": 34.1213, "eval_samples_per_second": 11.518, "eval_steps_per_second": 1.465, "step": 6400 }, { "epoch": 33.333333333333336, "grad_norm": 25.96141242980957, "learning_rate": 9.351e-06, "loss": 1.1981, "step": 6500 }, { "epoch": 33.333333333333336, "eval_loss": 1.234484076499939, "eval_runtime": 33.8881, "eval_samples_per_second": 11.597, "eval_steps_per_second": 1.475, "step": 6500 }, { "epoch": 33.84615384615385, "grad_norm": 43.51643753051758, "learning_rate": 9.341000000000001e-06, "loss": 1.1933, "step": 6600 }, { "epoch": 33.84615384615385, "eval_loss": 1.2187552452087402, "eval_runtime": 33.7579, "eval_samples_per_second": 11.642, "eval_steps_per_second": 1.481, "step": 6600 }, { "epoch": 34.35897435897436, "grad_norm": 28.205900192260742, "learning_rate": 9.331000000000001e-06, "loss": 1.1809, "step": 6700 }, { "epoch": 34.35897435897436, "eval_loss": 1.227611780166626, "eval_runtime": 33.8658, "eval_samples_per_second": 11.605, "eval_steps_per_second": 1.476, "step": 6700 }, { "epoch": 34.87179487179487, "grad_norm": 39.5003662109375, "learning_rate": 9.321000000000001e-06, "loss": 1.191, "step": 6800 }, { "epoch": 34.87179487179487, "eval_loss": 1.21685791015625, "eval_runtime": 33.9275, "eval_samples_per_second": 11.584, "eval_steps_per_second": 1.474, "step": 6800 }, { "epoch": 35.38461538461539, "grad_norm": 49.183170318603516, "learning_rate": 9.311000000000001e-06, "loss": 1.1955, "step": 6900 }, { "epoch": 35.38461538461539, "eval_loss": 1.2113089561462402, "eval_runtime": 33.874, "eval_samples_per_second": 11.602, "eval_steps_per_second": 1.476, "step": 6900 }, { "epoch": 35.8974358974359, "grad_norm": 24.414306640625, "learning_rate": 9.301000000000001e-06, "loss": 1.1529, "step": 7000 }, { "epoch": 35.8974358974359, "eval_loss": 1.204311728477478, "eval_runtime": 33.8158, "eval_samples_per_second": 11.622, "eval_steps_per_second": 1.479, "step": 7000 }, { "epoch": 36.41025641025641, "grad_norm": 41.432579040527344, "learning_rate": 9.291000000000001e-06, "loss": 1.1701, "step": 7100 }, { "epoch": 36.41025641025641, "eval_loss": 1.203251600265503, "eval_runtime": 33.9027, "eval_samples_per_second": 11.592, "eval_steps_per_second": 1.475, "step": 7100 }, { "epoch": 36.92307692307692, "grad_norm": 33.03053665161133, "learning_rate": 9.281000000000001e-06, "loss": 1.1413, "step": 7200 }, { "epoch": 36.92307692307692, "eval_loss": 1.1928728818893433, "eval_runtime": 34.3378, "eval_samples_per_second": 11.445, "eval_steps_per_second": 1.456, "step": 7200 }, { "epoch": 37.43589743589744, "grad_norm": 18.388671875, "learning_rate": 9.271000000000002e-06, "loss": 1.1929, "step": 7300 }, { "epoch": 37.43589743589744, "eval_loss": 1.2051817178726196, "eval_runtime": 33.7433, "eval_samples_per_second": 11.647, "eval_steps_per_second": 1.482, "step": 7300 }, { "epoch": 37.94871794871795, "grad_norm": 36.36753463745117, "learning_rate": 9.261000000000002e-06, "loss": 1.1365, "step": 7400 }, { "epoch": 37.94871794871795, "eval_loss": 1.1993858814239502, "eval_runtime": 33.7824, "eval_samples_per_second": 11.633, "eval_steps_per_second": 1.48, "step": 7400 }, { "epoch": 38.46153846153846, "grad_norm": 31.875085830688477, "learning_rate": 9.251000000000002e-06, "loss": 1.1455, "step": 7500 }, { "epoch": 38.46153846153846, "eval_loss": 1.209811806678772, "eval_runtime": 33.938, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.473, "step": 7500 }, { "epoch": 38.97435897435897, "grad_norm": 30.268564224243164, "learning_rate": 9.241000000000002e-06, "loss": 1.1583, "step": 7600 }, { "epoch": 38.97435897435897, "eval_loss": 1.1939027309417725, "eval_runtime": 33.8431, "eval_samples_per_second": 11.612, "eval_steps_per_second": 1.477, "step": 7600 }, { "epoch": 39.48717948717949, "grad_norm": 126.68383026123047, "learning_rate": 9.231000000000002e-06, "loss": 1.1443, "step": 7700 }, { "epoch": 39.48717948717949, "eval_loss": 1.2021498680114746, "eval_runtime": 33.8224, "eval_samples_per_second": 11.62, "eval_steps_per_second": 1.478, "step": 7700 }, { "epoch": 40.0, "grad_norm": 30.38014793395996, "learning_rate": 9.221e-06, "loss": 1.1509, "step": 7800 }, { "epoch": 40.0, "eval_loss": 1.2058591842651367, "eval_runtime": 33.7036, "eval_samples_per_second": 11.66, "eval_steps_per_second": 1.484, "step": 7800 }, { "epoch": 40.51282051282051, "grad_norm": 42.52033996582031, "learning_rate": 9.211e-06, "loss": 1.1635, "step": 7900 }, { "epoch": 40.51282051282051, "eval_loss": 1.2037510871887207, "eval_runtime": 33.7516, "eval_samples_per_second": 11.644, "eval_steps_per_second": 1.481, "step": 7900 }, { "epoch": 41.02564102564103, "grad_norm": 32.66926193237305, "learning_rate": 9.201e-06, "loss": 1.132, "step": 8000 }, { "epoch": 41.02564102564103, "eval_loss": 1.2147732973098755, "eval_runtime": 33.9388, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.473, "step": 8000 }, { "epoch": 41.53846153846154, "grad_norm": 42.2666130065918, "learning_rate": 9.191e-06, "loss": 1.1688, "step": 8100 }, { "epoch": 41.53846153846154, "eval_loss": 1.214206337928772, "eval_runtime": 33.8017, "eval_samples_per_second": 11.627, "eval_steps_per_second": 1.479, "step": 8100 }, { "epoch": 42.05128205128205, "grad_norm": 61.25041198730469, "learning_rate": 9.181e-06, "loss": 1.1035, "step": 8200 }, { "epoch": 42.05128205128205, "eval_loss": 1.1902282238006592, "eval_runtime": 33.7773, "eval_samples_per_second": 11.635, "eval_steps_per_second": 1.48, "step": 8200 }, { "epoch": 42.56410256410256, "grad_norm": 42.63145065307617, "learning_rate": 9.171e-06, "loss": 1.1429, "step": 8300 }, { "epoch": 42.56410256410256, "eval_loss": 1.2035140991210938, "eval_runtime": 33.7617, "eval_samples_per_second": 11.64, "eval_steps_per_second": 1.481, "step": 8300 }, { "epoch": 43.07692307692308, "grad_norm": 20.476646423339844, "learning_rate": 9.161000000000001e-06, "loss": 1.1663, "step": 8400 }, { "epoch": 43.07692307692308, "eval_loss": 1.1914900541305542, "eval_runtime": 33.9898, "eval_samples_per_second": 11.562, "eval_steps_per_second": 1.471, "step": 8400 }, { "epoch": 43.58974358974359, "grad_norm": 36.54648971557617, "learning_rate": 9.151000000000001e-06, "loss": 1.1096, "step": 8500 }, { "epoch": 43.58974358974359, "eval_loss": 1.183597207069397, "eval_runtime": 33.897, "eval_samples_per_second": 11.594, "eval_steps_per_second": 1.475, "step": 8500 }, { "epoch": 44.1025641025641, "grad_norm": 44.22875213623047, "learning_rate": 9.141000000000001e-06, "loss": 1.1497, "step": 8600 }, { "epoch": 44.1025641025641, "eval_loss": 1.1925466060638428, "eval_runtime": 33.8189, "eval_samples_per_second": 11.621, "eval_steps_per_second": 1.478, "step": 8600 }, { "epoch": 44.61538461538461, "grad_norm": 42.943180084228516, "learning_rate": 9.131000000000001e-06, "loss": 1.104, "step": 8700 }, { "epoch": 44.61538461538461, "eval_loss": 1.1945114135742188, "eval_runtime": 33.8565, "eval_samples_per_second": 11.608, "eval_steps_per_second": 1.477, "step": 8700 }, { "epoch": 45.12820512820513, "grad_norm": 21.709800720214844, "learning_rate": 9.121000000000001e-06, "loss": 1.1303, "step": 8800 }, { "epoch": 45.12820512820513, "eval_loss": 1.1883279085159302, "eval_runtime": 33.8145, "eval_samples_per_second": 11.622, "eval_steps_per_second": 1.479, "step": 8800 }, { "epoch": 45.64102564102564, "grad_norm": 34.65962219238281, "learning_rate": 9.1111e-06, "loss": 1.1347, "step": 8900 }, { "epoch": 45.64102564102564, "eval_loss": 1.1927034854888916, "eval_runtime": 34.2072, "eval_samples_per_second": 11.489, "eval_steps_per_second": 1.462, "step": 8900 }, { "epoch": 46.15384615384615, "grad_norm": 31.94856071472168, "learning_rate": 9.1011e-06, "loss": 1.1077, "step": 9000 }, { "epoch": 46.15384615384615, "eval_loss": 1.185064673423767, "eval_runtime": 33.9549, "eval_samples_per_second": 11.574, "eval_steps_per_second": 1.473, "step": 9000 }, { "epoch": 46.666666666666664, "grad_norm": 42.5874137878418, "learning_rate": 9.0911e-06, "loss": 1.124, "step": 9100 }, { "epoch": 46.666666666666664, "eval_loss": 1.186991572380066, "eval_runtime": 33.9527, "eval_samples_per_second": 11.575, "eval_steps_per_second": 1.473, "step": 9100 }, { "epoch": 47.17948717948718, "grad_norm": 28.342052459716797, "learning_rate": 9.0811e-06, "loss": 1.12, "step": 9200 }, { "epoch": 47.17948717948718, "eval_loss": 1.1799968481063843, "eval_runtime": 34.1077, "eval_samples_per_second": 11.522, "eval_steps_per_second": 1.466, "step": 9200 }, { "epoch": 47.69230769230769, "grad_norm": 18.034202575683594, "learning_rate": 9.0711e-06, "loss": 1.1023, "step": 9300 }, { "epoch": 47.69230769230769, "eval_loss": 1.1863212585449219, "eval_runtime": 33.8209, "eval_samples_per_second": 11.62, "eval_steps_per_second": 1.478, "step": 9300 }, { "epoch": 48.205128205128204, "grad_norm": 18.521087646484375, "learning_rate": 9.0611e-06, "loss": 1.1776, "step": 9400 }, { "epoch": 48.205128205128204, "eval_loss": 1.1852102279663086, "eval_runtime": 33.8724, "eval_samples_per_second": 11.602, "eval_steps_per_second": 1.476, "step": 9400 }, { "epoch": 48.717948717948715, "grad_norm": 54.735145568847656, "learning_rate": 9.0511e-06, "loss": 1.1108, "step": 9500 }, { "epoch": 48.717948717948715, "eval_loss": 1.1690032482147217, "eval_runtime": 33.7943, "eval_samples_per_second": 11.629, "eval_steps_per_second": 1.48, "step": 9500 }, { "epoch": 49.23076923076923, "grad_norm": 24.619985580444336, "learning_rate": 9.0411e-06, "loss": 1.1173, "step": 9600 }, { "epoch": 49.23076923076923, "eval_loss": 1.1739821434020996, "eval_runtime": 33.8187, "eval_samples_per_second": 11.621, "eval_steps_per_second": 1.478, "step": 9600 }, { "epoch": 49.743589743589745, "grad_norm": 27.19590950012207, "learning_rate": 9.0311e-06, "loss": 1.0959, "step": 9700 }, { "epoch": 49.743589743589745, "eval_loss": 1.1569674015045166, "eval_runtime": 33.8966, "eval_samples_per_second": 11.594, "eval_steps_per_second": 1.475, "step": 9700 }, { "epoch": 50.256410256410255, "grad_norm": 66.81871032714844, "learning_rate": 9.0211e-06, "loss": 1.1213, "step": 9800 }, { "epoch": 50.256410256410255, "eval_loss": 1.1669386625289917, "eval_runtime": 33.9461, "eval_samples_per_second": 11.577, "eval_steps_per_second": 1.473, "step": 9800 }, { "epoch": 50.76923076923077, "grad_norm": 20.71648406982422, "learning_rate": 9.011100000000001e-06, "loss": 1.0801, "step": 9900 }, { "epoch": 50.76923076923077, "eval_loss": 1.1752002239227295, "eval_runtime": 33.6927, "eval_samples_per_second": 11.664, "eval_steps_per_second": 1.484, "step": 9900 }, { "epoch": 51.282051282051285, "grad_norm": 27.14691925048828, "learning_rate": 9.001100000000001e-06, "loss": 1.1269, "step": 10000 }, { "epoch": 51.282051282051285, "eval_loss": 1.1753243207931519, "eval_runtime": 33.7905, "eval_samples_per_second": 11.63, "eval_steps_per_second": 1.48, "step": 10000 }, { "epoch": 51.794871794871796, "grad_norm": 16.967599868774414, "learning_rate": 8.991100000000001e-06, "loss": 1.0956, "step": 10100 }, { "epoch": 51.794871794871796, "eval_loss": 1.1728534698486328, "eval_runtime": 33.9831, "eval_samples_per_second": 11.565, "eval_steps_per_second": 1.471, "step": 10100 }, { "epoch": 52.30769230769231, "grad_norm": 25.84881019592285, "learning_rate": 8.981100000000001e-06, "loss": 1.1034, "step": 10200 }, { "epoch": 52.30769230769231, "eval_loss": 1.1675057411193848, "eval_runtime": 33.8047, "eval_samples_per_second": 11.626, "eval_steps_per_second": 1.479, "step": 10200 }, { "epoch": 52.82051282051282, "grad_norm": 27.911279678344727, "learning_rate": 8.9711e-06, "loss": 1.1109, "step": 10300 }, { "epoch": 52.82051282051282, "eval_loss": 1.1695001125335693, "eval_runtime": 33.9411, "eval_samples_per_second": 11.579, "eval_steps_per_second": 1.473, "step": 10300 }, { "epoch": 53.333333333333336, "grad_norm": 30.883014678955078, "learning_rate": 8.9611e-06, "loss": 1.0748, "step": 10400 }, { "epoch": 53.333333333333336, "eval_loss": 1.166171908378601, "eval_runtime": 33.7307, "eval_samples_per_second": 11.651, "eval_steps_per_second": 1.482, "step": 10400 }, { "epoch": 53.84615384615385, "grad_norm": 30.53699493408203, "learning_rate": 8.9511e-06, "loss": 1.1039, "step": 10500 }, { "epoch": 53.84615384615385, "eval_loss": 1.1673123836517334, "eval_runtime": 33.8942, "eval_samples_per_second": 11.595, "eval_steps_per_second": 1.475, "step": 10500 }, { "epoch": 54.35897435897436, "grad_norm": 32.961483001708984, "learning_rate": 8.9411e-06, "loss": 1.1203, "step": 10600 }, { "epoch": 54.35897435897436, "eval_loss": 1.1615757942199707, "eval_runtime": 33.9176, "eval_samples_per_second": 11.587, "eval_steps_per_second": 1.474, "step": 10600 }, { "epoch": 54.87179487179487, "grad_norm": 26.77215003967285, "learning_rate": 8.9311e-06, "loss": 1.1109, "step": 10700 }, { "epoch": 54.87179487179487, "eval_loss": 1.1798686981201172, "eval_runtime": 34.1898, "eval_samples_per_second": 11.495, "eval_steps_per_second": 1.462, "step": 10700 }, { "epoch": 55.38461538461539, "grad_norm": 42.99930191040039, "learning_rate": 8.9211e-06, "loss": 1.0651, "step": 10800 }, { "epoch": 55.38461538461539, "eval_loss": 1.1585968732833862, "eval_runtime": 33.9129, "eval_samples_per_second": 11.589, "eval_steps_per_second": 1.474, "step": 10800 }, { "epoch": 55.8974358974359, "grad_norm": 22.420515060424805, "learning_rate": 8.9111e-06, "loss": 1.0901, "step": 10900 }, { "epoch": 55.8974358974359, "eval_loss": 1.182327151298523, "eval_runtime": 34.0546, "eval_samples_per_second": 11.54, "eval_steps_per_second": 1.468, "step": 10900 }, { "epoch": 56.41025641025641, "grad_norm": 34.271324157714844, "learning_rate": 8.9011e-06, "loss": 1.0969, "step": 11000 }, { "epoch": 56.41025641025641, "eval_loss": 1.1700419187545776, "eval_runtime": 34.0913, "eval_samples_per_second": 11.528, "eval_steps_per_second": 1.467, "step": 11000 }, { "epoch": 56.92307692307692, "grad_norm": 25.083974838256836, "learning_rate": 8.8911e-06, "loss": 1.0825, "step": 11100 }, { "epoch": 56.92307692307692, "eval_loss": 1.1617279052734375, "eval_runtime": 33.9494, "eval_samples_per_second": 11.576, "eval_steps_per_second": 1.473, "step": 11100 }, { "epoch": 57.43589743589744, "grad_norm": 45.67741012573242, "learning_rate": 8.8811e-06, "loss": 1.1105, "step": 11200 }, { "epoch": 57.43589743589744, "eval_loss": 1.1558475494384766, "eval_runtime": 33.8739, "eval_samples_per_second": 11.602, "eval_steps_per_second": 1.476, "step": 11200 }, { "epoch": 57.94871794871795, "grad_norm": 40.9770622253418, "learning_rate": 8.8711e-06, "loss": 1.0714, "step": 11300 }, { "epoch": 57.94871794871795, "eval_loss": 1.158663272857666, "eval_runtime": 33.8651, "eval_samples_per_second": 11.605, "eval_steps_per_second": 1.476, "step": 11300 }, { "epoch": 58.46153846153846, "grad_norm": 26.5457763671875, "learning_rate": 8.8611e-06, "loss": 1.053, "step": 11400 }, { "epoch": 58.46153846153846, "eval_loss": 1.1581999063491821, "eval_runtime": 33.9299, "eval_samples_per_second": 11.583, "eval_steps_per_second": 1.474, "step": 11400 }, { "epoch": 58.97435897435897, "grad_norm": 21.226743698120117, "learning_rate": 8.8511e-06, "loss": 1.1005, "step": 11500 }, { "epoch": 58.97435897435897, "eval_loss": 1.1427479982376099, "eval_runtime": 33.8652, "eval_samples_per_second": 11.605, "eval_steps_per_second": 1.476, "step": 11500 }, { "epoch": 59.48717948717949, "grad_norm": 35.32261657714844, "learning_rate": 8.8411e-06, "loss": 1.1142, "step": 11600 }, { "epoch": 59.48717948717949, "eval_loss": 1.1504665613174438, "eval_runtime": 33.9719, "eval_samples_per_second": 11.568, "eval_steps_per_second": 1.472, "step": 11600 }, { "epoch": 60.0, "grad_norm": 62.73653793334961, "learning_rate": 8.831200000000001e-06, "loss": 1.0317, "step": 11700 }, { "epoch": 60.0, "eval_loss": 1.166619896888733, "eval_runtime": 33.7919, "eval_samples_per_second": 11.63, "eval_steps_per_second": 1.48, "step": 11700 }, { "epoch": 60.51282051282051, "grad_norm": 32.88017654418945, "learning_rate": 8.821200000000001e-06, "loss": 1.053, "step": 11800 }, { "epoch": 60.51282051282051, "eval_loss": 1.151050090789795, "eval_runtime": 33.8088, "eval_samples_per_second": 11.624, "eval_steps_per_second": 1.479, "step": 11800 }, { "epoch": 61.02564102564103, "grad_norm": 25.249664306640625, "learning_rate": 8.811200000000001e-06, "loss": 1.083, "step": 11900 }, { "epoch": 61.02564102564103, "eval_loss": 1.1466352939605713, "eval_runtime": 33.7084, "eval_samples_per_second": 11.659, "eval_steps_per_second": 1.483, "step": 11900 }, { "epoch": 61.53846153846154, "grad_norm": 25.186697006225586, "learning_rate": 8.801200000000001e-06, "loss": 1.023, "step": 12000 }, { "epoch": 61.53846153846154, "eval_loss": 1.1523329019546509, "eval_runtime": 33.8963, "eval_samples_per_second": 11.594, "eval_steps_per_second": 1.475, "step": 12000 }, { "epoch": 62.05128205128205, "grad_norm": 65.74333953857422, "learning_rate": 8.791200000000001e-06, "loss": 1.1023, "step": 12100 }, { "epoch": 62.05128205128205, "eval_loss": 1.1539132595062256, "eval_runtime": 33.8501, "eval_samples_per_second": 11.61, "eval_steps_per_second": 1.477, "step": 12100 }, { "epoch": 62.56410256410256, "grad_norm": 37.643035888671875, "learning_rate": 8.781200000000002e-06, "loss": 1.096, "step": 12200 }, { "epoch": 62.56410256410256, "eval_loss": 1.1323292255401611, "eval_runtime": 33.9504, "eval_samples_per_second": 11.576, "eval_steps_per_second": 1.473, "step": 12200 }, { "epoch": 63.07692307692308, "grad_norm": 41.54033279418945, "learning_rate": 8.7712e-06, "loss": 1.04, "step": 12300 }, { "epoch": 63.07692307692308, "eval_loss": 1.1349562406539917, "eval_runtime": 33.8242, "eval_samples_per_second": 11.619, "eval_steps_per_second": 1.478, "step": 12300 }, { "epoch": 63.58974358974359, "grad_norm": 32.337890625, "learning_rate": 8.7612e-06, "loss": 1.0729, "step": 12400 }, { "epoch": 63.58974358974359, "eval_loss": 1.1326929330825806, "eval_runtime": 33.7768, "eval_samples_per_second": 11.635, "eval_steps_per_second": 1.48, "step": 12400 }, { "epoch": 64.1025641025641, "grad_norm": 32.026912689208984, "learning_rate": 8.7512e-06, "loss": 1.0467, "step": 12500 }, { "epoch": 64.1025641025641, "eval_loss": 1.13288414478302, "eval_runtime": 34.2235, "eval_samples_per_second": 11.483, "eval_steps_per_second": 1.461, "step": 12500 }, { "epoch": 64.61538461538461, "grad_norm": 17.785661697387695, "learning_rate": 8.7412e-06, "loss": 1.0339, "step": 12600 }, { "epoch": 64.61538461538461, "eval_loss": 1.1395562887191772, "eval_runtime": 33.752, "eval_samples_per_second": 11.644, "eval_steps_per_second": 1.481, "step": 12600 }, { "epoch": 65.12820512820512, "grad_norm": 30.277780532836914, "learning_rate": 8.7312e-06, "loss": 1.0828, "step": 12700 }, { "epoch": 65.12820512820512, "eval_loss": 1.1387863159179688, "eval_runtime": 33.8766, "eval_samples_per_second": 11.601, "eval_steps_per_second": 1.476, "step": 12700 }, { "epoch": 65.64102564102564, "grad_norm": 41.21147918701172, "learning_rate": 8.7212e-06, "loss": 1.0502, "step": 12800 }, { "epoch": 65.64102564102564, "eval_loss": 1.143326997756958, "eval_runtime": 33.9279, "eval_samples_per_second": 11.583, "eval_steps_per_second": 1.474, "step": 12800 }, { "epoch": 66.15384615384616, "grad_norm": 12.04747486114502, "learning_rate": 8.7112e-06, "loss": 1.0099, "step": 12900 }, { "epoch": 66.15384615384616, "eval_loss": 1.130028247833252, "eval_runtime": 33.9548, "eval_samples_per_second": 11.574, "eval_steps_per_second": 1.473, "step": 12900 }, { "epoch": 66.66666666666667, "grad_norm": 46.18159866333008, "learning_rate": 8.7012e-06, "loss": 1.026, "step": 13000 }, { "epoch": 66.66666666666667, "eval_loss": 1.135296106338501, "eval_runtime": 33.8939, "eval_samples_per_second": 11.595, "eval_steps_per_second": 1.475, "step": 13000 }, { "epoch": 67.17948717948718, "grad_norm": 34.04996109008789, "learning_rate": 8.6912e-06, "loss": 1.0969, "step": 13100 }, { "epoch": 67.17948717948718, "eval_loss": 1.1394113302230835, "eval_runtime": 33.7863, "eval_samples_per_second": 11.632, "eval_steps_per_second": 1.48, "step": 13100 }, { "epoch": 67.6923076923077, "grad_norm": 57.76072692871094, "learning_rate": 8.6812e-06, "loss": 1.0345, "step": 13200 }, { "epoch": 67.6923076923077, "eval_loss": 1.1415585279464722, "eval_runtime": 33.8137, "eval_samples_per_second": 11.622, "eval_steps_per_second": 1.479, "step": 13200 }, { "epoch": 68.2051282051282, "grad_norm": 26.72646713256836, "learning_rate": 8.671200000000001e-06, "loss": 1.0679, "step": 13300 }, { "epoch": 68.2051282051282, "eval_loss": 1.140710473060608, "eval_runtime": 34.6101, "eval_samples_per_second": 11.355, "eval_steps_per_second": 1.445, "step": 13300 }, { "epoch": 68.71794871794872, "grad_norm": 48.05488586425781, "learning_rate": 8.661200000000001e-06, "loss": 1.0037, "step": 13400 }, { "epoch": 68.71794871794872, "eval_loss": 1.1428287029266357, "eval_runtime": 34.2239, "eval_samples_per_second": 11.483, "eval_steps_per_second": 1.461, "step": 13400 }, { "epoch": 69.23076923076923, "grad_norm": 15.574548721313477, "learning_rate": 8.651200000000001e-06, "loss": 1.0348, "step": 13500 }, { "epoch": 69.23076923076923, "eval_loss": 1.1325479745864868, "eval_runtime": 34.131, "eval_samples_per_second": 11.514, "eval_steps_per_second": 1.465, "step": 13500 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 513, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.160282743808e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }