|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 128.2051282051282, |
|
"eval_steps": 100, |
|
"global_step": 25000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 92.9238510131836, |
|
"learning_rate": 9.9907e-06, |
|
"loss": 2.4016, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"eval_loss": 2.1255013942718506, |
|
"eval_runtime": 34.4318, |
|
"eval_samples_per_second": 11.414, |
|
"eval_steps_per_second": 1.452, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 47.019439697265625, |
|
"learning_rate": 9.980700000000001e-06, |
|
"loss": 2.0696, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 1.9405728578567505, |
|
"eval_runtime": 34.0713, |
|
"eval_samples_per_second": 11.535, |
|
"eval_steps_per_second": 1.468, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 52.230751037597656, |
|
"learning_rate": 9.970700000000001e-06, |
|
"loss": 1.9983, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 1.900020718574524, |
|
"eval_runtime": 34.1694, |
|
"eval_samples_per_second": 11.502, |
|
"eval_steps_per_second": 1.463, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 39.62374496459961, |
|
"learning_rate": 9.960800000000001e-06, |
|
"loss": 1.8888, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"eval_loss": 1.8164124488830566, |
|
"eval_runtime": 34.1213, |
|
"eval_samples_per_second": 11.518, |
|
"eval_steps_per_second": 1.465, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 39.697731018066406, |
|
"learning_rate": 9.9508e-06, |
|
"loss": 1.8456, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 1.7753618955612183, |
|
"eval_runtime": 34.149, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 52.823184967041016, |
|
"learning_rate": 9.9408e-06, |
|
"loss": 1.7839, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 1.7344136238098145, |
|
"eval_runtime": 34.121, |
|
"eval_samples_per_second": 11.518, |
|
"eval_steps_per_second": 1.465, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"grad_norm": 107.05223083496094, |
|
"learning_rate": 9.9308e-06, |
|
"loss": 1.7544, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"eval_loss": 1.6865615844726562, |
|
"eval_runtime": 34.0842, |
|
"eval_samples_per_second": 11.53, |
|
"eval_steps_per_second": 1.467, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 53.641353607177734, |
|
"learning_rate": 9.9208e-06, |
|
"loss": 1.6812, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"eval_loss": 1.6568766832351685, |
|
"eval_runtime": 34.432, |
|
"eval_samples_per_second": 11.414, |
|
"eval_steps_per_second": 1.452, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 40.92328643798828, |
|
"learning_rate": 9.9109e-06, |
|
"loss": 1.6501, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 1.6080188751220703, |
|
"eval_runtime": 34.1751, |
|
"eval_samples_per_second": 11.5, |
|
"eval_steps_per_second": 1.463, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 28.52039909362793, |
|
"learning_rate": 9.9009e-06, |
|
"loss": 1.6579, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"eval_loss": 1.6059809923171997, |
|
"eval_runtime": 34.1634, |
|
"eval_samples_per_second": 11.504, |
|
"eval_steps_per_second": 1.464, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"grad_norm": 73.21099090576172, |
|
"learning_rate": 9.8909e-06, |
|
"loss": 1.6286, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"eval_loss": 1.5779187679290771, |
|
"eval_runtime": 34.1489, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 36.768428802490234, |
|
"learning_rate": 9.8809e-06, |
|
"loss": 1.5871, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.5641562938690186, |
|
"eval_runtime": 34.1081, |
|
"eval_samples_per_second": 11.522, |
|
"eval_steps_per_second": 1.466, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 28.098352432250977, |
|
"learning_rate": 9.8709e-06, |
|
"loss": 1.6231, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"eval_loss": 1.530659556388855, |
|
"eval_runtime": 34.0717, |
|
"eval_samples_per_second": 11.534, |
|
"eval_steps_per_second": 1.467, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 48.131195068359375, |
|
"learning_rate": 9.8609e-06, |
|
"loss": 1.5178, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"eval_loss": 1.5207794904708862, |
|
"eval_runtime": 34.0224, |
|
"eval_samples_per_second": 11.551, |
|
"eval_steps_per_second": 1.47, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 15.9362211227417, |
|
"learning_rate": 9.8509e-06, |
|
"loss": 1.5434, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 1.4978805780410767, |
|
"eval_runtime": 34.0269, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.469, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 21.21210479736328, |
|
"learning_rate": 9.840900000000001e-06, |
|
"loss": 1.5368, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"eval_loss": 1.4791795015335083, |
|
"eval_runtime": 34.5402, |
|
"eval_samples_per_second": 11.378, |
|
"eval_steps_per_second": 1.448, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.717948717948717, |
|
"grad_norm": 39.53807830810547, |
|
"learning_rate": 9.830900000000001e-06, |
|
"loss": 1.5163, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.717948717948717, |
|
"eval_loss": 1.4630368947982788, |
|
"eval_runtime": 34.1482, |
|
"eval_samples_per_second": 11.509, |
|
"eval_steps_per_second": 1.464, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 42.564842224121094, |
|
"learning_rate": 9.820900000000001e-06, |
|
"loss": 1.483, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"eval_loss": 1.4529582262039185, |
|
"eval_runtime": 34.2374, |
|
"eval_samples_per_second": 11.479, |
|
"eval_steps_per_second": 1.46, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.743589743589745, |
|
"grad_norm": 55.969017028808594, |
|
"learning_rate": 9.810900000000001e-06, |
|
"loss": 1.4795, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.743589743589745, |
|
"eval_loss": 1.4357120990753174, |
|
"eval_runtime": 34.2522, |
|
"eval_samples_per_second": 11.474, |
|
"eval_steps_per_second": 1.46, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 25.308517456054688, |
|
"learning_rate": 9.800900000000001e-06, |
|
"loss": 1.4151, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"eval_loss": 1.432659387588501, |
|
"eval_runtime": 34.2016, |
|
"eval_samples_per_second": 11.491, |
|
"eval_steps_per_second": 1.462, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 54.97860336303711, |
|
"learning_rate": 9.790900000000001e-06, |
|
"loss": 1.4665, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"eval_loss": 1.4322772026062012, |
|
"eval_runtime": 34.2513, |
|
"eval_samples_per_second": 11.474, |
|
"eval_steps_per_second": 1.46, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 11.282051282051283, |
|
"grad_norm": 26.281103134155273, |
|
"learning_rate": 9.780900000000002e-06, |
|
"loss": 1.4336, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 11.282051282051283, |
|
"eval_loss": 1.41593337059021, |
|
"eval_runtime": 33.815, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 11.794871794871796, |
|
"grad_norm": 37.469024658203125, |
|
"learning_rate": 9.770900000000002e-06, |
|
"loss": 1.4149, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 11.794871794871796, |
|
"eval_loss": 1.4032217264175415, |
|
"eval_runtime": 33.8113, |
|
"eval_samples_per_second": 11.623, |
|
"eval_steps_per_second": 1.479, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 17.248014450073242, |
|
"learning_rate": 9.760900000000002e-06, |
|
"loss": 1.3918, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"eval_loss": 1.3832417726516724, |
|
"eval_runtime": 34.1456, |
|
"eval_samples_per_second": 11.51, |
|
"eval_steps_per_second": 1.464, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 36.33113479614258, |
|
"learning_rate": 9.7509e-06, |
|
"loss": 1.4319, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"eval_loss": 1.384070634841919, |
|
"eval_runtime": 33.7658, |
|
"eval_samples_per_second": 11.639, |
|
"eval_steps_per_second": 1.481, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 28.724761962890625, |
|
"learning_rate": 9.7409e-06, |
|
"loss": 1.3936, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"eval_loss": 1.3891230821609497, |
|
"eval_runtime": 33.7884, |
|
"eval_samples_per_second": 11.631, |
|
"eval_steps_per_second": 1.48, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 25.300365447998047, |
|
"learning_rate": 9.7309e-06, |
|
"loss": 1.386, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"eval_loss": 1.373278260231018, |
|
"eval_runtime": 33.8109, |
|
"eval_samples_per_second": 11.623, |
|
"eval_steps_per_second": 1.479, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 14.35897435897436, |
|
"grad_norm": 165.74114990234375, |
|
"learning_rate": 9.7209e-06, |
|
"loss": 1.3514, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.35897435897436, |
|
"eval_loss": 1.376756191253662, |
|
"eval_runtime": 33.8844, |
|
"eval_samples_per_second": 11.598, |
|
"eval_steps_per_second": 1.476, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.871794871794872, |
|
"grad_norm": 37.75138854980469, |
|
"learning_rate": 9.7109e-06, |
|
"loss": 1.3542, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 14.871794871794872, |
|
"eval_loss": 1.3550430536270142, |
|
"eval_runtime": 34.0038, |
|
"eval_samples_per_second": 11.558, |
|
"eval_steps_per_second": 1.47, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 41.38801956176758, |
|
"learning_rate": 9.7009e-06, |
|
"loss": 1.3653, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"eval_loss": 1.3493261337280273, |
|
"eval_runtime": 33.7831, |
|
"eval_samples_per_second": 11.633, |
|
"eval_steps_per_second": 1.48, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 15.897435897435898, |
|
"grad_norm": 54.01289749145508, |
|
"learning_rate": 9.6909e-06, |
|
"loss": 1.3451, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 15.897435897435898, |
|
"eval_loss": 1.3563708066940308, |
|
"eval_runtime": 33.7662, |
|
"eval_samples_per_second": 11.639, |
|
"eval_steps_per_second": 1.481, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"grad_norm": 47.22911834716797, |
|
"learning_rate": 9.6809e-06, |
|
"loss": 1.323, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"eval_loss": 1.343112587928772, |
|
"eval_runtime": 34.3174, |
|
"eval_samples_per_second": 11.452, |
|
"eval_steps_per_second": 1.457, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 254.61305236816406, |
|
"learning_rate": 9.670900000000001e-06, |
|
"loss": 1.3348, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"eval_loss": 1.3341063261032104, |
|
"eval_runtime": 33.9905, |
|
"eval_samples_per_second": 11.562, |
|
"eval_steps_per_second": 1.471, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 17.435897435897434, |
|
"grad_norm": 25.823589324951172, |
|
"learning_rate": 9.660900000000001e-06, |
|
"loss": 1.3185, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 17.435897435897434, |
|
"eval_loss": 1.3256839513778687, |
|
"eval_runtime": 33.9185, |
|
"eval_samples_per_second": 11.587, |
|
"eval_steps_per_second": 1.474, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 17.94871794871795, |
|
"grad_norm": 44.86510467529297, |
|
"learning_rate": 9.650900000000001e-06, |
|
"loss": 1.328, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 17.94871794871795, |
|
"eval_loss": 1.3411694765090942, |
|
"eval_runtime": 33.8769, |
|
"eval_samples_per_second": 11.601, |
|
"eval_steps_per_second": 1.476, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 20.39423942565918, |
|
"learning_rate": 9.640900000000001e-06, |
|
"loss": 1.3306, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"eval_loss": 1.318002462387085, |
|
"eval_runtime": 33.7275, |
|
"eval_samples_per_second": 11.652, |
|
"eval_steps_per_second": 1.482, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 18.974358974358974, |
|
"grad_norm": 42.16006088256836, |
|
"learning_rate": 9.630900000000001e-06, |
|
"loss": 1.301, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 18.974358974358974, |
|
"eval_loss": 1.3143377304077148, |
|
"eval_runtime": 33.8038, |
|
"eval_samples_per_second": 11.626, |
|
"eval_steps_per_second": 1.479, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 19.487179487179485, |
|
"grad_norm": 36.292999267578125, |
|
"learning_rate": 9.620900000000001e-06, |
|
"loss": 1.3036, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 19.487179487179485, |
|
"eval_loss": 1.3157984018325806, |
|
"eval_runtime": 33.7447, |
|
"eval_samples_per_second": 11.646, |
|
"eval_steps_per_second": 1.482, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 64.44886016845703, |
|
"learning_rate": 9.610900000000001e-06, |
|
"loss": 1.3077, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.31955885887146, |
|
"eval_runtime": 33.8296, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 1.478, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 20.51282051282051, |
|
"grad_norm": 37.88364028930664, |
|
"learning_rate": 9.600900000000002e-06, |
|
"loss": 1.3092, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 20.51282051282051, |
|
"eval_loss": 1.3180863857269287, |
|
"eval_runtime": 34.1642, |
|
"eval_samples_per_second": 11.503, |
|
"eval_steps_per_second": 1.464, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 21.025641025641026, |
|
"grad_norm": 27.300247192382812, |
|
"learning_rate": 9.5909e-06, |
|
"loss": 1.27, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 21.025641025641026, |
|
"eval_loss": 1.3031089305877686, |
|
"eval_runtime": 34.0151, |
|
"eval_samples_per_second": 11.554, |
|
"eval_steps_per_second": 1.47, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 27.94341278076172, |
|
"learning_rate": 9.5809e-06, |
|
"loss": 1.2983, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"eval_loss": 1.3018323183059692, |
|
"eval_runtime": 34.1294, |
|
"eval_samples_per_second": 11.515, |
|
"eval_steps_per_second": 1.465, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 22.05128205128205, |
|
"grad_norm": 35.468135833740234, |
|
"learning_rate": 9.5709e-06, |
|
"loss": 1.2568, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 22.05128205128205, |
|
"eval_loss": 1.3130985498428345, |
|
"eval_runtime": 33.9979, |
|
"eval_samples_per_second": 11.56, |
|
"eval_steps_per_second": 1.471, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 22.564102564102566, |
|
"grad_norm": 21.431076049804688, |
|
"learning_rate": 9.5609e-06, |
|
"loss": 1.2812, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 22.564102564102566, |
|
"eval_loss": 1.293820858001709, |
|
"eval_runtime": 33.9036, |
|
"eval_samples_per_second": 11.592, |
|
"eval_steps_per_second": 1.475, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 59.38164520263672, |
|
"learning_rate": 9.5509e-06, |
|
"loss": 1.2776, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"eval_loss": 1.2872756719589233, |
|
"eval_runtime": 33.8513, |
|
"eval_samples_per_second": 11.61, |
|
"eval_steps_per_second": 1.477, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 23.58974358974359, |
|
"grad_norm": 48.05171585083008, |
|
"learning_rate": 9.5409e-06, |
|
"loss": 1.2488, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 23.58974358974359, |
|
"eval_loss": 1.2994896173477173, |
|
"eval_runtime": 33.9738, |
|
"eval_samples_per_second": 11.568, |
|
"eval_steps_per_second": 1.472, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 24.102564102564102, |
|
"grad_norm": 32.86454391479492, |
|
"learning_rate": 9.5309e-06, |
|
"loss": 1.3085, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 24.102564102564102, |
|
"eval_loss": 1.2995264530181885, |
|
"eval_runtime": 33.8598, |
|
"eval_samples_per_second": 11.607, |
|
"eval_steps_per_second": 1.477, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 36.235042572021484, |
|
"learning_rate": 9.5209e-06, |
|
"loss": 1.2439, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"eval_loss": 1.285250186920166, |
|
"eval_runtime": 34.2054, |
|
"eval_samples_per_second": 11.489, |
|
"eval_steps_per_second": 1.462, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 25.128205128205128, |
|
"grad_norm": 17.993911743164062, |
|
"learning_rate": 9.5109e-06, |
|
"loss": 1.2427, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 25.128205128205128, |
|
"eval_loss": 1.2752227783203125, |
|
"eval_runtime": 33.8959, |
|
"eval_samples_per_second": 11.594, |
|
"eval_steps_per_second": 1.475, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"grad_norm": 55.71548080444336, |
|
"learning_rate": 9.501000000000001e-06, |
|
"loss": 1.2774, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"eval_loss": 1.28258216381073, |
|
"eval_runtime": 33.7988, |
|
"eval_samples_per_second": 11.628, |
|
"eval_steps_per_second": 1.479, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 30.139249801635742, |
|
"learning_rate": 9.491000000000001e-06, |
|
"loss": 1.2224, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"eval_loss": 1.2681365013122559, |
|
"eval_runtime": 33.759, |
|
"eval_samples_per_second": 11.641, |
|
"eval_steps_per_second": 1.481, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 39.02808380126953, |
|
"learning_rate": 9.481000000000001e-06, |
|
"loss": 1.2226, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"eval_loss": 1.2783496379852295, |
|
"eval_runtime": 33.8934, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 27.17948717948718, |
|
"grad_norm": 27.232601165771484, |
|
"learning_rate": 9.471000000000001e-06, |
|
"loss": 1.2542, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 27.17948717948718, |
|
"eval_loss": 1.2665411233901978, |
|
"eval_runtime": 33.8407, |
|
"eval_samples_per_second": 11.613, |
|
"eval_steps_per_second": 1.478, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 25.21843719482422, |
|
"learning_rate": 9.461000000000001e-06, |
|
"loss": 1.2193, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"eval_loss": 1.2602121829986572, |
|
"eval_runtime": 33.8192, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 1.478, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 28.205128205128204, |
|
"grad_norm": 27.98350715637207, |
|
"learning_rate": 9.451000000000002e-06, |
|
"loss": 1.2602, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 28.205128205128204, |
|
"eval_loss": 1.257614016532898, |
|
"eval_runtime": 33.8642, |
|
"eval_samples_per_second": 11.605, |
|
"eval_steps_per_second": 1.476, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 28.71794871794872, |
|
"grad_norm": 80.89112854003906, |
|
"learning_rate": 9.441000000000002e-06, |
|
"loss": 1.2233, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 28.71794871794872, |
|
"eval_loss": 1.2567039728164673, |
|
"eval_runtime": 33.9431, |
|
"eval_samples_per_second": 11.578, |
|
"eval_steps_per_second": 1.473, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 89.17146301269531, |
|
"learning_rate": 9.431000000000002e-06, |
|
"loss": 1.2349, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"eval_loss": 1.2528877258300781, |
|
"eval_runtime": 33.691, |
|
"eval_samples_per_second": 11.665, |
|
"eval_steps_per_second": 1.484, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 29.743589743589745, |
|
"grad_norm": 27.014436721801758, |
|
"learning_rate": 9.421000000000002e-06, |
|
"loss": 1.1977, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 29.743589743589745, |
|
"eval_loss": 1.2418824434280396, |
|
"eval_runtime": 33.786, |
|
"eval_samples_per_second": 11.632, |
|
"eval_steps_per_second": 1.48, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 30.256410256410255, |
|
"grad_norm": 34.459503173828125, |
|
"learning_rate": 9.411000000000002e-06, |
|
"loss": 1.2017, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 30.256410256410255, |
|
"eval_loss": 1.2447059154510498, |
|
"eval_runtime": 33.7138, |
|
"eval_samples_per_second": 11.657, |
|
"eval_steps_per_second": 1.483, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 33.78609848022461, |
|
"learning_rate": 9.401000000000002e-06, |
|
"loss": 1.1899, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"eval_loss": 1.2605268955230713, |
|
"eval_runtime": 34.1716, |
|
"eval_samples_per_second": 11.501, |
|
"eval_steps_per_second": 1.463, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 31.28205128205128, |
|
"grad_norm": 15.015114784240723, |
|
"learning_rate": 9.391e-06, |
|
"loss": 1.2086, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 31.28205128205128, |
|
"eval_loss": 1.2350114583969116, |
|
"eval_runtime": 33.8206, |
|
"eval_samples_per_second": 11.62, |
|
"eval_steps_per_second": 1.478, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 31.794871794871796, |
|
"grad_norm": 34.10783386230469, |
|
"learning_rate": 9.381e-06, |
|
"loss": 1.1763, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 31.794871794871796, |
|
"eval_loss": 1.2277166843414307, |
|
"eval_runtime": 33.7042, |
|
"eval_samples_per_second": 11.66, |
|
"eval_steps_per_second": 1.483, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 42.146934509277344, |
|
"learning_rate": 9.371e-06, |
|
"loss": 1.1821, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"eval_loss": 1.2275365591049194, |
|
"eval_runtime": 33.8943, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 32.82051282051282, |
|
"grad_norm": 27.311311721801758, |
|
"learning_rate": 9.361e-06, |
|
"loss": 1.1891, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 32.82051282051282, |
|
"eval_loss": 1.2212082147598267, |
|
"eval_runtime": 34.1213, |
|
"eval_samples_per_second": 11.518, |
|
"eval_steps_per_second": 1.465, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 25.96141242980957, |
|
"learning_rate": 9.351e-06, |
|
"loss": 1.1981, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"eval_loss": 1.234484076499939, |
|
"eval_runtime": 33.8881, |
|
"eval_samples_per_second": 11.597, |
|
"eval_steps_per_second": 1.475, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 43.51643753051758, |
|
"learning_rate": 9.341000000000001e-06, |
|
"loss": 1.1933, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"eval_loss": 1.2187552452087402, |
|
"eval_runtime": 33.7579, |
|
"eval_samples_per_second": 11.642, |
|
"eval_steps_per_second": 1.481, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 34.35897435897436, |
|
"grad_norm": 28.205900192260742, |
|
"learning_rate": 9.331000000000001e-06, |
|
"loss": 1.1809, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 34.35897435897436, |
|
"eval_loss": 1.227611780166626, |
|
"eval_runtime": 33.8658, |
|
"eval_samples_per_second": 11.605, |
|
"eval_steps_per_second": 1.476, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 34.87179487179487, |
|
"grad_norm": 39.5003662109375, |
|
"learning_rate": 9.321000000000001e-06, |
|
"loss": 1.191, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 34.87179487179487, |
|
"eval_loss": 1.21685791015625, |
|
"eval_runtime": 33.9275, |
|
"eval_samples_per_second": 11.584, |
|
"eval_steps_per_second": 1.474, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 49.183170318603516, |
|
"learning_rate": 9.311000000000001e-06, |
|
"loss": 1.1955, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"eval_loss": 1.2113089561462402, |
|
"eval_runtime": 33.874, |
|
"eval_samples_per_second": 11.602, |
|
"eval_steps_per_second": 1.476, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 35.8974358974359, |
|
"grad_norm": 24.414306640625, |
|
"learning_rate": 9.301000000000001e-06, |
|
"loss": 1.1529, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 35.8974358974359, |
|
"eval_loss": 1.204311728477478, |
|
"eval_runtime": 33.8158, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 36.41025641025641, |
|
"grad_norm": 41.432579040527344, |
|
"learning_rate": 9.291000000000001e-06, |
|
"loss": 1.1701, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 36.41025641025641, |
|
"eval_loss": 1.203251600265503, |
|
"eval_runtime": 33.9027, |
|
"eval_samples_per_second": 11.592, |
|
"eval_steps_per_second": 1.475, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 33.03053665161133, |
|
"learning_rate": 9.281000000000001e-06, |
|
"loss": 1.1413, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"eval_loss": 1.1928728818893433, |
|
"eval_runtime": 34.3378, |
|
"eval_samples_per_second": 11.445, |
|
"eval_steps_per_second": 1.456, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 37.43589743589744, |
|
"grad_norm": 18.388671875, |
|
"learning_rate": 9.271000000000002e-06, |
|
"loss": 1.1929, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 37.43589743589744, |
|
"eval_loss": 1.2051817178726196, |
|
"eval_runtime": 33.7433, |
|
"eval_samples_per_second": 11.647, |
|
"eval_steps_per_second": 1.482, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 37.94871794871795, |
|
"grad_norm": 36.36753463745117, |
|
"learning_rate": 9.261000000000002e-06, |
|
"loss": 1.1365, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 37.94871794871795, |
|
"eval_loss": 1.1993858814239502, |
|
"eval_runtime": 33.7824, |
|
"eval_samples_per_second": 11.633, |
|
"eval_steps_per_second": 1.48, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 31.875085830688477, |
|
"learning_rate": 9.251000000000002e-06, |
|
"loss": 1.1455, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"eval_loss": 1.209811806678772, |
|
"eval_runtime": 33.938, |
|
"eval_samples_per_second": 11.58, |
|
"eval_steps_per_second": 1.473, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 38.97435897435897, |
|
"grad_norm": 30.268564224243164, |
|
"learning_rate": 9.241000000000002e-06, |
|
"loss": 1.1583, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 38.97435897435897, |
|
"eval_loss": 1.1939027309417725, |
|
"eval_runtime": 33.8431, |
|
"eval_samples_per_second": 11.612, |
|
"eval_steps_per_second": 1.477, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 39.48717948717949, |
|
"grad_norm": 126.68383026123047, |
|
"learning_rate": 9.231000000000002e-06, |
|
"loss": 1.1443, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 39.48717948717949, |
|
"eval_loss": 1.2021498680114746, |
|
"eval_runtime": 33.8224, |
|
"eval_samples_per_second": 11.62, |
|
"eval_steps_per_second": 1.478, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 30.38014793395996, |
|
"learning_rate": 9.221e-06, |
|
"loss": 1.1509, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.2058591842651367, |
|
"eval_runtime": 33.7036, |
|
"eval_samples_per_second": 11.66, |
|
"eval_steps_per_second": 1.484, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 40.51282051282051, |
|
"grad_norm": 42.52033996582031, |
|
"learning_rate": 9.211e-06, |
|
"loss": 1.1635, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 40.51282051282051, |
|
"eval_loss": 1.2037510871887207, |
|
"eval_runtime": 33.7516, |
|
"eval_samples_per_second": 11.644, |
|
"eval_steps_per_second": 1.481, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 41.02564102564103, |
|
"grad_norm": 32.66926193237305, |
|
"learning_rate": 9.201e-06, |
|
"loss": 1.132, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 41.02564102564103, |
|
"eval_loss": 1.2147732973098755, |
|
"eval_runtime": 33.9388, |
|
"eval_samples_per_second": 11.58, |
|
"eval_steps_per_second": 1.473, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 42.2666130065918, |
|
"learning_rate": 9.191e-06, |
|
"loss": 1.1688, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"eval_loss": 1.214206337928772, |
|
"eval_runtime": 33.8017, |
|
"eval_samples_per_second": 11.627, |
|
"eval_steps_per_second": 1.479, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 42.05128205128205, |
|
"grad_norm": 61.25041198730469, |
|
"learning_rate": 9.181e-06, |
|
"loss": 1.1035, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 42.05128205128205, |
|
"eval_loss": 1.1902282238006592, |
|
"eval_runtime": 33.7773, |
|
"eval_samples_per_second": 11.635, |
|
"eval_steps_per_second": 1.48, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 42.56410256410256, |
|
"grad_norm": 42.63145065307617, |
|
"learning_rate": 9.171e-06, |
|
"loss": 1.1429, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 42.56410256410256, |
|
"eval_loss": 1.2035140991210938, |
|
"eval_runtime": 33.7617, |
|
"eval_samples_per_second": 11.64, |
|
"eval_steps_per_second": 1.481, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 20.476646423339844, |
|
"learning_rate": 9.161000000000001e-06, |
|
"loss": 1.1663, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"eval_loss": 1.1914900541305542, |
|
"eval_runtime": 33.9898, |
|
"eval_samples_per_second": 11.562, |
|
"eval_steps_per_second": 1.471, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 43.58974358974359, |
|
"grad_norm": 36.54648971557617, |
|
"learning_rate": 9.151000000000001e-06, |
|
"loss": 1.1096, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 43.58974358974359, |
|
"eval_loss": 1.183597207069397, |
|
"eval_runtime": 33.897, |
|
"eval_samples_per_second": 11.594, |
|
"eval_steps_per_second": 1.475, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 44.1025641025641, |
|
"grad_norm": 44.22875213623047, |
|
"learning_rate": 9.141000000000001e-06, |
|
"loss": 1.1497, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 44.1025641025641, |
|
"eval_loss": 1.1925466060638428, |
|
"eval_runtime": 33.8189, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 1.478, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 42.943180084228516, |
|
"learning_rate": 9.131000000000001e-06, |
|
"loss": 1.104, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"eval_loss": 1.1945114135742188, |
|
"eval_runtime": 33.8565, |
|
"eval_samples_per_second": 11.608, |
|
"eval_steps_per_second": 1.477, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 45.12820512820513, |
|
"grad_norm": 21.709800720214844, |
|
"learning_rate": 9.121000000000001e-06, |
|
"loss": 1.1303, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 45.12820512820513, |
|
"eval_loss": 1.1883279085159302, |
|
"eval_runtime": 33.8145, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 45.64102564102564, |
|
"grad_norm": 34.65962219238281, |
|
"learning_rate": 9.1111e-06, |
|
"loss": 1.1347, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 45.64102564102564, |
|
"eval_loss": 1.1927034854888916, |
|
"eval_runtime": 34.2072, |
|
"eval_samples_per_second": 11.489, |
|
"eval_steps_per_second": 1.462, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 31.94856071472168, |
|
"learning_rate": 9.1011e-06, |
|
"loss": 1.1077, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"eval_loss": 1.185064673423767, |
|
"eval_runtime": 33.9549, |
|
"eval_samples_per_second": 11.574, |
|
"eval_steps_per_second": 1.473, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"grad_norm": 42.5874137878418, |
|
"learning_rate": 9.0911e-06, |
|
"loss": 1.124, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"eval_loss": 1.186991572380066, |
|
"eval_runtime": 33.9527, |
|
"eval_samples_per_second": 11.575, |
|
"eval_steps_per_second": 1.473, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 47.17948717948718, |
|
"grad_norm": 28.342052459716797, |
|
"learning_rate": 9.0811e-06, |
|
"loss": 1.12, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 47.17948717948718, |
|
"eval_loss": 1.1799968481063843, |
|
"eval_runtime": 34.1077, |
|
"eval_samples_per_second": 11.522, |
|
"eval_steps_per_second": 1.466, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 18.034202575683594, |
|
"learning_rate": 9.0711e-06, |
|
"loss": 1.1023, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"eval_loss": 1.1863212585449219, |
|
"eval_runtime": 33.8209, |
|
"eval_samples_per_second": 11.62, |
|
"eval_steps_per_second": 1.478, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 48.205128205128204, |
|
"grad_norm": 18.521087646484375, |
|
"learning_rate": 9.0611e-06, |
|
"loss": 1.1776, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 48.205128205128204, |
|
"eval_loss": 1.1852102279663086, |
|
"eval_runtime": 33.8724, |
|
"eval_samples_per_second": 11.602, |
|
"eval_steps_per_second": 1.476, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 48.717948717948715, |
|
"grad_norm": 54.735145568847656, |
|
"learning_rate": 9.0511e-06, |
|
"loss": 1.1108, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 48.717948717948715, |
|
"eval_loss": 1.1690032482147217, |
|
"eval_runtime": 33.7943, |
|
"eval_samples_per_second": 11.629, |
|
"eval_steps_per_second": 1.48, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 24.619985580444336, |
|
"learning_rate": 9.0411e-06, |
|
"loss": 1.1173, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"eval_loss": 1.1739821434020996, |
|
"eval_runtime": 33.8187, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 1.478, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 49.743589743589745, |
|
"grad_norm": 27.19590950012207, |
|
"learning_rate": 9.0311e-06, |
|
"loss": 1.0959, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 49.743589743589745, |
|
"eval_loss": 1.1569674015045166, |
|
"eval_runtime": 33.8966, |
|
"eval_samples_per_second": 11.594, |
|
"eval_steps_per_second": 1.475, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 50.256410256410255, |
|
"grad_norm": 66.81871032714844, |
|
"learning_rate": 9.0211e-06, |
|
"loss": 1.1213, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 50.256410256410255, |
|
"eval_loss": 1.1669386625289917, |
|
"eval_runtime": 33.9461, |
|
"eval_samples_per_second": 11.577, |
|
"eval_steps_per_second": 1.473, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"grad_norm": 20.71648406982422, |
|
"learning_rate": 9.011100000000001e-06, |
|
"loss": 1.0801, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"eval_loss": 1.1752002239227295, |
|
"eval_runtime": 33.6927, |
|
"eval_samples_per_second": 11.664, |
|
"eval_steps_per_second": 1.484, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 51.282051282051285, |
|
"grad_norm": 27.14691925048828, |
|
"learning_rate": 9.001100000000001e-06, |
|
"loss": 1.1269, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 51.282051282051285, |
|
"eval_loss": 1.1753243207931519, |
|
"eval_runtime": 33.7905, |
|
"eval_samples_per_second": 11.63, |
|
"eval_steps_per_second": 1.48, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 51.794871794871796, |
|
"grad_norm": 16.967599868774414, |
|
"learning_rate": 8.991100000000001e-06, |
|
"loss": 1.0956, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 51.794871794871796, |
|
"eval_loss": 1.1728534698486328, |
|
"eval_runtime": 33.9831, |
|
"eval_samples_per_second": 11.565, |
|
"eval_steps_per_second": 1.471, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"grad_norm": 25.84881019592285, |
|
"learning_rate": 8.981100000000001e-06, |
|
"loss": 1.1034, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"eval_loss": 1.1675057411193848, |
|
"eval_runtime": 33.8047, |
|
"eval_samples_per_second": 11.626, |
|
"eval_steps_per_second": 1.479, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 52.82051282051282, |
|
"grad_norm": 27.911279678344727, |
|
"learning_rate": 8.9711e-06, |
|
"loss": 1.1109, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 52.82051282051282, |
|
"eval_loss": 1.1695001125335693, |
|
"eval_runtime": 33.9411, |
|
"eval_samples_per_second": 11.579, |
|
"eval_steps_per_second": 1.473, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 30.883014678955078, |
|
"learning_rate": 8.9611e-06, |
|
"loss": 1.0748, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"eval_loss": 1.166171908378601, |
|
"eval_runtime": 33.7307, |
|
"eval_samples_per_second": 11.651, |
|
"eval_steps_per_second": 1.482, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"grad_norm": 30.53699493408203, |
|
"learning_rate": 8.9511e-06, |
|
"loss": 1.1039, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"eval_loss": 1.1673123836517334, |
|
"eval_runtime": 33.8942, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 54.35897435897436, |
|
"grad_norm": 32.961483001708984, |
|
"learning_rate": 8.9411e-06, |
|
"loss": 1.1203, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 54.35897435897436, |
|
"eval_loss": 1.1615757942199707, |
|
"eval_runtime": 33.9176, |
|
"eval_samples_per_second": 11.587, |
|
"eval_steps_per_second": 1.474, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 54.87179487179487, |
|
"grad_norm": 26.77215003967285, |
|
"learning_rate": 8.9311e-06, |
|
"loss": 1.1109, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 54.87179487179487, |
|
"eval_loss": 1.1798686981201172, |
|
"eval_runtime": 34.1898, |
|
"eval_samples_per_second": 11.495, |
|
"eval_steps_per_second": 1.462, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"grad_norm": 42.99930191040039, |
|
"learning_rate": 8.9211e-06, |
|
"loss": 1.0651, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"eval_loss": 1.1585968732833862, |
|
"eval_runtime": 33.9129, |
|
"eval_samples_per_second": 11.589, |
|
"eval_steps_per_second": 1.474, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 55.8974358974359, |
|
"grad_norm": 22.420515060424805, |
|
"learning_rate": 8.9111e-06, |
|
"loss": 1.0901, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 55.8974358974359, |
|
"eval_loss": 1.182327151298523, |
|
"eval_runtime": 34.0546, |
|
"eval_samples_per_second": 11.54, |
|
"eval_steps_per_second": 1.468, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 56.41025641025641, |
|
"grad_norm": 34.271324157714844, |
|
"learning_rate": 8.9011e-06, |
|
"loss": 1.0969, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 56.41025641025641, |
|
"eval_loss": 1.1700419187545776, |
|
"eval_runtime": 34.0913, |
|
"eval_samples_per_second": 11.528, |
|
"eval_steps_per_second": 1.467, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"grad_norm": 25.083974838256836, |
|
"learning_rate": 8.8911e-06, |
|
"loss": 1.0825, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"eval_loss": 1.1617279052734375, |
|
"eval_runtime": 33.9494, |
|
"eval_samples_per_second": 11.576, |
|
"eval_steps_per_second": 1.473, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 57.43589743589744, |
|
"grad_norm": 45.67741012573242, |
|
"learning_rate": 8.8811e-06, |
|
"loss": 1.1105, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 57.43589743589744, |
|
"eval_loss": 1.1558475494384766, |
|
"eval_runtime": 33.8739, |
|
"eval_samples_per_second": 11.602, |
|
"eval_steps_per_second": 1.476, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 57.94871794871795, |
|
"grad_norm": 40.9770622253418, |
|
"learning_rate": 8.8711e-06, |
|
"loss": 1.0714, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 57.94871794871795, |
|
"eval_loss": 1.158663272857666, |
|
"eval_runtime": 33.8651, |
|
"eval_samples_per_second": 11.605, |
|
"eval_steps_per_second": 1.476, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"grad_norm": 26.5457763671875, |
|
"learning_rate": 8.8611e-06, |
|
"loss": 1.053, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"eval_loss": 1.1581999063491821, |
|
"eval_runtime": 33.9299, |
|
"eval_samples_per_second": 11.583, |
|
"eval_steps_per_second": 1.474, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 58.97435897435897, |
|
"grad_norm": 21.226743698120117, |
|
"learning_rate": 8.8511e-06, |
|
"loss": 1.1005, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 58.97435897435897, |
|
"eval_loss": 1.1427479982376099, |
|
"eval_runtime": 33.8652, |
|
"eval_samples_per_second": 11.605, |
|
"eval_steps_per_second": 1.476, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 59.48717948717949, |
|
"grad_norm": 35.32261657714844, |
|
"learning_rate": 8.8411e-06, |
|
"loss": 1.1142, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 59.48717948717949, |
|
"eval_loss": 1.1504665613174438, |
|
"eval_runtime": 33.9719, |
|
"eval_samples_per_second": 11.568, |
|
"eval_steps_per_second": 1.472, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 62.73653793334961, |
|
"learning_rate": 8.831200000000001e-06, |
|
"loss": 1.0317, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 1.166619896888733, |
|
"eval_runtime": 33.7919, |
|
"eval_samples_per_second": 11.63, |
|
"eval_steps_per_second": 1.48, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 60.51282051282051, |
|
"grad_norm": 32.88017654418945, |
|
"learning_rate": 8.821200000000001e-06, |
|
"loss": 1.053, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 60.51282051282051, |
|
"eval_loss": 1.151050090789795, |
|
"eval_runtime": 33.8088, |
|
"eval_samples_per_second": 11.624, |
|
"eval_steps_per_second": 1.479, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 61.02564102564103, |
|
"grad_norm": 25.249664306640625, |
|
"learning_rate": 8.811200000000001e-06, |
|
"loss": 1.083, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 61.02564102564103, |
|
"eval_loss": 1.1466352939605713, |
|
"eval_runtime": 33.7084, |
|
"eval_samples_per_second": 11.659, |
|
"eval_steps_per_second": 1.483, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"grad_norm": 25.186697006225586, |
|
"learning_rate": 8.801200000000001e-06, |
|
"loss": 1.023, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"eval_loss": 1.1523329019546509, |
|
"eval_runtime": 33.8963, |
|
"eval_samples_per_second": 11.594, |
|
"eval_steps_per_second": 1.475, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 62.05128205128205, |
|
"grad_norm": 65.74333953857422, |
|
"learning_rate": 8.791200000000001e-06, |
|
"loss": 1.1023, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 62.05128205128205, |
|
"eval_loss": 1.1539132595062256, |
|
"eval_runtime": 33.8501, |
|
"eval_samples_per_second": 11.61, |
|
"eval_steps_per_second": 1.477, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 62.56410256410256, |
|
"grad_norm": 37.643035888671875, |
|
"learning_rate": 8.781200000000002e-06, |
|
"loss": 1.096, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 62.56410256410256, |
|
"eval_loss": 1.1323292255401611, |
|
"eval_runtime": 33.9504, |
|
"eval_samples_per_second": 11.576, |
|
"eval_steps_per_second": 1.473, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"grad_norm": 41.54033279418945, |
|
"learning_rate": 8.7712e-06, |
|
"loss": 1.04, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"eval_loss": 1.1349562406539917, |
|
"eval_runtime": 33.8242, |
|
"eval_samples_per_second": 11.619, |
|
"eval_steps_per_second": 1.478, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 63.58974358974359, |
|
"grad_norm": 32.337890625, |
|
"learning_rate": 8.7612e-06, |
|
"loss": 1.0729, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 63.58974358974359, |
|
"eval_loss": 1.1326929330825806, |
|
"eval_runtime": 33.7768, |
|
"eval_samples_per_second": 11.635, |
|
"eval_steps_per_second": 1.48, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 64.1025641025641, |
|
"grad_norm": 32.026912689208984, |
|
"learning_rate": 8.7512e-06, |
|
"loss": 1.0467, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 64.1025641025641, |
|
"eval_loss": 1.13288414478302, |
|
"eval_runtime": 34.2235, |
|
"eval_samples_per_second": 11.483, |
|
"eval_steps_per_second": 1.461, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"grad_norm": 17.785661697387695, |
|
"learning_rate": 8.7412e-06, |
|
"loss": 1.0339, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"eval_loss": 1.1395562887191772, |
|
"eval_runtime": 33.752, |
|
"eval_samples_per_second": 11.644, |
|
"eval_steps_per_second": 1.481, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 65.12820512820512, |
|
"grad_norm": 30.277780532836914, |
|
"learning_rate": 8.7312e-06, |
|
"loss": 1.0828, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 65.12820512820512, |
|
"eval_loss": 1.1387863159179688, |
|
"eval_runtime": 33.8766, |
|
"eval_samples_per_second": 11.601, |
|
"eval_steps_per_second": 1.476, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 65.64102564102564, |
|
"grad_norm": 41.21147918701172, |
|
"learning_rate": 8.7212e-06, |
|
"loss": 1.0502, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 65.64102564102564, |
|
"eval_loss": 1.143326997756958, |
|
"eval_runtime": 33.9279, |
|
"eval_samples_per_second": 11.583, |
|
"eval_steps_per_second": 1.474, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"grad_norm": 12.04747486114502, |
|
"learning_rate": 8.7112e-06, |
|
"loss": 1.0099, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"eval_loss": 1.130028247833252, |
|
"eval_runtime": 33.9548, |
|
"eval_samples_per_second": 11.574, |
|
"eval_steps_per_second": 1.473, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 46.18159866333008, |
|
"learning_rate": 8.7012e-06, |
|
"loss": 1.026, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"eval_loss": 1.135296106338501, |
|
"eval_runtime": 33.8939, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 67.17948717948718, |
|
"grad_norm": 34.04996109008789, |
|
"learning_rate": 8.6912e-06, |
|
"loss": 1.0969, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 67.17948717948718, |
|
"eval_loss": 1.1394113302230835, |
|
"eval_runtime": 33.7863, |
|
"eval_samples_per_second": 11.632, |
|
"eval_steps_per_second": 1.48, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"grad_norm": 57.76072692871094, |
|
"learning_rate": 8.6812e-06, |
|
"loss": 1.0345, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"eval_loss": 1.1415585279464722, |
|
"eval_runtime": 33.8137, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 68.2051282051282, |
|
"grad_norm": 26.72646713256836, |
|
"learning_rate": 8.671200000000001e-06, |
|
"loss": 1.0679, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 68.2051282051282, |
|
"eval_loss": 1.140710473060608, |
|
"eval_runtime": 34.6101, |
|
"eval_samples_per_second": 11.355, |
|
"eval_steps_per_second": 1.445, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 68.71794871794872, |
|
"grad_norm": 48.05488586425781, |
|
"learning_rate": 8.661200000000001e-06, |
|
"loss": 1.0037, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 68.71794871794872, |
|
"eval_loss": 1.1428287029266357, |
|
"eval_runtime": 34.2239, |
|
"eval_samples_per_second": 11.483, |
|
"eval_steps_per_second": 1.461, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"grad_norm": 15.574548721313477, |
|
"learning_rate": 8.651200000000001e-06, |
|
"loss": 1.0348, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"eval_loss": 1.1325479745864868, |
|
"eval_runtime": 34.131, |
|
"eval_samples_per_second": 11.514, |
|
"eval_steps_per_second": 1.465, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 69.74358974358974, |
|
"grad_norm": 52.864234924316406, |
|
"learning_rate": 8.641200000000001e-06, |
|
"loss": 1.0529, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 69.74358974358974, |
|
"eval_loss": 1.126291036605835, |
|
"eval_runtime": 34.0945, |
|
"eval_samples_per_second": 11.527, |
|
"eval_steps_per_second": 1.467, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 70.25641025641026, |
|
"grad_norm": 43.43352127075195, |
|
"learning_rate": 8.631200000000001e-06, |
|
"loss": 1.0605, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 70.25641025641026, |
|
"eval_loss": 1.129713773727417, |
|
"eval_runtime": 33.9866, |
|
"eval_samples_per_second": 11.563, |
|
"eval_steps_per_second": 1.471, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"grad_norm": 26.118932723999023, |
|
"learning_rate": 8.621200000000001e-06, |
|
"loss": 1.0422, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"eval_loss": 1.130542278289795, |
|
"eval_runtime": 33.9346, |
|
"eval_samples_per_second": 11.581, |
|
"eval_steps_per_second": 1.473, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 71.28205128205128, |
|
"grad_norm": 37.925621032714844, |
|
"learning_rate": 8.611200000000002e-06, |
|
"loss": 1.0397, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 71.28205128205128, |
|
"eval_loss": 1.141406536102295, |
|
"eval_runtime": 34.0654, |
|
"eval_samples_per_second": 11.537, |
|
"eval_steps_per_second": 1.468, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 71.7948717948718, |
|
"grad_norm": 62.41912078857422, |
|
"learning_rate": 8.6012e-06, |
|
"loss": 1.0287, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 71.7948717948718, |
|
"eval_loss": 1.131140112876892, |
|
"eval_runtime": 33.9622, |
|
"eval_samples_per_second": 11.572, |
|
"eval_steps_per_second": 1.472, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"grad_norm": 23.217283248901367, |
|
"learning_rate": 8.5912e-06, |
|
"loss": 1.0324, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"eval_loss": 1.1330175399780273, |
|
"eval_runtime": 33.8419, |
|
"eval_samples_per_second": 11.613, |
|
"eval_steps_per_second": 1.477, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 72.82051282051282, |
|
"grad_norm": 32.80116271972656, |
|
"learning_rate": 8.5812e-06, |
|
"loss": 1.0388, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 72.82051282051282, |
|
"eval_loss": 1.1305664777755737, |
|
"eval_runtime": 33.8519, |
|
"eval_samples_per_second": 11.609, |
|
"eval_steps_per_second": 1.477, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 73.33333333333333, |
|
"grad_norm": 82.27204132080078, |
|
"learning_rate": 8.5712e-06, |
|
"loss": 1.0258, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 73.33333333333333, |
|
"eval_loss": 1.131137728691101, |
|
"eval_runtime": 33.915, |
|
"eval_samples_per_second": 11.588, |
|
"eval_steps_per_second": 1.474, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"grad_norm": 81.36099243164062, |
|
"learning_rate": 8.5612e-06, |
|
"loss": 1.0176, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"eval_loss": 1.1416622400283813, |
|
"eval_runtime": 34.0376, |
|
"eval_samples_per_second": 11.546, |
|
"eval_steps_per_second": 1.469, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 74.35897435897436, |
|
"grad_norm": 72.8280258178711, |
|
"learning_rate": 8.5512e-06, |
|
"loss": 1.0269, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 74.35897435897436, |
|
"eval_loss": 1.1399762630462646, |
|
"eval_runtime": 33.8029, |
|
"eval_samples_per_second": 11.626, |
|
"eval_steps_per_second": 1.479, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 74.87179487179488, |
|
"grad_norm": 31.938396453857422, |
|
"learning_rate": 8.5412e-06, |
|
"loss": 1.0038, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 74.87179487179488, |
|
"eval_loss": 1.1206576824188232, |
|
"eval_runtime": 33.7774, |
|
"eval_samples_per_second": 11.635, |
|
"eval_steps_per_second": 1.48, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"grad_norm": 41.431663513183594, |
|
"learning_rate": 8.5312e-06, |
|
"loss": 1.028, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"eval_loss": 1.1250663995742798, |
|
"eval_runtime": 33.8079, |
|
"eval_samples_per_second": 11.624, |
|
"eval_steps_per_second": 1.479, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 75.8974358974359, |
|
"grad_norm": 32.541805267333984, |
|
"learning_rate": 8.5212e-06, |
|
"loss": 1.0191, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 75.8974358974359, |
|
"eval_loss": 1.1266955137252808, |
|
"eval_runtime": 33.9427, |
|
"eval_samples_per_second": 11.578, |
|
"eval_steps_per_second": 1.473, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 76.41025641025641, |
|
"grad_norm": 28.85361671447754, |
|
"learning_rate": 8.5112e-06, |
|
"loss": 1.045, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 76.41025641025641, |
|
"eval_loss": 1.1317839622497559, |
|
"eval_runtime": 33.8219, |
|
"eval_samples_per_second": 11.62, |
|
"eval_steps_per_second": 1.478, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"grad_norm": 22.870845794677734, |
|
"learning_rate": 8.501200000000001e-06, |
|
"loss": 0.9832, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"eval_loss": 1.1070276498794556, |
|
"eval_runtime": 33.8987, |
|
"eval_samples_per_second": 11.593, |
|
"eval_steps_per_second": 1.475, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 77.43589743589743, |
|
"grad_norm": 18.483102798461914, |
|
"learning_rate": 8.491200000000001e-06, |
|
"loss": 1.0155, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 77.43589743589743, |
|
"eval_loss": 1.10912024974823, |
|
"eval_runtime": 33.8107, |
|
"eval_samples_per_second": 11.624, |
|
"eval_steps_per_second": 1.479, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 77.94871794871794, |
|
"grad_norm": 19.445619583129883, |
|
"learning_rate": 8.481200000000001e-06, |
|
"loss": 1.0398, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 77.94871794871794, |
|
"eval_loss": 1.1232614517211914, |
|
"eval_runtime": 33.8148, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"grad_norm": 18.695117950439453, |
|
"learning_rate": 8.471300000000001e-06, |
|
"loss": 1.0403, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"eval_loss": 1.1138592958450317, |
|
"eval_runtime": 33.6752, |
|
"eval_samples_per_second": 11.67, |
|
"eval_steps_per_second": 1.485, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 78.97435897435898, |
|
"grad_norm": 44.09070587158203, |
|
"learning_rate": 8.461300000000001e-06, |
|
"loss": 0.9847, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 78.97435897435898, |
|
"eval_loss": 1.1114585399627686, |
|
"eval_runtime": 33.7907, |
|
"eval_samples_per_second": 11.63, |
|
"eval_steps_per_second": 1.48, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 79.48717948717949, |
|
"grad_norm": 49.75135040283203, |
|
"learning_rate": 8.451300000000002e-06, |
|
"loss": 1.0351, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 79.48717948717949, |
|
"eval_loss": 1.1093136072158813, |
|
"eval_runtime": 33.8604, |
|
"eval_samples_per_second": 11.606, |
|
"eval_steps_per_second": 1.477, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 32.891754150390625, |
|
"learning_rate": 8.441300000000002e-06, |
|
"loss": 0.9776, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 1.1244064569473267, |
|
"eval_runtime": 33.8414, |
|
"eval_samples_per_second": 11.613, |
|
"eval_steps_per_second": 1.477, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 80.51282051282051, |
|
"grad_norm": 19.692750930786133, |
|
"learning_rate": 8.431300000000002e-06, |
|
"loss": 0.9815, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 80.51282051282051, |
|
"eval_loss": 1.1285768747329712, |
|
"eval_runtime": 33.7284, |
|
"eval_samples_per_second": 11.652, |
|
"eval_steps_per_second": 1.482, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 81.02564102564102, |
|
"grad_norm": 46.99306869506836, |
|
"learning_rate": 8.421300000000002e-06, |
|
"loss": 1.0426, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 81.02564102564102, |
|
"eval_loss": 1.1372685432434082, |
|
"eval_runtime": 33.7726, |
|
"eval_samples_per_second": 11.637, |
|
"eval_steps_per_second": 1.48, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"grad_norm": 19.263620376586914, |
|
"learning_rate": 8.411300000000002e-06, |
|
"loss": 0.9906, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"eval_loss": 1.1213167905807495, |
|
"eval_runtime": 33.8885, |
|
"eval_samples_per_second": 11.597, |
|
"eval_steps_per_second": 1.475, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 82.05128205128206, |
|
"grad_norm": 19.580486297607422, |
|
"learning_rate": 8.4013e-06, |
|
"loss": 1.0321, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 82.05128205128206, |
|
"eval_loss": 1.1251707077026367, |
|
"eval_runtime": 33.7214, |
|
"eval_samples_per_second": 11.654, |
|
"eval_steps_per_second": 1.483, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 82.56410256410257, |
|
"grad_norm": 18.83529281616211, |
|
"learning_rate": 8.3913e-06, |
|
"loss": 1.0043, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 82.56410256410257, |
|
"eval_loss": 1.1074498891830444, |
|
"eval_runtime": 34.0109, |
|
"eval_samples_per_second": 11.555, |
|
"eval_steps_per_second": 1.47, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"grad_norm": 16.603994369506836, |
|
"learning_rate": 8.3813e-06, |
|
"loss": 0.9935, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"eval_loss": 1.1357744932174683, |
|
"eval_runtime": 33.9144, |
|
"eval_samples_per_second": 11.588, |
|
"eval_steps_per_second": 1.474, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 83.58974358974359, |
|
"grad_norm": 33.002960205078125, |
|
"learning_rate": 8.3713e-06, |
|
"loss": 0.9659, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 83.58974358974359, |
|
"eval_loss": 1.1160190105438232, |
|
"eval_runtime": 34.0015, |
|
"eval_samples_per_second": 11.558, |
|
"eval_steps_per_second": 1.471, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 84.1025641025641, |
|
"grad_norm": 37.87334060668945, |
|
"learning_rate": 8.3613e-06, |
|
"loss": 0.987, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 84.1025641025641, |
|
"eval_loss": 1.1338489055633545, |
|
"eval_runtime": 33.8043, |
|
"eval_samples_per_second": 11.626, |
|
"eval_steps_per_second": 1.479, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"grad_norm": 29.907007217407227, |
|
"learning_rate": 8.3513e-06, |
|
"loss": 1.0051, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"eval_loss": 1.1136332750320435, |
|
"eval_runtime": 33.6613, |
|
"eval_samples_per_second": 11.675, |
|
"eval_steps_per_second": 1.485, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 85.12820512820512, |
|
"grad_norm": 29.1041259765625, |
|
"learning_rate": 8.341300000000001e-06, |
|
"loss": 1.0379, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 85.12820512820512, |
|
"eval_loss": 1.1357380151748657, |
|
"eval_runtime": 33.7783, |
|
"eval_samples_per_second": 11.635, |
|
"eval_steps_per_second": 1.48, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 85.64102564102564, |
|
"grad_norm": 36.248313903808594, |
|
"learning_rate": 8.331400000000001e-06, |
|
"loss": 1.0121, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 85.64102564102564, |
|
"eval_loss": 1.1128472089767456, |
|
"eval_runtime": 33.7439, |
|
"eval_samples_per_second": 11.647, |
|
"eval_steps_per_second": 1.482, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"grad_norm": 25.37560272216797, |
|
"learning_rate": 8.321400000000001e-06, |
|
"loss": 0.9787, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"eval_loss": 1.1043667793273926, |
|
"eval_runtime": 34.15, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 86.66666666666667, |
|
"grad_norm": 16.18814468383789, |
|
"learning_rate": 8.3114e-06, |
|
"loss": 0.9772, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 86.66666666666667, |
|
"eval_loss": 1.1302591562271118, |
|
"eval_runtime": 33.755, |
|
"eval_samples_per_second": 11.643, |
|
"eval_steps_per_second": 1.481, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 87.17948717948718, |
|
"grad_norm": 31.982431411743164, |
|
"learning_rate": 8.3014e-06, |
|
"loss": 0.9904, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 87.17948717948718, |
|
"eval_loss": 1.1226387023925781, |
|
"eval_runtime": 33.8382, |
|
"eval_samples_per_second": 11.614, |
|
"eval_steps_per_second": 1.478, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"grad_norm": 37.292362213134766, |
|
"learning_rate": 8.2914e-06, |
|
"loss": 0.9413, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"eval_loss": 1.1049525737762451, |
|
"eval_runtime": 33.7667, |
|
"eval_samples_per_second": 11.639, |
|
"eval_steps_per_second": 1.481, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 88.2051282051282, |
|
"grad_norm": 28.432649612426758, |
|
"learning_rate": 8.2814e-06, |
|
"loss": 1.0127, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 88.2051282051282, |
|
"eval_loss": 1.1197983026504517, |
|
"eval_runtime": 33.7266, |
|
"eval_samples_per_second": 11.653, |
|
"eval_steps_per_second": 1.483, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 88.71794871794872, |
|
"grad_norm": 34.415977478027344, |
|
"learning_rate": 8.2714e-06, |
|
"loss": 0.9891, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 88.71794871794872, |
|
"eval_loss": 1.1233532428741455, |
|
"eval_runtime": 33.9939, |
|
"eval_samples_per_second": 11.561, |
|
"eval_steps_per_second": 1.471, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"grad_norm": 22.9134521484375, |
|
"learning_rate": 8.2614e-06, |
|
"loss": 0.9868, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"eval_loss": 1.1397521495819092, |
|
"eval_runtime": 33.9535, |
|
"eval_samples_per_second": 11.575, |
|
"eval_steps_per_second": 1.473, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 89.74358974358974, |
|
"grad_norm": 26.470407485961914, |
|
"learning_rate": 8.2514e-06, |
|
"loss": 1.0048, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 89.74358974358974, |
|
"eval_loss": 1.103281855583191, |
|
"eval_runtime": 33.932, |
|
"eval_samples_per_second": 11.582, |
|
"eval_steps_per_second": 1.474, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 90.25641025641026, |
|
"grad_norm": 13.81521987915039, |
|
"learning_rate": 8.2414e-06, |
|
"loss": 0.9952, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 90.25641025641026, |
|
"eval_loss": 1.1103590726852417, |
|
"eval_runtime": 33.8805, |
|
"eval_samples_per_second": 11.6, |
|
"eval_steps_per_second": 1.476, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"grad_norm": 39.196781158447266, |
|
"learning_rate": 8.2314e-06, |
|
"loss": 0.9787, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"eval_loss": 1.1147035360336304, |
|
"eval_runtime": 34.0993, |
|
"eval_samples_per_second": 11.525, |
|
"eval_steps_per_second": 1.466, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 91.28205128205128, |
|
"grad_norm": 36.378990173339844, |
|
"learning_rate": 8.2214e-06, |
|
"loss": 0.985, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 91.28205128205128, |
|
"eval_loss": 1.120380163192749, |
|
"eval_runtime": 33.8597, |
|
"eval_samples_per_second": 11.607, |
|
"eval_steps_per_second": 1.477, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 91.7948717948718, |
|
"grad_norm": 29.002111434936523, |
|
"learning_rate": 8.2114e-06, |
|
"loss": 0.9828, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 91.7948717948718, |
|
"eval_loss": 1.1089298725128174, |
|
"eval_runtime": 33.8537, |
|
"eval_samples_per_second": 11.609, |
|
"eval_steps_per_second": 1.477, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"grad_norm": 20.288036346435547, |
|
"learning_rate": 8.2014e-06, |
|
"loss": 0.966, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"eval_loss": 1.104116678237915, |
|
"eval_runtime": 33.8513, |
|
"eval_samples_per_second": 11.61, |
|
"eval_steps_per_second": 1.477, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 92.82051282051282, |
|
"grad_norm": 32.15245819091797, |
|
"learning_rate": 8.191400000000001e-06, |
|
"loss": 0.9607, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 92.82051282051282, |
|
"eval_loss": 1.1346195936203003, |
|
"eval_runtime": 33.8192, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 1.478, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 93.33333333333333, |
|
"grad_norm": 30.35234832763672, |
|
"learning_rate": 8.181400000000001e-06, |
|
"loss": 0.9816, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 93.33333333333333, |
|
"eval_loss": 1.1086933612823486, |
|
"eval_runtime": 33.866, |
|
"eval_samples_per_second": 11.605, |
|
"eval_steps_per_second": 1.476, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"grad_norm": 84.19772338867188, |
|
"learning_rate": 8.171400000000001e-06, |
|
"loss": 0.9945, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"eval_loss": 1.1133761405944824, |
|
"eval_runtime": 33.8302, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 1.478, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 94.35897435897436, |
|
"grad_norm": 93.01074981689453, |
|
"learning_rate": 8.161400000000001e-06, |
|
"loss": 0.9509, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 94.35897435897436, |
|
"eval_loss": 1.0900119543075562, |
|
"eval_runtime": 33.8482, |
|
"eval_samples_per_second": 11.611, |
|
"eval_steps_per_second": 1.477, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 94.87179487179488, |
|
"grad_norm": 33.34095001220703, |
|
"learning_rate": 8.1514e-06, |
|
"loss": 0.9695, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 94.87179487179488, |
|
"eval_loss": 1.0987663269042969, |
|
"eval_runtime": 34.0189, |
|
"eval_samples_per_second": 11.552, |
|
"eval_steps_per_second": 1.47, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"grad_norm": 40.531585693359375, |
|
"learning_rate": 8.1414e-06, |
|
"loss": 0.9567, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"eval_loss": 1.1120296716690063, |
|
"eval_runtime": 33.8468, |
|
"eval_samples_per_second": 11.611, |
|
"eval_steps_per_second": 1.477, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 95.8974358974359, |
|
"grad_norm": 20.810407638549805, |
|
"learning_rate": 8.1314e-06, |
|
"loss": 1.0048, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 95.8974358974359, |
|
"eval_loss": 1.1099683046340942, |
|
"eval_runtime": 33.8925, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 96.41025641025641, |
|
"grad_norm": 20.797433853149414, |
|
"learning_rate": 8.1214e-06, |
|
"loss": 0.9339, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 96.41025641025641, |
|
"eval_loss": 1.1000967025756836, |
|
"eval_runtime": 33.8669, |
|
"eval_samples_per_second": 11.604, |
|
"eval_steps_per_second": 1.476, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"grad_norm": 36.17203140258789, |
|
"learning_rate": 8.1114e-06, |
|
"loss": 0.9502, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"eval_loss": 1.1125061511993408, |
|
"eval_runtime": 33.8939, |
|
"eval_samples_per_second": 11.595, |
|
"eval_steps_per_second": 1.475, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 97.43589743589743, |
|
"grad_norm": 34.18137741088867, |
|
"learning_rate": 8.1014e-06, |
|
"loss": 0.9115, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 97.43589743589743, |
|
"eval_loss": 1.1053694486618042, |
|
"eval_runtime": 34.1495, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 97.94871794871794, |
|
"grad_norm": 27.006271362304688, |
|
"learning_rate": 8.0914e-06, |
|
"loss": 0.9906, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 97.94871794871794, |
|
"eval_loss": 1.0994688272476196, |
|
"eval_runtime": 33.8004, |
|
"eval_samples_per_second": 11.627, |
|
"eval_steps_per_second": 1.479, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"grad_norm": 35.52959442138672, |
|
"learning_rate": 8.0814e-06, |
|
"loss": 0.9616, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"eval_loss": 1.12260103225708, |
|
"eval_runtime": 33.7133, |
|
"eval_samples_per_second": 11.657, |
|
"eval_steps_per_second": 1.483, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 98.97435897435898, |
|
"grad_norm": 31.87555694580078, |
|
"learning_rate": 8.0714e-06, |
|
"loss": 0.96, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 98.97435897435898, |
|
"eval_loss": 1.1023892164230347, |
|
"eval_runtime": 33.8183, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 1.478, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 99.48717948717949, |
|
"grad_norm": 39.26782989501953, |
|
"learning_rate": 8.0614e-06, |
|
"loss": 0.9458, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 99.48717948717949, |
|
"eval_loss": 1.1115421056747437, |
|
"eval_runtime": 34.199, |
|
"eval_samples_per_second": 11.492, |
|
"eval_steps_per_second": 1.462, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 52.86429214477539, |
|
"learning_rate": 8.0514e-06, |
|
"loss": 0.9764, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 1.096418857574463, |
|
"eval_runtime": 33.8472, |
|
"eval_samples_per_second": 11.611, |
|
"eval_steps_per_second": 1.477, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 100.51282051282051, |
|
"grad_norm": 25.798358917236328, |
|
"learning_rate": 8.0414e-06, |
|
"loss": 0.9651, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 100.51282051282051, |
|
"eval_loss": 1.0968595743179321, |
|
"eval_runtime": 33.8602, |
|
"eval_samples_per_second": 11.607, |
|
"eval_steps_per_second": 1.477, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 101.02564102564102, |
|
"grad_norm": 25.59754180908203, |
|
"learning_rate": 8.0314e-06, |
|
"loss": 0.9414, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 101.02564102564102, |
|
"eval_loss": 1.1089657545089722, |
|
"eval_runtime": 33.6692, |
|
"eval_samples_per_second": 11.672, |
|
"eval_steps_per_second": 1.485, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"grad_norm": 38.994361877441406, |
|
"learning_rate": 8.0214e-06, |
|
"loss": 0.9628, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"eval_loss": 1.085176706314087, |
|
"eval_runtime": 33.7721, |
|
"eval_samples_per_second": 11.637, |
|
"eval_steps_per_second": 1.481, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 102.05128205128206, |
|
"grad_norm": 28.150468826293945, |
|
"learning_rate": 8.011400000000001e-06, |
|
"loss": 0.9321, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 102.05128205128206, |
|
"eval_loss": 1.0919770002365112, |
|
"eval_runtime": 34.0271, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.469, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 102.56410256410257, |
|
"grad_norm": 22.67235565185547, |
|
"learning_rate": 8.001400000000001e-06, |
|
"loss": 0.987, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 102.56410256410257, |
|
"eval_loss": 1.084847092628479, |
|
"eval_runtime": 33.8029, |
|
"eval_samples_per_second": 11.626, |
|
"eval_steps_per_second": 1.479, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"grad_norm": 23.50147819519043, |
|
"learning_rate": 7.991400000000001e-06, |
|
"loss": 0.9442, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"eval_loss": 1.0791277885437012, |
|
"eval_runtime": 33.9754, |
|
"eval_samples_per_second": 11.567, |
|
"eval_steps_per_second": 1.472, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 103.58974358974359, |
|
"grad_norm": 28.83344078063965, |
|
"learning_rate": 7.9814e-06, |
|
"loss": 0.9624, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 103.58974358974359, |
|
"eval_loss": 1.0788371562957764, |
|
"eval_runtime": 33.8139, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.479, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 104.1025641025641, |
|
"grad_norm": 23.166961669921875, |
|
"learning_rate": 7.9714e-06, |
|
"loss": 0.9365, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 104.1025641025641, |
|
"eval_loss": 1.0833864212036133, |
|
"eval_runtime": 34.4656, |
|
"eval_samples_per_second": 11.403, |
|
"eval_steps_per_second": 1.451, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"grad_norm": 43.671165466308594, |
|
"learning_rate": 7.9614e-06, |
|
"loss": 0.9429, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"eval_loss": 1.0890021324157715, |
|
"eval_runtime": 34.1248, |
|
"eval_samples_per_second": 11.517, |
|
"eval_steps_per_second": 1.465, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 105.12820512820512, |
|
"grad_norm": 19.534181594848633, |
|
"learning_rate": 7.9514e-06, |
|
"loss": 0.9771, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 105.12820512820512, |
|
"eval_loss": 1.0997905731201172, |
|
"eval_runtime": 34.0699, |
|
"eval_samples_per_second": 11.535, |
|
"eval_steps_per_second": 1.468, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 105.64102564102564, |
|
"grad_norm": 15.605002403259277, |
|
"learning_rate": 7.9414e-06, |
|
"loss": 0.947, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 105.64102564102564, |
|
"eval_loss": 1.0916997194290161, |
|
"eval_runtime": 34.057, |
|
"eval_samples_per_second": 11.539, |
|
"eval_steps_per_second": 1.468, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"grad_norm": 33.188629150390625, |
|
"learning_rate": 7.9314e-06, |
|
"loss": 0.9413, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"eval_loss": 1.1046770811080933, |
|
"eval_runtime": 33.9382, |
|
"eval_samples_per_second": 11.58, |
|
"eval_steps_per_second": 1.473, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 106.66666666666667, |
|
"grad_norm": 27.891300201416016, |
|
"learning_rate": 7.9214e-06, |
|
"loss": 0.953, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 106.66666666666667, |
|
"eval_loss": 1.1009467840194702, |
|
"eval_runtime": 34.0607, |
|
"eval_samples_per_second": 11.538, |
|
"eval_steps_per_second": 1.468, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 107.17948717948718, |
|
"grad_norm": 27.26763916015625, |
|
"learning_rate": 7.9114e-06, |
|
"loss": 0.9506, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 107.17948717948718, |
|
"eval_loss": 1.0977036952972412, |
|
"eval_runtime": 33.9327, |
|
"eval_samples_per_second": 11.582, |
|
"eval_steps_per_second": 1.474, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"grad_norm": 21.317838668823242, |
|
"learning_rate": 7.9014e-06, |
|
"loss": 0.9407, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"eval_loss": 1.1164356470108032, |
|
"eval_runtime": 34.1907, |
|
"eval_samples_per_second": 11.494, |
|
"eval_steps_per_second": 1.462, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 108.2051282051282, |
|
"grad_norm": 24.29534339904785, |
|
"learning_rate": 7.8914e-06, |
|
"loss": 0.9317, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 108.2051282051282, |
|
"eval_loss": 1.0796505212783813, |
|
"eval_runtime": 34.0823, |
|
"eval_samples_per_second": 11.531, |
|
"eval_steps_per_second": 1.467, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 108.71794871794872, |
|
"grad_norm": 23.155261993408203, |
|
"learning_rate": 7.8814e-06, |
|
"loss": 0.9172, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 108.71794871794872, |
|
"eval_loss": 1.1064153909683228, |
|
"eval_runtime": 34.0712, |
|
"eval_samples_per_second": 11.535, |
|
"eval_steps_per_second": 1.468, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"grad_norm": 102.01934051513672, |
|
"learning_rate": 7.8715e-06, |
|
"loss": 1.006, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"eval_loss": 1.0971482992172241, |
|
"eval_runtime": 34.0629, |
|
"eval_samples_per_second": 11.537, |
|
"eval_steps_per_second": 1.468, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 109.74358974358974, |
|
"grad_norm": 90.6104965209961, |
|
"learning_rate": 7.861500000000001e-06, |
|
"loss": 0.9273, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 109.74358974358974, |
|
"eval_loss": 1.0784090757369995, |
|
"eval_runtime": 33.9381, |
|
"eval_samples_per_second": 11.58, |
|
"eval_steps_per_second": 1.473, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 110.25641025641026, |
|
"grad_norm": 15.22960090637207, |
|
"learning_rate": 7.851500000000001e-06, |
|
"loss": 0.9306, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 110.25641025641026, |
|
"eval_loss": 1.0971401929855347, |
|
"eval_runtime": 34.1723, |
|
"eval_samples_per_second": 11.501, |
|
"eval_steps_per_second": 1.463, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"grad_norm": 40.347999572753906, |
|
"learning_rate": 7.841500000000001e-06, |
|
"loss": 0.9607, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"eval_loss": 1.086938500404358, |
|
"eval_runtime": 33.9506, |
|
"eval_samples_per_second": 11.576, |
|
"eval_steps_per_second": 1.473, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 111.28205128205128, |
|
"grad_norm": 23.18324089050293, |
|
"learning_rate": 7.831500000000001e-06, |
|
"loss": 0.9431, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 111.28205128205128, |
|
"eval_loss": 1.0866228342056274, |
|
"eval_runtime": 34.128, |
|
"eval_samples_per_second": 11.515, |
|
"eval_steps_per_second": 1.465, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 111.7948717948718, |
|
"grad_norm": 49.34690475463867, |
|
"learning_rate": 7.821500000000001e-06, |
|
"loss": 0.9316, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 111.7948717948718, |
|
"eval_loss": 1.0955440998077393, |
|
"eval_runtime": 34.0714, |
|
"eval_samples_per_second": 11.535, |
|
"eval_steps_per_second": 1.468, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"grad_norm": 45.29327392578125, |
|
"learning_rate": 7.811500000000001e-06, |
|
"loss": 0.9248, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"eval_loss": 1.0963120460510254, |
|
"eval_runtime": 34.0052, |
|
"eval_samples_per_second": 11.557, |
|
"eval_steps_per_second": 1.47, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 112.82051282051282, |
|
"grad_norm": 29.918447494506836, |
|
"learning_rate": 7.801500000000001e-06, |
|
"loss": 0.929, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 112.82051282051282, |
|
"eval_loss": 1.0954854488372803, |
|
"eval_runtime": 34.4653, |
|
"eval_samples_per_second": 11.403, |
|
"eval_steps_per_second": 1.451, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 113.33333333333333, |
|
"grad_norm": 24.213356018066406, |
|
"learning_rate": 7.791500000000002e-06, |
|
"loss": 0.9472, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 113.33333333333333, |
|
"eval_loss": 1.1072837114334106, |
|
"eval_runtime": 34.0842, |
|
"eval_samples_per_second": 11.53, |
|
"eval_steps_per_second": 1.467, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"grad_norm": 31.872329711914062, |
|
"learning_rate": 7.7815e-06, |
|
"loss": 0.9149, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"eval_loss": 1.1297913789749146, |
|
"eval_runtime": 33.9499, |
|
"eval_samples_per_second": 11.576, |
|
"eval_steps_per_second": 1.473, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 114.35897435897436, |
|
"grad_norm": 29.122095108032227, |
|
"learning_rate": 7.7715e-06, |
|
"loss": 0.9373, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 114.35897435897436, |
|
"eval_loss": 1.1010404825210571, |
|
"eval_runtime": 34.038, |
|
"eval_samples_per_second": 11.546, |
|
"eval_steps_per_second": 1.469, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 114.87179487179488, |
|
"grad_norm": 61.661808013916016, |
|
"learning_rate": 7.7615e-06, |
|
"loss": 0.942, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 114.87179487179488, |
|
"eval_loss": 1.1000746488571167, |
|
"eval_runtime": 34.0776, |
|
"eval_samples_per_second": 11.533, |
|
"eval_steps_per_second": 1.467, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"grad_norm": 32.940059661865234, |
|
"learning_rate": 7.7515e-06, |
|
"loss": 0.9259, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"eval_loss": 1.0878592729568481, |
|
"eval_runtime": 34.0411, |
|
"eval_samples_per_second": 11.545, |
|
"eval_steps_per_second": 1.469, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 115.8974358974359, |
|
"grad_norm": 24.981050491333008, |
|
"learning_rate": 7.7415e-06, |
|
"loss": 0.9315, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 115.8974358974359, |
|
"eval_loss": 1.071781039237976, |
|
"eval_runtime": 33.9996, |
|
"eval_samples_per_second": 11.559, |
|
"eval_steps_per_second": 1.471, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 116.41025641025641, |
|
"grad_norm": 29.54717254638672, |
|
"learning_rate": 7.7315e-06, |
|
"loss": 0.9141, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 116.41025641025641, |
|
"eval_loss": 1.0897409915924072, |
|
"eval_runtime": 34.0038, |
|
"eval_samples_per_second": 11.558, |
|
"eval_steps_per_second": 1.47, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"grad_norm": 55.15362548828125, |
|
"learning_rate": 7.7215e-06, |
|
"loss": 0.9504, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"eval_loss": 1.0998215675354004, |
|
"eval_runtime": 34.1575, |
|
"eval_samples_per_second": 11.506, |
|
"eval_steps_per_second": 1.464, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 117.43589743589743, |
|
"grad_norm": 28.481815338134766, |
|
"learning_rate": 7.7115e-06, |
|
"loss": 0.9227, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 117.43589743589743, |
|
"eval_loss": 1.0956928730010986, |
|
"eval_runtime": 34.0805, |
|
"eval_samples_per_second": 11.532, |
|
"eval_steps_per_second": 1.467, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 117.94871794871794, |
|
"grad_norm": 21.423744201660156, |
|
"learning_rate": 7.7015e-06, |
|
"loss": 0.9235, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 117.94871794871794, |
|
"eval_loss": 1.0744383335113525, |
|
"eval_runtime": 34.0591, |
|
"eval_samples_per_second": 11.539, |
|
"eval_steps_per_second": 1.468, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"grad_norm": 41.02705383300781, |
|
"learning_rate": 7.6915e-06, |
|
"loss": 0.8931, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"eval_loss": 1.0844883918762207, |
|
"eval_runtime": 33.9091, |
|
"eval_samples_per_second": 11.59, |
|
"eval_steps_per_second": 1.475, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 118.97435897435898, |
|
"grad_norm": 63.89754104614258, |
|
"learning_rate": 7.681500000000001e-06, |
|
"loss": 0.9388, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 118.97435897435898, |
|
"eval_loss": 1.10041344165802, |
|
"eval_runtime": 33.9999, |
|
"eval_samples_per_second": 11.559, |
|
"eval_steps_per_second": 1.471, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 119.48717948717949, |
|
"grad_norm": 46.03657531738281, |
|
"learning_rate": 7.671500000000001e-06, |
|
"loss": 0.9, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 119.48717948717949, |
|
"eval_loss": 1.1028199195861816, |
|
"eval_runtime": 34.2808, |
|
"eval_samples_per_second": 11.464, |
|
"eval_steps_per_second": 1.459, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"grad_norm": 23.22905158996582, |
|
"learning_rate": 7.661500000000001e-06, |
|
"loss": 0.9223, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_loss": 1.0903513431549072, |
|
"eval_runtime": 34.0346, |
|
"eval_samples_per_second": 11.547, |
|
"eval_steps_per_second": 1.469, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 120.51282051282051, |
|
"grad_norm": 40.20142364501953, |
|
"learning_rate": 7.651500000000001e-06, |
|
"loss": 0.918, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 120.51282051282051, |
|
"eval_loss": 1.0990569591522217, |
|
"eval_runtime": 34.0066, |
|
"eval_samples_per_second": 11.557, |
|
"eval_steps_per_second": 1.47, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 121.02564102564102, |
|
"grad_norm": 39.85820388793945, |
|
"learning_rate": 7.641500000000001e-06, |
|
"loss": 0.9272, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 121.02564102564102, |
|
"eval_loss": 1.078169584274292, |
|
"eval_runtime": 33.9594, |
|
"eval_samples_per_second": 11.573, |
|
"eval_steps_per_second": 1.472, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"grad_norm": 49.25630569458008, |
|
"learning_rate": 7.631500000000001e-06, |
|
"loss": 0.9339, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"eval_loss": 1.0832933187484741, |
|
"eval_runtime": 33.8556, |
|
"eval_samples_per_second": 11.608, |
|
"eval_steps_per_second": 1.477, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 122.05128205128206, |
|
"grad_norm": 18.731901168823242, |
|
"learning_rate": 7.621500000000001e-06, |
|
"loss": 0.8934, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 122.05128205128206, |
|
"eval_loss": 1.0746291875839233, |
|
"eval_runtime": 33.8765, |
|
"eval_samples_per_second": 11.601, |
|
"eval_steps_per_second": 1.476, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 122.56410256410257, |
|
"grad_norm": 45.756378173828125, |
|
"learning_rate": 7.611500000000001e-06, |
|
"loss": 0.9111, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 122.56410256410257, |
|
"eval_loss": 1.078890323638916, |
|
"eval_runtime": 33.8516, |
|
"eval_samples_per_second": 11.61, |
|
"eval_steps_per_second": 1.477, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"grad_norm": 31.239049911499023, |
|
"learning_rate": 7.601500000000001e-06, |
|
"loss": 0.8878, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"eval_loss": 1.0913454294204712, |
|
"eval_runtime": 33.798, |
|
"eval_samples_per_second": 11.628, |
|
"eval_steps_per_second": 1.479, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 123.58974358974359, |
|
"grad_norm": 41.38566589355469, |
|
"learning_rate": 7.591500000000001e-06, |
|
"loss": 0.9315, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 123.58974358974359, |
|
"eval_loss": 1.0924988985061646, |
|
"eval_runtime": 33.9883, |
|
"eval_samples_per_second": 11.563, |
|
"eval_steps_per_second": 1.471, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 124.1025641025641, |
|
"grad_norm": 20.533891677856445, |
|
"learning_rate": 7.5815e-06, |
|
"loss": 0.896, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 124.1025641025641, |
|
"eval_loss": 1.0829992294311523, |
|
"eval_runtime": 33.9318, |
|
"eval_samples_per_second": 11.582, |
|
"eval_steps_per_second": 1.474, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"grad_norm": 40.294097900390625, |
|
"learning_rate": 7.5715e-06, |
|
"loss": 0.8953, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"eval_loss": 1.0874361991882324, |
|
"eval_runtime": 33.9986, |
|
"eval_samples_per_second": 11.559, |
|
"eval_steps_per_second": 1.471, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 125.12820512820512, |
|
"grad_norm": 24.189720153808594, |
|
"learning_rate": 7.5615e-06, |
|
"loss": 0.9274, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 125.12820512820512, |
|
"eval_loss": 1.07763671875, |
|
"eval_runtime": 34.0188, |
|
"eval_samples_per_second": 11.552, |
|
"eval_steps_per_second": 1.47, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 125.64102564102564, |
|
"grad_norm": 47.872535705566406, |
|
"learning_rate": 7.5515000000000005e-06, |
|
"loss": 0.9042, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 125.64102564102564, |
|
"eval_loss": 1.0780576467514038, |
|
"eval_runtime": 34.0494, |
|
"eval_samples_per_second": 11.542, |
|
"eval_steps_per_second": 1.468, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"grad_norm": 25.210033416748047, |
|
"learning_rate": 7.5415000000000006e-06, |
|
"loss": 0.9105, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"eval_loss": 1.0951448678970337, |
|
"eval_runtime": 34.0972, |
|
"eval_samples_per_second": 11.526, |
|
"eval_steps_per_second": 1.466, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 126.66666666666667, |
|
"grad_norm": 24.063753128051758, |
|
"learning_rate": 7.531500000000001e-06, |
|
"loss": 0.8896, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 126.66666666666667, |
|
"eval_loss": 1.070051908493042, |
|
"eval_runtime": 34.046, |
|
"eval_samples_per_second": 11.543, |
|
"eval_steps_per_second": 1.469, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 127.17948717948718, |
|
"grad_norm": 28.127609252929688, |
|
"learning_rate": 7.521500000000001e-06, |
|
"loss": 0.9472, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 127.17948717948718, |
|
"eval_loss": 1.0773255825042725, |
|
"eval_runtime": 34.2002, |
|
"eval_samples_per_second": 11.491, |
|
"eval_steps_per_second": 1.462, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"grad_norm": 37.02296447753906, |
|
"learning_rate": 7.511500000000001e-06, |
|
"loss": 0.9003, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"eval_loss": 1.0599360466003418, |
|
"eval_runtime": 34.0678, |
|
"eval_samples_per_second": 11.536, |
|
"eval_steps_per_second": 1.468, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 128.2051282051282, |
|
"grad_norm": 51.526878356933594, |
|
"learning_rate": 7.5015e-06, |
|
"loss": 0.8857, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 128.2051282051282, |
|
"eval_loss": 1.0875056982040405, |
|
"eval_runtime": 34.0747, |
|
"eval_samples_per_second": 11.533, |
|
"eval_steps_per_second": 1.467, |
|
"step": 25000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 513, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.5560791552e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|