|
{ |
|
"best_metric": 0.3917270302772522, |
|
"best_model_checkpoint": "mikhail-panzo/ceb_b32_le5_s8000/checkpoint-6500", |
|
"epoch": 158.41584158415841, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9900990099009901, |
|
"grad_norm": 3.180250644683838, |
|
"learning_rate": 2.4500000000000004e-07, |
|
"loss": 0.8005, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.9801980198019802, |
|
"grad_norm": 3.2645788192749023, |
|
"learning_rate": 4.95e-07, |
|
"loss": 0.7901, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.9702970297029703, |
|
"grad_norm": 2.118293523788452, |
|
"learning_rate": 7.450000000000001e-07, |
|
"loss": 0.7371, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.9603960396039604, |
|
"grad_norm": 1.8010900020599365, |
|
"learning_rate": 9.950000000000002e-07, |
|
"loss": 0.7469, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.9504950495049505, |
|
"grad_norm": 1.6043368577957153, |
|
"learning_rate": 1.2450000000000002e-06, |
|
"loss": 0.7138, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.9405940594059405, |
|
"grad_norm": 2.1476693153381348, |
|
"learning_rate": 1.495e-06, |
|
"loss": 0.7011, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.930693069306931, |
|
"grad_norm": 1.7792506217956543, |
|
"learning_rate": 1.745e-06, |
|
"loss": 0.6728, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.920792079207921, |
|
"grad_norm": 2.7465362548828125, |
|
"learning_rate": 1.9950000000000004e-06, |
|
"loss": 0.6505, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.910891089108912, |
|
"grad_norm": 1.8483725786209106, |
|
"learning_rate": 2.245e-06, |
|
"loss": 0.617, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.900990099009901, |
|
"grad_norm": 2.8589906692504883, |
|
"learning_rate": 2.4950000000000003e-06, |
|
"loss": 0.5667, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.900990099009901, |
|
"eval_loss": 0.47756776213645935, |
|
"eval_runtime": 8.0921, |
|
"eval_samples_per_second": 22.244, |
|
"eval_steps_per_second": 2.842, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.891089108910892, |
|
"grad_norm": 2.1117241382598877, |
|
"learning_rate": 2.7450000000000004e-06, |
|
"loss": 0.5433, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 11.881188118811881, |
|
"grad_norm": 1.311270833015442, |
|
"learning_rate": 2.995e-06, |
|
"loss": 0.5224, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.871287128712872, |
|
"grad_norm": 2.1834664344787598, |
|
"learning_rate": 3.2450000000000003e-06, |
|
"loss": 0.5134, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 13.861386138613861, |
|
"grad_norm": 1.2540502548217773, |
|
"learning_rate": 3.495e-06, |
|
"loss": 0.4977, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 14.851485148514852, |
|
"grad_norm": 1.5720808506011963, |
|
"learning_rate": 3.745e-06, |
|
"loss": 0.4992, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 15.841584158415841, |
|
"grad_norm": 1.3779664039611816, |
|
"learning_rate": 3.995000000000001e-06, |
|
"loss": 0.5041, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 16.831683168316832, |
|
"grad_norm": 1.2948031425476074, |
|
"learning_rate": 4.245e-06, |
|
"loss": 0.4998, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 17.821782178217823, |
|
"grad_norm": 1.5043249130249023, |
|
"learning_rate": 4.495e-06, |
|
"loss": 0.4908, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 18.81188118811881, |
|
"grad_norm": 1.9952532052993774, |
|
"learning_rate": 4.745e-06, |
|
"loss": 0.4828, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 19.801980198019802, |
|
"grad_norm": 1.1994812488555908, |
|
"learning_rate": 4.9950000000000005e-06, |
|
"loss": 0.4838, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 19.801980198019802, |
|
"eval_loss": 0.4320996403694153, |
|
"eval_runtime": 7.9832, |
|
"eval_samples_per_second": 22.547, |
|
"eval_steps_per_second": 2.881, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 20.792079207920793, |
|
"grad_norm": 2.356138229370117, |
|
"learning_rate": 5.245e-06, |
|
"loss": 0.4784, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 21.782178217821784, |
|
"grad_norm": 1.6155955791473389, |
|
"learning_rate": 5.495000000000001e-06, |
|
"loss": 0.4696, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 22.77227722772277, |
|
"grad_norm": 1.3177858591079712, |
|
"learning_rate": 5.745000000000001e-06, |
|
"loss": 0.4694, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 23.762376237623762, |
|
"grad_norm": 1.5806872844696045, |
|
"learning_rate": 5.995000000000001e-06, |
|
"loss": 0.4746, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 24.752475247524753, |
|
"grad_norm": 2.288752794265747, |
|
"learning_rate": 6.245000000000001e-06, |
|
"loss": 0.4676, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 25.742574257425744, |
|
"grad_norm": 1.542547583580017, |
|
"learning_rate": 6.4950000000000005e-06, |
|
"loss": 0.4773, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 26.73267326732673, |
|
"grad_norm": 1.0209100246429443, |
|
"learning_rate": 6.745000000000001e-06, |
|
"loss": 0.4683, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 27.722772277227723, |
|
"grad_norm": 1.2771400213241577, |
|
"learning_rate": 6.995000000000001e-06, |
|
"loss": 0.4577, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 28.712871287128714, |
|
"grad_norm": 1.348565936088562, |
|
"learning_rate": 7.245000000000001e-06, |
|
"loss": 0.4563, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 29.702970297029704, |
|
"grad_norm": 1.5620756149291992, |
|
"learning_rate": 7.495000000000001e-06, |
|
"loss": 0.4604, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 29.702970297029704, |
|
"eval_loss": 0.41570472717285156, |
|
"eval_runtime": 7.3746, |
|
"eval_samples_per_second": 24.408, |
|
"eval_steps_per_second": 3.119, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 30.693069306930692, |
|
"grad_norm": 1.3051259517669678, |
|
"learning_rate": 7.745e-06, |
|
"loss": 0.4551, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 31.683168316831683, |
|
"grad_norm": 1.6832257509231567, |
|
"learning_rate": 7.995e-06, |
|
"loss": 0.4517, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 32.67326732673267, |
|
"grad_norm": 2.1702144145965576, |
|
"learning_rate": 8.245000000000002e-06, |
|
"loss": 0.4546, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 33.663366336633665, |
|
"grad_norm": 1.5217938423156738, |
|
"learning_rate": 8.495e-06, |
|
"loss": 0.4549, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 34.65346534653465, |
|
"grad_norm": 1.8007372617721558, |
|
"learning_rate": 8.745000000000002e-06, |
|
"loss": 0.4453, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 35.64356435643565, |
|
"grad_norm": 1.490868091583252, |
|
"learning_rate": 8.995000000000001e-06, |
|
"loss": 0.4522, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 36.633663366336634, |
|
"grad_norm": 1.3354467153549194, |
|
"learning_rate": 9.245e-06, |
|
"loss": 0.4533, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 37.62376237623762, |
|
"grad_norm": 1.661230206489563, |
|
"learning_rate": 9.495000000000001e-06, |
|
"loss": 0.4467, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 38.613861386138616, |
|
"grad_norm": 1.4943360090255737, |
|
"learning_rate": 9.745e-06, |
|
"loss": 0.4451, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 39.603960396039604, |
|
"grad_norm": 2.01084566116333, |
|
"learning_rate": 9.995000000000002e-06, |
|
"loss": 0.4373, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 39.603960396039604, |
|
"eval_loss": 0.40344154834747314, |
|
"eval_runtime": 8.1054, |
|
"eval_samples_per_second": 22.207, |
|
"eval_steps_per_second": 2.838, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 40.59405940594059, |
|
"grad_norm": 2.1528162956237793, |
|
"learning_rate": 9.918333333333335e-06, |
|
"loss": 0.4446, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 41.584158415841586, |
|
"grad_norm": 1.692417025566101, |
|
"learning_rate": 9.835000000000002e-06, |
|
"loss": 0.4363, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 42.57425742574257, |
|
"grad_norm": 1.190176248550415, |
|
"learning_rate": 9.751666666666667e-06, |
|
"loss": 0.4425, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 43.56435643564357, |
|
"grad_norm": 1.027122974395752, |
|
"learning_rate": 9.668333333333334e-06, |
|
"loss": 0.4318, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 44.554455445544555, |
|
"grad_norm": 1.3497254848480225, |
|
"learning_rate": 9.585e-06, |
|
"loss": 0.4377, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 45.54455445544554, |
|
"grad_norm": 1.483748197555542, |
|
"learning_rate": 9.501666666666667e-06, |
|
"loss": 0.4358, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 46.53465346534654, |
|
"grad_norm": 1.2555333375930786, |
|
"learning_rate": 9.418333333333334e-06, |
|
"loss": 0.4397, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 47.524752475247524, |
|
"grad_norm": 1.345694899559021, |
|
"learning_rate": 9.335000000000001e-06, |
|
"loss": 0.4341, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 48.51485148514851, |
|
"grad_norm": 1.3507237434387207, |
|
"learning_rate": 9.251666666666668e-06, |
|
"loss": 0.4363, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 49.504950495049506, |
|
"grad_norm": 2.3417470455169678, |
|
"learning_rate": 9.168333333333333e-06, |
|
"loss": 0.4359, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 49.504950495049506, |
|
"eval_loss": 0.40056198835372925, |
|
"eval_runtime": 7.9369, |
|
"eval_samples_per_second": 22.679, |
|
"eval_steps_per_second": 2.898, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 50.495049504950494, |
|
"grad_norm": 1.1603953838348389, |
|
"learning_rate": 9.085e-06, |
|
"loss": 0.4316, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 51.48514851485149, |
|
"grad_norm": 1.2483686208724976, |
|
"learning_rate": 9.001666666666667e-06, |
|
"loss": 0.4264, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 52.475247524752476, |
|
"grad_norm": 1.3609150648117065, |
|
"learning_rate": 8.918333333333334e-06, |
|
"loss": 0.4375, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 53.46534653465346, |
|
"grad_norm": 1.287948727607727, |
|
"learning_rate": 8.835000000000001e-06, |
|
"loss": 0.4296, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 54.45544554455446, |
|
"grad_norm": 1.1371320486068726, |
|
"learning_rate": 8.751666666666668e-06, |
|
"loss": 0.4293, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 55.445544554455445, |
|
"grad_norm": 1.2330107688903809, |
|
"learning_rate": 8.668333333333335e-06, |
|
"loss": 0.4211, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 56.43564356435643, |
|
"grad_norm": 1.494728446006775, |
|
"learning_rate": 8.585000000000002e-06, |
|
"loss": 0.4265, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 57.42574257425743, |
|
"grad_norm": 1.195456862449646, |
|
"learning_rate": 8.501666666666667e-06, |
|
"loss": 0.4255, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 58.415841584158414, |
|
"grad_norm": 2.2788732051849365, |
|
"learning_rate": 8.418333333333334e-06, |
|
"loss": 0.4262, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 59.40594059405941, |
|
"grad_norm": 1.3329004049301147, |
|
"learning_rate": 8.335e-06, |
|
"loss": 0.4236, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 59.40594059405941, |
|
"eval_loss": 0.39749667048454285, |
|
"eval_runtime": 7.652, |
|
"eval_samples_per_second": 23.523, |
|
"eval_steps_per_second": 3.006, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 60.396039603960396, |
|
"grad_norm": 1.0736905336380005, |
|
"learning_rate": 8.251666666666668e-06, |
|
"loss": 0.4324, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 61.386138613861384, |
|
"grad_norm": 1.4117296934127808, |
|
"learning_rate": 8.168333333333334e-06, |
|
"loss": 0.421, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 62.37623762376238, |
|
"grad_norm": 1.2299913167953491, |
|
"learning_rate": 8.085000000000001e-06, |
|
"loss": 0.426, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 63.366336633663366, |
|
"grad_norm": 1.780414342880249, |
|
"learning_rate": 8.001666666666668e-06, |
|
"loss": 0.4217, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 64.35643564356435, |
|
"grad_norm": 0.953036367893219, |
|
"learning_rate": 7.918333333333333e-06, |
|
"loss": 0.4275, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 65.34653465346534, |
|
"grad_norm": 1.4100078344345093, |
|
"learning_rate": 7.835e-06, |
|
"loss": 0.4207, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 66.33663366336634, |
|
"grad_norm": 1.2050296068191528, |
|
"learning_rate": 7.751666666666667e-06, |
|
"loss": 0.4194, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 67.32673267326733, |
|
"grad_norm": 1.2712194919586182, |
|
"learning_rate": 7.668333333333334e-06, |
|
"loss": 0.4185, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 68.31683168316832, |
|
"grad_norm": 1.4407145977020264, |
|
"learning_rate": 7.585e-06, |
|
"loss": 0.42, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 69.3069306930693, |
|
"grad_norm": 0.9915832281112671, |
|
"learning_rate": 7.501666666666667e-06, |
|
"loss": 0.4196, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 69.3069306930693, |
|
"eval_loss": 0.3955594301223755, |
|
"eval_runtime": 8.0095, |
|
"eval_samples_per_second": 22.473, |
|
"eval_steps_per_second": 2.872, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 70.29702970297029, |
|
"grad_norm": 1.399967074394226, |
|
"learning_rate": 7.418333333333334e-06, |
|
"loss": 0.413, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 71.2871287128713, |
|
"grad_norm": 1.4354674816131592, |
|
"learning_rate": 7.335000000000001e-06, |
|
"loss": 0.4166, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 72.27722772277228, |
|
"grad_norm": 1.5903120040893555, |
|
"learning_rate": 7.251666666666667e-06, |
|
"loss": 0.4206, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 73.26732673267327, |
|
"grad_norm": 1.184046983718872, |
|
"learning_rate": 7.168333333333334e-06, |
|
"loss": 0.4179, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 74.25742574257426, |
|
"grad_norm": 1.1453702449798584, |
|
"learning_rate": 7.085000000000001e-06, |
|
"loss": 0.4145, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 75.24752475247524, |
|
"grad_norm": 2.3300318717956543, |
|
"learning_rate": 7.001666666666668e-06, |
|
"loss": 0.4255, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 76.23762376237623, |
|
"grad_norm": 1.671020746231079, |
|
"learning_rate": 6.918333333333334e-06, |
|
"loss": 0.4213, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 77.22772277227723, |
|
"grad_norm": 1.1744346618652344, |
|
"learning_rate": 6.835000000000001e-06, |
|
"loss": 0.4181, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 78.21782178217822, |
|
"grad_norm": 1.3221676349639893, |
|
"learning_rate": 6.7516666666666675e-06, |
|
"loss": 0.4163, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 79.20792079207921, |
|
"grad_norm": 1.0609053373336792, |
|
"learning_rate": 6.668333333333334e-06, |
|
"loss": 0.4183, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 79.20792079207921, |
|
"eval_loss": 0.39377516508102417, |
|
"eval_runtime": 7.9303, |
|
"eval_samples_per_second": 22.698, |
|
"eval_steps_per_second": 2.9, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 80.1980198019802, |
|
"grad_norm": 1.0227816104888916, |
|
"learning_rate": 6.5850000000000005e-06, |
|
"loss": 0.4164, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 81.18811881188118, |
|
"grad_norm": 1.0951721668243408, |
|
"learning_rate": 6.501666666666667e-06, |
|
"loss": 0.4093, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 82.17821782178218, |
|
"grad_norm": 1.0732530355453491, |
|
"learning_rate": 6.418333333333334e-06, |
|
"loss": 0.4161, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 83.16831683168317, |
|
"grad_norm": 1.13483464717865, |
|
"learning_rate": 6.336666666666667e-06, |
|
"loss": 0.4113, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 84.15841584158416, |
|
"grad_norm": 0.9911147356033325, |
|
"learning_rate": 6.253333333333333e-06, |
|
"loss": 0.4123, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 85.14851485148515, |
|
"grad_norm": 1.4925992488861084, |
|
"learning_rate": 6.17e-06, |
|
"loss": 0.4122, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 86.13861386138613, |
|
"grad_norm": 1.4414916038513184, |
|
"learning_rate": 6.086666666666667e-06, |
|
"loss": 0.4103, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 87.12871287128714, |
|
"grad_norm": 1.1858835220336914, |
|
"learning_rate": 6.003333333333334e-06, |
|
"loss": 0.4137, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 88.11881188118812, |
|
"grad_norm": 1.104232907295227, |
|
"learning_rate": 5.92e-06, |
|
"loss": 0.4096, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 89.10891089108911, |
|
"grad_norm": 1.7566862106323242, |
|
"learning_rate": 5.836666666666667e-06, |
|
"loss": 0.4148, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 89.10891089108911, |
|
"eval_loss": 0.3940875828266144, |
|
"eval_runtime": 7.934, |
|
"eval_samples_per_second": 22.687, |
|
"eval_steps_per_second": 2.899, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 90.0990099009901, |
|
"grad_norm": 1.0610746145248413, |
|
"learning_rate": 5.753333333333334e-06, |
|
"loss": 0.414, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 91.08910891089108, |
|
"grad_norm": 1.1602790355682373, |
|
"learning_rate": 5.67e-06, |
|
"loss": 0.415, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 92.07920792079207, |
|
"grad_norm": 1.1488662958145142, |
|
"learning_rate": 5.586666666666667e-06, |
|
"loss": 0.4162, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 93.06930693069307, |
|
"grad_norm": 1.279823899269104, |
|
"learning_rate": 5.503333333333334e-06, |
|
"loss": 0.4086, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 94.05940594059406, |
|
"grad_norm": 1.4046456813812256, |
|
"learning_rate": 5.420000000000001e-06, |
|
"loss": 0.4094, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 95.04950495049505, |
|
"grad_norm": 1.0302339792251587, |
|
"learning_rate": 5.336666666666667e-06, |
|
"loss": 0.4126, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 96.03960396039604, |
|
"grad_norm": 1.2423168420791626, |
|
"learning_rate": 5.2533333333333336e-06, |
|
"loss": 0.4115, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 97.02970297029702, |
|
"grad_norm": 1.0046170949935913, |
|
"learning_rate": 5.1700000000000005e-06, |
|
"loss": 0.4027, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 98.01980198019803, |
|
"grad_norm": 1.3599224090576172, |
|
"learning_rate": 5.086666666666667e-06, |
|
"loss": 0.4124, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 99.00990099009901, |
|
"grad_norm": 1.408510446548462, |
|
"learning_rate": 5.0033333333333334e-06, |
|
"loss": 0.4034, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 99.00990099009901, |
|
"eval_loss": 0.3930128216743469, |
|
"eval_runtime": 7.3645, |
|
"eval_samples_per_second": 24.442, |
|
"eval_steps_per_second": 3.123, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 1.2026501893997192, |
|
"learning_rate": 4.92e-06, |
|
"loss": 0.4097, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 100.99009900990099, |
|
"grad_norm": 1.1910372972488403, |
|
"learning_rate": 4.836666666666667e-06, |
|
"loss": 0.4126, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 101.98019801980197, |
|
"grad_norm": 1.084506630897522, |
|
"learning_rate": 4.753333333333333e-06, |
|
"loss": 0.4075, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 102.97029702970298, |
|
"grad_norm": 1.115693211555481, |
|
"learning_rate": 4.670000000000001e-06, |
|
"loss": 0.4069, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 103.96039603960396, |
|
"grad_norm": 1.1751248836517334, |
|
"learning_rate": 4.586666666666667e-06, |
|
"loss": 0.4098, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 104.95049504950495, |
|
"grad_norm": 1.0256706476211548, |
|
"learning_rate": 4.503333333333333e-06, |
|
"loss": 0.4044, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 105.94059405940594, |
|
"grad_norm": 1.200976848602295, |
|
"learning_rate": 4.42e-06, |
|
"loss": 0.4064, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 106.93069306930693, |
|
"grad_norm": 1.0933268070220947, |
|
"learning_rate": 4.336666666666667e-06, |
|
"loss": 0.4073, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 107.92079207920793, |
|
"grad_norm": 1.3021538257598877, |
|
"learning_rate": 4.253333333333334e-06, |
|
"loss": 0.4118, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 108.91089108910892, |
|
"grad_norm": 1.7753965854644775, |
|
"learning_rate": 4.17e-06, |
|
"loss": 0.4137, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 108.91089108910892, |
|
"eval_loss": 0.3955301344394684, |
|
"eval_runtime": 7.5702, |
|
"eval_samples_per_second": 23.777, |
|
"eval_steps_per_second": 3.038, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 109.9009900990099, |
|
"grad_norm": 1.4470231533050537, |
|
"learning_rate": 4.086666666666667e-06, |
|
"loss": 0.4053, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 110.89108910891089, |
|
"grad_norm": 1.7531636953353882, |
|
"learning_rate": 4.003333333333334e-06, |
|
"loss": 0.4061, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 111.88118811881188, |
|
"grad_norm": 0.9751603603363037, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 0.4086, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 112.87128712871286, |
|
"grad_norm": 1.1000280380249023, |
|
"learning_rate": 3.836666666666667e-06, |
|
"loss": 0.4031, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 113.86138613861387, |
|
"grad_norm": 1.2386143207550049, |
|
"learning_rate": 3.753333333333334e-06, |
|
"loss": 0.4059, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 114.85148514851485, |
|
"grad_norm": 1.4081382751464844, |
|
"learning_rate": 3.6700000000000004e-06, |
|
"loss": 0.4084, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 115.84158415841584, |
|
"grad_norm": 1.3724069595336914, |
|
"learning_rate": 3.5866666666666673e-06, |
|
"loss": 0.4046, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 116.83168316831683, |
|
"grad_norm": 1.188818097114563, |
|
"learning_rate": 3.5033333333333334e-06, |
|
"loss": 0.4109, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 117.82178217821782, |
|
"grad_norm": 1.3779815435409546, |
|
"learning_rate": 3.4200000000000007e-06, |
|
"loss": 0.4042, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 118.81188118811882, |
|
"grad_norm": 1.3800166845321655, |
|
"learning_rate": 3.3366666666666668e-06, |
|
"loss": 0.4094, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 118.81188118811882, |
|
"eval_loss": 0.39243730902671814, |
|
"eval_runtime": 7.911, |
|
"eval_samples_per_second": 22.753, |
|
"eval_steps_per_second": 2.907, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 119.8019801980198, |
|
"grad_norm": 1.2765828371047974, |
|
"learning_rate": 3.2533333333333332e-06, |
|
"loss": 0.4022, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 120.79207920792079, |
|
"grad_norm": 1.3544330596923828, |
|
"learning_rate": 3.17e-06, |
|
"loss": 0.4062, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 121.78217821782178, |
|
"grad_norm": 1.9719560146331787, |
|
"learning_rate": 3.0866666666666666e-06, |
|
"loss": 0.4002, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 122.77227722772277, |
|
"grad_norm": 1.2542638778686523, |
|
"learning_rate": 3.0033333333333335e-06, |
|
"loss": 0.4037, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 123.76237623762377, |
|
"grad_norm": 0.9950863122940063, |
|
"learning_rate": 2.92e-06, |
|
"loss": 0.4034, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 124.75247524752476, |
|
"grad_norm": 1.2220611572265625, |
|
"learning_rate": 2.836666666666667e-06, |
|
"loss": 0.405, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 125.74257425742574, |
|
"grad_norm": 1.3050748109817505, |
|
"learning_rate": 2.7533333333333334e-06, |
|
"loss": 0.4019, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 126.73267326732673, |
|
"grad_norm": 1.2218639850616455, |
|
"learning_rate": 2.6700000000000003e-06, |
|
"loss": 0.4077, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 127.72277227722772, |
|
"grad_norm": 1.0454591512680054, |
|
"learning_rate": 2.5866666666666667e-06, |
|
"loss": 0.4009, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 128.7128712871287, |
|
"grad_norm": 1.211366057395935, |
|
"learning_rate": 2.505e-06, |
|
"loss": 0.4112, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 128.7128712871287, |
|
"eval_loss": 0.3917270302772522, |
|
"eval_runtime": 7.8896, |
|
"eval_samples_per_second": 22.815, |
|
"eval_steps_per_second": 2.915, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 129.7029702970297, |
|
"grad_norm": 1.1961348056793213, |
|
"learning_rate": 2.421666666666667e-06, |
|
"loss": 0.4009, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 130.69306930693068, |
|
"grad_norm": 1.6463865041732788, |
|
"learning_rate": 2.3383333333333335e-06, |
|
"loss": 0.403, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 131.68316831683168, |
|
"grad_norm": 1.1030319929122925, |
|
"learning_rate": 2.2550000000000004e-06, |
|
"loss": 0.4009, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 132.67326732673268, |
|
"grad_norm": 1.0525782108306885, |
|
"learning_rate": 2.171666666666667e-06, |
|
"loss": 0.4038, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 133.66336633663366, |
|
"grad_norm": 1.0361270904541016, |
|
"learning_rate": 2.088333333333334e-06, |
|
"loss": 0.4046, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 134.65346534653466, |
|
"grad_norm": 1.3003863096237183, |
|
"learning_rate": 2.0050000000000003e-06, |
|
"loss": 0.404, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 135.64356435643563, |
|
"grad_norm": 1.6495325565338135, |
|
"learning_rate": 1.9216666666666668e-06, |
|
"loss": 0.401, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 136.63366336633663, |
|
"grad_norm": 1.070357084274292, |
|
"learning_rate": 1.8383333333333334e-06, |
|
"loss": 0.4033, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 137.62376237623764, |
|
"grad_norm": 0.9179391860961914, |
|
"learning_rate": 1.7550000000000001e-06, |
|
"loss": 0.4018, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 138.6138613861386, |
|
"grad_norm": 1.398165225982666, |
|
"learning_rate": 1.6716666666666666e-06, |
|
"loss": 0.4041, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 138.6138613861386, |
|
"eval_loss": 0.39232540130615234, |
|
"eval_runtime": 7.5562, |
|
"eval_samples_per_second": 23.822, |
|
"eval_steps_per_second": 3.044, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 139.6039603960396, |
|
"grad_norm": 1.1762559413909912, |
|
"learning_rate": 1.5900000000000002e-06, |
|
"loss": 0.4117, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 140.59405940594058, |
|
"grad_norm": 1.515325665473938, |
|
"learning_rate": 1.506666666666667e-06, |
|
"loss": 0.4014, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 141.58415841584159, |
|
"grad_norm": 1.0724811553955078, |
|
"learning_rate": 1.4233333333333336e-06, |
|
"loss": 0.4028, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 142.5742574257426, |
|
"grad_norm": 1.0034635066986084, |
|
"learning_rate": 1.34e-06, |
|
"loss": 0.3977, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 143.56435643564356, |
|
"grad_norm": 1.1974977254867554, |
|
"learning_rate": 1.2566666666666668e-06, |
|
"loss": 0.4013, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 144.55445544554456, |
|
"grad_norm": 1.167351484298706, |
|
"learning_rate": 1.1733333333333335e-06, |
|
"loss": 0.3992, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 145.54455445544554, |
|
"grad_norm": 1.1566020250320435, |
|
"learning_rate": 1.0900000000000002e-06, |
|
"loss": 0.4, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 146.53465346534654, |
|
"grad_norm": 0.8720223307609558, |
|
"learning_rate": 1.0066666666666668e-06, |
|
"loss": 0.4006, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 147.52475247524754, |
|
"grad_norm": 1.1982648372650146, |
|
"learning_rate": 9.233333333333334e-07, |
|
"loss": 0.3999, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 148.5148514851485, |
|
"grad_norm": 1.1742349863052368, |
|
"learning_rate": 8.400000000000001e-07, |
|
"loss": 0.3989, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 148.5148514851485, |
|
"eval_loss": 0.3927270770072937, |
|
"eval_runtime": 8.9179, |
|
"eval_samples_per_second": 20.184, |
|
"eval_steps_per_second": 2.579, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 149.5049504950495, |
|
"grad_norm": 1.1068042516708374, |
|
"learning_rate": 7.566666666666667e-07, |
|
"loss": 0.4029, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 150.4950495049505, |
|
"grad_norm": 1.2785730361938477, |
|
"learning_rate": 6.733333333333334e-07, |
|
"loss": 0.4001, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 151.4851485148515, |
|
"grad_norm": 1.0063281059265137, |
|
"learning_rate": 5.900000000000001e-07, |
|
"loss": 0.3977, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 152.47524752475246, |
|
"grad_norm": 1.1545789241790771, |
|
"learning_rate": 5.066666666666667e-07, |
|
"loss": 0.4015, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 153.46534653465346, |
|
"grad_norm": 1.0624966621398926, |
|
"learning_rate": 4.233333333333334e-07, |
|
"loss": 0.4044, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 154.45544554455446, |
|
"grad_norm": 1.3898247480392456, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.4005, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 155.44554455445544, |
|
"grad_norm": 1.2618342638015747, |
|
"learning_rate": 2.566666666666667e-07, |
|
"loss": 0.4025, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 156.43564356435644, |
|
"grad_norm": 1.0798155069351196, |
|
"learning_rate": 1.7333333333333335e-07, |
|
"loss": 0.4015, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 157.4257425742574, |
|
"grad_norm": 1.0550168752670288, |
|
"learning_rate": 9e-08, |
|
"loss": 0.3952, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 158.41584158415841, |
|
"grad_norm": 1.0714497566223145, |
|
"learning_rate": 6.666666666666667e-09, |
|
"loss": 0.3989, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 158.41584158415841, |
|
"eval_loss": 0.39282524585723877, |
|
"eval_runtime": 8.2483, |
|
"eval_samples_per_second": 21.823, |
|
"eval_steps_per_second": 2.788, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 160, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.322114324975938e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|