|
{ |
|
"best_metric": 0.4256468117237091, |
|
"best_model_checkpoint": "guilhermebastos96/speecht5_allvoices\\checkpoint-5000", |
|
"epoch": 4.851425106124924, |
|
"eval_steps": 1000, |
|
"global_step": 7000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.8868106603622437, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.4682, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.702238082885742, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4709, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.156214714050293, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.4661, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8122159242630005, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.4658, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.852905511856079, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.4688, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4833136796951294, |
|
"learning_rate": 3e-06, |
|
"loss": 0.4658, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.180389881134033, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.4766, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.425173044204712, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.4625, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.724726438522339, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.4695, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.528960943222046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.560817003250122, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.469, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.050774097442627, |
|
"learning_rate": 6e-06, |
|
"loss": 0.4735, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.356326103210449, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.4711, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.561927318572998, |
|
"learning_rate": 7e-06, |
|
"loss": 0.4709, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.2564566135406494, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.464, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.8264591693878174, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4728, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.646620035171509, |
|
"learning_rate": 8.5e-06, |
|
"loss": 0.4672, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.601816177368164, |
|
"learning_rate": 9e-06, |
|
"loss": 0.4644, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.687861919403076, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.4687, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.237541675567627, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4721, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.4831230640411377, |
|
"learning_rate": 9.973684210526316e-06, |
|
"loss": 0.4674, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.717695951461792, |
|
"learning_rate": 9.947368421052632e-06, |
|
"loss": 0.4669, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.5196590423583984, |
|
"learning_rate": 9.921052631578947e-06, |
|
"loss": 0.4664, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.3250813484191895, |
|
"learning_rate": 9.894736842105264e-06, |
|
"loss": 0.4741, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.8072372674942017, |
|
"learning_rate": 9.868421052631579e-06, |
|
"loss": 0.4696, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.5572052001953125, |
|
"learning_rate": 9.842105263157896e-06, |
|
"loss": 0.4742, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.830040454864502, |
|
"learning_rate": 9.815789473684212e-06, |
|
"loss": 0.4679, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.8328497409820557, |
|
"learning_rate": 9.789473684210527e-06, |
|
"loss": 0.4736, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.366887331008911, |
|
"learning_rate": 9.763157894736844e-06, |
|
"loss": 0.4767, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.468636512756348, |
|
"learning_rate": 9.736842105263159e-06, |
|
"loss": 0.4699, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.1232364177703857, |
|
"learning_rate": 9.710526315789474e-06, |
|
"loss": 0.4687, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.8201797008514404, |
|
"learning_rate": 9.68421052631579e-06, |
|
"loss": 0.474, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.4909231662750244, |
|
"learning_rate": 9.657894736842106e-06, |
|
"loss": 0.4697, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.9799981117248535, |
|
"learning_rate": 9.631578947368422e-06, |
|
"loss": 0.4766, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.508791446685791, |
|
"learning_rate": 9.605263157894737e-06, |
|
"loss": 0.4632, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.9218318462371826, |
|
"learning_rate": 9.578947368421054e-06, |
|
"loss": 0.4687, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.421876907348633, |
|
"learning_rate": 9.552631578947369e-06, |
|
"loss": 0.4595, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.8650059700012207, |
|
"learning_rate": 9.526315789473684e-06, |
|
"loss": 0.4606, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.76725697517395, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.4683, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.538356304168701, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 0.4711, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.4389101266860962, |
|
"eval_runtime": 131.3338, |
|
"eval_samples_per_second": 39.068, |
|
"eval_steps_per_second": 19.538, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.725700855255127, |
|
"learning_rate": 9.447368421052632e-06, |
|
"loss": 0.4671, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.324768304824829, |
|
"learning_rate": 9.421052631578949e-06, |
|
"loss": 0.4746, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.6872613430023193, |
|
"learning_rate": 9.394736842105264e-06, |
|
"loss": 0.4724, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.945366382598877, |
|
"learning_rate": 9.36842105263158e-06, |
|
"loss": 0.4755, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.513762950897217, |
|
"learning_rate": 9.342105263157895e-06, |
|
"loss": 0.4661, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5.170629024505615, |
|
"learning_rate": 9.315789473684212e-06, |
|
"loss": 0.473, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.1704061031341553, |
|
"learning_rate": 9.289473684210527e-06, |
|
"loss": 0.4641, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.7679084539413452, |
|
"learning_rate": 9.263157894736842e-06, |
|
"loss": 0.4638, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.245815753936768, |
|
"learning_rate": 9.236842105263159e-06, |
|
"loss": 0.4736, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.433870315551758, |
|
"learning_rate": 9.210526315789474e-06, |
|
"loss": 0.4634, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.9975013732910156, |
|
"learning_rate": 9.18421052631579e-06, |
|
"loss": 0.4715, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.1186320781707764, |
|
"learning_rate": 9.157894736842105e-06, |
|
"loss": 0.4669, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.856799840927124, |
|
"learning_rate": 9.131578947368422e-06, |
|
"loss": 0.4707, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.04708194732666, |
|
"learning_rate": 9.105263157894739e-06, |
|
"loss": 0.4685, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.939920425415039, |
|
"learning_rate": 9.078947368421054e-06, |
|
"loss": 0.4616, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.5178582668304443, |
|
"learning_rate": 9.05263157894737e-06, |
|
"loss": 0.4701, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.2123663425445557, |
|
"learning_rate": 9.026315789473685e-06, |
|
"loss": 0.4708, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.4378151893615723, |
|
"learning_rate": 9e-06, |
|
"loss": 0.4572, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 3.701718330383301, |
|
"learning_rate": 8.973684210526317e-06, |
|
"loss": 0.467, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.7244162559509277, |
|
"learning_rate": 8.947368421052632e-06, |
|
"loss": 0.4642, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.887258291244507, |
|
"learning_rate": 8.921052631578949e-06, |
|
"loss": 0.4687, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 9.12584400177002, |
|
"learning_rate": 8.894736842105264e-06, |
|
"loss": 0.4651, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.1384148597717285, |
|
"learning_rate": 8.86842105263158e-06, |
|
"loss": 0.4649, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.476319432258606, |
|
"learning_rate": 8.842105263157895e-06, |
|
"loss": 0.4729, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.3843348026275635, |
|
"learning_rate": 8.81578947368421e-06, |
|
"loss": 0.465, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.3237972259521484, |
|
"learning_rate": 8.789473684210527e-06, |
|
"loss": 0.4696, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.5998685359954834, |
|
"learning_rate": 8.763157894736842e-06, |
|
"loss": 0.4673, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.7698214054107666, |
|
"learning_rate": 8.736842105263158e-06, |
|
"loss": 0.4721, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.030637741088867, |
|
"learning_rate": 8.710526315789475e-06, |
|
"loss": 0.4578, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.9983584880828857, |
|
"learning_rate": 8.68421052631579e-06, |
|
"loss": 0.4629, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.231105327606201, |
|
"learning_rate": 8.657894736842107e-06, |
|
"loss": 0.4617, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.0504438877105713, |
|
"learning_rate": 8.63263157894737e-06, |
|
"loss": 0.4675, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.8002873659133911, |
|
"learning_rate": 8.606315789473684e-06, |
|
"loss": 0.469, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.8942711353302, |
|
"learning_rate": 8.580000000000001e-06, |
|
"loss": 0.4668, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 3.3724169731140137, |
|
"learning_rate": 8.553684210526316e-06, |
|
"loss": 0.467, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 8.831938743591309, |
|
"learning_rate": 8.527368421052632e-06, |
|
"loss": 0.4659, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.8694584369659424, |
|
"learning_rate": 8.501052631578947e-06, |
|
"loss": 0.4727, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.35543155670166, |
|
"learning_rate": 8.474736842105264e-06, |
|
"loss": 0.4629, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.251941204071045, |
|
"learning_rate": 8.44842105263158e-06, |
|
"loss": 0.46, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.5079283714294434, |
|
"learning_rate": 8.422105263157896e-06, |
|
"loss": 0.4669, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_loss": 0.43367624282836914, |
|
"eval_runtime": 130.0756, |
|
"eval_samples_per_second": 39.446, |
|
"eval_steps_per_second": 19.727, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.216298818588257, |
|
"learning_rate": 8.395789473684212e-06, |
|
"loss": 0.4584, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.3113179206848145, |
|
"learning_rate": 8.369473684210527e-06, |
|
"loss": 0.4678, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.3224334716796875, |
|
"learning_rate": 8.343157894736842e-06, |
|
"loss": 0.4565, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 3.6630237102508545, |
|
"learning_rate": 8.316842105263159e-06, |
|
"loss": 0.4652, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.178736448287964, |
|
"learning_rate": 8.290526315789474e-06, |
|
"loss": 0.4615, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.6931562423706055, |
|
"learning_rate": 8.26421052631579e-06, |
|
"loss": 0.4649, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 3.884937286376953, |
|
"learning_rate": 8.237894736842106e-06, |
|
"loss": 0.4665, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.139063835144043, |
|
"learning_rate": 8.211578947368422e-06, |
|
"loss": 0.4629, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.8598359823226929, |
|
"learning_rate": 8.185263157894737e-06, |
|
"loss": 0.4681, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.011505126953125, |
|
"learning_rate": 8.158947368421052e-06, |
|
"loss": 0.4639, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.256636381149292, |
|
"learning_rate": 8.132631578947369e-06, |
|
"loss": 0.4686, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.452007293701172, |
|
"learning_rate": 8.106315789473684e-06, |
|
"loss": 0.4655, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.439833164215088, |
|
"learning_rate": 8.08e-06, |
|
"loss": 0.4635, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.101173162460327, |
|
"learning_rate": 8.053684210526317e-06, |
|
"loss": 0.4616, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 3.1077115535736084, |
|
"learning_rate": 8.027368421052632e-06, |
|
"loss": 0.4645, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.397768020629883, |
|
"learning_rate": 8.001052631578949e-06, |
|
"loss": 0.4551, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.6911340951919556, |
|
"learning_rate": 7.974736842105264e-06, |
|
"loss": 0.4693, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 4.298758506774902, |
|
"learning_rate": 7.94842105263158e-06, |
|
"loss": 0.463, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.41021728515625, |
|
"learning_rate": 7.922105263157895e-06, |
|
"loss": 0.4597, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.7337491512298584, |
|
"learning_rate": 7.89578947368421e-06, |
|
"loss": 0.4572, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.6781957149505615, |
|
"learning_rate": 7.869473684210527e-06, |
|
"loss": 0.4588, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.573647975921631, |
|
"learning_rate": 7.843157894736842e-06, |
|
"loss": 0.4662, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.0476982593536377, |
|
"learning_rate": 7.816842105263159e-06, |
|
"loss": 0.4592, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.947978138923645, |
|
"learning_rate": 7.790526315789474e-06, |
|
"loss": 0.4629, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.7212955951690674, |
|
"learning_rate": 7.76421052631579e-06, |
|
"loss": 0.4593, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.7885591983795166, |
|
"learning_rate": 7.737894736842105e-06, |
|
"loss": 0.469, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.266343116760254, |
|
"learning_rate": 7.711578947368422e-06, |
|
"loss": 0.4541, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.3520565032958984, |
|
"learning_rate": 7.685263157894739e-06, |
|
"loss": 0.4608, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.0906155109405518, |
|
"learning_rate": 7.658947368421054e-06, |
|
"loss": 0.4585, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.396610736846924, |
|
"learning_rate": 7.632631578947369e-06, |
|
"loss": 0.4607, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.4076027870178223, |
|
"learning_rate": 7.606315789473685e-06, |
|
"loss": 0.4622, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 5.321292400360107, |
|
"learning_rate": 7.58e-06, |
|
"loss": 0.4594, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 4.2904253005981445, |
|
"learning_rate": 7.553684210526316e-06, |
|
"loss": 0.4596, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.1583385467529297, |
|
"learning_rate": 7.527368421052632e-06, |
|
"loss": 0.4703, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.3681633472442627, |
|
"learning_rate": 7.501052631578948e-06, |
|
"loss": 0.4618, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.9283578395843506, |
|
"learning_rate": 7.4747368421052635e-06, |
|
"loss": 0.461, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.1495721340179443, |
|
"learning_rate": 7.448421052631579e-06, |
|
"loss": 0.4591, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.898043394088745, |
|
"learning_rate": 7.422105263157895e-06, |
|
"loss": 0.4597, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.5551648139953613, |
|
"learning_rate": 7.395789473684211e-06, |
|
"loss": 0.4598, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.92994225025177, |
|
"learning_rate": 7.369473684210528e-06, |
|
"loss": 0.4647, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 0.4316975176334381, |
|
"eval_runtime": 128.6261, |
|
"eval_samples_per_second": 39.891, |
|
"eval_steps_per_second": 19.949, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.059903383255005, |
|
"learning_rate": 7.3431578947368435e-06, |
|
"loss": 0.4634, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.8484073877334595, |
|
"learning_rate": 7.3168421052631585e-06, |
|
"loss": 0.4622, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.5850799083709717, |
|
"learning_rate": 7.290526315789474e-06, |
|
"loss": 0.4479, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 3.510633945465088, |
|
"learning_rate": 7.26421052631579e-06, |
|
"loss": 0.4585, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 3.163177490234375, |
|
"learning_rate": 7.237894736842106e-06, |
|
"loss": 0.4682, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 3.7852284908294678, |
|
"learning_rate": 7.211578947368422e-06, |
|
"loss": 0.4552, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.6458024978637695, |
|
"learning_rate": 7.1852631578947375e-06, |
|
"loss": 0.4623, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.3463425636291504, |
|
"learning_rate": 7.158947368421053e-06, |
|
"loss": 0.4645, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 3.481792688369751, |
|
"learning_rate": 7.132631578947369e-06, |
|
"loss": 0.4562, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.2543387413024902, |
|
"learning_rate": 7.106315789473684e-06, |
|
"loss": 0.4577, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 3.5041863918304443, |
|
"learning_rate": 7.08e-06, |
|
"loss": 0.4614, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.6661858558654785, |
|
"learning_rate": 7.053684210526316e-06, |
|
"loss": 0.4611, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.6855933666229248, |
|
"learning_rate": 7.027368421052632e-06, |
|
"loss": 0.4638, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.8656867742538452, |
|
"learning_rate": 7.001052631578948e-06, |
|
"loss": 0.4633, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.9860706329345703, |
|
"learning_rate": 6.974736842105264e-06, |
|
"loss": 0.4609, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 3.0542991161346436, |
|
"learning_rate": 6.94842105263158e-06, |
|
"loss": 0.4532, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.680880308151245, |
|
"learning_rate": 6.922105263157896e-06, |
|
"loss": 0.4561, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.62613582611084, |
|
"learning_rate": 6.8957894736842116e-06, |
|
"loss": 0.4646, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.2299606800079346, |
|
"learning_rate": 6.869473684210527e-06, |
|
"loss": 0.4591, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 3.0914087295532227, |
|
"learning_rate": 6.843157894736842e-06, |
|
"loss": 0.4625, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.1178531646728516, |
|
"learning_rate": 6.816842105263158e-06, |
|
"loss": 0.4627, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.3285298347473145, |
|
"learning_rate": 6.790526315789474e-06, |
|
"loss": 0.454, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.802120327949524, |
|
"learning_rate": 6.76421052631579e-06, |
|
"loss": 0.4637, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.2270822525024414, |
|
"learning_rate": 6.737894736842106e-06, |
|
"loss": 0.4633, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.21866512298584, |
|
"learning_rate": 6.7115789473684214e-06, |
|
"loss": 0.4627, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.067976951599121, |
|
"learning_rate": 6.685263157894737e-06, |
|
"loss": 0.4609, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.3835887908935547, |
|
"learning_rate": 6.658947368421054e-06, |
|
"loss": 0.4547, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 3.4934396743774414, |
|
"learning_rate": 6.63263157894737e-06, |
|
"loss": 0.4513, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.1658859252929688, |
|
"learning_rate": 6.606315789473685e-06, |
|
"loss": 0.4568, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.9029380083084106, |
|
"learning_rate": 6.5800000000000005e-06, |
|
"loss": 0.4554, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.7163124084472656, |
|
"learning_rate": 6.553684210526316e-06, |
|
"loss": 0.4619, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 4.556973457336426, |
|
"learning_rate": 6.527368421052632e-06, |
|
"loss": 0.4598, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.304401397705078, |
|
"learning_rate": 6.501052631578948e-06, |
|
"loss": 0.4613, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.5970966815948486, |
|
"learning_rate": 6.474736842105264e-06, |
|
"loss": 0.4617, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.3288301229476929, |
|
"learning_rate": 6.44842105263158e-06, |
|
"loss": 0.4555, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.891694188117981, |
|
"learning_rate": 6.4221052631578954e-06, |
|
"loss": 0.4534, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.017850637435913, |
|
"learning_rate": 6.39578947368421e-06, |
|
"loss": 0.4577, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 3.5200083255767822, |
|
"learning_rate": 6.369473684210526e-06, |
|
"loss": 0.4496, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.430833339691162, |
|
"learning_rate": 6.343157894736842e-06, |
|
"loss": 0.4471, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.285391092300415, |
|
"learning_rate": 6.316842105263158e-06, |
|
"loss": 0.4596, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"eval_loss": 0.4282490015029907, |
|
"eval_runtime": 127.4696, |
|
"eval_samples_per_second": 40.253, |
|
"eval_steps_per_second": 20.13, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": Infinity, |
|
"learning_rate": 6.291578947368422e-06, |
|
"loss": 0.4625, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.3426144123077393, |
|
"learning_rate": 6.265263157894738e-06, |
|
"loss": 0.461, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.6181225776672363, |
|
"learning_rate": 6.238947368421054e-06, |
|
"loss": 0.4601, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 4.169466018676758, |
|
"learning_rate": 6.212631578947369e-06, |
|
"loss": 0.4582, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.541663408279419, |
|
"learning_rate": 6.1863157894736845e-06, |
|
"loss": 0.462, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 6.109897613525391, |
|
"learning_rate": 6.16e-06, |
|
"loss": 0.4574, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.4529943466186523, |
|
"learning_rate": 6.133684210526316e-06, |
|
"loss": 0.4622, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.091356039047241, |
|
"learning_rate": 6.107368421052632e-06, |
|
"loss": 0.4624, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.5841152667999268, |
|
"learning_rate": 6.081052631578948e-06, |
|
"loss": 0.4573, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 2.812778949737549, |
|
"learning_rate": 6.054736842105264e-06, |
|
"loss": 0.4529, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.070568084716797, |
|
"learning_rate": 6.0284210526315786e-06, |
|
"loss": 0.4621, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.89155650138855, |
|
"learning_rate": 6.002105263157896e-06, |
|
"loss": 0.4534, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.926348328590393, |
|
"learning_rate": 5.975789473684212e-06, |
|
"loss": 0.4558, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 2.3729255199432373, |
|
"learning_rate": 5.949473684210527e-06, |
|
"loss": 0.4569, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 2.6525228023529053, |
|
"learning_rate": 5.923157894736843e-06, |
|
"loss": 0.4593, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 2.258516311645508, |
|
"learning_rate": 5.8968421052631585e-06, |
|
"loss": 0.4563, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 3.1756558418273926, |
|
"learning_rate": 5.870526315789474e-06, |
|
"loss": 0.4565, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.4328861236572266, |
|
"learning_rate": 5.84421052631579e-06, |
|
"loss": 0.457, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.02143931388855, |
|
"learning_rate": 5.817894736842106e-06, |
|
"loss": 0.4579, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.9647924900054932, |
|
"learning_rate": 5.791578947368422e-06, |
|
"loss": 0.4577, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 3.5917584896087646, |
|
"learning_rate": 5.765263157894737e-06, |
|
"loss": 0.4486, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 3.461472749710083, |
|
"learning_rate": 5.7389473684210526e-06, |
|
"loss": 0.4603, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.0499308109283447, |
|
"learning_rate": 5.712631578947368e-06, |
|
"loss": 0.4572, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.1889617443084717, |
|
"learning_rate": 5.686315789473684e-06, |
|
"loss": 0.4567, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.788960576057434, |
|
"learning_rate": 5.66e-06, |
|
"loss": 0.4523, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 3.5286545753479004, |
|
"learning_rate": 5.633684210526317e-06, |
|
"loss": 0.4568, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.9870589971542358, |
|
"learning_rate": 5.6073684210526325e-06, |
|
"loss": 0.4586, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 3.326669216156006, |
|
"learning_rate": 5.581052631578948e-06, |
|
"loss": 0.4629, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.8193014860153198, |
|
"learning_rate": 5.554736842105264e-06, |
|
"loss": 0.4508, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.358961343765259, |
|
"learning_rate": 5.52842105263158e-06, |
|
"loss": 0.4572, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 2.7740235328674316, |
|
"learning_rate": 5.502105263157895e-06, |
|
"loss": 0.4644, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 4.845912933349609, |
|
"learning_rate": 5.475789473684211e-06, |
|
"loss": 0.4558, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.390437364578247, |
|
"learning_rate": 5.4494736842105266e-06, |
|
"loss": 0.4605, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 2.571261405944824, |
|
"learning_rate": 5.423157894736842e-06, |
|
"loss": 0.4562, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.0128092765808105, |
|
"learning_rate": 5.396842105263158e-06, |
|
"loss": 0.46, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.0535905361175537, |
|
"learning_rate": 5.370526315789474e-06, |
|
"loss": 0.4505, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 10.284537315368652, |
|
"learning_rate": 5.34421052631579e-06, |
|
"loss": 0.4598, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 1.640753984451294, |
|
"learning_rate": 5.317894736842105e-06, |
|
"loss": 0.4558, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 3.6907098293304443, |
|
"learning_rate": 5.291578947368422e-06, |
|
"loss": 0.4521, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 3.541050672531128, |
|
"learning_rate": 5.265263157894738e-06, |
|
"loss": 0.4528, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"eval_loss": 0.4256468117237091, |
|
"eval_runtime": 131.0319, |
|
"eval_samples_per_second": 39.158, |
|
"eval_steps_per_second": 19.583, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 3.7902073860168457, |
|
"learning_rate": 5.238947368421053e-06, |
|
"loss": 0.4571, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 4.066291332244873, |
|
"learning_rate": 5.212631578947369e-06, |
|
"loss": 0.4512, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.505599021911621, |
|
"learning_rate": 5.186315789473685e-06, |
|
"loss": 0.4468, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.8621853590011597, |
|
"learning_rate": 5.1600000000000006e-06, |
|
"loss": 0.4523, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 2.166684865951538, |
|
"learning_rate": 5.133684210526316e-06, |
|
"loss": 0.4545, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 2.113370656967163, |
|
"learning_rate": 5.107368421052632e-06, |
|
"loss": 0.4564, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 1.8998031616210938, |
|
"learning_rate": 5.081052631578948e-06, |
|
"loss": 0.4527, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.5921194553375244, |
|
"learning_rate": 5.054736842105263e-06, |
|
"loss": 0.4562, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 6.103330612182617, |
|
"learning_rate": 5.028421052631579e-06, |
|
"loss": 0.4549, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 3.6465394496917725, |
|
"learning_rate": 5.002105263157895e-06, |
|
"loss": 0.4576, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 2.6215322017669678, |
|
"learning_rate": 4.975789473684211e-06, |
|
"loss": 0.4587, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.8484139442443848, |
|
"learning_rate": 4.949473684210527e-06, |
|
"loss": 0.4524, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 2.0593345165252686, |
|
"learning_rate": 4.923157894736842e-06, |
|
"loss": 0.4486, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.749562978744507, |
|
"learning_rate": 4.896842105263158e-06, |
|
"loss": 0.4556, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 5.627275466918945, |
|
"learning_rate": 4.870526315789474e-06, |
|
"loss": 0.4535, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 4.1247992515563965, |
|
"learning_rate": 4.845263157894737e-06, |
|
"loss": 0.4585, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.740677833557129, |
|
"learning_rate": 4.818947368421053e-06, |
|
"loss": 0.4613, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 2.885836362838745, |
|
"learning_rate": 4.792631578947369e-06, |
|
"loss": 0.4509, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 2.179293394088745, |
|
"learning_rate": 4.7663157894736845e-06, |
|
"loss": 0.4626, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.766608953475952, |
|
"learning_rate": 4.74e-06, |
|
"loss": 0.4532, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.0567455291748047, |
|
"learning_rate": 4.713684210526316e-06, |
|
"loss": 0.456, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 3.076902151107788, |
|
"learning_rate": 4.687368421052632e-06, |
|
"loss": 0.4456, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 2.915499687194824, |
|
"learning_rate": 4.661052631578948e-06, |
|
"loss": 0.4504, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 3.0495471954345703, |
|
"learning_rate": 4.634736842105264e-06, |
|
"loss": 0.4539, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.6944961547851562, |
|
"learning_rate": 4.6084210526315794e-06, |
|
"loss": 0.4541, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.135935068130493, |
|
"learning_rate": 4.582105263157895e-06, |
|
"loss": 0.4525, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 6.483078956604004, |
|
"learning_rate": 4.55578947368421e-06, |
|
"loss": 0.4535, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 2.811405658721924, |
|
"learning_rate": 4.529473684210527e-06, |
|
"loss": 0.4538, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 3.5844473838806152, |
|
"learning_rate": 4.503157894736843e-06, |
|
"loss": 0.4556, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 2.2661666870117188, |
|
"learning_rate": 4.4768421052631585e-06, |
|
"loss": 0.4585, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.6553196907043457, |
|
"learning_rate": 4.450526315789474e-06, |
|
"loss": 0.441, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 2.779641628265381, |
|
"learning_rate": 4.424210526315789e-06, |
|
"loss": 0.4552, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 11.688698768615723, |
|
"learning_rate": 4.397894736842105e-06, |
|
"loss": 0.4496, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.6974663734436035, |
|
"learning_rate": 4.371578947368421e-06, |
|
"loss": 0.4552, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 3.191411256790161, |
|
"learning_rate": 4.345263157894738e-06, |
|
"loss": 0.4522, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 2.5754339694976807, |
|
"learning_rate": 4.3189473684210535e-06, |
|
"loss": 0.4565, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 2.6307075023651123, |
|
"learning_rate": 4.2926315789473684e-06, |
|
"loss": 0.4446, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 2.7968878746032715, |
|
"learning_rate": 4.266315789473684e-06, |
|
"loss": 0.4522, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 4.277090549468994, |
|
"learning_rate": 4.24e-06, |
|
"loss": 0.454, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 4.684088706970215, |
|
"learning_rate": 4.213684210526316e-06, |
|
"loss": 0.4564, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"eval_loss": 0.4263022243976593, |
|
"eval_runtime": 129.2969, |
|
"eval_samples_per_second": 39.684, |
|
"eval_steps_per_second": 19.846, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 2.7073895931243896, |
|
"learning_rate": 4.1873684210526325e-06, |
|
"loss": 0.4527, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.904488444328308, |
|
"learning_rate": 4.1610526315789475e-06, |
|
"loss": 0.454, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 2.016899347305298, |
|
"learning_rate": 4.134736842105263e-06, |
|
"loss": 0.4482, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 2.5908937454223633, |
|
"learning_rate": 4.108421052631579e-06, |
|
"loss": 0.4545, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 2.05924129486084, |
|
"learning_rate": 4.082105263157895e-06, |
|
"loss": 0.4561, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.9701108932495117, |
|
"learning_rate": 4.055789473684211e-06, |
|
"loss": 0.4505, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 2.8021397590637207, |
|
"learning_rate": 4.029473684210527e-06, |
|
"loss": 0.4512, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 1.7641443014144897, |
|
"learning_rate": 4.0031578947368424e-06, |
|
"loss": 0.4492, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 2.424809217453003, |
|
"learning_rate": 3.976842105263158e-06, |
|
"loss": 0.4591, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 3.036320686340332, |
|
"learning_rate": 3.950526315789474e-06, |
|
"loss": 0.448, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 2.072425127029419, |
|
"learning_rate": 3.92421052631579e-06, |
|
"loss": 0.4459, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 14.710461616516113, |
|
"learning_rate": 3.897894736842106e-06, |
|
"loss": 0.4552, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 2.3576061725616455, |
|
"learning_rate": 3.8715789473684215e-06, |
|
"loss": 0.4528, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.984506368637085, |
|
"learning_rate": 3.845263157894737e-06, |
|
"loss": 0.4501, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 2.9575440883636475, |
|
"learning_rate": 3.818947368421053e-06, |
|
"loss": 0.4505, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 2.4890646934509277, |
|
"learning_rate": 3.792631578947369e-06, |
|
"loss": 0.4546, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 1.8990737199783325, |
|
"learning_rate": 3.766315789473685e-06, |
|
"loss": 0.4534, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 3.2801966667175293, |
|
"learning_rate": 3.74e-06, |
|
"loss": 0.4517, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 2.2833831310272217, |
|
"learning_rate": 3.713684210526316e-06, |
|
"loss": 0.4501, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 3.27669620513916, |
|
"learning_rate": 3.687368421052632e-06, |
|
"loss": 0.4596, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 3.1152820587158203, |
|
"learning_rate": 3.6610526315789472e-06, |
|
"loss": 0.4577, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 3.170578718185425, |
|
"learning_rate": 3.6347368421052635e-06, |
|
"loss": 0.4542, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 4.5797295570373535, |
|
"learning_rate": 3.6084210526315793e-06, |
|
"loss": 0.4467, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 2.258434295654297, |
|
"learning_rate": 3.582105263157895e-06, |
|
"loss": 0.4511, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 2.859511137008667, |
|
"learning_rate": 3.555789473684211e-06, |
|
"loss": 0.449, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 3.1564037799835205, |
|
"learning_rate": 3.5294736842105263e-06, |
|
"loss": 0.4488, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 2.4222614765167236, |
|
"learning_rate": 3.503157894736842e-06, |
|
"loss": 0.4505, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 6.269469261169434, |
|
"learning_rate": 3.476842105263158e-06, |
|
"loss": 0.4512, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 2.1644115447998047, |
|
"learning_rate": 3.450526315789474e-06, |
|
"loss": 0.4533, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 4.101073741912842, |
|
"learning_rate": 3.42421052631579e-06, |
|
"loss": 0.4538, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 2.309544563293457, |
|
"learning_rate": 3.3978947368421054e-06, |
|
"loss": 0.4518, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 2.7543396949768066, |
|
"learning_rate": 3.3715789473684212e-06, |
|
"loss": 0.4488, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 1.9031325578689575, |
|
"learning_rate": 3.345263157894737e-06, |
|
"loss": 0.4572, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 1.8639992475509644, |
|
"learning_rate": 3.318947368421053e-06, |
|
"loss": 0.4547, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 2.6859822273254395, |
|
"learning_rate": 3.292631578947369e-06, |
|
"loss": 0.4504, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 2.0838868618011475, |
|
"learning_rate": 3.2663157894736845e-06, |
|
"loss": 0.4503, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.9140487909317017, |
|
"learning_rate": 3.2400000000000003e-06, |
|
"loss": 0.4508, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 2.2218894958496094, |
|
"learning_rate": 3.213684210526316e-06, |
|
"loss": 0.4542, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.9918988943099976, |
|
"learning_rate": 3.187368421052632e-06, |
|
"loss": 0.4488, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 2.9417710304260254, |
|
"learning_rate": 3.1610526315789474e-06, |
|
"loss": 0.4529, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"eval_loss": 0.42716261744499207, |
|
"eval_runtime": 125.3767, |
|
"eval_samples_per_second": 40.925, |
|
"eval_steps_per_second": 20.466, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 1000, |
|
"total_flos": 2.5358032943217216e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|