|
{ |
|
"best_metric": 0.3675082325935364, |
|
"best_model_checkpoint": "mikhail-panzo/zlm_b128_le5_s8000/checkpoint-7000", |
|
"epoch": 11.727748691099476, |
|
"eval_steps": 500, |
|
"global_step": 7000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 6.784719467163086, |
|
"learning_rate": 2.4500000000000004e-07, |
|
"loss": 1.1115, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 12.37061595916748, |
|
"learning_rate": 4.95e-07, |
|
"loss": 1.1117, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 2.4181911945343018, |
|
"learning_rate": 7.450000000000001e-07, |
|
"loss": 0.9761, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 5.356767177581787, |
|
"learning_rate": 9.950000000000002e-07, |
|
"loss": 0.8691, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 1.991185188293457, |
|
"learning_rate": 1.2450000000000002e-06, |
|
"loss": 0.816, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 1.8864444494247437, |
|
"learning_rate": 1.495e-06, |
|
"loss": 0.7841, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 2.3990137577056885, |
|
"learning_rate": 1.745e-06, |
|
"loss": 0.7553, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 3.0834362506866455, |
|
"learning_rate": 1.9950000000000004e-06, |
|
"loss": 0.7231, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 1.6590628623962402, |
|
"learning_rate": 2.245e-06, |
|
"loss": 0.6887, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 2.2680208683013916, |
|
"learning_rate": 2.4950000000000003e-06, |
|
"loss": 0.6645, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"eval_loss": 0.5697648525238037, |
|
"eval_runtime": 253.6, |
|
"eval_samples_per_second": 33.474, |
|
"eval_steps_per_second": 4.188, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 1.2383062839508057, |
|
"learning_rate": 2.7450000000000004e-06, |
|
"loss": 0.6485, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0052356020942408, |
|
"grad_norm": 1.7865532636642456, |
|
"learning_rate": 2.995e-06, |
|
"loss": 0.6305, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0890052356020943, |
|
"grad_norm": 5.1390790939331055, |
|
"learning_rate": 3.2450000000000003e-06, |
|
"loss": 0.625, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1727748691099475, |
|
"grad_norm": 1.56719172000885, |
|
"learning_rate": 3.495e-06, |
|
"loss": 0.6164, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 2.732139825820923, |
|
"learning_rate": 3.745e-06, |
|
"loss": 0.5949, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3403141361256545, |
|
"grad_norm": 2.0998775959014893, |
|
"learning_rate": 3.995000000000001e-06, |
|
"loss": 0.5885, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4240837696335078, |
|
"grad_norm": 9.37508487701416, |
|
"learning_rate": 4.245e-06, |
|
"loss": 0.5763, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5078534031413613, |
|
"grad_norm": 2.467970132827759, |
|
"learning_rate": 4.495e-06, |
|
"loss": 0.5671, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5916230366492146, |
|
"grad_norm": 1.5992038249969482, |
|
"learning_rate": 4.745e-06, |
|
"loss": 0.5569, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 2.854398727416992, |
|
"learning_rate": 4.9950000000000005e-06, |
|
"loss": 0.5581, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"eval_loss": 0.4793892800807953, |
|
"eval_runtime": 254.3858, |
|
"eval_samples_per_second": 33.371, |
|
"eval_steps_per_second": 4.175, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7591623036649215, |
|
"grad_norm": 2.9313721656799316, |
|
"learning_rate": 5.245e-06, |
|
"loss": 0.5573, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8429319371727748, |
|
"grad_norm": 1.984640121459961, |
|
"learning_rate": 5.495000000000001e-06, |
|
"loss": 0.5455, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9267015706806283, |
|
"grad_norm": 2.0967133045196533, |
|
"learning_rate": 5.745000000000001e-06, |
|
"loss": 0.5301, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0104712041884816, |
|
"grad_norm": 2.864819049835205, |
|
"learning_rate": 5.995000000000001e-06, |
|
"loss": 0.5249, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 1.5830483436584473, |
|
"learning_rate": 6.245000000000001e-06, |
|
"loss": 0.5265, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1780104712041886, |
|
"grad_norm": 1.706629753112793, |
|
"learning_rate": 6.4950000000000005e-06, |
|
"loss": 0.5226, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.261780104712042, |
|
"grad_norm": 2.3449299335479736, |
|
"learning_rate": 6.745000000000001e-06, |
|
"loss": 0.5159, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.345549738219895, |
|
"grad_norm": 2.4408295154571533, |
|
"learning_rate": 6.995000000000001e-06, |
|
"loss": 0.517, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4293193717277486, |
|
"grad_norm": 1.8883627653121948, |
|
"learning_rate": 7.245000000000001e-06, |
|
"loss": 0.5139, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 1.5606210231781006, |
|
"learning_rate": 7.495000000000001e-06, |
|
"loss": 0.5045, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"eval_loss": 0.44668668508529663, |
|
"eval_runtime": 252.363, |
|
"eval_samples_per_second": 33.638, |
|
"eval_steps_per_second": 4.208, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.5968586387434556, |
|
"grad_norm": 1.6376454830169678, |
|
"learning_rate": 7.745e-06, |
|
"loss": 0.503, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.680628272251309, |
|
"grad_norm": 1.7104960680007935, |
|
"learning_rate": 7.995e-06, |
|
"loss": 0.4988, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.7643979057591626, |
|
"grad_norm": 2.3372185230255127, |
|
"learning_rate": 8.245000000000002e-06, |
|
"loss": 0.4954, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8481675392670156, |
|
"grad_norm": 2.170474052429199, |
|
"learning_rate": 8.495e-06, |
|
"loss": 0.4971, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 2.3041465282440186, |
|
"learning_rate": 8.745000000000002e-06, |
|
"loss": 0.4948, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0157068062827226, |
|
"grad_norm": 2.1798529624938965, |
|
"learning_rate": 8.995000000000001e-06, |
|
"loss": 0.4849, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.099476439790576, |
|
"grad_norm": 1.8813133239746094, |
|
"learning_rate": 9.245e-06, |
|
"loss": 0.4839, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.183246073298429, |
|
"grad_norm": 1.8749632835388184, |
|
"learning_rate": 9.49e-06, |
|
"loss": 0.4871, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.2670157068062826, |
|
"grad_norm": 1.917149305343628, |
|
"learning_rate": 9.74e-06, |
|
"loss": 0.4724, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"grad_norm": 1.9401865005493164, |
|
"learning_rate": 9.990000000000001e-06, |
|
"loss": 0.4776, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"eval_loss": 0.4236186444759369, |
|
"eval_runtime": 261.4044, |
|
"eval_samples_per_second": 32.475, |
|
"eval_steps_per_second": 4.063, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4345549738219896, |
|
"grad_norm": 1.67532479763031, |
|
"learning_rate": 9.921666666666667e-06, |
|
"loss": 0.4797, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.518324607329843, |
|
"grad_norm": 2.044435977935791, |
|
"learning_rate": 9.838333333333334e-06, |
|
"loss": 0.475, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.6020942408376966, |
|
"grad_norm": 1.7103309631347656, |
|
"learning_rate": 9.755e-06, |
|
"loss": 0.4729, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.6858638743455496, |
|
"grad_norm": 2.557506799697876, |
|
"learning_rate": 9.671666666666668e-06, |
|
"loss": 0.4671, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.769633507853403, |
|
"grad_norm": 2.5080385208129883, |
|
"learning_rate": 9.588333333333334e-06, |
|
"loss": 0.4613, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.8534031413612566, |
|
"grad_norm": 1.6658800840377808, |
|
"learning_rate": 9.505000000000001e-06, |
|
"loss": 0.4614, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.93717277486911, |
|
"grad_norm": 2.167924404144287, |
|
"learning_rate": 9.421666666666668e-06, |
|
"loss": 0.4685, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.020942408376963, |
|
"grad_norm": 4.781851768493652, |
|
"learning_rate": 9.338333333333333e-06, |
|
"loss": 0.4578, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.104712041884817, |
|
"grad_norm": 2.1949734687805176, |
|
"learning_rate": 9.255e-06, |
|
"loss": 0.4613, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"grad_norm": 1.9870787858963013, |
|
"learning_rate": 9.171666666666667e-06, |
|
"loss": 0.4553, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"eval_loss": 0.40933898091316223, |
|
"eval_runtime": 252.812, |
|
"eval_samples_per_second": 33.578, |
|
"eval_steps_per_second": 4.201, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.272251308900524, |
|
"grad_norm": 5.594339370727539, |
|
"learning_rate": 9.088333333333334e-06, |
|
"loss": 0.4592, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.356020942408377, |
|
"grad_norm": 2.930290699005127, |
|
"learning_rate": 9.005000000000001e-06, |
|
"loss": 0.4549, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.439790575916231, |
|
"grad_norm": 2.199352741241455, |
|
"learning_rate": 8.921666666666668e-06, |
|
"loss": 0.4512, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.523560209424084, |
|
"grad_norm": 2.435898780822754, |
|
"learning_rate": 8.838333333333335e-06, |
|
"loss": 0.4557, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.607329842931938, |
|
"grad_norm": 3.2397773265838623, |
|
"learning_rate": 8.755e-06, |
|
"loss": 0.449, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.69109947643979, |
|
"grad_norm": 1.9361001253128052, |
|
"learning_rate": 8.671666666666667e-06, |
|
"loss": 0.4525, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.774869109947644, |
|
"grad_norm": 2.1852686405181885, |
|
"learning_rate": 8.588333333333334e-06, |
|
"loss": 0.4486, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.858638743455497, |
|
"grad_norm": 1.8136191368103027, |
|
"learning_rate": 8.505e-06, |
|
"loss": 0.4517, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.942408376963351, |
|
"grad_norm": 2.694406032562256, |
|
"learning_rate": 8.421666666666668e-06, |
|
"loss": 0.4505, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"grad_norm": 1.251187801361084, |
|
"learning_rate": 8.338333333333335e-06, |
|
"loss": 0.4489, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"eval_loss": 0.39678552746772766, |
|
"eval_runtime": 256.1618, |
|
"eval_samples_per_second": 33.139, |
|
"eval_steps_per_second": 4.146, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.109947643979058, |
|
"grad_norm": 1.1454211473464966, |
|
"learning_rate": 8.255000000000001e-06, |
|
"loss": 0.4422, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.193717277486911, |
|
"grad_norm": 1.8685294389724731, |
|
"learning_rate": 8.171666666666668e-06, |
|
"loss": 0.4426, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.277486910994765, |
|
"grad_norm": 1.6863799095153809, |
|
"learning_rate": 8.088333333333334e-06, |
|
"loss": 0.4398, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.361256544502618, |
|
"grad_norm": 2.249805212020874, |
|
"learning_rate": 8.005e-06, |
|
"loss": 0.4384, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.445026178010472, |
|
"grad_norm": 2.1187326908111572, |
|
"learning_rate": 7.921666666666667e-06, |
|
"loss": 0.4431, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.528795811518324, |
|
"grad_norm": 1.8476357460021973, |
|
"learning_rate": 7.838333333333334e-06, |
|
"loss": 0.4434, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.612565445026178, |
|
"grad_norm": 1.6522760391235352, |
|
"learning_rate": 7.755000000000001e-06, |
|
"loss": 0.4343, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.696335078534031, |
|
"grad_norm": 1.3926664590835571, |
|
"learning_rate": 7.671666666666668e-06, |
|
"loss": 0.4365, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.780104712041885, |
|
"grad_norm": 2.1967947483062744, |
|
"learning_rate": 7.588333333333334e-06, |
|
"loss": 0.4362, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"grad_norm": 1.5428054332733154, |
|
"learning_rate": 7.505e-06, |
|
"loss": 0.4337, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"eval_loss": 0.39255669713020325, |
|
"eval_runtime": 251.4241, |
|
"eval_samples_per_second": 33.764, |
|
"eval_steps_per_second": 4.224, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.947643979057592, |
|
"grad_norm": 1.7545124292373657, |
|
"learning_rate": 7.421666666666667e-06, |
|
"loss": 0.4292, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.031413612565445, |
|
"grad_norm": 1.1912785768508911, |
|
"learning_rate": 7.338333333333334e-06, |
|
"loss": 0.4293, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.115183246073299, |
|
"grad_norm": 1.549297571182251, |
|
"learning_rate": 7.255000000000001e-06, |
|
"loss": 0.4333, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.198952879581152, |
|
"grad_norm": 1.8822449445724487, |
|
"learning_rate": 7.171666666666667e-06, |
|
"loss": 0.4308, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.282722513089006, |
|
"grad_norm": 1.6894242763519287, |
|
"learning_rate": 7.088333333333334e-06, |
|
"loss": 0.4336, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.366492146596858, |
|
"grad_norm": 1.63300621509552, |
|
"learning_rate": 7.005000000000001e-06, |
|
"loss": 0.432, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.450261780104712, |
|
"grad_norm": 1.4569647312164307, |
|
"learning_rate": 6.921666666666668e-06, |
|
"loss": 0.4302, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 6.534031413612565, |
|
"grad_norm": 1.3649786710739136, |
|
"learning_rate": 6.838333333333334e-06, |
|
"loss": 0.4277, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.617801047120419, |
|
"grad_norm": 1.6974161863327026, |
|
"learning_rate": 6.7550000000000005e-06, |
|
"loss": 0.4329, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 6.701570680628272, |
|
"grad_norm": 8.575983047485352, |
|
"learning_rate": 6.6716666666666674e-06, |
|
"loss": 0.4282, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.701570680628272, |
|
"eval_loss": 0.3837336301803589, |
|
"eval_runtime": 252.6194, |
|
"eval_samples_per_second": 33.604, |
|
"eval_steps_per_second": 4.204, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.785340314136126, |
|
"grad_norm": 1.308820366859436, |
|
"learning_rate": 6.588333333333334e-06, |
|
"loss": 0.4233, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 6.869109947643979, |
|
"grad_norm": 1.6496186256408691, |
|
"learning_rate": 6.505e-06, |
|
"loss": 0.4235, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.952879581151833, |
|
"grad_norm": 2.9797613620758057, |
|
"learning_rate": 6.421666666666667e-06, |
|
"loss": 0.424, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.036649214659686, |
|
"grad_norm": 1.366092324256897, |
|
"learning_rate": 6.338333333333334e-06, |
|
"loss": 0.4241, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.12041884816754, |
|
"grad_norm": 1.2892742156982422, |
|
"learning_rate": 6.255e-06, |
|
"loss": 0.4306, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.204188481675392, |
|
"grad_norm": 1.769739031791687, |
|
"learning_rate": 6.171666666666667e-06, |
|
"loss": 0.4206, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.287958115183246, |
|
"grad_norm": 1.2120298147201538, |
|
"learning_rate": 6.088333333333334e-06, |
|
"loss": 0.4245, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.371727748691099, |
|
"grad_norm": 2.545499086380005, |
|
"learning_rate": 6.005000000000001e-06, |
|
"loss": 0.4234, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.455497382198953, |
|
"grad_norm": 1.57191801071167, |
|
"learning_rate": 5.921666666666667e-06, |
|
"loss": 0.426, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.539267015706806, |
|
"grad_norm": 1.3846454620361328, |
|
"learning_rate": 5.838333333333334e-06, |
|
"loss": 0.4188, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.539267015706806, |
|
"eval_loss": 0.37984684109687805, |
|
"eval_runtime": 250.9224, |
|
"eval_samples_per_second": 33.831, |
|
"eval_steps_per_second": 4.232, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.62303664921466, |
|
"grad_norm": 1.3911954164505005, |
|
"learning_rate": 5.755000000000001e-06, |
|
"loss": 0.42, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.706806282722513, |
|
"grad_norm": 1.3781930208206177, |
|
"learning_rate": 5.671666666666668e-06, |
|
"loss": 0.4234, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.790575916230367, |
|
"grad_norm": 1.5225090980529785, |
|
"learning_rate": 5.588333333333334e-06, |
|
"loss": 0.4242, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 7.87434554973822, |
|
"grad_norm": 1.4195870161056519, |
|
"learning_rate": 5.505000000000001e-06, |
|
"loss": 0.4243, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 7.958115183246074, |
|
"grad_norm": 1.9713116884231567, |
|
"learning_rate": 5.4216666666666676e-06, |
|
"loss": 0.4158, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 8.041884816753926, |
|
"grad_norm": 1.3215075731277466, |
|
"learning_rate": 5.3383333333333345e-06, |
|
"loss": 0.4141, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.12565445026178, |
|
"grad_norm": 4.483395576477051, |
|
"learning_rate": 5.2550000000000005e-06, |
|
"loss": 0.4165, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 8.209424083769633, |
|
"grad_norm": 1.4759974479675293, |
|
"learning_rate": 5.171666666666667e-06, |
|
"loss": 0.4291, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.293193717277488, |
|
"grad_norm": 1.3336604833602905, |
|
"learning_rate": 5.088333333333334e-06, |
|
"loss": 0.4134, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 8.37696335078534, |
|
"grad_norm": 1.5618665218353271, |
|
"learning_rate": 5.0049999999999995e-06, |
|
"loss": 0.4222, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.37696335078534, |
|
"eval_loss": 0.37838345766067505, |
|
"eval_runtime": 256.1892, |
|
"eval_samples_per_second": 33.136, |
|
"eval_steps_per_second": 4.145, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.460732984293193, |
|
"grad_norm": 1.079275369644165, |
|
"learning_rate": 4.921666666666666e-06, |
|
"loss": 0.4143, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 8.544502617801047, |
|
"grad_norm": 1.0949628353118896, |
|
"learning_rate": 4.838333333333334e-06, |
|
"loss": 0.4123, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.6282722513089, |
|
"grad_norm": 1.5005624294281006, |
|
"learning_rate": 4.755e-06, |
|
"loss": 0.4218, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 8.712041884816754, |
|
"grad_norm": 5.408727169036865, |
|
"learning_rate": 4.671666666666667e-06, |
|
"loss": 0.4123, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 8.795811518324607, |
|
"grad_norm": 1.0426135063171387, |
|
"learning_rate": 4.588333333333333e-06, |
|
"loss": 0.4159, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 8.879581151832461, |
|
"grad_norm": 2.312485456466675, |
|
"learning_rate": 4.505e-06, |
|
"loss": 0.4149, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 8.963350785340314, |
|
"grad_norm": 1.6531749963760376, |
|
"learning_rate": 4.421666666666667e-06, |
|
"loss": 0.4147, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 9.047120418848168, |
|
"grad_norm": 1.2448443174362183, |
|
"learning_rate": 4.338333333333334e-06, |
|
"loss": 0.4191, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.13089005235602, |
|
"grad_norm": 1.1506507396697998, |
|
"learning_rate": 4.255e-06, |
|
"loss": 0.4094, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 9.214659685863875, |
|
"grad_norm": 1.2973967790603638, |
|
"learning_rate": 4.171666666666667e-06, |
|
"loss": 0.412, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.214659685863875, |
|
"eval_loss": 0.3729405403137207, |
|
"eval_runtime": 258.7159, |
|
"eval_samples_per_second": 32.812, |
|
"eval_steps_per_second": 4.105, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.298429319371728, |
|
"grad_norm": 1.0570714473724365, |
|
"learning_rate": 4.088333333333334e-06, |
|
"loss": 0.4144, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 9.38219895287958, |
|
"grad_norm": 1.1278653144836426, |
|
"learning_rate": 4.005000000000001e-06, |
|
"loss": 0.4154, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.465968586387435, |
|
"grad_norm": 5.811945915222168, |
|
"learning_rate": 3.921666666666667e-06, |
|
"loss": 0.4085, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 9.549738219895287, |
|
"grad_norm": 1.6973522901535034, |
|
"learning_rate": 3.8383333333333336e-06, |
|
"loss": 0.4133, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.633507853403142, |
|
"grad_norm": 1.209333062171936, |
|
"learning_rate": 3.7550000000000005e-06, |
|
"loss": 0.4123, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 9.717277486910994, |
|
"grad_norm": 3.592991590499878, |
|
"learning_rate": 3.6716666666666665e-06, |
|
"loss": 0.4126, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 9.801047120418849, |
|
"grad_norm": 1.152239203453064, |
|
"learning_rate": 3.588333333333334e-06, |
|
"loss": 0.4115, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 9.884816753926701, |
|
"grad_norm": 1.6118751764297485, |
|
"learning_rate": 3.505e-06, |
|
"loss": 0.4036, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 9.968586387434556, |
|
"grad_norm": 1.4384329319000244, |
|
"learning_rate": 3.4216666666666672e-06, |
|
"loss": 0.4103, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 10.052356020942408, |
|
"grad_norm": 1.447549819946289, |
|
"learning_rate": 3.3383333333333333e-06, |
|
"loss": 0.4056, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.052356020942408, |
|
"eval_loss": 0.3696598410606384, |
|
"eval_runtime": 256.5857, |
|
"eval_samples_per_second": 33.084, |
|
"eval_steps_per_second": 4.139, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.136125654450261, |
|
"grad_norm": 1.2832478284835815, |
|
"learning_rate": 3.255e-06, |
|
"loss": 0.4086, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 10.219895287958115, |
|
"grad_norm": 1.0580389499664307, |
|
"learning_rate": 3.1716666666666667e-06, |
|
"loss": 0.4128, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 10.303664921465968, |
|
"grad_norm": 1.5535204410552979, |
|
"learning_rate": 3.0883333333333336e-06, |
|
"loss": 0.4088, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 10.387434554973822, |
|
"grad_norm": 0.8630754947662354, |
|
"learning_rate": 3.005e-06, |
|
"loss": 0.4073, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 10.471204188481675, |
|
"grad_norm": 1.248646855354309, |
|
"learning_rate": 2.921666666666667e-06, |
|
"loss": 0.4074, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 10.55497382198953, |
|
"grad_norm": 0.9990864396095276, |
|
"learning_rate": 2.8383333333333334e-06, |
|
"loss": 0.4076, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 10.638743455497382, |
|
"grad_norm": 1.100101351737976, |
|
"learning_rate": 2.7550000000000003e-06, |
|
"loss": 0.4081, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 10.722513089005236, |
|
"grad_norm": 1.1483354568481445, |
|
"learning_rate": 2.6716666666666668e-06, |
|
"loss": 0.4088, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 10.806282722513089, |
|
"grad_norm": 0.8169103860855103, |
|
"learning_rate": 2.5883333333333337e-06, |
|
"loss": 0.406, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 10.890052356020943, |
|
"grad_norm": 1.0567216873168945, |
|
"learning_rate": 2.505e-06, |
|
"loss": 0.4065, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 10.890052356020943, |
|
"eval_loss": 0.36852771043777466, |
|
"eval_runtime": 255.1911, |
|
"eval_samples_per_second": 33.265, |
|
"eval_steps_per_second": 4.162, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 10.973821989528796, |
|
"grad_norm": 1.0459740161895752, |
|
"learning_rate": 2.421666666666667e-06, |
|
"loss": 0.4074, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 11.057591623036648, |
|
"grad_norm": 0.9029247760772705, |
|
"learning_rate": 2.3383333333333335e-06, |
|
"loss": 0.4075, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 11.141361256544503, |
|
"grad_norm": 1.5372889041900635, |
|
"learning_rate": 2.2550000000000004e-06, |
|
"loss": 0.4088, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 11.225130890052355, |
|
"grad_norm": 0.9959379434585571, |
|
"learning_rate": 2.171666666666667e-06, |
|
"loss": 0.4044, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 11.30890052356021, |
|
"grad_norm": 1.3793728351593018, |
|
"learning_rate": 2.088333333333334e-06, |
|
"loss": 0.4034, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 11.392670157068062, |
|
"grad_norm": 1.2086491584777832, |
|
"learning_rate": 2.0050000000000003e-06, |
|
"loss": 0.4073, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 11.476439790575917, |
|
"grad_norm": 1.07647705078125, |
|
"learning_rate": 1.9216666666666668e-06, |
|
"loss": 0.405, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 11.56020942408377, |
|
"grad_norm": 0.9849846363067627, |
|
"learning_rate": 1.8383333333333334e-06, |
|
"loss": 0.4037, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 11.643979057591624, |
|
"grad_norm": 1.2623456716537476, |
|
"learning_rate": 1.7550000000000001e-06, |
|
"loss": 0.4042, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 11.727748691099476, |
|
"grad_norm": 0.9488279819488525, |
|
"learning_rate": 1.6716666666666666e-06, |
|
"loss": 0.4069, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 11.727748691099476, |
|
"eval_loss": 0.3675082325935364, |
|
"eval_runtime": 257.7947, |
|
"eval_samples_per_second": 32.929, |
|
"eval_steps_per_second": 4.12, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2541009593096864e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|