|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.4032768978700165, |
|
"eval_steps": 500, |
|
"global_step": 11000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10922992900054615, |
|
"grad_norm": 0.9561710357666016, |
|
"learning_rate": 1.4564520827264783e-05, |
|
"loss": 0.7634, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10922992900054615, |
|
"eval_loss": 0.3866455852985382, |
|
"eval_runtime": 53.8542, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2184598580010923, |
|
"grad_norm": 0.6381384134292603, |
|
"learning_rate": 2.9129041654529566e-05, |
|
"loss": 0.2265, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2184598580010923, |
|
"eval_loss": 0.28045815229415894, |
|
"eval_runtime": 53.8781, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3276897870016384, |
|
"grad_norm": 0.80877685546875, |
|
"learning_rate": 4.3693562481794354e-05, |
|
"loss": 0.13, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3276897870016384, |
|
"eval_loss": 0.24824748933315277, |
|
"eval_runtime": 53.7906, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 1.729, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4369197160021846, |
|
"grad_norm": 0.7288264632225037, |
|
"learning_rate": 5.8228954267404603e-05, |
|
"loss": 0.1067, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.4369197160021846, |
|
"eval_loss": 0.24914385378360748, |
|
"eval_runtime": 53.8469, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5461496450027308, |
|
"grad_norm": 1.1100046634674072, |
|
"learning_rate": 7.27934750946694e-05, |
|
"loss": 0.1018, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5461496450027308, |
|
"eval_loss": 0.27210181951522827, |
|
"eval_runtime": 53.5513, |
|
"eval_samples_per_second": 1.737, |
|
"eval_steps_per_second": 1.737, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6553795740032768, |
|
"grad_norm": 0.8480390310287476, |
|
"learning_rate": 8.732886688027965e-05, |
|
"loss": 0.0976, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6553795740032768, |
|
"eval_loss": 0.2787032425403595, |
|
"eval_runtime": 51.6926, |
|
"eval_samples_per_second": 1.799, |
|
"eval_steps_per_second": 1.799, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.764609503003823, |
|
"grad_norm": 0.8974778056144714, |
|
"learning_rate": 0.00010189338770754442, |
|
"loss": 0.0948, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.764609503003823, |
|
"eval_loss": 0.26553070545196533, |
|
"eval_runtime": 52.0101, |
|
"eval_samples_per_second": 1.788, |
|
"eval_steps_per_second": 1.788, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8738394320043692, |
|
"grad_norm": 0.4318696856498718, |
|
"learning_rate": 0.00011645790853480921, |
|
"loss": 0.0941, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8738394320043692, |
|
"eval_loss": 0.2772391140460968, |
|
"eval_runtime": 51.821, |
|
"eval_samples_per_second": 1.795, |
|
"eval_steps_per_second": 1.795, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9830693610049154, |
|
"grad_norm": 0.22688086330890656, |
|
"learning_rate": 0.00013102242936207398, |
|
"loss": 0.0916, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9830693610049154, |
|
"eval_loss": 0.2516581118106842, |
|
"eval_runtime": 51.8197, |
|
"eval_samples_per_second": 1.795, |
|
"eval_steps_per_second": 1.795, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0922992900054616, |
|
"grad_norm": 0.053704842925071716, |
|
"learning_rate": 0.0001455869501893388, |
|
"loss": 0.084, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.0922992900054616, |
|
"eval_loss": 0.24150623381137848, |
|
"eval_runtime": 50.6724, |
|
"eval_samples_per_second": 1.835, |
|
"eval_steps_per_second": 1.835, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.2015292190060076, |
|
"grad_norm": 0.30040594935417175, |
|
"learning_rate": 0.00016015147101660357, |
|
"loss": 0.0882, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.2015292190060076, |
|
"eval_loss": 0.24166980385780334, |
|
"eval_runtime": 50.7195, |
|
"eval_samples_per_second": 1.834, |
|
"eval_steps_per_second": 1.834, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3107591480065537, |
|
"grad_norm": 0.21068286895751953, |
|
"learning_rate": 0.00017471599184386836, |
|
"loss": 0.0868, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.3107591480065537, |
|
"eval_loss": 0.2663758397102356, |
|
"eval_runtime": 50.6208, |
|
"eval_samples_per_second": 1.837, |
|
"eval_steps_per_second": 1.837, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.4199890770071, |
|
"grad_norm": 0.1310437172651291, |
|
"learning_rate": 0.00018928051267113314, |
|
"loss": 0.087, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.4199890770071, |
|
"eval_loss": 0.2654285728931427, |
|
"eval_runtime": 50.5893, |
|
"eval_samples_per_second": 1.838, |
|
"eval_steps_per_second": 1.838, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.529219006007646, |
|
"grad_norm": 0.1630202829837799, |
|
"learning_rate": 0.00019988197347556582, |
|
"loss": 0.0841, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.529219006007646, |
|
"eval_loss": 0.25736120343208313, |
|
"eval_runtime": 50.5692, |
|
"eval_samples_per_second": 1.839, |
|
"eval_steps_per_second": 1.839, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.6385581649371928, |
|
"grad_norm": 0.07697170972824097, |
|
"learning_rate": 0.00019943149055787806, |
|
"loss": 0.0807, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.6385581649371928, |
|
"eval_loss": 0.265057772397995, |
|
"eval_runtime": 50.3751, |
|
"eval_samples_per_second": 1.846, |
|
"eval_steps_per_second": 1.846, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.7477880939377388, |
|
"grad_norm": 0.10595889389514923, |
|
"learning_rate": 0.0001989810076401903, |
|
"loss": 0.0782, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.7477880939377388, |
|
"eval_loss": 0.2732037305831909, |
|
"eval_runtime": 50.5346, |
|
"eval_samples_per_second": 1.84, |
|
"eval_steps_per_second": 1.84, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.857018022938285, |
|
"grad_norm": 0.13792632520198822, |
|
"learning_rate": 0.00019853052472250252, |
|
"loss": 0.0757, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.857018022938285, |
|
"eval_loss": 0.25934740900993347, |
|
"eval_runtime": 50.3603, |
|
"eval_samples_per_second": 1.847, |
|
"eval_steps_per_second": 1.847, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.1341710239648819, |
|
"learning_rate": 0.00019808004180481476, |
|
"loss": 0.0764, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"eval_loss": 0.25228968262672424, |
|
"eval_runtime": 47.204, |
|
"eval_samples_per_second": 1.97, |
|
"eval_steps_per_second": 1.97, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.011377187445759773, |
|
"learning_rate": 0.00019726869847708111, |
|
"loss": 0.0743, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 0.1291830837726593, |
|
"eval_runtime": 193.506, |
|
"eval_samples_per_second": 2.393, |
|
"eval_steps_per_second": 2.393, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.184817039868924, |
|
"grad_norm": 0.39175206422805786, |
|
"learning_rate": 0.00019717907596943925, |
|
"loss": 0.0717, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.184817039868924, |
|
"eval_loss": 0.2626956105232239, |
|
"eval_runtime": 51.1714, |
|
"eval_samples_per_second": 1.817, |
|
"eval_steps_per_second": 1.817, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.2940469688694702, |
|
"grad_norm": 0.17580239474773407, |
|
"learning_rate": 0.0001967285930517515, |
|
"loss": 0.0703, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.2940469688694702, |
|
"eval_loss": 0.28000086545944214, |
|
"eval_runtime": 51.2031, |
|
"eval_samples_per_second": 1.816, |
|
"eval_steps_per_second": 1.816, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.4032768978700165, |
|
"grad_norm": 0.27358973026275635, |
|
"learning_rate": 0.00019627811013406371, |
|
"loss": 0.0707, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.4032768978700165, |
|
"eval_loss": 0.27782928943634033, |
|
"eval_runtime": 50.7155, |
|
"eval_samples_per_second": 1.834, |
|
"eval_steps_per_second": 1.834, |
|
"step": 11000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 228850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 2.298922015943301e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|