|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.383378016085791, |
|
"eval_steps": 500, |
|
"global_step": 7000, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.6702412868632708, |
|
"grad_norm": 12.96187973022461, |
|
"learning_rate": 5.852478627669732e-06, |
|
"loss": 0.5779, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7619845867156982, |
|
"eval_loss": 0.48268476128578186, |
|
"eval_runtime": 9.469, |
|
"eval_samples_per_second": 315.027, |
|
"eval_steps_per_second": 19.749, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.3404825737265416, |
|
"grad_norm": 19.64900779724121, |
|
"learning_rate": 5.432041944647482e-06, |
|
"loss": 0.464, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7680187821388245, |
|
"eval_loss": 0.5030698180198669, |
|
"eval_runtime": 9.5015, |
|
"eval_samples_per_second": 313.951, |
|
"eval_steps_per_second": 19.681, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 2.0107238605898123, |
|
"grad_norm": 9.11796760559082, |
|
"learning_rate": 5.0116052616252305e-06, |
|
"loss": 0.4025, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.680965147453083, |
|
"grad_norm": 11.002016067504883, |
|
"learning_rate": 4.59116857860298e-06, |
|
"loss": 0.2688, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7747234106063843, |
|
"eval_loss": 0.5899690389633179, |
|
"eval_runtime": 9.4806, |
|
"eval_samples_per_second": 314.643, |
|
"eval_steps_per_second": 19.725, |
|
"step": 2238 |
|
}, |
|
{ |
|
"epoch": 3.351206434316354, |
|
"grad_norm": 3.2045211791992188, |
|
"learning_rate": 4.170731895580729e-06, |
|
"loss": 0.2182, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7673482894897461, |
|
"eval_loss": 0.7394132614135742, |
|
"eval_runtime": 9.6202, |
|
"eval_samples_per_second": 310.077, |
|
"eval_steps_per_second": 19.438, |
|
"step": 2984 |
|
}, |
|
{ |
|
"epoch": 4.021447721179625, |
|
"grad_norm": 0.5618192553520203, |
|
"learning_rate": 3.7502952125584775e-06, |
|
"loss": 0.1644, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.6916890080428955, |
|
"grad_norm": 19.86639976501465, |
|
"learning_rate": 3.3298585295362272e-06, |
|
"loss": 0.1173, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7646664381027222, |
|
"eval_loss": 0.7920505404472351, |
|
"eval_runtime": 9.4974, |
|
"eval_samples_per_second": 314.085, |
|
"eval_steps_per_second": 19.69, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 5.361930294906166, |
|
"grad_norm": 7.497531414031982, |
|
"learning_rate": 2.909421846513976e-06, |
|
"loss": 0.1026, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7740529775619507, |
|
"eval_loss": 0.9123848676681519, |
|
"eval_runtime": 9.4955, |
|
"eval_samples_per_second": 314.147, |
|
"eval_steps_per_second": 19.693, |
|
"step": 4476 |
|
}, |
|
{ |
|
"epoch": 6.032171581769437, |
|
"grad_norm": 21.95626449584961, |
|
"learning_rate": 2.488985163491725e-06, |
|
"loss": 0.082, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.702412868632708, |
|
"grad_norm": 0.22410695254802704, |
|
"learning_rate": 2.0685484804694742e-06, |
|
"loss": 0.065, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7720415592193604, |
|
"eval_loss": 0.9457674026489258, |
|
"eval_runtime": 9.4723, |
|
"eval_samples_per_second": 314.918, |
|
"eval_steps_per_second": 19.742, |
|
"step": 5222 |
|
}, |
|
{ |
|
"epoch": 7.372654155495979, |
|
"grad_norm": 53.26298904418945, |
|
"learning_rate": 1.6481117974472235e-06, |
|
"loss": 0.0629, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7740529775619507, |
|
"eval_loss": 0.9820337891578674, |
|
"eval_runtime": 9.5123, |
|
"eval_samples_per_second": 313.596, |
|
"eval_steps_per_second": 19.659, |
|
"step": 5968 |
|
}, |
|
{ |
|
"epoch": 8.04289544235925, |
|
"grad_norm": 13.058491706848145, |
|
"learning_rate": 1.2276751144249726e-06, |
|
"loss": 0.0518, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.71313672922252, |
|
"grad_norm": 2.8812756538391113, |
|
"learning_rate": 8.072384314027216e-07, |
|
"loss": 0.0435, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7743881940841675, |
|
"eval_loss": 0.9872733354568481, |
|
"eval_runtime": 9.4773, |
|
"eval_samples_per_second": 314.753, |
|
"eval_steps_per_second": 19.731, |
|
"step": 6714 |
|
}, |
|
{ |
|
"epoch": 9.383378016085791, |
|
"grad_norm": 2.8628196716308594, |
|
"learning_rate": 3.868017483804708e-07, |
|
"loss": 0.037, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 7460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.8861065792123148e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": { |
|
"learning_rate": 6.272915310691983e-06, |
|
"per_device_train_batch_size": 16 |
|
} |
|
} |
|
|