{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.71313672922252, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6702412868632708, "grad_norm": 12.96187973022461, "learning_rate": 5.852478627669732e-06, "loss": 0.5779, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.7619845867156982, "eval_loss": 0.48268476128578186, "eval_runtime": 9.469, "eval_samples_per_second": 315.027, "eval_steps_per_second": 19.749, "step": 746 }, { "epoch": 1.3404825737265416, "grad_norm": 19.64900779724121, "learning_rate": 5.432041944647482e-06, "loss": 0.464, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.7680187821388245, "eval_loss": 0.5030698180198669, "eval_runtime": 9.5015, "eval_samples_per_second": 313.951, "eval_steps_per_second": 19.681, "step": 1492 }, { "epoch": 2.0107238605898123, "grad_norm": 9.11796760559082, "learning_rate": 5.0116052616252305e-06, "loss": 0.4025, "step": 1500 }, { "epoch": 2.680965147453083, "grad_norm": 11.002016067504883, "learning_rate": 4.59116857860298e-06, "loss": 0.2688, "step": 2000 }, { "epoch": 3.0, "eval_accuracy": 0.7747234106063843, "eval_loss": 0.5899690389633179, "eval_runtime": 9.4806, "eval_samples_per_second": 314.643, "eval_steps_per_second": 19.725, "step": 2238 }, { "epoch": 3.351206434316354, "grad_norm": 3.2045211791992188, "learning_rate": 4.170731895580729e-06, "loss": 0.2182, "step": 2500 }, { "epoch": 4.0, "eval_accuracy": 0.7673482894897461, "eval_loss": 0.7394132614135742, "eval_runtime": 9.6202, "eval_samples_per_second": 310.077, "eval_steps_per_second": 19.438, "step": 2984 }, { "epoch": 4.021447721179625, "grad_norm": 0.5618192553520203, "learning_rate": 3.7502952125584775e-06, "loss": 0.1644, "step": 3000 }, { "epoch": 4.6916890080428955, "grad_norm": 19.86639976501465, "learning_rate": 3.3298585295362272e-06, "loss": 0.1173, "step": 3500 }, { "epoch": 5.0, "eval_accuracy": 0.7646664381027222, "eval_loss": 0.7920505404472351, "eval_runtime": 9.4974, "eval_samples_per_second": 314.085, "eval_steps_per_second": 19.69, "step": 3730 }, { "epoch": 5.361930294906166, "grad_norm": 7.497531414031982, "learning_rate": 2.909421846513976e-06, "loss": 0.1026, "step": 4000 }, { "epoch": 6.0, "eval_accuracy": 0.7740529775619507, "eval_loss": 0.9123848676681519, "eval_runtime": 9.4955, "eval_samples_per_second": 314.147, "eval_steps_per_second": 19.693, "step": 4476 }, { "epoch": 6.032171581769437, "grad_norm": 21.95626449584961, "learning_rate": 2.488985163491725e-06, "loss": 0.082, "step": 4500 }, { "epoch": 6.702412868632708, "grad_norm": 0.22410695254802704, "learning_rate": 2.0685484804694742e-06, "loss": 0.065, "step": 5000 }, { "epoch": 7.0, "eval_accuracy": 0.7720415592193604, "eval_loss": 0.9457674026489258, "eval_runtime": 9.4723, "eval_samples_per_second": 314.918, "eval_steps_per_second": 19.742, "step": 5222 }, { "epoch": 7.372654155495979, "grad_norm": 53.26298904418945, "learning_rate": 1.6481117974472235e-06, "loss": 0.0629, "step": 5500 }, { "epoch": 8.0, "eval_accuracy": 0.7740529775619507, "eval_loss": 0.9820337891578674, "eval_runtime": 9.5123, "eval_samples_per_second": 313.596, "eval_steps_per_second": 19.659, "step": 5968 }, { "epoch": 8.04289544235925, "grad_norm": 13.058491706848145, "learning_rate": 1.2276751144249726e-06, "loss": 0.0518, "step": 6000 }, { "epoch": 8.71313672922252, "grad_norm": 2.8812756538391113, "learning_rate": 8.072384314027216e-07, "loss": 0.0435, "step": 6500 } ], "logging_steps": 500, "max_steps": 7460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.7562958955832816e+16, "train_batch_size": 16, "trial_name": null, "trial_params": { "learning_rate": 6.272915310691983e-06, "per_device_train_batch_size": 16 } }