{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.71313672922252, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6702412868632708, "grad_norm": 17.923147201538086, "learning_rate": 1.3001640168623715e-05, "loss": 0.5514, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.7683539986610413, "eval_loss": 0.46452510356903076, "eval_runtime": 8.9706, "eval_samples_per_second": 332.531, "eval_steps_per_second": 20.846, "step": 746 }, { "epoch": 1.3404825737265416, "grad_norm": 19.32151222229004, "learning_rate": 1.2067614294440976e-05, "loss": 0.3969, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.7643312215805054, "eval_loss": 0.5671606063842773, "eval_runtime": 9.0896, "eval_samples_per_second": 328.177, "eval_steps_per_second": 20.573, "step": 1492 }, { "epoch": 2.0107238605898123, "grad_norm": 17.702476501464844, "learning_rate": 1.1133588420258238e-05, "loss": 0.2998, "step": 1500 }, { "epoch": 2.680965147453083, "grad_norm": 6.516891956329346, "learning_rate": 1.0199562546075501e-05, "loss": 0.1319, "step": 2000 }, { "epoch": 3.0, "eval_accuracy": 0.7770700454711914, "eval_loss": 0.7936307191848755, "eval_runtime": 9.0031, "eval_samples_per_second": 331.332, "eval_steps_per_second": 20.771, "step": 2238 }, { "epoch": 3.351206434316354, "grad_norm": 0.19944968819618225, "learning_rate": 9.265536671892763e-06, "loss": 0.0932, "step": 2500 }, { "epoch": 4.0, "eval_accuracy": 0.7750586867332458, "eval_loss": 1.1848183870315552, "eval_runtime": 9.064, "eval_samples_per_second": 329.104, "eval_steps_per_second": 20.631, "step": 2984 }, { "epoch": 4.021447721179625, "grad_norm": 0.03240065276622772, "learning_rate": 8.331510797710023e-06, "loss": 0.0618, "step": 3000 }, { "epoch": 4.6916890080428955, "grad_norm": 23.173229217529297, "learning_rate": 7.397484923527286e-06, "loss": 0.0314, "step": 3500 }, { "epoch": 5.0, "eval_accuracy": 0.7707006335258484, "eval_loss": 1.372557520866394, "eval_runtime": 9.0505, "eval_samples_per_second": 329.594, "eval_steps_per_second": 20.662, "step": 3730 }, { "epoch": 5.361930294906166, "grad_norm": 0.01009325310587883, "learning_rate": 6.463459049344548e-06, "loss": 0.0281, "step": 4000 }, { "epoch": 6.0, "eval_accuracy": 0.777405321598053, "eval_loss": 1.31714928150177, "eval_runtime": 9.0862, "eval_samples_per_second": 328.299, "eval_steps_per_second": 20.581, "step": 4476 }, { "epoch": 6.032171581769437, "grad_norm": 0.6762986779212952, "learning_rate": 5.529433175161809e-06, "loss": 0.0202, "step": 4500 }, { "epoch": 6.702412868632708, "grad_norm": 0.610667884349823, "learning_rate": 4.595407300979072e-06, "loss": 0.0144, "step": 5000 }, { "epoch": 7.0, "eval_accuracy": 0.7767348289489746, "eval_loss": 1.4879707098007202, "eval_runtime": 8.9951, "eval_samples_per_second": 331.626, "eval_steps_per_second": 20.789, "step": 5222 }, { "epoch": 7.372654155495979, "grad_norm": 0.01202669832855463, "learning_rate": 3.6613814267963338e-06, "loss": 0.0084, "step": 5500 }, { "epoch": 8.0, "eval_accuracy": 0.7763996124267578, "eval_loss": 1.6179240942001343, "eval_runtime": 9.0802, "eval_samples_per_second": 328.518, "eval_steps_per_second": 20.594, "step": 5968 }, { "epoch": 8.04289544235925, "grad_norm": 0.007873074151575565, "learning_rate": 2.7273555526135954e-06, "loss": 0.0046, "step": 6000 }, { "epoch": 8.71313672922252, "grad_norm": 0.029083162546157837, "learning_rate": 1.7933296784308572e-06, "loss": 0.0044, "step": 6500 } ], "logging_steps": 500, "max_steps": 7460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.5429010800414744e+16, "train_batch_size": 16, "trial_name": null, "trial_params": { "learning_rate": 1.3935666042806453e-05, "per_device_train_batch_size": 16 } }