{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.148272017837236, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.56, "grad_norm": 4.136017322540283, "learning_rate": 4.75e-05, "loss": 1.4862, "step": 500 }, { "epoch": 1.11, "grad_norm": 3.5059304237365723, "learning_rate": 4.5e-05, "loss": 1.2182, "step": 1000 }, { "epoch": 1.67, "grad_norm": 3.1735312938690186, "learning_rate": 4.25e-05, "loss": 1.0549, "step": 1500 }, { "epoch": 2.23, "grad_norm": 3.1064469814300537, "learning_rate": 4e-05, "loss": 0.9527, "step": 2000 }, { "epoch": 2.79, "grad_norm": 2.8415098190307617, "learning_rate": 3.7500000000000003e-05, "loss": 0.8717, "step": 2500 }, { "epoch": 3.34, "grad_norm": 2.4496607780456543, "learning_rate": 3.5e-05, "loss": 0.783, "step": 3000 }, { "epoch": 3.9, "grad_norm": 2.9042794704437256, "learning_rate": 3.2500000000000004e-05, "loss": 0.7372, "step": 3500 }, { "epoch": 4.46, "grad_norm": 2.628190517425537, "learning_rate": 3e-05, "loss": 0.6509, "step": 4000 }, { "epoch": 5.02, "grad_norm": 2.9309325218200684, "learning_rate": 2.7500000000000004e-05, "loss": 0.6398, "step": 4500 }, { "epoch": 5.57, "grad_norm": 2.357445240020752, "learning_rate": 2.5e-05, "loss": 0.56, "step": 5000 }, { "epoch": 6.13, "grad_norm": 2.1420226097106934, "learning_rate": 2.25e-05, "loss": 0.5543, "step": 5500 }, { "epoch": 6.69, "grad_norm": 2.6394078731536865, "learning_rate": 2e-05, "loss": 0.5104, "step": 6000 }, { "epoch": 7.25, "grad_norm": 2.395305633544922, "learning_rate": 1.75e-05, "loss": 0.4832, "step": 6500 }, { "epoch": 7.8, "grad_norm": 2.048461675643921, "learning_rate": 1.5e-05, "loss": 0.4705, "step": 7000 }, { "epoch": 8.36, "grad_norm": 2.379493236541748, "learning_rate": 1.25e-05, "loss": 0.442, "step": 7500 }, { "epoch": 8.92, "grad_norm": 2.170802116394043, "learning_rate": 1e-05, "loss": 0.4264, "step": 8000 }, { "epoch": 9.48, "grad_norm": 2.277756690979004, "learning_rate": 7.5e-06, "loss": 0.405, "step": 8500 }, { "epoch": 10.03, "grad_norm": 2.3412973880767822, "learning_rate": 5e-06, "loss": 0.4069, "step": 9000 }, { "epoch": 10.59, "grad_norm": 2.220499038696289, "learning_rate": 2.5e-06, "loss": 0.3937, "step": 9500 }, { "epoch": 11.15, "grad_norm": 2.1124320030212402, "learning_rate": 0.0, "loss": 0.3872, "step": 10000 }, { "epoch": 11.15, "step": 10000, "total_flos": 5225840640000000.0, "train_loss": 0.6717062408447265, "train_runtime": 2130.1373, "train_samples_per_second": 4.695, "train_steps_per_second": 4.695 } ], "logging_steps": 500, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 1000, "total_flos": 5225840640000000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }