{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 13336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "grad_norm": 5.067293643951416, "learning_rate": 9.625074985003e-06, "loss": 1.3508, "step": 500 }, { "epoch": 0.3, "grad_norm": 6.026179313659668, "learning_rate": 9.250149970005999e-06, "loss": 1.3055, "step": 1000 }, { "epoch": 0.45, "grad_norm": 6.276480674743652, "learning_rate": 8.875224955008999e-06, "loss": 1.2235, "step": 1500 }, { "epoch": 0.6, "grad_norm": 5.172286510467529, "learning_rate": 8.500299940011997e-06, "loss": 1.2637, "step": 2000 }, { "epoch": 0.75, "grad_norm": 5.915102958679199, "learning_rate": 8.125374925014997e-06, "loss": 1.2232, "step": 2500 }, { "epoch": 0.9, "grad_norm": 5.094091415405273, "learning_rate": 7.750449910017997e-06, "loss": 1.1814, "step": 3000 }, { "epoch": 1.05, "grad_norm": 4.9657793045043945, "learning_rate": 7.375524895020996e-06, "loss": 1.1449, "step": 3500 }, { "epoch": 1.2, "grad_norm": 5.007996082305908, "learning_rate": 7.000599880023996e-06, "loss": 1.0923, "step": 4000 }, { "epoch": 1.35, "grad_norm": 5.494340896606445, "learning_rate": 6.6256748650269955e-06, "loss": 1.0908, "step": 4500 }, { "epoch": 1.5, "grad_norm": 4.9410271644592285, "learning_rate": 6.250749850029995e-06, "loss": 1.0804, "step": 5000 }, { "epoch": 1.65, "grad_norm": 5.13407564163208, "learning_rate": 5.875824835032994e-06, "loss": 1.0556, "step": 5500 }, { "epoch": 1.8, "grad_norm": 4.397137641906738, "learning_rate": 5.500899820035993e-06, "loss": 1.0886, "step": 6000 }, { "epoch": 1.95, "grad_norm": 5.924018383026123, "learning_rate": 5.125974805038992e-06, "loss": 1.0694, "step": 6500 }, { "epoch": 2.1, "grad_norm": 3.952533006668091, "learning_rate": 4.751049790041992e-06, "loss": 1.0158, "step": 7000 }, { "epoch": 2.25, "grad_norm": 6.275745868682861, "learning_rate": 4.376124775044991e-06, "loss": 1.0082, "step": 7500 }, { "epoch": 2.4, "grad_norm": 7.6413116455078125, "learning_rate": 4.001199760047991e-06, "loss": 0.9991, "step": 8000 }, { "epoch": 2.55, "grad_norm": 5.2266387939453125, "learning_rate": 3.6262747450509898e-06, "loss": 0.976, "step": 8500 }, { "epoch": 2.7, "grad_norm": 4.824028968811035, "learning_rate": 3.2513497300539893e-06, "loss": 0.9826, "step": 9000 }, { "epoch": 2.85, "grad_norm": 7.193837642669678, "learning_rate": 2.876424715056989e-06, "loss": 0.9731, "step": 9500 }, { "epoch": 3.0, "grad_norm": 6.571595191955566, "learning_rate": 2.5014997000599884e-06, "loss": 1.0054, "step": 10000 }, { "epoch": 3.15, "grad_norm": 4.61974573135376, "learning_rate": 2.1265746850629876e-06, "loss": 0.9583, "step": 10500 }, { "epoch": 3.3, "grad_norm": 5.337657451629639, "learning_rate": 1.751649670065987e-06, "loss": 0.9296, "step": 11000 }, { "epoch": 3.45, "grad_norm": 5.575818061828613, "learning_rate": 1.3767246550689864e-06, "loss": 0.9576, "step": 11500 }, { "epoch": 3.6, "grad_norm": 6.3436431884765625, "learning_rate": 1.0017996400719856e-06, "loss": 0.9241, "step": 12000 }, { "epoch": 3.75, "grad_norm": 3.276711940765381, "learning_rate": 6.26874625074985e-07, "loss": 0.9307, "step": 12500 }, { "epoch": 3.9, "grad_norm": 5.67604923248291, "learning_rate": 2.519496100779844e-07, "loss": 0.9255, "step": 13000 }, { "epoch": 4.0, "step": 13336, "total_flos": 1.66894187000832e+16, "train_loss": 1.0640255470939504, "train_runtime": 2426.9638, "train_samples_per_second": 16.481, "train_steps_per_second": 5.495 } ], "logging_steps": 500, "max_steps": 13336, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 4000, "total_flos": 1.66894187000832e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }