{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997401022264576, "eval_steps": 500, "global_step": 2885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 22.53300666809082, "learning_rate": 6.71280276816609e-06, "loss": 3.1491, "step": 100 }, { "epoch": 0.07, "grad_norm": 36.242984771728516, "learning_rate": 1.356401384083045e-05, "loss": 2.6131, "step": 200 }, { "epoch": 0.1, "grad_norm": 25.216327667236328, "learning_rate": 1.999973639055537e-05, "loss": 2.5726, "step": 300 }, { "epoch": 0.14, "grad_norm": 14.273802757263184, "learning_rate": 1.9917836961775225e-05, "loss": 2.4989, "step": 400 }, { "epoch": 0.17, "grad_norm": 27.216812133789062, "learning_rate": 1.969086765436979e-05, "loss": 2.5906, "step": 500 }, { "epoch": 0.21, "grad_norm": 18.74100112915039, "learning_rate": 1.9322148386785378e-05, "loss": 2.4275, "step": 600 }, { "epoch": 0.24, "grad_norm": 20.627084732055664, "learning_rate": 1.8817072478109763e-05, "loss": 2.5103, "step": 700 }, { "epoch": 0.28, "grad_norm": 15.611855506896973, "learning_rate": 1.818302775908169e-05, "loss": 2.3706, "step": 800 }, { "epoch": 0.31, "grad_norm": 25.303524017333984, "learning_rate": 1.7429288509041197e-05, "loss": 2.3601, "step": 900 }, { "epoch": 0.35, "grad_norm": 20.18657875061035, "learning_rate": 1.6566879799477148e-05, "loss": 2.5054, "step": 1000 }, { "epoch": 0.38, "grad_norm": 17.65004539489746, "learning_rate": 1.560841622844192e-05, "loss": 2.3717, "step": 1100 }, { "epoch": 0.42, "grad_norm": 19.5482177734375, "learning_rate": 1.4578679381126853e-05, "loss": 2.3772, "step": 1200 }, { "epoch": 0.45, "grad_norm": 14.92688274383545, "learning_rate": 1.3471954275891059e-05, "loss": 2.2991, "step": 1300 }, { "epoch": 0.49, "grad_norm": 10.425432205200195, "learning_rate": 1.2314444308256605e-05, "loss": 2.2865, "step": 1400 }, { "epoch": 0.52, "grad_norm": 16.403301239013672, "learning_rate": 1.1123080572287608e-05, "loss": 2.2595, "step": 1500 }, { "epoch": 0.55, "grad_norm": 11.935959815979004, "learning_rate": 9.915289346843219e-06, "loss": 2.3662, "step": 1600 }, { "epoch": 0.59, "grad_norm": 18.410987854003906, "learning_rate": 8.708737198449509e-06, "loss": 2.2021, "step": 1700 }, { "epoch": 0.62, "grad_norm": 15.293601036071777, "learning_rate": 7.521072569442963e-06, "loss": 2.2545, "step": 1800 }, { "epoch": 0.66, "grad_norm": 16.34610939025879, "learning_rate": 6.369667631219584e-06, "loss": 2.3199, "step": 1900 }, { "epoch": 0.69, "grad_norm": 15.948208808898926, "learning_rate": 5.2713641785457504e-06, "loss": 2.2029, "step": 2000 }, { "epoch": 0.73, "grad_norm": 27.17706298828125, "learning_rate": 4.242227281777747e-06, "loss": 2.2861, "step": 2100 }, { "epoch": 0.76, "grad_norm": 19.407489776611328, "learning_rate": 3.297310300360622e-06, "loss": 2.2157, "step": 2200 }, { "epoch": 0.8, "grad_norm": 11.622710227966309, "learning_rate": 2.450434694793621e-06, "loss": 2.2724, "step": 2300 }, { "epoch": 0.83, "grad_norm": 16.701732635498047, "learning_rate": 1.7139878577898772e-06, "loss": 2.1622, "step": 2400 }, { "epoch": 0.87, "grad_norm": 10.720149040222168, "learning_rate": 1.0987419217881333e-06, "loss": 2.2026, "step": 2500 }, { "epoch": 0.9, "grad_norm": 14.398381233215332, "learning_rate": 6.136961931496943e-07, "loss": 2.2619, "step": 2600 }, { "epoch": 0.94, "grad_norm": 16.95086669921875, "learning_rate": 2.6594551778223896e-07, "loss": 2.2626, "step": 2700 }, { "epoch": 0.97, "grad_norm": 12.132495880126953, "learning_rate": 6.057650362879753e-08, "loss": 2.1139, "step": 2800 }, { "epoch": 1.0, "step": 2885, "total_flos": 1.1131515504795648e+16, "train_loss": 2.3624265621191913, "train_runtime": 6766.2575, "train_samples_per_second": 1.706, "train_steps_per_second": 0.426 } ], "logging_steps": 100, "max_steps": 2885, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.1131515504795648e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }