{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.112676056338028, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0880281690140845, "grad_norm": 1643.14404296875, "learning_rate": 4.166666666666667e-06, "loss": 136.5152, "step": 100 }, { "epoch": 0.176056338028169, "grad_norm": 1259.6392822265625, "learning_rate": 8.333333333333334e-06, "loss": 127.3215, "step": 200 }, { "epoch": 0.2640845070422535, "grad_norm": 1478.6107177734375, "learning_rate": 9.980973490458728e-06, "loss": 137.0093, "step": 300 }, { "epoch": 0.352112676056338, "grad_norm": 586.53076171875, "learning_rate": 9.86522435289912e-06, "loss": 136.2538, "step": 400 }, { "epoch": 0.44014084507042256, "grad_norm": 748.0945434570312, "learning_rate": 9.646737621134112e-06, "loss": 110.4259, "step": 500 }, { "epoch": 0.528169014084507, "grad_norm": 1178.682373046875, "learning_rate": 9.330127018922195e-06, "loss": 99.7847, "step": 600 }, { "epoch": 0.6161971830985915, "grad_norm": 890.4047241210938, "learning_rate": 8.92207832459788e-06, "loss": 111.7352, "step": 700 }, { "epoch": 0.704225352112676, "grad_norm": 333.7227783203125, "learning_rate": 8.43120818934367e-06, "loss": 104.0762, "step": 800 }, { "epoch": 0.7922535211267606, "grad_norm": 1009.5026245117188, "learning_rate": 7.86788218175523e-06, "loss": 103.2902, "step": 900 }, { "epoch": 0.8802816901408451, "grad_norm": 542.9478759765625, "learning_rate": 7.243995901002312e-06, "loss": 107.7297, "step": 1000 }, { "epoch": 0.9683098591549296, "grad_norm": 864.6980590820312, "learning_rate": 6.572723780758069e-06, "loss": 88.3369, "step": 1100 }, { "epoch": 1.0, "eval_loss": 200.542236328125, "eval_runtime": 31.8227, "eval_samples_per_second": 31.738, "eval_steps_per_second": 3.991, "step": 1136 }, { "epoch": 1.056338028169014, "grad_norm": 1790.6683349609375, "learning_rate": 5.8682408883346535e-06, "loss": 91.1943, "step": 1200 }, { "epoch": 1.1443661971830985, "grad_norm": 1222.1258544921875, "learning_rate": 5.145423593715558e-06, "loss": 101.148, "step": 1300 }, { "epoch": 1.232394366197183, "grad_norm": 352.8750915527344, "learning_rate": 4.4195354293738484e-06, "loss": 74.3622, "step": 1400 }, { "epoch": 1.3204225352112675, "grad_norm": 2005.60986328125, "learning_rate": 3.705904774487396e-06, "loss": 85.063, "step": 1500 }, { "epoch": 1.408450704225352, "grad_norm": 680.5648193359375, "learning_rate": 3.019601169804216e-06, "loss": 81.2217, "step": 1600 }, { "epoch": 1.4964788732394365, "grad_norm": 394.0007629394531, "learning_rate": 2.3751170983272e-06, "loss": 83.0847, "step": 1700 }, { "epoch": 1.584507042253521, "grad_norm": 868.7428588867188, "learning_rate": 1.7860619515673034e-06, "loss": 68.1166, "step": 1800 }, { "epoch": 1.6725352112676055, "grad_norm": 275.2021179199219, "learning_rate": 1.264874643795021e-06, "loss": 88.6323, "step": 1900 }, { "epoch": 1.76056338028169, "grad_norm": 370.32763671875, "learning_rate": 8.225609429353187e-07, "loss": 75.0886, "step": 2000 }, { "epoch": 1.8485915492957745, "grad_norm": 514.0765380859375, "learning_rate": 4.6846106481675035e-07, "loss": 70.9229, "step": 2100 }, { "epoch": 1.936619718309859, "grad_norm": 687.5501708984375, "learning_rate": 2.1005243842255552e-07, "loss": 77.3133, "step": 2200 }, { "epoch": 2.0, "eval_loss": 195.50889587402344, "eval_runtime": 32.1653, "eval_samples_per_second": 31.4, "eval_steps_per_second": 3.948, "step": 2272 }, { "epoch": 2.0246478873239435, "grad_norm": 885.4177856445312, "learning_rate": 5.279180709527765e-08, "loss": 76.5441, "step": 2300 }, { "epoch": 2.112676056338028, "grad_norm": 815.9951782226562, "learning_rate": 0.0, "loss": 72.9512, "step": 2400 } ], "logging_steps": 100, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 600, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }