{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2416626389560174, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 75.8236083984375, "learning_rate": 4.0000000000000003e-07, "loss": 3.7148, "step": 10 }, { "epoch": 0.01, "grad_norm": 52.23726272583008, "learning_rate": 9e-07, "loss": 3.2721, "step": 20 }, { "epoch": 0.01, "grad_norm": 35.391998291015625, "learning_rate": 1.4000000000000001e-06, "loss": 2.4262, "step": 30 }, { "epoch": 0.02, "grad_norm": 15.225934028625488, "learning_rate": 1.9e-06, "loss": 1.8828, "step": 40 }, { "epoch": 0.02, "grad_norm": 13.488654136657715, "learning_rate": 2.4000000000000003e-06, "loss": 1.5398, "step": 50 }, { "epoch": 0.03, "grad_norm": 8.773133277893066, "learning_rate": 2.8500000000000002e-06, "loss": 1.2772, "step": 60 }, { "epoch": 0.03, "grad_norm": 6.736795902252197, "learning_rate": 3.3500000000000005e-06, "loss": 1.1159, "step": 70 }, { "epoch": 0.04, "grad_norm": 2.9463019371032715, "learning_rate": 3.85e-06, "loss": 0.9883, "step": 80 }, { "epoch": 0.04, "grad_norm": 1.3851969242095947, "learning_rate": 4.35e-06, "loss": 0.902, "step": 90 }, { "epoch": 0.05, "grad_norm": 1.4642181396484375, "learning_rate": 4.85e-06, "loss": 0.8701, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.6514687538146973, "learning_rate": 5.3500000000000004e-06, "loss": 0.8548, "step": 110 }, { "epoch": 0.06, "grad_norm": 0.4448617994785309, "learning_rate": 5.850000000000001e-06, "loss": 0.8038, "step": 120 }, { "epoch": 0.06, "grad_norm": 0.5425733327865601, "learning_rate": 6.35e-06, "loss": 0.825, "step": 130 }, { "epoch": 0.07, "grad_norm": 0.47352057695388794, "learning_rate": 6.8500000000000005e-06, "loss": 0.812, "step": 140 }, { "epoch": 0.07, "grad_norm": 0.40401744842529297, "learning_rate": 7.35e-06, "loss": 0.7912, "step": 150 }, { "epoch": 0.08, "grad_norm": 0.3112718164920807, "learning_rate": 7.850000000000001e-06, "loss": 0.8057, "step": 160 }, { "epoch": 0.08, "grad_norm": 0.45351892709732056, "learning_rate": 8.350000000000001e-06, "loss": 0.7912, "step": 170 }, { "epoch": 0.09, "grad_norm": 0.38080641627311707, "learning_rate": 8.85e-06, "loss": 0.7956, "step": 180 }, { "epoch": 0.09, "grad_norm": 0.22949431836605072, "learning_rate": 9.35e-06, "loss": 0.7785, "step": 190 }, { "epoch": 0.1, "grad_norm": 0.8331744074821472, "learning_rate": 9.85e-06, "loss": 0.7769, "step": 200 }, { "epoch": 0.1, "grad_norm": 0.30213871598243713, "learning_rate": 1.035e-05, "loss": 0.7797, "step": 210 }, { "epoch": 0.11, "grad_norm": 0.32015591859817505, "learning_rate": 1.0850000000000001e-05, "loss": 0.7941, "step": 220 }, { "epoch": 0.11, "grad_norm": 0.22447015345096588, "learning_rate": 1.1350000000000001e-05, "loss": 0.8014, "step": 230 }, { "epoch": 0.12, "grad_norm": 0.2546006143093109, "learning_rate": 1.185e-05, "loss": 0.771, "step": 240 }, { "epoch": 0.12, "grad_norm": 0.23833389580249786, "learning_rate": 1.235e-05, "loss": 0.7956, "step": 250 }, { "epoch": 0.13, "grad_norm": 0.1831669956445694, "learning_rate": 1.285e-05, "loss": 0.7833, "step": 260 }, { "epoch": 0.13, "grad_norm": 0.19860319793224335, "learning_rate": 1.3350000000000001e-05, "loss": 0.7956, "step": 270 }, { "epoch": 0.14, "grad_norm": 0.21220380067825317, "learning_rate": 1.3850000000000001e-05, "loss": 0.7942, "step": 280 }, { "epoch": 0.14, "grad_norm": 0.26025038957595825, "learning_rate": 1.435e-05, "loss": 0.7877, "step": 290 }, { "epoch": 0.14, "grad_norm": 0.2029470056295395, "learning_rate": 1.485e-05, "loss": 0.7637, "step": 300 }, { "epoch": 0.15, "grad_norm": 0.21639183163642883, "learning_rate": 1.535e-05, "loss": 0.7885, "step": 310 }, { "epoch": 0.15, "grad_norm": 0.23670387268066406, "learning_rate": 1.5850000000000002e-05, "loss": 0.7558, "step": 320 }, { "epoch": 0.16, "grad_norm": 0.2855335772037506, "learning_rate": 1.635e-05, "loss": 0.7854, "step": 330 }, { "epoch": 0.16, "grad_norm": 0.21515759825706482, "learning_rate": 1.6850000000000003e-05, "loss": 0.7577, "step": 340 }, { "epoch": 0.17, "grad_norm": 0.18981383740901947, "learning_rate": 1.7349999999999998e-05, "loss": 0.7802, "step": 350 }, { "epoch": 0.17, "grad_norm": 0.2674611508846283, "learning_rate": 1.785e-05, "loss": 0.7809, "step": 360 }, { "epoch": 0.18, "grad_norm": 0.24821774661540985, "learning_rate": 1.8350000000000002e-05, "loss": 0.7965, "step": 370 }, { "epoch": 0.18, "grad_norm": 0.3644929528236389, "learning_rate": 1.885e-05, "loss": 0.7957, "step": 380 }, { "epoch": 0.19, "grad_norm": 0.3568972647190094, "learning_rate": 1.9350000000000003e-05, "loss": 0.7882, "step": 390 }, { "epoch": 0.19, "grad_norm": 0.3236319422721863, "learning_rate": 1.985e-05, "loss": 0.7841, "step": 400 }, { "epoch": 0.2, "grad_norm": 0.24553723633289337, "learning_rate": 2.035e-05, "loss": 0.7824, "step": 410 }, { "epoch": 0.2, "grad_norm": 0.1855400949716568, "learning_rate": 2.085e-05, "loss": 0.7645, "step": 420 }, { "epoch": 0.21, "grad_norm": 0.26212435960769653, "learning_rate": 2.135e-05, "loss": 0.7824, "step": 430 }, { "epoch": 0.21, "grad_norm": 0.3167509138584137, "learning_rate": 2.1850000000000003e-05, "loss": 0.7908, "step": 440 }, { "epoch": 0.22, "grad_norm": 0.21845023334026337, "learning_rate": 2.235e-05, "loss": 0.7888, "step": 450 }, { "epoch": 0.22, "grad_norm": 0.30318892002105713, "learning_rate": 2.2850000000000003e-05, "loss": 0.7682, "step": 460 }, { "epoch": 0.23, "grad_norm": 0.28828638792037964, "learning_rate": 2.3350000000000002e-05, "loss": 0.7588, "step": 470 }, { "epoch": 0.23, "grad_norm": 0.2422240674495697, "learning_rate": 2.385e-05, "loss": 0.7787, "step": 480 }, { "epoch": 0.24, "grad_norm": 0.3579326272010803, "learning_rate": 2.435e-05, "loss": 0.7932, "step": 490 }, { "epoch": 0.24, "grad_norm": 0.3549540042877197, "learning_rate": 2.485e-05, "loss": 0.7634, "step": 500 }, { "epoch": 0.24, "eval_loss": 0.7744565606117249, "eval_runtime": 43.9626, "eval_samples_per_second": 45.493, "eval_steps_per_second": 0.364, "step": 500 } ], "logging_steps": 10, "max_steps": 10345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.2648108400962437e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }