| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.605851979345955, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.3079030513763428, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.4896, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.4786765575408936, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.4113, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.2072057723999023, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.3924, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.5262311697006226, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.3768, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.114892840385437, | |
| "learning_rate": 2e-05, | |
| "loss": 0.353, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_loss": 0.2806718051433563, | |
| "eval_runtime": 64.9966, | |
| "eval_samples_per_second": 15.385, | |
| "eval_steps_per_second": 3.846, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.1068037748336792, | |
| "learning_rate": 1.9555555555555557e-05, | |
| "loss": 0.3373, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0337824821472168, | |
| "learning_rate": 1.9111111111111113e-05, | |
| "loss": 0.2773, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.1779000759124756, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 0.2757, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.2178924083709717, | |
| "learning_rate": 1.8222222222222224e-05, | |
| "loss": 0.2891, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.1651198863983154, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.2648, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "eval_loss": 0.23123475909233093, | |
| "eval_runtime": 64.9606, | |
| "eval_samples_per_second": 15.394, | |
| "eval_steps_per_second": 3.848, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.941031813621521, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 0.2556, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.5538479089736938, | |
| "learning_rate": 1.688888888888889e-05, | |
| "loss": 0.2188, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.7369763851165771, | |
| "learning_rate": 1.6444444444444444e-05, | |
| "loss": 0.1589, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.8593116998672485, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.1526, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.9025155305862427, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 0.1512, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "eval_loss": 0.23454324901103973, | |
| "eval_runtime": 64.9882, | |
| "eval_samples_per_second": 15.387, | |
| "eval_steps_per_second": 3.847, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.2440764904022217, | |
| "learning_rate": 1.5111111111111112e-05, | |
| "loss": 0.1596, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.9365782141685486, | |
| "learning_rate": 1.4666666666666666e-05, | |
| "loss": 0.1603, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.977448582649231, | |
| "learning_rate": 1.4222222222222224e-05, | |
| "loss": 0.1086, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.9493745565414429, | |
| "learning_rate": 1.377777777777778e-05, | |
| "loss": 0.0822, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.437554955482483, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.0838, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "eval_loss": 0.258994460105896, | |
| "eval_runtime": 65.0096, | |
| "eval_samples_per_second": 15.382, | |
| "eval_steps_per_second": 3.846, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 1.1978514194488525, | |
| "learning_rate": 1.288888888888889e-05, | |
| "loss": 0.0849, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 1.1100072860717773, | |
| "learning_rate": 1.2444444444444446e-05, | |
| "loss": 0.0822, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 1.0157575607299805, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.0799, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 1.1923385858535767, | |
| "learning_rate": 1.1555555555555556e-05, | |
| "loss": 0.05, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 1.1894739866256714, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.0401, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "eval_loss": 0.3230312466621399, | |
| "eval_runtime": 65.0356, | |
| "eval_samples_per_second": 15.376, | |
| "eval_steps_per_second": 3.844, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.5902525186538696, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 0.0404, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 1.3404467105865479, | |
| "learning_rate": 1.0222222222222223e-05, | |
| "loss": 0.0418, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 1.4325445890426636, | |
| "learning_rate": 9.777777777777779e-06, | |
| "loss": 0.0441, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 1.3849271535873413, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 0.0429, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 1.0262218713760376, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.0196, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "eval_loss": 0.38104376196861267, | |
| "eval_runtime": 64.9932, | |
| "eval_samples_per_second": 15.386, | |
| "eval_steps_per_second": 3.847, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 1.4700788259506226, | |
| "learning_rate": 8.444444444444446e-06, | |
| "loss": 0.0205, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 5.51, | |
| "grad_norm": 0.9892378449440002, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.0207, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.4820214509963989, | |
| "learning_rate": 7.555555555555556e-06, | |
| "loss": 0.0204, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 1.2511119842529297, | |
| "learning_rate": 7.111111111111112e-06, | |
| "loss": 0.0228, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "grad_norm": 0.32089540362358093, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.0197, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "eval_loss": 0.3612504303455353, | |
| "eval_runtime": 65.1059, | |
| "eval_samples_per_second": 15.36, | |
| "eval_steps_per_second": 3.84, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.4687187671661377, | |
| "learning_rate": 6.222222222222223e-06, | |
| "loss": 0.0085, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 6.37, | |
| "grad_norm": 0.696907103061676, | |
| "learning_rate": 5.777777777777778e-06, | |
| "loss": 0.0096, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.5151516795158386, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 0.009, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 6.71, | |
| "grad_norm": 0.8852140307426453, | |
| "learning_rate": 4.888888888888889e-06, | |
| "loss": 0.012, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.8389159440994263, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.0093, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "eval_loss": 0.3735957741737366, | |
| "eval_runtime": 65.1742, | |
| "eval_samples_per_second": 15.343, | |
| "eval_steps_per_second": 3.836, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.06, | |
| "grad_norm": 0.158839613199234, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0071, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "grad_norm": 0.4010579586029053, | |
| "learning_rate": 3.555555555555556e-06, | |
| "loss": 0.0047, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.16058029234409332, | |
| "learning_rate": 3.1111111111111116e-06, | |
| "loss": 0.0037, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 7.57, | |
| "grad_norm": 0.1508949100971222, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.0039, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.3169790804386139, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.0037, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "eval_loss": 0.38906845450401306, | |
| "eval_runtime": 65.0854, | |
| "eval_samples_per_second": 15.364, | |
| "eval_steps_per_second": 3.841, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.46604883670806885, | |
| "learning_rate": 1.777777777777778e-06, | |
| "loss": 0.0043, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 8.09, | |
| "grad_norm": 0.08523246645927429, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.0042, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 8.26, | |
| "grad_norm": 0.2368686944246292, | |
| "learning_rate": 8.88888888888889e-07, | |
| "loss": 0.0032, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 8.43, | |
| "grad_norm": 0.11481987684965134, | |
| "learning_rate": 4.444444444444445e-07, | |
| "loss": 0.0019, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 8.61, | |
| "grad_norm": 0.13208821415901184, | |
| "learning_rate": 0.0, | |
| "loss": 0.002, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.61, | |
| "eval_loss": 0.4305289387702942, | |
| "eval_runtime": 65.0773, | |
| "eval_samples_per_second": 15.366, | |
| "eval_steps_per_second": 3.842, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 500, | |
| "total_flos": 1.258569996863275e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |