{ "best_metric": null, "best_model_checkpoint": null, "epoch": 32.6530612244898, "eval_steps": 20, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.82, "grad_norm": 0.18377472460269928, "learning_rate": 2.9999999999999997e-05, "loss": 1.861, "step": 10 }, { "epoch": 1.63, "grad_norm": 0.35202744603157043, "learning_rate": 5.9999999999999995e-05, "loss": 1.7263, "step": 20 }, { "epoch": 1.63, "eval_loss": 1.4457755088806152, "eval_runtime": 89.8938, "eval_samples_per_second": 4.305, "eval_steps_per_second": 0.545, "step": 20 }, { "epoch": 2.45, "grad_norm": 0.928983747959137, "learning_rate": 8.999999999999999e-05, "loss": 1.1718, "step": 30 }, { "epoch": 3.27, "grad_norm": 0.253262996673584, "learning_rate": 0.00011999999999999999, "loss": 0.4789, "step": 40 }, { "epoch": 3.27, "eval_loss": 0.3332095146179199, "eval_runtime": 89.9804, "eval_samples_per_second": 4.301, "eval_steps_per_second": 0.545, "step": 40 }, { "epoch": 4.08, "grad_norm": 0.12236642092466354, "learning_rate": 0.00015, "loss": 0.3568, "step": 50 }, { "epoch": 4.9, "grad_norm": 0.09160923212766647, "learning_rate": 0.00017999999999999998, "loss": 0.3256, "step": 60 }, { "epoch": 4.9, "eval_loss": 0.2753114104270935, "eval_runtime": 90.3206, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.543, "step": 60 }, { "epoch": 5.71, "grad_norm": 0.10242326557636261, "learning_rate": 0.00020999999999999998, "loss": 0.2841, "step": 70 }, { "epoch": 6.53, "grad_norm": 0.1305350810289383, "learning_rate": 0.00023999999999999998, "loss": 0.2615, "step": 80 }, { "epoch": 6.53, "eval_loss": 0.2476309835910797, "eval_runtime": 90.525, "eval_samples_per_second": 4.275, "eval_steps_per_second": 0.541, "step": 80 }, { "epoch": 7.35, "grad_norm": 0.17941106855869293, "learning_rate": 0.00027, "loss": 0.2216, "step": 90 }, { "epoch": 8.16, "grad_norm": 0.20095375180244446, "learning_rate": 0.0003, "loss": 0.1832, "step": 100 }, { "epoch": 8.16, "eval_loss": 0.2350914180278778, "eval_runtime": 90.3919, "eval_samples_per_second": 4.281, "eval_steps_per_second": 0.542, "step": 100 }, { "epoch": 8.98, "grad_norm": 0.2600422501564026, "learning_rate": 0.00029, "loss": 0.1441, "step": 110 }, { "epoch": 9.8, "grad_norm": 0.20544037222862244, "learning_rate": 0.00028, "loss": 0.1186, "step": 120 }, { "epoch": 9.8, "eval_loss": 0.23090216517448425, "eval_runtime": 90.3144, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.543, "step": 120 }, { "epoch": 10.61, "grad_norm": 0.2158157229423523, "learning_rate": 0.00027, "loss": 0.0947, "step": 130 }, { "epoch": 11.43, "grad_norm": 0.18916285037994385, "learning_rate": 0.00026, "loss": 0.0768, "step": 140 }, { "epoch": 11.43, "eval_loss": 0.24214179813861847, "eval_runtime": 90.2597, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.543, "step": 140 }, { "epoch": 12.24, "grad_norm": 0.22263498604297638, "learning_rate": 0.00025, "loss": 0.0615, "step": 150 }, { "epoch": 13.06, "grad_norm": 0.21315976977348328, "learning_rate": 0.00023999999999999998, "loss": 0.054, "step": 160 }, { "epoch": 13.06, "eval_loss": 0.25932466983795166, "eval_runtime": 89.8439, "eval_samples_per_second": 4.307, "eval_steps_per_second": 0.545, "step": 160 }, { "epoch": 13.88, "grad_norm": 0.18338361382484436, "learning_rate": 0.00023, "loss": 0.0455, "step": 170 }, { "epoch": 14.69, "grad_norm": 0.17157459259033203, "learning_rate": 0.00021999999999999995, "loss": 0.0393, "step": 180 }, { "epoch": 14.69, "eval_loss": 0.27233538031578064, "eval_runtime": 90.1364, "eval_samples_per_second": 4.293, "eval_steps_per_second": 0.544, "step": 180 }, { "epoch": 15.51, "grad_norm": 0.1541435867547989, "learning_rate": 0.00020999999999999998, "loss": 0.0352, "step": 190 }, { "epoch": 16.33, "grad_norm": 0.1553652435541153, "learning_rate": 0.00019999999999999998, "loss": 0.0325, "step": 200 }, { "epoch": 16.33, "eval_loss": 0.28704825043678284, "eval_runtime": 89.7951, "eval_samples_per_second": 4.31, "eval_steps_per_second": 0.546, "step": 200 }, { "epoch": 17.14, "grad_norm": 0.13403691351413727, "learning_rate": 0.00018999999999999998, "loss": 0.0297, "step": 210 }, { "epoch": 17.96, "grad_norm": 0.14512716233730316, "learning_rate": 0.00017999999999999998, "loss": 0.0279, "step": 220 }, { "epoch": 17.96, "eval_loss": 0.2964874505996704, "eval_runtime": 89.7009, "eval_samples_per_second": 4.314, "eval_steps_per_second": 0.546, "step": 220 }, { "epoch": 18.78, "grad_norm": 0.12400835007429123, "learning_rate": 0.00016999999999999999, "loss": 0.0263, "step": 230 }, { "epoch": 19.59, "grad_norm": 0.1139909029006958, "learning_rate": 0.00015999999999999999, "loss": 0.0246, "step": 240 }, { "epoch": 19.59, "eval_loss": 0.30519917607307434, "eval_runtime": 89.8387, "eval_samples_per_second": 4.308, "eval_steps_per_second": 0.545, "step": 240 }, { "epoch": 20.41, "grad_norm": 0.12317101657390594, "learning_rate": 0.00015, "loss": 0.0235, "step": 250 }, { "epoch": 21.22, "grad_norm": 0.12494686245918274, "learning_rate": 0.00014, "loss": 0.0224, "step": 260 }, { "epoch": 21.22, "eval_loss": 0.314134418964386, "eval_runtime": 89.7974, "eval_samples_per_second": 4.31, "eval_steps_per_second": 0.546, "step": 260 }, { "epoch": 22.04, "grad_norm": 0.1180659756064415, "learning_rate": 0.00013, "loss": 0.022, "step": 270 }, { "epoch": 22.86, "grad_norm": 0.09653373062610626, "learning_rate": 0.00011999999999999999, "loss": 0.0212, "step": 280 }, { "epoch": 22.86, "eval_loss": 0.3175604045391083, "eval_runtime": 89.9764, "eval_samples_per_second": 4.301, "eval_steps_per_second": 0.545, "step": 280 }, { "epoch": 23.67, "grad_norm": 0.10445748269557953, "learning_rate": 0.00010999999999999998, "loss": 0.0208, "step": 290 }, { "epoch": 24.49, "grad_norm": 0.09245337545871735, "learning_rate": 9.999999999999999e-05, "loss": 0.0199, "step": 300 }, { "epoch": 24.49, "eval_loss": 0.32360976934432983, "eval_runtime": 89.8613, "eval_samples_per_second": 4.307, "eval_steps_per_second": 0.545, "step": 300 }, { "epoch": 25.31, "grad_norm": 0.09468758851289749, "learning_rate": 8.999999999999999e-05, "loss": 0.0197, "step": 310 }, { "epoch": 26.12, "grad_norm": 0.0891977846622467, "learning_rate": 7.999999999999999e-05, "loss": 0.0192, "step": 320 }, { "epoch": 26.12, "eval_loss": 0.3267403841018677, "eval_runtime": 90.3063, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.543, "step": 320 }, { "epoch": 26.94, "grad_norm": 0.08574336767196655, "learning_rate": 7e-05, "loss": 0.0188, "step": 330 }, { "epoch": 27.76, "grad_norm": 0.08517367392778397, "learning_rate": 5.9999999999999995e-05, "loss": 0.0184, "step": 340 }, { "epoch": 27.76, "eval_loss": 0.33063870668411255, "eval_runtime": 89.8041, "eval_samples_per_second": 4.309, "eval_steps_per_second": 0.546, "step": 340 }, { "epoch": 28.57, "grad_norm": 0.08357132971286774, "learning_rate": 4.9999999999999996e-05, "loss": 0.0181, "step": 350 }, { "epoch": 29.39, "grad_norm": 0.08679915964603424, "learning_rate": 3.9999999999999996e-05, "loss": 0.0181, "step": 360 }, { "epoch": 29.39, "eval_loss": 0.3352932929992676, "eval_runtime": 90.2438, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.543, "step": 360 }, { "epoch": 30.2, "grad_norm": 0.07208231836557388, "learning_rate": 2.9999999999999997e-05, "loss": 0.0178, "step": 370 }, { "epoch": 31.02, "grad_norm": 0.08611435443162918, "learning_rate": 1.9999999999999998e-05, "loss": 0.0175, "step": 380 }, { "epoch": 31.02, "eval_loss": 0.338119238615036, "eval_runtime": 90.2767, "eval_samples_per_second": 4.287, "eval_steps_per_second": 0.543, "step": 380 }, { "epoch": 31.84, "grad_norm": 0.07755295187234879, "learning_rate": 9.999999999999999e-06, "loss": 0.0176, "step": 390 }, { "epoch": 32.65, "grad_norm": 0.07367673516273499, "learning_rate": 0.0, "loss": 0.0173, "step": 400 }, { "epoch": 32.65, "eval_loss": 0.3382853865623474, "eval_runtime": 90.0437, "eval_samples_per_second": 4.298, "eval_steps_per_second": 0.544, "step": 400 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 34, "save_steps": 20, "total_flos": 2.2944565601689928e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }