{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.256410256410255, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5128205128205128, "grad_norm": 92.9238510131836, "learning_rate": 9.9907e-06, "loss": 2.4016, "step": 100 }, { "epoch": 0.5128205128205128, "eval_loss": 2.1255013942718506, "eval_runtime": 34.4318, "eval_samples_per_second": 11.414, "eval_steps_per_second": 1.452, "step": 100 }, { "epoch": 1.0256410256410255, "grad_norm": 47.019439697265625, "learning_rate": 9.980700000000001e-06, "loss": 2.0696, "step": 200 }, { "epoch": 1.0256410256410255, "eval_loss": 1.9405728578567505, "eval_runtime": 34.0713, "eval_samples_per_second": 11.535, "eval_steps_per_second": 1.468, "step": 200 }, { "epoch": 1.5384615384615383, "grad_norm": 52.230751037597656, "learning_rate": 9.970700000000001e-06, "loss": 1.9983, "step": 300 }, { "epoch": 1.5384615384615383, "eval_loss": 1.900020718574524, "eval_runtime": 34.1694, "eval_samples_per_second": 11.502, "eval_steps_per_second": 1.463, "step": 300 }, { "epoch": 2.051282051282051, "grad_norm": 39.62374496459961, "learning_rate": 9.960800000000001e-06, "loss": 1.8888, "step": 400 }, { "epoch": 2.051282051282051, "eval_loss": 1.8164124488830566, "eval_runtime": 34.1213, "eval_samples_per_second": 11.518, "eval_steps_per_second": 1.465, "step": 400 }, { "epoch": 2.564102564102564, "grad_norm": 39.697731018066406, "learning_rate": 9.9508e-06, "loss": 1.8456, "step": 500 }, { "epoch": 2.564102564102564, "eval_loss": 1.7753618955612183, "eval_runtime": 34.149, "eval_samples_per_second": 11.508, "eval_steps_per_second": 1.464, "step": 500 }, { "epoch": 3.076923076923077, "grad_norm": 52.823184967041016, "learning_rate": 9.9408e-06, "loss": 1.7839, "step": 600 }, { "epoch": 3.076923076923077, "eval_loss": 1.7344136238098145, "eval_runtime": 34.121, "eval_samples_per_second": 11.518, "eval_steps_per_second": 1.465, "step": 600 }, { "epoch": 3.58974358974359, "grad_norm": 107.05223083496094, "learning_rate": 9.9308e-06, "loss": 1.7544, "step": 700 }, { "epoch": 3.58974358974359, "eval_loss": 1.6865615844726562, "eval_runtime": 34.0842, "eval_samples_per_second": 11.53, "eval_steps_per_second": 1.467, "step": 700 }, { "epoch": 4.102564102564102, "grad_norm": 53.641353607177734, "learning_rate": 9.9208e-06, "loss": 1.6812, "step": 800 }, { "epoch": 4.102564102564102, "eval_loss": 1.6568766832351685, "eval_runtime": 34.432, "eval_samples_per_second": 11.414, "eval_steps_per_second": 1.452, "step": 800 }, { "epoch": 4.615384615384615, "grad_norm": 40.92328643798828, "learning_rate": 9.9109e-06, "loss": 1.6501, "step": 900 }, { "epoch": 4.615384615384615, "eval_loss": 1.6080188751220703, "eval_runtime": 34.1751, "eval_samples_per_second": 11.5, "eval_steps_per_second": 1.463, "step": 900 }, { "epoch": 5.128205128205128, "grad_norm": 28.52039909362793, "learning_rate": 9.9009e-06, "loss": 1.6579, "step": 1000 }, { "epoch": 5.128205128205128, "eval_loss": 1.6059809923171997, "eval_runtime": 34.1634, "eval_samples_per_second": 11.504, "eval_steps_per_second": 1.464, "step": 1000 }, { "epoch": 5.641025641025641, "grad_norm": 73.21099090576172, "learning_rate": 9.8909e-06, "loss": 1.6286, "step": 1100 }, { "epoch": 5.641025641025641, "eval_loss": 1.5779187679290771, "eval_runtime": 34.1489, "eval_samples_per_second": 11.508, "eval_steps_per_second": 1.464, "step": 1100 }, { "epoch": 6.153846153846154, "grad_norm": 36.768428802490234, "learning_rate": 9.8809e-06, "loss": 1.5871, "step": 1200 }, { "epoch": 6.153846153846154, "eval_loss": 1.5641562938690186, "eval_runtime": 34.1081, "eval_samples_per_second": 11.522, "eval_steps_per_second": 1.466, "step": 1200 }, { "epoch": 6.666666666666667, "grad_norm": 28.098352432250977, "learning_rate": 9.8709e-06, "loss": 1.6231, "step": 1300 }, { "epoch": 6.666666666666667, "eval_loss": 1.530659556388855, "eval_runtime": 34.0717, "eval_samples_per_second": 11.534, "eval_steps_per_second": 1.467, "step": 1300 }, { "epoch": 7.17948717948718, "grad_norm": 48.131195068359375, "learning_rate": 9.8609e-06, "loss": 1.5178, "step": 1400 }, { "epoch": 7.17948717948718, "eval_loss": 1.5207794904708862, "eval_runtime": 34.0224, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.47, "step": 1400 }, { "epoch": 7.6923076923076925, "grad_norm": 15.9362211227417, "learning_rate": 9.8509e-06, "loss": 1.5434, "step": 1500 }, { "epoch": 7.6923076923076925, "eval_loss": 1.4978805780410767, "eval_runtime": 34.0269, "eval_samples_per_second": 11.55, "eval_steps_per_second": 1.469, "step": 1500 }, { "epoch": 8.205128205128204, "grad_norm": 21.21210479736328, "learning_rate": 9.840900000000001e-06, "loss": 1.5368, "step": 1600 }, { "epoch": 8.205128205128204, "eval_loss": 1.4791795015335083, "eval_runtime": 34.5402, "eval_samples_per_second": 11.378, "eval_steps_per_second": 1.448, "step": 1600 }, { "epoch": 8.717948717948717, "grad_norm": 39.53807830810547, "learning_rate": 9.830900000000001e-06, "loss": 1.5163, "step": 1700 }, { "epoch": 8.717948717948717, "eval_loss": 1.4630368947982788, "eval_runtime": 34.1482, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.464, "step": 1700 }, { "epoch": 9.23076923076923, "grad_norm": 42.564842224121094, "learning_rate": 9.820900000000001e-06, "loss": 1.483, "step": 1800 }, { "epoch": 9.23076923076923, "eval_loss": 1.4529582262039185, "eval_runtime": 34.2374, "eval_samples_per_second": 11.479, "eval_steps_per_second": 1.46, "step": 1800 }, { "epoch": 9.743589743589745, "grad_norm": 55.969017028808594, "learning_rate": 9.810900000000001e-06, "loss": 1.4795, "step": 1900 }, { "epoch": 9.743589743589745, "eval_loss": 1.4357120990753174, "eval_runtime": 34.2522, "eval_samples_per_second": 11.474, "eval_steps_per_second": 1.46, "step": 1900 }, { "epoch": 10.256410256410255, "grad_norm": 25.308517456054688, "learning_rate": 9.800900000000001e-06, "loss": 1.4151, "step": 2000 }, { "epoch": 10.256410256410255, "eval_loss": 1.432659387588501, "eval_runtime": 34.2016, "eval_samples_per_second": 11.491, "eval_steps_per_second": 1.462, "step": 2000 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 513, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.64486332416e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }