{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 410, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.24390243902439024, "grad_norm": 7.625, "learning_rate": 0.00019970658011837404, "loss": 2.0086, "step": 10 }, { "epoch": 0.4878048780487805, "grad_norm": 0.59033203125, "learning_rate": 0.00019882804237803488, "loss": 1.1443, "step": 20 }, { "epoch": 0.7317073170731707, "grad_norm": 0.736328125, "learning_rate": 0.00019736954238777792, "loss": 0.985, "step": 30 }, { "epoch": 0.975609756097561, "grad_norm": 0.50048828125, "learning_rate": 0.00019533963920549306, "loss": 0.918, "step": 40 }, { "epoch": 1.2195121951219512, "grad_norm": 0.36083984375, "learning_rate": 0.0001927502451102095, "loss": 0.7771, "step": 50 }, { "epoch": 1.4634146341463414, "grad_norm": 0.376953125, "learning_rate": 0.00018961655569610557, "loss": 0.8079, "step": 60 }, { "epoch": 1.7073170731707317, "grad_norm": 0.314208984375, "learning_rate": 0.00018595696069872013, "loss": 0.7491, "step": 70 }, { "epoch": 1.951219512195122, "grad_norm": 0.3203125, "learning_rate": 0.00018179293607667178, "loss": 0.7349, "step": 80 }, { "epoch": 2.1951219512195124, "grad_norm": 0.331787109375, "learning_rate": 0.0001771489179821943, "loss": 0.7468, "step": 90 }, { "epoch": 2.4390243902439024, "grad_norm": 0.374755859375, "learning_rate": 0.0001720521593600787, "loss": 0.7582, "step": 100 }, { "epoch": 2.682926829268293, "grad_norm": 0.3662109375, "learning_rate": 0.00016653257001655652, "loss": 0.715, "step": 110 }, { "epoch": 2.926829268292683, "grad_norm": 0.413818359375, "learning_rate": 0.0001606225410966638, "loss": 0.7404, "step": 120 }, { "epoch": 3.1707317073170733, "grad_norm": 0.349609375, "learning_rate": 0.00015435675500012212, "loss": 0.6844, "step": 130 }, { "epoch": 3.4146341463414633, "grad_norm": 0.48095703125, "learning_rate": 0.0001477719818512263, "loss": 0.7568, "step": 140 }, { "epoch": 3.658536585365854, "grad_norm": 0.369384765625, "learning_rate": 0.00014090686371713402, "loss": 0.6647, "step": 150 }, { "epoch": 3.902439024390244, "grad_norm": 0.422607421875, "learning_rate": 0.00013380168784085027, "loss": 0.698, "step": 160 }, { "epoch": 4.146341463414634, "grad_norm": 0.38427734375, "learning_rate": 0.0001264981502196662, "loss": 0.6861, "step": 170 }, { "epoch": 4.390243902439025, "grad_norm": 0.38525390625, "learning_rate": 0.00011903911091646684, "loss": 0.6855, "step": 180 }, { "epoch": 4.634146341463414, "grad_norm": 0.384033203125, "learning_rate": 0.00011146834253984006, "loss": 0.6797, "step": 190 }, { "epoch": 4.878048780487805, "grad_norm": 0.365966796875, "learning_rate": 0.00010383027336900355, "loss": 0.6865, "step": 200 }, { "epoch": 5.121951219512195, "grad_norm": 0.39794921875, "learning_rate": 9.616972663099647e-05, "loss": 0.6859, "step": 210 }, { "epoch": 5.365853658536586, "grad_norm": 0.3837890625, "learning_rate": 8.853165746015997e-05, "loss": 0.6636, "step": 220 }, { "epoch": 5.609756097560975, "grad_norm": 0.445068359375, "learning_rate": 8.096088908353315e-05, "loss": 0.6883, "step": 230 }, { "epoch": 5.853658536585366, "grad_norm": 0.4130859375, "learning_rate": 7.350184978033386e-05, "loss": 0.6432, "step": 240 }, { "epoch": 6.097560975609756, "grad_norm": 0.380859375, "learning_rate": 6.619831215914974e-05, "loss": 0.6597, "step": 250 }, { "epoch": 6.341463414634147, "grad_norm": 0.407958984375, "learning_rate": 5.909313628286601e-05, "loss": 0.6574, "step": 260 }, { "epoch": 6.585365853658536, "grad_norm": 0.3974609375, "learning_rate": 5.222801814877369e-05, "loss": 0.6499, "step": 270 }, { "epoch": 6.829268292682927, "grad_norm": 0.40625, "learning_rate": 4.56432449998779e-05, "loss": 0.6436, "step": 280 }, { "epoch": 7.073170731707317, "grad_norm": 0.3994140625, "learning_rate": 3.937745890333623e-05, "loss": 0.6644, "step": 290 }, { "epoch": 7.317073170731708, "grad_norm": 0.40673828125, "learning_rate": 3.346742998344348e-05, "loss": 0.656, "step": 300 }, { "epoch": 7.560975609756097, "grad_norm": 0.408447265625, "learning_rate": 2.794784063992131e-05, "loss": 0.6244, "step": 310 }, { "epoch": 7.804878048780488, "grad_norm": 0.41015625, "learning_rate": 2.2851082017805703e-05, "loss": 0.6594, "step": 320 }, { "epoch": 8.048780487804878, "grad_norm": 0.437744140625, "learning_rate": 1.8207063923328237e-05, "loss": 0.6423, "step": 330 }, { "epoch": 8.292682926829269, "grad_norm": 0.43212890625, "learning_rate": 1.4043039301279903e-05, "loss": 0.635, "step": 340 }, { "epoch": 8.536585365853659, "grad_norm": 0.4013671875, "learning_rate": 1.0383444303894452e-05, "loss": 0.6408, "step": 350 }, { "epoch": 8.78048780487805, "grad_norm": 0.40869140625, "learning_rate": 7.249754889790539e-06, "loss": 0.6438, "step": 360 }, { "epoch": 9.024390243902438, "grad_norm": 0.3857421875, "learning_rate": 4.660360794506946e-06, "loss": 0.6282, "step": 370 }, { "epoch": 9.268292682926829, "grad_norm": 0.400634765625, "learning_rate": 2.6304576122221035e-06, "loss": 0.6359, "step": 380 }, { "epoch": 9.512195121951219, "grad_norm": 0.3935546875, "learning_rate": 1.1719576219651585e-06, "loss": 0.6467, "step": 390 }, { "epoch": 9.75609756097561, "grad_norm": 0.4140625, "learning_rate": 2.934198816259559e-07, "loss": 0.6479, "step": 400 }, { "epoch": 10.0, "grad_norm": 0.400390625, "learning_rate": 0.0, "loss": 0.6203, "step": 410 }, { "epoch": 10.0, "step": 410, "total_flos": 2.000692923334656e+16, "train_loss": 0.7381390141277778, "train_runtime": 366.1191, "train_samples_per_second": 4.479, "train_steps_per_second": 1.12 } ], "logging_steps": 10, "max_steps": 410, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.000692923334656e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }