{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.820359281437126, "eval_steps": 500, "global_step": 410, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23952095808383234, "grad_norm": 2.16796875, "learning_rate": 0.00019970658011837404, "loss": 0.5442, "step": 10 }, { "epoch": 0.47904191616766467, "grad_norm": 2.02734375, "learning_rate": 0.00019882804237803488, "loss": 0.5445, "step": 20 }, { "epoch": 0.718562874251497, "grad_norm": 1.2587890625, "learning_rate": 0.00019736954238777792, "loss": 0.5499, "step": 30 }, { "epoch": 0.9580838323353293, "grad_norm": 1.3486328125, "learning_rate": 0.00019533963920549306, "loss": 0.5429, "step": 40 }, { "epoch": 1.1976047904191618, "grad_norm": 2.08984375, "learning_rate": 0.0001927502451102095, "loss": 0.5336, "step": 50 }, { "epoch": 1.437125748502994, "grad_norm": 1.7734375, "learning_rate": 0.00018961655569610557, "loss": 0.5333, "step": 60 }, { "epoch": 1.6766467065868262, "grad_norm": 2.63671875, "learning_rate": 0.00018595696069872013, "loss": 0.5401, "step": 70 }, { "epoch": 1.9161676646706587, "grad_norm": 2.19921875, "learning_rate": 0.00018179293607667178, "loss": 0.5414, "step": 80 }, { "epoch": 2.155688622754491, "grad_norm": 1.908203125, "learning_rate": 0.0001771489179821943, "loss": 0.5713, "step": 90 }, { "epoch": 2.3952095808383236, "grad_norm": 4.09765625, "learning_rate": 0.0001720521593600787, "loss": 0.538, "step": 100 }, { "epoch": 2.6347305389221556, "grad_norm": 2.0390625, "learning_rate": 0.00016653257001655652, "loss": 0.5199, "step": 110 }, { "epoch": 2.874251497005988, "grad_norm": 1.8154296875, "learning_rate": 0.0001606225410966638, "loss": 0.5204, "step": 120 }, { "epoch": 3.1137724550898205, "grad_norm": 1.9189453125, "learning_rate": 0.00015435675500012212, "loss": 0.5396, "step": 130 }, { "epoch": 3.3532934131736525, "grad_norm": 1.8896484375, "learning_rate": 0.0001477719818512263, "loss": 0.5294, "step": 140 }, { "epoch": 3.592814371257485, "grad_norm": 2.302734375, "learning_rate": 0.00014090686371713402, "loss": 0.5394, "step": 150 }, { "epoch": 3.8323353293413174, "grad_norm": 2.16796875, "learning_rate": 0.00013380168784085027, "loss": 0.5279, "step": 160 }, { "epoch": 4.07185628742515, "grad_norm": 2.11328125, "learning_rate": 0.0001264981502196662, "loss": 0.5182, "step": 170 }, { "epoch": 4.311377245508982, "grad_norm": 1.9189453125, "learning_rate": 0.00011903911091646684, "loss": 0.5304, "step": 180 }, { "epoch": 4.550898203592815, "grad_norm": 2.48046875, "learning_rate": 0.00011146834253984006, "loss": 0.5386, "step": 190 }, { "epoch": 4.790419161676647, "grad_norm": 2.62890625, "learning_rate": 0.00010383027336900355, "loss": 0.5276, "step": 200 }, { "epoch": 5.029940119760479, "grad_norm": 3.021484375, "learning_rate": 9.616972663099647e-05, "loss": 0.5229, "step": 210 }, { "epoch": 5.269461077844311, "grad_norm": 3.259765625, "learning_rate": 8.853165746015997e-05, "loss": 0.5243, "step": 220 }, { "epoch": 5.508982035928144, "grad_norm": 2.876953125, "learning_rate": 8.096088908353315e-05, "loss": 0.5206, "step": 230 }, { "epoch": 5.748502994011976, "grad_norm": 3.349609375, "learning_rate": 7.350184978033386e-05, "loss": 0.5374, "step": 240 }, { "epoch": 5.9880239520958085, "grad_norm": 3.791015625, "learning_rate": 6.619831215914974e-05, "loss": 0.5258, "step": 250 }, { "epoch": 6.227544910179641, "grad_norm": 3.240234375, "learning_rate": 5.909313628286601e-05, "loss": 0.5166, "step": 260 }, { "epoch": 6.467065868263473, "grad_norm": 3.248046875, "learning_rate": 5.222801814877369e-05, "loss": 0.5329, "step": 270 }, { "epoch": 6.706586826347305, "grad_norm": 2.83984375, "learning_rate": 4.56432449998779e-05, "loss": 0.5117, "step": 280 }, { "epoch": 6.946107784431137, "grad_norm": 4.03515625, "learning_rate": 3.937745890333623e-05, "loss": 0.5269, "step": 290 }, { "epoch": 7.18562874251497, "grad_norm": 4.08203125, "learning_rate": 3.346742998344348e-05, "loss": 0.5299, "step": 300 }, { "epoch": 7.425149700598802, "grad_norm": 3.03125, "learning_rate": 2.794784063992131e-05, "loss": 0.5217, "step": 310 }, { "epoch": 7.664670658682635, "grad_norm": 3.00390625, "learning_rate": 2.2851082017805703e-05, "loss": 0.5241, "step": 320 }, { "epoch": 7.904191616766467, "grad_norm": 3.27734375, "learning_rate": 1.8207063923328237e-05, "loss": 0.5066, "step": 330 }, { "epoch": 8.1437125748503, "grad_norm": 2.927734375, "learning_rate": 1.4043039301279903e-05, "loss": 0.5111, "step": 340 }, { "epoch": 8.383233532934131, "grad_norm": 3.3984375, "learning_rate": 1.0383444303894452e-05, "loss": 0.5137, "step": 350 }, { "epoch": 8.622754491017965, "grad_norm": 3.4375, "learning_rate": 7.249754889790539e-06, "loss": 0.5279, "step": 360 }, { "epoch": 8.862275449101796, "grad_norm": 3.19140625, "learning_rate": 4.660360794506946e-06, "loss": 0.5125, "step": 370 }, { "epoch": 9.10179640718563, "grad_norm": 4.23046875, "learning_rate": 2.6304576122221035e-06, "loss": 0.5141, "step": 380 }, { "epoch": 9.341317365269461, "grad_norm": 2.779296875, "learning_rate": 1.1719576219651585e-06, "loss": 0.5172, "step": 390 }, { "epoch": 9.580838323353294, "grad_norm": 3.619140625, "learning_rate": 2.934198816259559e-07, "loss": 0.5389, "step": 400 }, { "epoch": 9.820359281437126, "grad_norm": 4.30078125, "learning_rate": 0.0, "loss": 0.5024, "step": 410 }, { "epoch": 9.820359281437126, "step": 410, "total_flos": 2.67670788243456e+16, "train_loss": 0.5285330202521348, "train_runtime": 355.5253, "train_samples_per_second": 4.697, "train_steps_per_second": 1.153 } ], "logging_steps": 10, "max_steps": 410, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.67670788243456e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }