{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9647058823529413, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 17.495570135172613, "learning_rate": 1.0526315789473685e-06, "loss": 1.5715, "step": 1 }, { "epoch": 0.08, "grad_norm": 10.76150785270408, "learning_rate": 5.263157894736842e-06, "loss": 1.4595, "step": 5 }, { "epoch": 0.16, "grad_norm": 4.725613013527171, "learning_rate": 1.0526315789473684e-05, "loss": 1.0131, "step": 10 }, { "epoch": 0.24, "grad_norm": 1.9709789387558339, "learning_rate": 1.578947368421053e-05, "loss": 0.9452, "step": 15 }, { "epoch": 0.31, "grad_norm": 1.5718916258677555, "learning_rate": 1.9998292504580528e-05, "loss": 0.8657, "step": 20 }, { "epoch": 0.39, "grad_norm": 1.2587381103664048, "learning_rate": 1.993859136895274e-05, "loss": 0.7733, "step": 25 }, { "epoch": 0.47, "grad_norm": 1.0676841590173272, "learning_rate": 1.979409767601366e-05, "loss": 0.7396, "step": 30 }, { "epoch": 0.55, "grad_norm": 0.9830974732786681, "learning_rate": 1.956604419500441e-05, "loss": 0.712, "step": 35 }, { "epoch": 0.63, "grad_norm": 0.9898127673571485, "learning_rate": 1.9256376597815565e-05, "loss": 0.6854, "step": 40 }, { "epoch": 0.71, "grad_norm": 0.9235108049390474, "learning_rate": 1.886773685920062e-05, "loss": 0.6959, "step": 45 }, { "epoch": 0.78, "grad_norm": 0.9176451831476561, "learning_rate": 1.840344071637893e-05, "loss": 0.6707, "step": 50 }, { "epoch": 0.86, "grad_norm": 1.0570214869530683, "learning_rate": 1.7867449380334834e-05, "loss": 0.6703, "step": 55 }, { "epoch": 0.94, "grad_norm": 0.8971457648885479, "learning_rate": 1.7264335740162244e-05, "loss": 0.6791, "step": 60 }, { "epoch": 0.99, "eval_loss": 0.675278902053833, "eval_runtime": 7.6872, "eval_samples_per_second": 19.773, "eval_steps_per_second": 2.472, "step": 63 }, { "epoch": 1.02, "grad_norm": 1.0643549643394743, "learning_rate": 1.659924534878723e-05, "loss": 0.6235, "step": 65 }, { "epoch": 1.1, "grad_norm": 0.9490599850362894, "learning_rate": 1.5877852522924733e-05, "loss": 0.4562, "step": 70 }, { "epoch": 1.18, "grad_norm": 0.9144117495898926, "learning_rate": 1.510631193180907e-05, "loss": 0.4195, "step": 75 }, { "epoch": 1.25, "grad_norm": 0.8786872768630574, "learning_rate": 1.429120608772609e-05, "loss": 0.4257, "step": 80 }, { "epoch": 1.33, "grad_norm": 0.815215205518059, "learning_rate": 1.3439489186339283e-05, "loss": 0.4233, "step": 85 }, { "epoch": 1.41, "grad_norm": 0.8699270815065504, "learning_rate": 1.2558427775944357e-05, "loss": 0.4208, "step": 90 }, { "epoch": 1.49, "grad_norm": 0.8685370301630462, "learning_rate": 1.16555387618413e-05, "loss": 0.4242, "step": 95 }, { "epoch": 1.57, "grad_norm": 1.1593148629393575, "learning_rate": 1.073852527474874e-05, "loss": 0.4409, "step": 100 }, { "epoch": 1.65, "grad_norm": 0.8148155791935645, "learning_rate": 9.815210950408703e-06, "loss": 0.4196, "step": 105 }, { "epoch": 1.73, "grad_norm": 0.8416381894160022, "learning_rate": 8.893473181084993e-06, "loss": 0.4375, "step": 110 }, { "epoch": 1.8, "grad_norm": 0.8570128253866367, "learning_rate": 7.9811759084299e-06, "loss": 0.42, "step": 115 }, { "epoch": 1.88, "grad_norm": 0.9019501174530336, "learning_rate": 7.086102531106755e-06, "loss": 0.4289, "step": 120 }, { "epoch": 1.96, "grad_norm": 0.8041684983988516, "learning_rate": 6.215889499576898e-06, "loss": 0.4164, "step": 125 }, { "epoch": 1.99, "eval_loss": 0.7019588351249695, "eval_runtime": 7.6577, "eval_samples_per_second": 19.849, "eval_steps_per_second": 2.481, "step": 127 }, { "epoch": 2.04, "grad_norm": 0.8404401588139048, "learning_rate": 5.3779611645968696e-06, "loss": 0.3314, "step": 130 }, { "epoch": 2.12, "grad_norm": 1.0503604307814172, "learning_rate": 4.579466435275506e-06, "loss": 0.2445, "step": 135 }, { "epoch": 2.2, "grad_norm": 0.7976145439951375, "learning_rate": 3.827217787102072e-06, "loss": 0.2222, "step": 140 }, { "epoch": 2.27, "grad_norm": 0.7577793168033735, "learning_rate": 3.1276331403073733e-06, "loss": 0.2256, "step": 145 }, { "epoch": 2.35, "grad_norm": 0.7242853721207333, "learning_rate": 2.4866811044312667e-06, "loss": 0.2153, "step": 150 }, { "epoch": 2.43, "grad_norm": 0.7007575307345968, "learning_rate": 1.9098300562505266e-06, "loss": 0.2148, "step": 155 }, { "epoch": 2.51, "grad_norm": 0.7109375761250729, "learning_rate": 1.4020014855162755e-06, "loss": 0.2203, "step": 160 }, { "epoch": 2.59, "grad_norm": 0.7041755633099551, "learning_rate": 9.675280065387117e-07, "loss": 0.2123, "step": 165 }, { "epoch": 2.67, "grad_norm": 0.6924970223941338, "learning_rate": 6.101163938494359e-07, "loss": 0.2207, "step": 170 }, { "epoch": 2.75, "grad_norm": 0.6779904104237012, "learning_rate": 3.328159573081258e-07, "loss": 0.2098, "step": 175 }, { "epoch": 2.82, "grad_norm": 0.717318798866986, "learning_rate": 1.3799252646597428e-07, "loss": 0.2168, "step": 180 }, { "epoch": 2.9, "grad_norm": 0.7202766403388026, "learning_rate": 2.7308266142119788e-08, "loss": 0.2065, "step": 185 }, { "epoch": 2.96, "eval_loss": 0.8261664509773254, "eval_runtime": 7.6483, "eval_samples_per_second": 19.874, "eval_steps_per_second": 2.484, "step": 189 }, { "epoch": 2.96, "step": 189, "total_flos": 64372667842560.0, "train_loss": 0.4923532526329081, "train_runtime": 4087.1741, "train_samples_per_second": 1.494, "train_steps_per_second": 0.046 } ], "logging_steps": 5, "max_steps": 189, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 64372667842560.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }