{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.949238578680204, "eval_steps": 500, "global_step": 490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20304568527918782, "grad_norm": 0.2015380859375, "learning_rate": 0.00019979453927503364, "loss": 0.7307, "step": 10 }, { "epoch": 0.40609137055837563, "grad_norm": 0.255615234375, "learning_rate": 0.0001991790013823246, "loss": 0.7211, "step": 20 }, { "epoch": 0.6091370558375635, "grad_norm": 0.306640625, "learning_rate": 0.00019815591569910654, "loss": 0.7372, "step": 30 }, { "epoch": 0.8121827411167513, "grad_norm": 0.359130859375, "learning_rate": 0.00019672948630390294, "loss": 0.7826, "step": 40 }, { "epoch": 1.015228426395939, "grad_norm": 0.385986328125, "learning_rate": 0.00019490557470106686, "loss": 0.8238, "step": 50 }, { "epoch": 1.218274111675127, "grad_norm": 0.359130859375, "learning_rate": 0.0001926916757346022, "loss": 0.7958, "step": 60 }, { "epoch": 1.4213197969543148, "grad_norm": 0.34912109375, "learning_rate": 0.0001900968867902419, "loss": 0.7704, "step": 70 }, { "epoch": 1.6243654822335025, "grad_norm": 0.378662109375, "learning_rate": 0.00018713187041233896, "loss": 0.7576, "step": 80 }, { "epoch": 1.8274111675126905, "grad_norm": 0.38818359375, "learning_rate": 0.00018380881048918405, "loss": 0.7895, "step": 90 }, { "epoch": 2.030456852791878, "grad_norm": 0.3798828125, "learning_rate": 0.00018014136218679567, "loss": 0.7953, "step": 100 }, { "epoch": 2.233502538071066, "grad_norm": 0.40869140625, "learning_rate": 0.00017614459583691346, "loss": 0.7621, "step": 110 }, { "epoch": 2.436548223350254, "grad_norm": 0.409912109375, "learning_rate": 0.00017183493500977278, "loss": 0.7806, "step": 120 }, { "epoch": 2.6395939086294415, "grad_norm": 0.41162109375, "learning_rate": 0.0001672300890261317, "loss": 0.7414, "step": 130 }, { "epoch": 2.8426395939086295, "grad_norm": 0.4326171875, "learning_rate": 0.00016234898018587337, "loss": 0.7326, "step": 140 }, { "epoch": 3.045685279187817, "grad_norm": 0.468994140625, "learning_rate": 0.00015721166601221698, "loss": 0.7233, "step": 150 }, { "epoch": 3.248730964467005, "grad_norm": 0.443359375, "learning_rate": 0.00015183925683105254, "loss": 0.7304, "step": 160 }, { "epoch": 3.451776649746193, "grad_norm": 0.47705078125, "learning_rate": 0.00014625382902408356, "loss": 0.7311, "step": 170 }, { "epoch": 3.6548223350253806, "grad_norm": 0.44287109375, "learning_rate": 0.00014047833431223938, "loss": 0.7204, "step": 180 }, { "epoch": 3.8578680203045685, "grad_norm": 0.490478515625, "learning_rate": 0.00013453650544213076, "loss": 0.6965, "step": 190 }, { "epoch": 4.060913705583756, "grad_norm": 0.42578125, "learning_rate": 0.00012845275866310324, "loss": 0.7102, "step": 200 }, { "epoch": 4.2639593908629445, "grad_norm": 0.46240234375, "learning_rate": 0.00012225209339563145, "loss": 0.7164, "step": 210 }, { "epoch": 4.467005076142132, "grad_norm": 0.449951171875, "learning_rate": 0.00011595998950333793, "loss": 0.7068, "step": 220 }, { "epoch": 4.67005076142132, "grad_norm": 0.54296875, "learning_rate": 0.00010960230259076818, "loss": 0.7092, "step": 230 }, { "epoch": 4.873096446700508, "grad_norm": 0.5244140625, "learning_rate": 0.00010320515775716555, "loss": 0.6712, "step": 240 }, { "epoch": 5.0761421319796955, "grad_norm": 0.47705078125, "learning_rate": 9.679484224283449e-05, "loss": 0.6593, "step": 250 }, { "epoch": 5.279187817258883, "grad_norm": 0.50830078125, "learning_rate": 9.039769740923183e-05, "loss": 0.6991, "step": 260 }, { "epoch": 5.482233502538071, "grad_norm": 0.51318359375, "learning_rate": 8.404001049666211e-05, "loss": 0.6612, "step": 270 }, { "epoch": 5.685279187817259, "grad_norm": 0.50341796875, "learning_rate": 7.774790660436858e-05, "loss": 0.6829, "step": 280 }, { "epoch": 5.888324873096447, "grad_norm": 0.513671875, "learning_rate": 7.154724133689677e-05, "loss": 0.6576, "step": 290 }, { "epoch": 6.091370558375634, "grad_norm": 0.471435546875, "learning_rate": 6.546349455786926e-05, "loss": 0.6669, "step": 300 }, { "epoch": 6.2944162436548226, "grad_norm": 0.55224609375, "learning_rate": 5.952166568776062e-05, "loss": 0.6569, "step": 310 }, { "epoch": 6.49746192893401, "grad_norm": 0.5986328125, "learning_rate": 5.37461709759165e-05, "loss": 0.6517, "step": 320 }, { "epoch": 6.700507614213198, "grad_norm": 0.5849609375, "learning_rate": 4.8160743168947496e-05, "loss": 0.6627, "step": 330 }, { "epoch": 6.903553299492386, "grad_norm": 0.54345703125, "learning_rate": 4.278833398778306e-05, "loss": 0.6526, "step": 340 }, { "epoch": 7.106598984771574, "grad_norm": 0.53125, "learning_rate": 3.7651019814126654e-05, "loss": 0.652, "step": 350 }, { "epoch": 7.309644670050761, "grad_norm": 0.51611328125, "learning_rate": 3.276991097386831e-05, "loss": 0.6237, "step": 360 }, { "epoch": 7.5126903553299496, "grad_norm": 0.5654296875, "learning_rate": 2.8165064990227252e-05, "loss": 0.6313, "step": 370 }, { "epoch": 7.715736040609137, "grad_norm": 0.5810546875, "learning_rate": 2.3855404163086558e-05, "loss": 0.6452, "step": 380 }, { "epoch": 7.918781725888325, "grad_norm": 0.59423828125, "learning_rate": 1.985863781320435e-05, "loss": 0.6412, "step": 390 }, { "epoch": 8.121827411167512, "grad_norm": 0.55712890625, "learning_rate": 1.619118951081594e-05, "loss": 0.6532, "step": 400 }, { "epoch": 8.3248730964467, "grad_norm": 0.60595703125, "learning_rate": 1.286812958766106e-05, "loss": 0.6271, "step": 410 }, { "epoch": 8.527918781725889, "grad_norm": 0.580078125, "learning_rate": 9.903113209758096e-06, "loss": 0.6202, "step": 420 }, { "epoch": 8.730964467005077, "grad_norm": 0.54833984375, "learning_rate": 7.308324265397836e-06, "loss": 0.6382, "step": 430 }, { "epoch": 8.934010152284264, "grad_norm": 0.53759765625, "learning_rate": 5.094425298933136e-06, "loss": 0.6374, "step": 440 }, { "epoch": 9.137055837563452, "grad_norm": 0.591796875, "learning_rate": 3.270513696097055e-06, "loss": 0.6319, "step": 450 }, { "epoch": 9.34010152284264, "grad_norm": 0.54638671875, "learning_rate": 1.8440843008934561e-06, "loss": 0.6336, "step": 460 }, { "epoch": 9.543147208121827, "grad_norm": 0.56494140625, "learning_rate": 8.209986176753948e-07, "loss": 0.6319, "step": 470 }, { "epoch": 9.746192893401016, "grad_norm": 0.55029296875, "learning_rate": 2.054607249663665e-07, "loss": 0.6295, "step": 480 }, { "epoch": 9.949238578680204, "grad_norm": 0.5234375, "learning_rate": 0.0, "loss": 0.6314, "step": 490 }, { "epoch": 9.949238578680204, "step": 490, "total_flos": 2.391072030326784e+16, "train_loss": 0.6941771516994554, "train_runtime": 436.887, "train_samples_per_second": 4.509, "train_steps_per_second": 1.122 } ], "logging_steps": 10, "max_steps": 490, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.391072030326784e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }