{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0061002732414056045, "eval_steps": 500, "global_step": 384, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002541780517252335, "grad_norm": 11.004426002502441, "learning_rate": 3.90625e-06, "loss": 11.0192, "step": 16 }, { "epoch": 0.000508356103450467, "grad_norm": 11.49954605102539, "learning_rate": 7.8125e-06, "loss": 10.7833, "step": 32 }, { "epoch": 0.0007625341551757006, "grad_norm": 13.807037353515625, "learning_rate": 1.171875e-05, "loss": 10.0912, "step": 48 }, { "epoch": 0.001016712206900934, "grad_norm": 20.37409019470215, "learning_rate": 1.5625e-05, "loss": 7.7976, "step": 64 }, { "epoch": 0.0012708902586261676, "grad_norm": 5.419431686401367, "learning_rate": 1.953125e-05, "loss": 4.0052, "step": 80 }, { "epoch": 0.0015250683103514011, "grad_norm": 1.0872530937194824, "learning_rate": 2.34375e-05, "loss": 2.9534, "step": 96 }, { "epoch": 0.0017792463620766347, "grad_norm": 0.634519636631012, "learning_rate": 2.734375e-05, "loss": 2.7875, "step": 112 }, { "epoch": 0.002033424413801868, "grad_norm": 0.551450788974762, "learning_rate": 3.125e-05, "loss": 2.7139, "step": 128 }, { "epoch": 0.002287602465527102, "grad_norm": 0.7493265271186829, "learning_rate": 3.5156250000000004e-05, "loss": 2.4613, "step": 144 }, { "epoch": 0.002541780517252335, "grad_norm": 0.5198466181755066, "learning_rate": 3.90625e-05, "loss": 2.2055, "step": 160 }, { "epoch": 0.002795958568977569, "grad_norm": 0.326664000749588, "learning_rate": 4.296875e-05, "loss": 2.0689, "step": 176 }, { "epoch": 0.0030501366207028023, "grad_norm": 0.26472437381744385, "learning_rate": 4.6875e-05, "loss": 2.0621, "step": 192 }, { "epoch": 0.003304314672428036, "grad_norm": 0.20516088604927063, "learning_rate": 5.0781250000000004e-05, "loss": 2.0306, "step": 208 }, { "epoch": 0.0035584927241532694, "grad_norm": 0.1534327268600464, "learning_rate": 5.46875e-05, "loss": 1.8969, "step": 224 }, { "epoch": 0.0038126707758785027, "grad_norm": 0.12809991836547852, "learning_rate": 5.859375e-05, "loss": 1.8912, "step": 240 }, { "epoch": 0.004066848827603736, "grad_norm": 0.1291392296552658, "learning_rate": 6.25e-05, "loss": 1.8995, "step": 256 }, { "epoch": 0.00432102687932897, "grad_norm": 0.10523363947868347, "learning_rate": 6.640625e-05, "loss": 1.9408, "step": 272 }, { "epoch": 0.004575204931054204, "grad_norm": 0.11328139156103134, "learning_rate": 7.031250000000001e-05, "loss": 1.9119, "step": 288 }, { "epoch": 0.004829382982779437, "grad_norm": 0.10714733600616455, "learning_rate": 7.421875e-05, "loss": 1.8455, "step": 304 }, { "epoch": 0.00508356103450467, "grad_norm": 0.09541799128055573, "learning_rate": 7.8125e-05, "loss": 1.868, "step": 320 }, { "epoch": 0.005337739086229904, "grad_norm": 0.10002260655164719, "learning_rate": 8.203125e-05, "loss": 1.7988, "step": 336 }, { "epoch": 0.005591917137955138, "grad_norm": 0.08132267743349075, "learning_rate": 8.59375e-05, "loss": 1.8745, "step": 352 }, { "epoch": 0.005846095189680371, "grad_norm": 0.08185765147209167, "learning_rate": 8.984375e-05, "loss": 1.8365, "step": 368 }, { "epoch": 0.0061002732414056045, "grad_norm": 0.12349283695220947, "learning_rate": 9.375e-05, "loss": 1.8622, "step": 384 } ], "logging_steps": 16, "max_steps": 251792, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 384, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6637127819264e+16, "train_batch_size": 48, "trial_name": null, "trial_params": null }