{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990586758707248, "eval_steps": 500, "global_step": 398, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025101976780671477, "grad_norm": 11.570974273934945, "learning_rate": 5e-06, "loss": 0.9144, "step": 10 }, { "epoch": 0.050203953561342954, "grad_norm": 3.2868828349894663, "learning_rate": 5e-06, "loss": 0.7934, "step": 20 }, { "epoch": 0.07530593034201444, "grad_norm": 1.3621580098977089, "learning_rate": 5e-06, "loss": 0.7581, "step": 30 }, { "epoch": 0.10040790712268591, "grad_norm": 1.1465743337293792, "learning_rate": 5e-06, "loss": 0.718, "step": 40 }, { "epoch": 0.12550988390335738, "grad_norm": 1.8313362803176332, "learning_rate": 5e-06, "loss": 0.6988, "step": 50 }, { "epoch": 0.15061186068402888, "grad_norm": 0.8106493906617342, "learning_rate": 5e-06, "loss": 0.6753, "step": 60 }, { "epoch": 0.17571383746470035, "grad_norm": 1.3540466482658315, "learning_rate": 5e-06, "loss": 0.6672, "step": 70 }, { "epoch": 0.20081581424537182, "grad_norm": 0.5964949627381437, "learning_rate": 5e-06, "loss": 0.6551, "step": 80 }, { "epoch": 0.2259177910260433, "grad_norm": 0.5688152556152026, "learning_rate": 5e-06, "loss": 0.6532, "step": 90 }, { "epoch": 0.25101976780671476, "grad_norm": 0.6773755092137309, "learning_rate": 5e-06, "loss": 0.6464, "step": 100 }, { "epoch": 0.27612174458738625, "grad_norm": 0.5774859088110325, "learning_rate": 5e-06, "loss": 0.6405, "step": 110 }, { "epoch": 0.30122372136805775, "grad_norm": 0.5522163553259679, "learning_rate": 5e-06, "loss": 0.6312, "step": 120 }, { "epoch": 0.3263256981487292, "grad_norm": 0.5727079237004361, "learning_rate": 5e-06, "loss": 0.6291, "step": 130 }, { "epoch": 0.3514276749294007, "grad_norm": 0.5373258099366229, "learning_rate": 5e-06, "loss": 0.6348, "step": 140 }, { "epoch": 0.3765296517100722, "grad_norm": 0.5642619689588081, "learning_rate": 5e-06, "loss": 0.6252, "step": 150 }, { "epoch": 0.40163162849074363, "grad_norm": 0.647603552543901, "learning_rate": 5e-06, "loss": 0.6219, "step": 160 }, { "epoch": 0.42673360527141513, "grad_norm": 0.6137422702457818, "learning_rate": 5e-06, "loss": 0.6234, "step": 170 }, { "epoch": 0.4518355820520866, "grad_norm": 0.5654990824228964, "learning_rate": 5e-06, "loss": 0.6139, "step": 180 }, { "epoch": 0.47693755883275807, "grad_norm": 0.7024949610501936, "learning_rate": 5e-06, "loss": 0.6195, "step": 190 }, { "epoch": 0.5020395356134295, "grad_norm": 0.5820569269038963, "learning_rate": 5e-06, "loss": 0.6125, "step": 200 }, { "epoch": 0.527141512394101, "grad_norm": 0.7020933693248848, "learning_rate": 5e-06, "loss": 0.6086, "step": 210 }, { "epoch": 0.5522434891747725, "grad_norm": 0.604199603689463, "learning_rate": 5e-06, "loss": 0.6188, "step": 220 }, { "epoch": 0.577345465955444, "grad_norm": 0.5647954363455674, "learning_rate": 5e-06, "loss": 0.6032, "step": 230 }, { "epoch": 0.6024474427361155, "grad_norm": 0.62231194375424, "learning_rate": 5e-06, "loss": 0.6097, "step": 240 }, { "epoch": 0.627549419516787, "grad_norm": 0.6954867326199731, "learning_rate": 5e-06, "loss": 0.6062, "step": 250 }, { "epoch": 0.6526513962974584, "grad_norm": 0.6683159364143533, "learning_rate": 5e-06, "loss": 0.6081, "step": 260 }, { "epoch": 0.6777533730781299, "grad_norm": 0.7460632306114617, "learning_rate": 5e-06, "loss": 0.6051, "step": 270 }, { "epoch": 0.7028553498588014, "grad_norm": 0.5069551815996469, "learning_rate": 5e-06, "loss": 0.6035, "step": 280 }, { "epoch": 0.7279573266394729, "grad_norm": 0.7002297529886885, "learning_rate": 5e-06, "loss": 0.6037, "step": 290 }, { "epoch": 0.7530593034201444, "grad_norm": 0.6956058201700761, "learning_rate": 5e-06, "loss": 0.5979, "step": 300 }, { "epoch": 0.7781612802008158, "grad_norm": 0.5991932647306116, "learning_rate": 5e-06, "loss": 0.5981, "step": 310 }, { "epoch": 0.8032632569814873, "grad_norm": 0.5275465592448361, "learning_rate": 5e-06, "loss": 0.5933, "step": 320 }, { "epoch": 0.8283652337621588, "grad_norm": 0.6290119559084576, "learning_rate": 5e-06, "loss": 0.5945, "step": 330 }, { "epoch": 0.8534672105428303, "grad_norm": 0.5130802873233022, "learning_rate": 5e-06, "loss": 0.5936, "step": 340 }, { "epoch": 0.8785691873235018, "grad_norm": 0.7421613361549775, "learning_rate": 5e-06, "loss": 0.5948, "step": 350 }, { "epoch": 0.9036711641041733, "grad_norm": 0.6153969499038726, "learning_rate": 5e-06, "loss": 0.5824, "step": 360 }, { "epoch": 0.9287731408848446, "grad_norm": 0.6034204487183606, "learning_rate": 5e-06, "loss": 0.5847, "step": 370 }, { "epoch": 0.9538751176655161, "grad_norm": 0.592301687922246, "learning_rate": 5e-06, "loss": 0.5908, "step": 380 }, { "epoch": 0.9789770944461876, "grad_norm": 0.6008165953019412, "learning_rate": 5e-06, "loss": 0.5911, "step": 390 }, { "epoch": 0.9990586758707248, "eval_loss": 0.5867129564285278, "eval_runtime": 271.3794, "eval_samples_per_second": 39.553, "eval_steps_per_second": 0.619, "step": 398 } ], "logging_steps": 10, "max_steps": 1194, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 667083582996480.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }