{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.990490124359912, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.058522311631309436, "grad_norm": 1.5647390529813434, "learning_rate": 5e-06, "loss": 0.8022, "step": 10 }, { "epoch": 0.11704462326261887, "grad_norm": 2.4298906740936874, "learning_rate": 5e-06, "loss": 0.7306, "step": 20 }, { "epoch": 0.1755669348939283, "grad_norm": 1.1278550883615037, "learning_rate": 5e-06, "loss": 0.7122, "step": 30 }, { "epoch": 0.23408924652523774, "grad_norm": 1.1246574687423805, "learning_rate": 5e-06, "loss": 0.6975, "step": 40 }, { "epoch": 0.29261155815654716, "grad_norm": 1.0811775432928663, "learning_rate": 5e-06, "loss": 0.6821, "step": 50 }, { "epoch": 0.3511338697878566, "grad_norm": 0.8345121386846462, "learning_rate": 5e-06, "loss": 0.6822, "step": 60 }, { "epoch": 0.40965618141916604, "grad_norm": 0.502423577542533, "learning_rate": 5e-06, "loss": 0.6631, "step": 70 }, { "epoch": 0.4681784930504755, "grad_norm": 0.3206403744702351, "learning_rate": 5e-06, "loss": 0.6566, "step": 80 }, { "epoch": 0.5267008046817849, "grad_norm": 0.3373586439028653, "learning_rate": 5e-06, "loss": 0.6613, "step": 90 }, { "epoch": 0.5852231163130943, "grad_norm": 0.27440465078078524, "learning_rate": 5e-06, "loss": 0.6497, "step": 100 }, { "epoch": 0.6437454279444038, "grad_norm": 0.25729298157504654, "learning_rate": 5e-06, "loss": 0.6506, "step": 110 }, { "epoch": 0.7022677395757132, "grad_norm": 0.2774645357576214, "learning_rate": 5e-06, "loss": 0.6479, "step": 120 }, { "epoch": 0.7607900512070227, "grad_norm": 0.2750786001903561, "learning_rate": 5e-06, "loss": 0.6509, "step": 130 }, { "epoch": 0.8193123628383321, "grad_norm": 0.3018000353503424, "learning_rate": 5e-06, "loss": 0.6532, "step": 140 }, { "epoch": 0.8778346744696416, "grad_norm": 0.2651836266343764, "learning_rate": 5e-06, "loss": 0.6407, "step": 150 }, { "epoch": 0.936356986100951, "grad_norm": 0.2621590800809169, "learning_rate": 5e-06, "loss": 0.646, "step": 160 }, { "epoch": 0.9948792977322605, "grad_norm": 0.274425449696112, "learning_rate": 5e-06, "loss": 0.6429, "step": 170 }, { "epoch": 0.9948792977322605, "eval_loss": 0.6455708742141724, "eval_runtime": 172.3102, "eval_samples_per_second": 53.444, "eval_steps_per_second": 0.418, "step": 170 }, { "epoch": 1.0563277249451353, "grad_norm": 0.32659804954217203, "learning_rate": 5e-06, "loss": 0.6596, "step": 180 }, { "epoch": 1.1148500365764447, "grad_norm": 0.29235225467356285, "learning_rate": 5e-06, "loss": 0.6219, "step": 190 }, { "epoch": 1.1733723482077543, "grad_norm": 0.2686911846272285, "learning_rate": 5e-06, "loss": 0.6246, "step": 200 }, { "epoch": 1.2318946598390637, "grad_norm": 0.2689856133611371, "learning_rate": 5e-06, "loss": 0.618, "step": 210 }, { "epoch": 1.290416971470373, "grad_norm": 0.26872283242131406, "learning_rate": 5e-06, "loss": 0.6202, "step": 220 }, { "epoch": 1.3489392831016827, "grad_norm": 0.301091252809549, "learning_rate": 5e-06, "loss": 0.618, "step": 230 }, { "epoch": 1.4074615947329918, "grad_norm": 0.2920775430394786, "learning_rate": 5e-06, "loss": 0.6142, "step": 240 }, { "epoch": 1.4659839063643014, "grad_norm": 0.2456820075171799, "learning_rate": 5e-06, "loss": 0.6155, "step": 250 }, { "epoch": 1.5245062179956108, "grad_norm": 0.2938378044663654, "learning_rate": 5e-06, "loss": 0.6187, "step": 260 }, { "epoch": 1.5830285296269202, "grad_norm": 0.32438651891226156, "learning_rate": 5e-06, "loss": 0.6219, "step": 270 }, { "epoch": 1.6415508412582298, "grad_norm": 0.25545801371272864, "learning_rate": 5e-06, "loss": 0.616, "step": 280 }, { "epoch": 1.700073152889539, "grad_norm": 0.26294073057220163, "learning_rate": 5e-06, "loss": 0.6127, "step": 290 }, { "epoch": 1.7585954645208486, "grad_norm": 0.26462245389002803, "learning_rate": 5e-06, "loss": 0.6168, "step": 300 }, { "epoch": 1.817117776152158, "grad_norm": 0.2847262707293318, "learning_rate": 5e-06, "loss": 0.6172, "step": 310 }, { "epoch": 1.8756400877834674, "grad_norm": 0.2669714428041422, "learning_rate": 5e-06, "loss": 0.614, "step": 320 }, { "epoch": 1.934162399414777, "grad_norm": 0.25457144598514, "learning_rate": 5e-06, "loss": 0.6166, "step": 330 }, { "epoch": 1.9926847110460864, "grad_norm": 0.2608967910015083, "learning_rate": 5e-06, "loss": 0.6126, "step": 340 }, { "epoch": 1.9926847110460864, "eval_loss": 0.6363422274589539, "eval_runtime": 172.306, "eval_samples_per_second": 53.446, "eval_steps_per_second": 0.418, "step": 340 }, { "epoch": 2.0541331382589614, "grad_norm": 0.271883921683299, "learning_rate": 5e-06, "loss": 0.6297, "step": 350 }, { "epoch": 2.1126554498902705, "grad_norm": 0.24729272080119263, "learning_rate": 5e-06, "loss": 0.5897, "step": 360 }, { "epoch": 2.17117776152158, "grad_norm": 0.27092891797600144, "learning_rate": 5e-06, "loss": 0.5946, "step": 370 }, { "epoch": 2.2297000731528893, "grad_norm": 0.3032127102208398, "learning_rate": 5e-06, "loss": 0.594, "step": 380 }, { "epoch": 2.288222384784199, "grad_norm": 0.25853126440367846, "learning_rate": 5e-06, "loss": 0.588, "step": 390 }, { "epoch": 2.3467446964155085, "grad_norm": 0.3077689025344159, "learning_rate": 5e-06, "loss": 0.5943, "step": 400 }, { "epoch": 2.4052670080468177, "grad_norm": 0.2827487146132787, "learning_rate": 5e-06, "loss": 0.5933, "step": 410 }, { "epoch": 2.4637893196781273, "grad_norm": 0.2519214403191199, "learning_rate": 5e-06, "loss": 0.5898, "step": 420 }, { "epoch": 2.522311631309437, "grad_norm": 0.2751668540595721, "learning_rate": 5e-06, "loss": 0.588, "step": 430 }, { "epoch": 2.580833942940746, "grad_norm": 0.2530698336402752, "learning_rate": 5e-06, "loss": 0.5883, "step": 440 }, { "epoch": 2.6393562545720557, "grad_norm": 0.25471213766207895, "learning_rate": 5e-06, "loss": 0.5951, "step": 450 }, { "epoch": 2.6978785662033653, "grad_norm": 0.29077003251470107, "learning_rate": 5e-06, "loss": 0.5914, "step": 460 }, { "epoch": 2.7564008778346745, "grad_norm": 0.30152118910674564, "learning_rate": 5e-06, "loss": 0.5917, "step": 470 }, { "epoch": 2.8149231894659836, "grad_norm": 0.26709177034419973, "learning_rate": 5e-06, "loss": 0.5923, "step": 480 }, { "epoch": 2.8734455010972932, "grad_norm": 0.23395614388611538, "learning_rate": 5e-06, "loss": 0.5888, "step": 490 }, { "epoch": 2.931967812728603, "grad_norm": 0.28669880402317394, "learning_rate": 5e-06, "loss": 0.5865, "step": 500 }, { "epoch": 2.990490124359912, "grad_norm": 0.2531977873715163, "learning_rate": 5e-06, "loss": 0.5898, "step": 510 }, { "epoch": 2.990490124359912, "eval_loss": 0.63369220495224, "eval_runtime": 171.9297, "eval_samples_per_second": 53.563, "eval_steps_per_second": 0.419, "step": 510 }, { "epoch": 2.990490124359912, "step": 510, "total_flos": 2138433883471872.0, "train_loss": 0.6292863135244332, "train_runtime": 27713.9731, "train_samples_per_second": 18.939, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2138433883471872.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }