{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.3149551745756822, "learning_rate": 8.333333333333334e-06, "loss": 10.3811, "step": 25 }, { "epoch": 0.025, "grad_norm": 4.476125395642584, "learning_rate": 1.6666666666666667e-05, "loss": 9.7962, "step": 50 }, { "epoch": 0.0375, "grad_norm": 3.2343787109724715, "learning_rate": 2.5e-05, "loss": 9.7445, "step": 75 }, { "epoch": 0.05, "grad_norm": 4.718081425339817, "learning_rate": 3.3333333333333335e-05, "loss": 9.7173, "step": 100 }, { "epoch": 0.05, "eval_loss": 9.740926742553711, "eval_runtime": 1.8462, "eval_samples_per_second": 3.25, "eval_steps_per_second": 0.542, "step": 100 }, { "epoch": 0.0625, "grad_norm": 3.2306919106602656, "learning_rate": 4.166666666666667e-05, "loss": 9.6781, "step": 125 }, { "epoch": 0.075, "grad_norm": 10.171404197937134, "learning_rate": 5e-05, "loss": 9.6586, "step": 150 }, { "epoch": 0.0875, "grad_norm": 5.483508791501675, "learning_rate": 4.9977474155117045e-05, "loss": 9.4947, "step": 175 }, { "epoch": 0.1, "grad_norm": 5.9759460744813975, "learning_rate": 4.9909937213563165e-05, "loss": 9.0607, "step": 200 }, { "epoch": 0.1, "eval_loss": 9.167929649353027, "eval_runtime": 1.8244, "eval_samples_per_second": 3.289, "eval_steps_per_second": 0.548, "step": 200 }, { "epoch": 0.1125, "grad_norm": 6.341385080860075, "learning_rate": 4.979751088147192e-05, "loss": 8.3272, "step": 225 }, { "epoch": 0.125, "grad_norm": 13.948046206815455, "learning_rate": 4.9640397758692715e-05, "loss": 7.2865, "step": 250 }, { "epoch": 0.1375, "grad_norm": 10.72817097098191, "learning_rate": 4.943888097369216e-05, "loss": 6.2547, "step": 275 }, { "epoch": 0.15, "grad_norm": 13.287424334539534, "learning_rate": 4.9193323673337476e-05, "loss": 5.4209, "step": 300 }, { "epoch": 0.15, "eval_loss": 8.03414249420166, "eval_runtime": 1.8224, "eval_samples_per_second": 3.292, "eval_steps_per_second": 0.549, "step": 300 }, { "epoch": 0.1625, "grad_norm": 12.101041260647385, "learning_rate": 4.890416836848127e-05, "loss": 4.4191, "step": 325 }, { "epoch": 0.175, "grad_norm": 9.80750427080603, "learning_rate": 4.857193613652711e-05, "loss": 3.2966, "step": 350 }, { "epoch": 0.1875, "grad_norm": 10.790073899029933, "learning_rate": 4.819722568241274e-05, "loss": 2.585, "step": 375 }, { "epoch": 0.2, "grad_norm": 8.653177439153787, "learning_rate": 4.77807122597034e-05, "loss": 2.083, "step": 400 }, { "epoch": 0.2, "eval_loss": 8.081077575683594, "eval_runtime": 1.7997, "eval_samples_per_second": 3.334, "eval_steps_per_second": 0.556, "step": 400 }, { "epoch": 0.2125, "grad_norm": 8.250257658909273, "learning_rate": 4.732314645373921e-05, "loss": 1.5624, "step": 425 }, { "epoch": 0.225, "grad_norm": 6.533065280213431, "learning_rate": 4.6825352829029705e-05, "loss": 1.0716, "step": 450 }, { "epoch": 0.2375, "grad_norm": 5.728706047757792, "learning_rate": 4.628822844333278e-05, "loss": 0.8864, "step": 475 }, { "epoch": 0.25, "grad_norm": 5.544182938195155, "learning_rate": 4.571274123109606e-05, "loss": 0.7022, "step": 500 }, { "epoch": 0.25, "eval_loss": 8.360109329223633, "eval_runtime": 1.8082, "eval_samples_per_second": 3.318, "eval_steps_per_second": 0.553, "step": 500 }, { "epoch": 0.2625, "grad_norm": 4.383684345209745, "learning_rate": 4.5099928259173516e-05, "loss": 0.5413, "step": 525 }, { "epoch": 0.275, "grad_norm": 4.371192250634983, "learning_rate": 4.445089385796099e-05, "loss": 0.4765, "step": 550 }, { "epoch": 0.2875, "grad_norm": 4.402713772495511, "learning_rate": 4.3766807631318106e-05, "loss": 0.4365, "step": 575 }, { "epoch": 0.3, "grad_norm": 3.8540612536413033, "learning_rate": 4.3048902348863116e-05, "loss": 0.3897, "step": 600 }, { "epoch": 0.3, "eval_loss": 8.43352222442627, "eval_runtime": 1.8212, "eval_samples_per_second": 3.294, "eval_steps_per_second": 0.549, "step": 600 }, { "epoch": 0.3125, "grad_norm": 3.7090862448632533, "learning_rate": 4.229847172443866e-05, "loss": 0.3055, "step": 625 }, { "epoch": 0.325, "grad_norm": 4.311250485288747, "learning_rate": 4.151686808475204e-05, "loss": 0.2775, "step": 650 }, { "epoch": 0.3375, "grad_norm": 3.6226429353917773, "learning_rate": 4.070549993239106e-05, "loss": 0.2508, "step": 675 }, { "epoch": 0.35, "grad_norm": 3.0403924264023483, "learning_rate": 3.986582940760717e-05, "loss": 0.2069, "step": 700 }, { "epoch": 0.35, "eval_loss": 8.551321983337402, "eval_runtime": 1.8234, "eval_samples_per_second": 3.291, "eval_steps_per_second": 0.548, "step": 700 }, { "epoch": 0.3625, "grad_norm": 2.964832791061043, "learning_rate": 3.899936965343989e-05, "loss": 0.1849, "step": 725 }, { "epoch": 0.375, "grad_norm": 3.120939904209259, "learning_rate": 3.8107682088930794e-05, "loss": 0.1814, "step": 750 }, { "epoch": 0.3875, "grad_norm": 2.871839463342702, "learning_rate": 3.719237359534087e-05, "loss": 0.1624, "step": 775 }, { "epoch": 0.4, "grad_norm": 3.613374823862504, "learning_rate": 3.6255093620441834e-05, "loss": 0.137, "step": 800 }, { "epoch": 0.4, "eval_loss": 8.811583518981934, "eval_runtime": 1.8083, "eval_samples_per_second": 3.318, "eval_steps_per_second": 0.553, "step": 800 }, { "epoch": 0.4125, "grad_norm": 2.8236114670073817, "learning_rate": 3.529753120609982e-05, "loss": 0.1381, "step": 825 }, { "epoch": 0.425, "grad_norm": 2.7770161359397014, "learning_rate": 3.432141194450772e-05, "loss": 0.1286, "step": 850 }, { "epoch": 0.4375, "grad_norm": 2.284640802585632, "learning_rate": 3.332849486855144e-05, "loss": 0.1231, "step": 875 }, { "epoch": 0.45, "grad_norm": 2.209989274456459, "learning_rate": 3.232056928191376e-05, "loss": 0.113, "step": 900 }, { "epoch": 0.45, "eval_loss": 8.732518196105957, "eval_runtime": 1.8171, "eval_samples_per_second": 3.302, "eval_steps_per_second": 0.55, "step": 900 }, { "epoch": 0.4625, "grad_norm": 2.276743831010252, "learning_rate": 3.1299451534628135e-05, "loss": 0.1108, "step": 925 }, { "epoch": 0.475, "grad_norm": 2.1372418619941436, "learning_rate": 3.0266981749893157e-05, "loss": 0.1032, "step": 950 }, { "epoch": 0.4875, "grad_norm": 2.043672361244138, "learning_rate": 2.9225020508046232e-05, "loss": 0.132, "step": 975 }, { "epoch": 0.5, "grad_norm": 1.9741157214596652, "learning_rate": 2.8175445493671972e-05, "loss": 0.0908, "step": 1000 }, { "epoch": 0.5, "eval_loss": 8.757107734680176, "eval_runtime": 1.8115, "eval_samples_per_second": 3.312, "eval_steps_per_second": 0.552, "step": 1000 }, { "epoch": 0.5125, "grad_norm": 1.9698096326525745, "learning_rate": 2.7120148111887732e-05, "loss": 0.1028, "step": 1025 }, { "epoch": 0.525, "grad_norm": 1.5292668769105346, "learning_rate": 2.606103007990371e-05, "loss": 0.1041, "step": 1050 }, { "epoch": 0.5375, "grad_norm": 1.6321666587082082, "learning_rate": 2.5e-05, "loss": 0.0738, "step": 1075 }, { "epoch": 0.55, "grad_norm": 1.6792190003403862, "learning_rate": 2.39389699200963e-05, "loss": 0.0781, "step": 1100 }, { "epoch": 0.55, "eval_loss": 8.770298957824707, "eval_runtime": 1.822, "eval_samples_per_second": 3.293, "eval_steps_per_second": 0.549, "step": 1100 }, { "epoch": 0.5625, "grad_norm": 1.7765341038422486, "learning_rate": 2.287985188811228e-05, "loss": 0.0759, "step": 1125 }, { "epoch": 0.575, "grad_norm": 1.4659843578972909, "learning_rate": 2.182455450632803e-05, "loss": 0.0688, "step": 1150 }, { "epoch": 0.5875, "grad_norm": 1.6449380567389906, "learning_rate": 2.0774979491953777e-05, "loss": 0.0681, "step": 1175 }, { "epoch": 0.6, "grad_norm": 1.643618719970827, "learning_rate": 1.973301825010685e-05, "loss": 0.0669, "step": 1200 }, { "epoch": 0.6, "eval_loss": 8.826465606689453, "eval_runtime": 1.8186, "eval_samples_per_second": 3.299, "eval_steps_per_second": 0.55, "step": 1200 }, { "epoch": 0.6125, "grad_norm": 1.4801071508891448, "learning_rate": 1.8700548465371874e-05, "loss": 0.0715, "step": 1225 }, { "epoch": 0.625, "grad_norm": 1.217350865691875, "learning_rate": 1.7679430718086243e-05, "loss": 0.0514, "step": 1250 }, { "epoch": 0.6375, "grad_norm": 1.4646837440162195, "learning_rate": 1.667150513144856e-05, "loss": 0.0558, "step": 1275 }, { "epoch": 0.65, "grad_norm": 1.479394797108792, "learning_rate": 1.567858805549229e-05, "loss": 0.0663, "step": 1300 }, { "epoch": 0.65, "eval_loss": 8.926370620727539, "eval_runtime": 1.8078, "eval_samples_per_second": 3.319, "eval_steps_per_second": 0.553, "step": 1300 }, { "epoch": 0.6625, "grad_norm": 1.3062143881028578, "learning_rate": 1.4702468793900188e-05, "loss": 0.0659, "step": 1325 }, { "epoch": 0.675, "grad_norm": 1.2676295117324594, "learning_rate": 1.3744906379558165e-05, "loss": 0.0547, "step": 1350 }, { "epoch": 0.6875, "grad_norm": 1.2398361251400454, "learning_rate": 1.2807626404659142e-05, "loss": 0.0639, "step": 1375 }, { "epoch": 0.7, "grad_norm": 1.255879844869929, "learning_rate": 1.1892317911069212e-05, "loss": 0.0552, "step": 1400 }, { "epoch": 0.7, "eval_loss": 8.901784896850586, "eval_runtime": 1.8039, "eval_samples_per_second": 3.326, "eval_steps_per_second": 0.554, "step": 1400 }, { "epoch": 0.7125, "grad_norm": 1.2575893365273807, "learning_rate": 1.1000630346560117e-05, "loss": 0.0535, "step": 1425 }, { "epoch": 0.725, "grad_norm": 1.258140331853837, "learning_rate": 1.0134170592392836e-05, "loss": 0.0637, "step": 1450 }, { "epoch": 0.7375, "grad_norm": 0.9287968743161679, "learning_rate": 9.29450006760894e-06, "loss": 0.0487, "step": 1475 }, { "epoch": 0.75, "grad_norm": 1.0046137241749171, "learning_rate": 8.483131915247968e-06, "loss": 0.0424, "step": 1500 }, { "epoch": 0.75, "eval_loss": 8.992987632751465, "eval_runtime": 1.8084, "eval_samples_per_second": 3.318, "eval_steps_per_second": 0.553, "step": 1500 }, { "epoch": 0.7625, "grad_norm": 0.9579032095373672, "learning_rate": 7.701528275561348e-06, "loss": 0.0576, "step": 1525 }, { "epoch": 0.775, "grad_norm": 1.125673487118933, "learning_rate": 6.951097651136889e-06, "loss": 0.0466, "step": 1550 }, { "epoch": 0.7875, "grad_norm": 1.0874236683551293, "learning_rate": 6.23319236868189e-06, "loss": 0.0437, "step": 1575 }, { "epoch": 0.8, "grad_norm": 0.8479867385237028, "learning_rate": 5.549106142039018e-06, "loss": 0.0405, "step": 1600 }, { "epoch": 0.8, "eval_loss": 9.112229347229004, "eval_runtime": 1.8057, "eval_samples_per_second": 3.323, "eval_steps_per_second": 0.554, "step": 1600 }, { "epoch": 0.8125, "grad_norm": 0.9113596204684477, "learning_rate": 4.900071740826489e-06, "loss": 0.0379, "step": 1625 }, { "epoch": 0.825, "grad_norm": 1.1432797389197238, "learning_rate": 4.2872587689039484e-06, "loss": 0.0494, "step": 1650 }, { "epoch": 0.8375, "grad_norm": 0.7567826498210426, "learning_rate": 3.711771556667218e-06, "loss": 0.0433, "step": 1675 }, { "epoch": 0.85, "grad_norm": 0.8659616887551557, "learning_rate": 3.1746471709702964e-06, "loss": 0.0402, "step": 1700 }, { "epoch": 0.85, "eval_loss": 9.140899658203125, "eval_runtime": 1.8129, "eval_samples_per_second": 3.31, "eval_steps_per_second": 0.552, "step": 1700 }, { "epoch": 0.8625, "grad_norm": 0.727880872866104, "learning_rate": 2.676853546260791e-06, "loss": 0.0491, "step": 1725 }, { "epoch": 0.875, "grad_norm": 1.0286223878402718, "learning_rate": 2.219287740296605e-06, "loss": 0.0344, "step": 1750 }, { "epoch": 0.8875, "grad_norm": 0.8881665782796215, "learning_rate": 1.8027743175872664e-06, "loss": 0.0413, "step": 1775 }, { "epoch": 0.9, "grad_norm": 0.9885092207331243, "learning_rate": 1.428063863472895e-06, "loss": 0.048, "step": 1800 }, { "epoch": 0.9, "eval_loss": 9.0552339553833, "eval_runtime": 1.8028, "eval_samples_per_second": 3.328, "eval_steps_per_second": 0.555, "step": 1800 }, { "epoch": 0.9125, "grad_norm": 0.9002848001580246, "learning_rate": 1.095831631518729e-06, "loss": 0.042, "step": 1825 }, { "epoch": 0.925, "grad_norm": 1.0463010120550167, "learning_rate": 8.066763266625282e-07, "loss": 0.051, "step": 1850 }, { "epoch": 0.9375, "grad_norm": 0.9826340095548511, "learning_rate": 5.611190263078464e-07, "loss": 0.0405, "step": 1875 }, { "epoch": 0.95, "grad_norm": 0.922633621151681, "learning_rate": 3.5960224130728857e-07, "loss": 0.0497, "step": 1900 }, { "epoch": 0.95, "eval_loss": 9.048293113708496, "eval_runtime": 1.8038, "eval_samples_per_second": 3.326, "eval_steps_per_second": 0.554, "step": 1900 }, { "epoch": 0.9625, "grad_norm": 1.0603289653790473, "learning_rate": 2.0248911852807916e-07, "loss": 0.0444, "step": 1925 }, { "epoch": 0.975, "grad_norm": 0.6803418512340275, "learning_rate": 9.006278643683696e-08, "loss": 0.0403, "step": 1950 }, { "epoch": 0.9875, "grad_norm": 0.7680662875965549, "learning_rate": 2.2525844882964607e-08, "loss": 0.0455, "step": 1975 }, { "epoch": 1.0, "grad_norm": 0.9001414861768687, "learning_rate": 0.0, "loss": 0.0556, "step": 2000 }, { "epoch": 1.0, "eval_loss": 9.248927116394043, "eval_runtime": 1.8161, "eval_samples_per_second": 3.304, "eval_steps_per_second": 0.551, "step": 2000 } ], "logging_steps": 25, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 78507556208640.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }