{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02557544757033248, "grad_norm": 1.875, "learning_rate": 2.5e-05, "loss": 0.8604, "step": 10 }, { "epoch": 0.05115089514066496, "grad_norm": 1.03125, "learning_rate": 5e-05, "loss": 0.6219, "step": 20 }, { "epoch": 0.07672634271099744, "grad_norm": 0.80078125, "learning_rate": 7.500000000000001e-05, "loss": 0.5142, "step": 30 }, { "epoch": 0.10230179028132992, "grad_norm": 0.71484375, "learning_rate": 0.0001, "loss": 0.4471, "step": 40 }, { "epoch": 0.1278772378516624, "grad_norm": 0.6640625, "learning_rate": 9.715099715099715e-05, "loss": 0.4005, "step": 50 }, { "epoch": 0.1534526854219949, "grad_norm": 0.58984375, "learning_rate": 9.430199430199431e-05, "loss": 0.3637, "step": 60 }, { "epoch": 0.17902813299232737, "grad_norm": 0.57421875, "learning_rate": 9.145299145299146e-05, "loss": 0.35, "step": 70 }, { "epoch": 0.20460358056265984, "grad_norm": 0.5078125, "learning_rate": 8.860398860398861e-05, "loss": 0.3432, "step": 80 }, { "epoch": 0.23017902813299232, "grad_norm": 0.50390625, "learning_rate": 8.575498575498576e-05, "loss": 0.3338, "step": 90 }, { "epoch": 0.2557544757033248, "grad_norm": 0.58203125, "learning_rate": 8.290598290598292e-05, "loss": 0.3331, "step": 100 }, { "epoch": 0.2813299232736573, "grad_norm": 0.55859375, "learning_rate": 8.005698005698006e-05, "loss": 0.3215, "step": 110 }, { "epoch": 0.3069053708439898, "grad_norm": 0.546875, "learning_rate": 7.720797720797721e-05, "loss": 0.3185, "step": 120 }, { "epoch": 0.33248081841432225, "grad_norm": 0.51953125, "learning_rate": 7.435897435897436e-05, "loss": 0.3186, "step": 130 }, { "epoch": 0.35805626598465473, "grad_norm": 0.46484375, "learning_rate": 7.150997150997152e-05, "loss": 0.3156, "step": 140 }, { "epoch": 0.3836317135549872, "grad_norm": 0.5859375, "learning_rate": 6.866096866096867e-05, "loss": 0.3106, "step": 150 }, { "epoch": 0.4092071611253197, "grad_norm": 0.48046875, "learning_rate": 6.581196581196581e-05, "loss": 0.3146, "step": 160 }, { "epoch": 0.43478260869565216, "grad_norm": 0.53515625, "learning_rate": 6.296296296296296e-05, "loss": 0.3135, "step": 170 }, { "epoch": 0.46035805626598464, "grad_norm": 0.490234375, "learning_rate": 6.011396011396012e-05, "loss": 0.3059, "step": 180 }, { "epoch": 0.4859335038363171, "grad_norm": 0.609375, "learning_rate": 5.726495726495726e-05, "loss": 0.3049, "step": 190 }, { "epoch": 0.5115089514066496, "grad_norm": 0.53125, "learning_rate": 5.441595441595442e-05, "loss": 0.3032, "step": 200 }, { "epoch": 0.5370843989769821, "grad_norm": 0.53125, "learning_rate": 5.156695156695157e-05, "loss": 0.2889, "step": 210 }, { "epoch": 0.5626598465473146, "grad_norm": 0.50390625, "learning_rate": 4.871794871794872e-05, "loss": 0.2971, "step": 220 }, { "epoch": 0.5882352941176471, "grad_norm": 0.546875, "learning_rate": 4.586894586894587e-05, "loss": 0.3088, "step": 230 }, { "epoch": 0.6138107416879796, "grad_norm": 0.57421875, "learning_rate": 4.301994301994302e-05, "loss": 0.2977, "step": 240 }, { "epoch": 0.639386189258312, "grad_norm": 0.578125, "learning_rate": 4.0170940170940174e-05, "loss": 0.2956, "step": 250 }, { "epoch": 0.6649616368286445, "grad_norm": 0.546875, "learning_rate": 3.732193732193732e-05, "loss": 0.2953, "step": 260 }, { "epoch": 0.690537084398977, "grad_norm": 0.49609375, "learning_rate": 3.4472934472934476e-05, "loss": 0.2955, "step": 270 }, { "epoch": 0.7161125319693095, "grad_norm": 0.49609375, "learning_rate": 3.162393162393162e-05, "loss": 0.2892, "step": 280 }, { "epoch": 0.7416879795396419, "grad_norm": 0.486328125, "learning_rate": 2.8774928774928778e-05, "loss": 0.281, "step": 290 }, { "epoch": 0.7672634271099744, "grad_norm": 0.4921875, "learning_rate": 2.5925925925925925e-05, "loss": 0.2911, "step": 300 }, { "epoch": 0.7928388746803069, "grad_norm": 0.61328125, "learning_rate": 2.307692307692308e-05, "loss": 0.2943, "step": 310 }, { "epoch": 0.8184143222506394, "grad_norm": 0.53125, "learning_rate": 2.022792022792023e-05, "loss": 0.293, "step": 320 }, { "epoch": 0.8439897698209718, "grad_norm": 0.4609375, "learning_rate": 1.737891737891738e-05, "loss": 0.2815, "step": 330 }, { "epoch": 0.8695652173913043, "grad_norm": 0.4609375, "learning_rate": 1.4529914529914531e-05, "loss": 0.2871, "step": 340 }, { "epoch": 0.8951406649616368, "grad_norm": 0.55078125, "learning_rate": 1.168091168091168e-05, "loss": 0.2832, "step": 350 }, { "epoch": 0.9207161125319693, "grad_norm": 0.55859375, "learning_rate": 8.831908831908831e-06, "loss": 0.289, "step": 360 }, { "epoch": 0.9462915601023018, "grad_norm": 0.53125, "learning_rate": 5.982905982905984e-06, "loss": 0.2936, "step": 370 }, { "epoch": 0.9718670076726342, "grad_norm": 0.484375, "learning_rate": 3.133903133903134e-06, "loss": 0.2828, "step": 380 }, { "epoch": 0.9974424552429667, "grad_norm": 0.53515625, "learning_rate": 2.8490028490028494e-07, "loss": 0.2916, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 1.382893920190464e+16, "train_loss": 0.3390924896273162, "train_runtime": 504.4341, "train_samples_per_second": 49.56, "train_steps_per_second": 0.775 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.382893920190464e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }