{ "best_metric": 2.326810598373413, "best_model_checkpoint": "./output/training_results/C015_llama3-8b-base_pretrain_20240428_005832/checkpoint-155", "epoch": 1.0064935064935066, "eval_steps": 5, "global_step": 155, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006493506493506494, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.6141, "step": 1 }, { "epoch": 0.006493506493506494, "eval_loss": 2.635435104370117, "eval_runtime": 13.3872, "eval_samples_per_second": 81.421, "eval_steps_per_second": 0.672, "step": 1 }, { "epoch": 0.032467532467532464, "grad_norm": 3.736102227472634, "learning_rate": 2.25e-06, "loss": 2.657, "step": 5 }, { "epoch": 0.032467532467532464, "eval_loss": 2.6205813884735107, "eval_runtime": 13.3001, "eval_samples_per_second": 81.955, "eval_steps_per_second": 0.677, "step": 5 }, { "epoch": 0.06493506493506493, "grad_norm": 2.9360680843951172, "learning_rate": 5.25e-06, "loss": 2.6337, "step": 10 }, { "epoch": 0.06493506493506493, "eval_loss": 2.5846035480499268, "eval_runtime": 13.3634, "eval_samples_per_second": 81.566, "eval_steps_per_second": 0.673, "step": 10 }, { "epoch": 0.09740259740259741, "grad_norm": 2.4441290480949354, "learning_rate": 9e-06, "loss": 2.5268, "step": 15 }, { "epoch": 0.09740259740259741, "eval_loss": 2.5515594482421875, "eval_runtime": 13.3452, "eval_samples_per_second": 81.677, "eval_steps_per_second": 0.674, "step": 15 }, { "epoch": 0.12987012987012986, "grad_norm": 2.5052749297132455, "learning_rate": 1.275e-05, "loss": 2.5275, "step": 20 }, { "epoch": 0.12987012987012986, "eval_loss": 2.5321404933929443, "eval_runtime": 13.3665, "eval_samples_per_second": 81.547, "eval_steps_per_second": 0.673, "step": 20 }, { "epoch": 0.16233766233766234, "grad_norm": 2.570439112365278, "learning_rate": 1.4457320927100615e-05, "loss": 2.5005, "step": 25 }, { "epoch": 0.16233766233766234, "eval_loss": 2.513052225112915, "eval_runtime": 13.3936, "eval_samples_per_second": 81.382, "eval_steps_per_second": 0.672, "step": 25 }, { "epoch": 0.19480519480519481, "grad_norm": 2.1653661003420974, "learning_rate": 1.3178060763055965e-05, "loss": 2.5339, "step": 30 }, { "epoch": 0.19480519480519481, "eval_loss": 2.496060609817505, "eval_runtime": 13.4225, "eval_samples_per_second": 81.207, "eval_steps_per_second": 0.671, "step": 30 }, { "epoch": 0.22727272727272727, "grad_norm": 2.4298156508230955, "learning_rate": 1.200291011775234e-05, "loss": 2.5335, "step": 35 }, { "epoch": 0.22727272727272727, "eval_loss": 2.4807701110839844, "eval_runtime": 13.3964, "eval_samples_per_second": 81.365, "eval_steps_per_second": 0.672, "step": 35 }, { "epoch": 0.2597402597402597, "grad_norm": 2.371474812035467, "learning_rate": 1.092418047398154e-05, "loss": 2.4252, "step": 40 }, { "epoch": 0.2597402597402597, "eval_loss": 2.464339256286621, "eval_runtime": 13.3751, "eval_samples_per_second": 81.495, "eval_steps_per_second": 0.673, "step": 40 }, { "epoch": 0.2922077922077922, "grad_norm": 2.233872363768768, "learning_rate": 9.934692235419926e-06, "loss": 2.4445, "step": 45 }, { "epoch": 0.2922077922077922, "eval_loss": 2.4518375396728516, "eval_runtime": 13.4165, "eval_samples_per_second": 81.243, "eval_steps_per_second": 0.671, "step": 45 }, { "epoch": 0.3246753246753247, "grad_norm": 2.228351075418733, "learning_rate": 9.02774500281382e-06, "loss": 2.4594, "step": 50 }, { "epoch": 0.3246753246753247, "eval_loss": 2.4393582344055176, "eval_runtime": 13.3552, "eval_samples_per_second": 81.616, "eval_steps_per_second": 0.674, "step": 50 }, { "epoch": 0.35714285714285715, "grad_norm": 2.2182357621988693, "learning_rate": 8.197089350822288e-06, "loss": 2.4498, "step": 55 }, { "epoch": 0.35714285714285715, "eval_loss": 2.4287211894989014, "eval_runtime": 13.3236, "eval_samples_per_second": 81.809, "eval_steps_per_second": 0.675, "step": 55 }, { "epoch": 0.38961038961038963, "grad_norm": 2.3864673038770854, "learning_rate": 7.436900041840997e-06, "loss": 2.3821, "step": 60 }, { "epoch": 0.38961038961038963, "eval_loss": 2.4184141159057617, "eval_runtime": 13.3882, "eval_samples_per_second": 81.415, "eval_steps_per_second": 0.672, "step": 60 }, { "epoch": 0.42207792207792205, "grad_norm": 1.9510161344368442, "learning_rate": 6.741750615310939e-06, "loss": 2.4317, "step": 65 }, { "epoch": 0.42207792207792205, "eval_loss": 2.4091267585754395, "eval_runtime": 13.358, "eval_samples_per_second": 81.599, "eval_steps_per_second": 0.674, "step": 65 }, { "epoch": 0.45454545454545453, "grad_norm": 1.9482390057148204, "learning_rate": 6.106589293139538e-06, "loss": 2.3931, "step": 70 }, { "epoch": 0.45454545454545453, "eval_loss": 2.40012788772583, "eval_runtime": 13.4261, "eval_samples_per_second": 81.185, "eval_steps_per_second": 0.67, "step": 70 }, { "epoch": 0.487012987012987, "grad_norm": 2.112671713085814, "learning_rate": 5.526716143930102e-06, "loss": 2.3695, "step": 75 }, { "epoch": 0.487012987012987, "eval_loss": 2.3934359550476074, "eval_runtime": 13.3826, "eval_samples_per_second": 81.449, "eval_steps_per_second": 0.673, "step": 75 }, { "epoch": 0.5194805194805194, "grad_norm": 2.0831282196798315, "learning_rate": 4.997761450728939e-06, "loss": 2.3981, "step": 80 }, { "epoch": 0.5194805194805194, "eval_loss": 2.3855459690093994, "eval_runtime": 13.4481, "eval_samples_per_second": 81.052, "eval_steps_per_second": 0.669, "step": 80 }, { "epoch": 0.551948051948052, "grad_norm": 1.850619691727126, "learning_rate": 4.515665228960038e-06, "loss": 2.3952, "step": 85 }, { "epoch": 0.551948051948052, "eval_loss": 2.37890887260437, "eval_runtime": 13.4148, "eval_samples_per_second": 81.254, "eval_steps_per_second": 0.671, "step": 85 }, { "epoch": 0.5844155844155844, "grad_norm": 1.92650957243675, "learning_rate": 4.0766578431245434e-06, "loss": 2.4137, "step": 90 }, { "epoch": 0.5844155844155844, "eval_loss": 2.3720638751983643, "eval_runtime": 13.3735, "eval_samples_per_second": 81.504, "eval_steps_per_second": 0.673, "step": 90 }, { "epoch": 0.6168831168831169, "grad_norm": 1.9166696037048836, "learning_rate": 3.6772416726983343e-06, "loss": 2.3614, "step": 95 }, { "epoch": 0.6168831168831169, "eval_loss": 2.366936683654785, "eval_runtime": 13.4784, "eval_samples_per_second": 80.87, "eval_steps_per_second": 0.668, "step": 95 }, { "epoch": 0.6493506493506493, "grad_norm": 1.9337017607143168, "learning_rate": 3.3141737794662055e-06, "loss": 2.3467, "step": 100 }, { "epoch": 0.6493506493506493, "eval_loss": 2.361203193664551, "eval_runtime": 13.4232, "eval_samples_per_second": 81.203, "eval_steps_per_second": 0.67, "step": 100 }, { "epoch": 0.6818181818181818, "grad_norm": 1.800114241259619, "learning_rate": 2.984449530286649e-06, "loss": 2.4012, "step": 105 }, { "epoch": 0.6818181818181818, "eval_loss": 2.3568994998931885, "eval_runtime": 13.3387, "eval_samples_per_second": 81.717, "eval_steps_per_second": 0.675, "step": 105 }, { "epoch": 0.7142857142857143, "grad_norm": 1.8774879479561137, "learning_rate": 2.685287130987944e-06, "loss": 2.3224, "step": 110 }, { "epoch": 0.7142857142857143, "eval_loss": 2.352806329727173, "eval_runtime": 13.3565, "eval_samples_per_second": 81.608, "eval_steps_per_second": 0.674, "step": 110 }, { "epoch": 0.7467532467532467, "grad_norm": 1.8730632952307495, "learning_rate": 2.4141130287548048e-06, "loss": 2.3348, "step": 115 }, { "epoch": 0.7467532467532467, "eval_loss": 2.348268985748291, "eval_runtime": 13.3814, "eval_samples_per_second": 81.456, "eval_steps_per_second": 0.673, "step": 115 }, { "epoch": 0.7792207792207793, "grad_norm": 1.9700141360167842, "learning_rate": 2.168548141976706e-06, "loss": 2.3573, "step": 120 }, { "epoch": 0.7792207792207793, "eval_loss": 2.3447518348693848, "eval_runtime": 13.3872, "eval_samples_per_second": 81.421, "eval_steps_per_second": 0.672, "step": 120 }, { "epoch": 0.8116883116883117, "grad_norm": 1.8580210267921047, "learning_rate": 1.946394878094437e-06, "loss": 2.306, "step": 125 }, { "epoch": 0.8116883116883117, "eval_loss": 2.3411996364593506, "eval_runtime": 13.4315, "eval_samples_per_second": 81.153, "eval_steps_per_second": 0.67, "step": 125 }, { "epoch": 0.8441558441558441, "grad_norm": 1.8818877549057513, "learning_rate": 1.745624901501792e-06, "loss": 2.342, "step": 130 }, { "epoch": 0.8441558441558441, "eval_loss": 2.338190793991089, "eval_runtime": 13.3467, "eval_samples_per_second": 81.668, "eval_steps_per_second": 0.674, "step": 130 }, { "epoch": 0.8766233766233766, "grad_norm": 1.9087045844270383, "learning_rate": 1.564367615035273e-06, "loss": 2.3045, "step": 135 }, { "epoch": 0.8766233766233766, "eval_loss": 2.3356211185455322, "eval_runtime": 13.3981, "eval_samples_per_second": 81.355, "eval_steps_per_second": 0.672, "step": 135 }, { "epoch": 0.9090909090909091, "grad_norm": 1.8238462929669437, "learning_rate": 1.4008993200171148e-06, "loss": 2.2959, "step": 140 }, { "epoch": 0.9090909090909091, "eval_loss": 2.3329813480377197, "eval_runtime": 13.4034, "eval_samples_per_second": 81.322, "eval_steps_per_second": 0.671, "step": 140 }, { "epoch": 0.9415584415584416, "grad_norm": 1.824603087231258, "learning_rate": 1.253633021206854e-06, "loss": 2.3545, "step": 145 }, { "epoch": 0.9415584415584416, "eval_loss": 2.330482006072998, "eval_runtime": 13.3871, "eval_samples_per_second": 81.422, "eval_steps_per_second": 0.672, "step": 145 }, { "epoch": 0.974025974025974, "grad_norm": 1.7913666925001428, "learning_rate": 1.1211088443646446e-06, "loss": 2.3446, "step": 150 }, { "epoch": 0.974025974025974, "eval_loss": 2.3284924030303955, "eval_runtime": 13.3985, "eval_samples_per_second": 81.353, "eval_steps_per_second": 0.672, "step": 150 }, { "epoch": 1.0064935064935066, "grad_norm": 3.0555751125596067, "learning_rate": 1.0019850354367667e-06, "loss": 2.2502, "step": 155 }, { "epoch": 1.0064935064935066, "eval_loss": 2.326810598373413, "eval_runtime": 13.3767, "eval_samples_per_second": 81.485, "eval_steps_per_second": 0.673, "step": 155 } ], "logging_steps": 5, "max_steps": 616, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5, "total_flos": 15912853831680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }