|
{ |
|
"best_metric": 2.326810598373413, |
|
"best_model_checkpoint": "./output/training_results/C015_llama3-8b-base_pretrain_20240428_005832/checkpoint-155", |
|
"epoch": 1.0064935064935066, |
|
"eval_steps": 5, |
|
"global_step": 155, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006493506493506494, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.6141, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006493506493506494, |
|
"eval_loss": 2.635435104370117, |
|
"eval_runtime": 13.3872, |
|
"eval_samples_per_second": 81.421, |
|
"eval_steps_per_second": 0.672, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"grad_norm": 3.736102227472634, |
|
"learning_rate": 2.25e-06, |
|
"loss": 2.657, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"eval_loss": 2.6205813884735107, |
|
"eval_runtime": 13.3001, |
|
"eval_samples_per_second": 81.955, |
|
"eval_steps_per_second": 0.677, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 2.9360680843951172, |
|
"learning_rate": 5.25e-06, |
|
"loss": 2.6337, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"eval_loss": 2.5846035480499268, |
|
"eval_runtime": 13.3634, |
|
"eval_samples_per_second": 81.566, |
|
"eval_steps_per_second": 0.673, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"grad_norm": 2.4441290480949354, |
|
"learning_rate": 9e-06, |
|
"loss": 2.5268, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"eval_loss": 2.5515594482421875, |
|
"eval_runtime": 13.3452, |
|
"eval_samples_per_second": 81.677, |
|
"eval_steps_per_second": 0.674, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 2.5052749297132455, |
|
"learning_rate": 1.275e-05, |
|
"loss": 2.5275, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"eval_loss": 2.5321404933929443, |
|
"eval_runtime": 13.3665, |
|
"eval_samples_per_second": 81.547, |
|
"eval_steps_per_second": 0.673, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"grad_norm": 2.570439112365278, |
|
"learning_rate": 1.4457320927100615e-05, |
|
"loss": 2.5005, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"eval_loss": 2.513052225112915, |
|
"eval_runtime": 13.3936, |
|
"eval_samples_per_second": 81.382, |
|
"eval_steps_per_second": 0.672, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 2.1653661003420974, |
|
"learning_rate": 1.3178060763055965e-05, |
|
"loss": 2.5339, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"eval_loss": 2.496060609817505, |
|
"eval_runtime": 13.4225, |
|
"eval_samples_per_second": 81.207, |
|
"eval_steps_per_second": 0.671, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 2.4298156508230955, |
|
"learning_rate": 1.200291011775234e-05, |
|
"loss": 2.5335, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"eval_loss": 2.4807701110839844, |
|
"eval_runtime": 13.3964, |
|
"eval_samples_per_second": 81.365, |
|
"eval_steps_per_second": 0.672, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 2.371474812035467, |
|
"learning_rate": 1.092418047398154e-05, |
|
"loss": 2.4252, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"eval_loss": 2.464339256286621, |
|
"eval_runtime": 13.3751, |
|
"eval_samples_per_second": 81.495, |
|
"eval_steps_per_second": 0.673, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"grad_norm": 2.233872363768768, |
|
"learning_rate": 9.934692235419926e-06, |
|
"loss": 2.4445, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"eval_loss": 2.4518375396728516, |
|
"eval_runtime": 13.4165, |
|
"eval_samples_per_second": 81.243, |
|
"eval_steps_per_second": 0.671, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 2.228351075418733, |
|
"learning_rate": 9.02774500281382e-06, |
|
"loss": 2.4594, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"eval_loss": 2.4393582344055176, |
|
"eval_runtime": 13.3552, |
|
"eval_samples_per_second": 81.616, |
|
"eval_steps_per_second": 0.674, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.2182357621988693, |
|
"learning_rate": 8.197089350822288e-06, |
|
"loss": 2.4498, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"eval_loss": 2.4287211894989014, |
|
"eval_runtime": 13.3236, |
|
"eval_samples_per_second": 81.809, |
|
"eval_steps_per_second": 0.675, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 2.3864673038770854, |
|
"learning_rate": 7.436900041840997e-06, |
|
"loss": 2.3821, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"eval_loss": 2.4184141159057617, |
|
"eval_runtime": 13.3882, |
|
"eval_samples_per_second": 81.415, |
|
"eval_steps_per_second": 0.672, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"grad_norm": 1.9510161344368442, |
|
"learning_rate": 6.741750615310939e-06, |
|
"loss": 2.4317, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"eval_loss": 2.4091267585754395, |
|
"eval_runtime": 13.358, |
|
"eval_samples_per_second": 81.599, |
|
"eval_steps_per_second": 0.674, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.9482390057148204, |
|
"learning_rate": 6.106589293139538e-06, |
|
"loss": 2.3931, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"eval_loss": 2.40012788772583, |
|
"eval_runtime": 13.4261, |
|
"eval_samples_per_second": 81.185, |
|
"eval_steps_per_second": 0.67, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"grad_norm": 2.112671713085814, |
|
"learning_rate": 5.526716143930102e-06, |
|
"loss": 2.3695, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"eval_loss": 2.3934359550476074, |
|
"eval_runtime": 13.3826, |
|
"eval_samples_per_second": 81.449, |
|
"eval_steps_per_second": 0.673, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 2.0831282196798315, |
|
"learning_rate": 4.997761450728939e-06, |
|
"loss": 2.3981, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"eval_loss": 2.3855459690093994, |
|
"eval_runtime": 13.4481, |
|
"eval_samples_per_second": 81.052, |
|
"eval_steps_per_second": 0.669, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"grad_norm": 1.850619691727126, |
|
"learning_rate": 4.515665228960038e-06, |
|
"loss": 2.3952, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"eval_loss": 2.37890887260437, |
|
"eval_runtime": 13.4148, |
|
"eval_samples_per_second": 81.254, |
|
"eval_steps_per_second": 0.671, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 1.92650957243675, |
|
"learning_rate": 4.0766578431245434e-06, |
|
"loss": 2.4137, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"eval_loss": 2.3720638751983643, |
|
"eval_runtime": 13.3735, |
|
"eval_samples_per_second": 81.504, |
|
"eval_steps_per_second": 0.673, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"grad_norm": 1.9166696037048836, |
|
"learning_rate": 3.6772416726983343e-06, |
|
"loss": 2.3614, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"eval_loss": 2.366936683654785, |
|
"eval_runtime": 13.4784, |
|
"eval_samples_per_second": 80.87, |
|
"eval_steps_per_second": 0.668, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 1.9337017607143168, |
|
"learning_rate": 3.3141737794662055e-06, |
|
"loss": 2.3467, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"eval_loss": 2.361203193664551, |
|
"eval_runtime": 13.4232, |
|
"eval_samples_per_second": 81.203, |
|
"eval_steps_per_second": 0.67, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 1.800114241259619, |
|
"learning_rate": 2.984449530286649e-06, |
|
"loss": 2.4012, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"eval_loss": 2.3568994998931885, |
|
"eval_runtime": 13.3387, |
|
"eval_samples_per_second": 81.717, |
|
"eval_steps_per_second": 0.675, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.8774879479561137, |
|
"learning_rate": 2.685287130987944e-06, |
|
"loss": 2.3224, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"eval_loss": 2.352806329727173, |
|
"eval_runtime": 13.3565, |
|
"eval_samples_per_second": 81.608, |
|
"eval_steps_per_second": 0.674, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"grad_norm": 1.8730632952307495, |
|
"learning_rate": 2.4141130287548048e-06, |
|
"loss": 2.3348, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"eval_loss": 2.348268985748291, |
|
"eval_runtime": 13.3814, |
|
"eval_samples_per_second": 81.456, |
|
"eval_steps_per_second": 0.673, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 1.9700141360167842, |
|
"learning_rate": 2.168548141976706e-06, |
|
"loss": 2.3573, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"eval_loss": 2.3447518348693848, |
|
"eval_runtime": 13.3872, |
|
"eval_samples_per_second": 81.421, |
|
"eval_steps_per_second": 0.672, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"grad_norm": 1.8580210267921047, |
|
"learning_rate": 1.946394878094437e-06, |
|
"loss": 2.306, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"eval_loss": 2.3411996364593506, |
|
"eval_runtime": 13.4315, |
|
"eval_samples_per_second": 81.153, |
|
"eval_steps_per_second": 0.67, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 1.8818877549057513, |
|
"learning_rate": 1.745624901501792e-06, |
|
"loss": 2.342, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"eval_loss": 2.338190793991089, |
|
"eval_runtime": 13.3467, |
|
"eval_samples_per_second": 81.668, |
|
"eval_steps_per_second": 0.674, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"grad_norm": 1.9087045844270383, |
|
"learning_rate": 1.564367615035273e-06, |
|
"loss": 2.3045, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"eval_loss": 2.3356211185455322, |
|
"eval_runtime": 13.3981, |
|
"eval_samples_per_second": 81.355, |
|
"eval_steps_per_second": 0.672, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.8238462929669437, |
|
"learning_rate": 1.4008993200171148e-06, |
|
"loss": 2.2959, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"eval_loss": 2.3329813480377197, |
|
"eval_runtime": 13.4034, |
|
"eval_samples_per_second": 81.322, |
|
"eval_steps_per_second": 0.671, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"grad_norm": 1.824603087231258, |
|
"learning_rate": 1.253633021206854e-06, |
|
"loss": 2.3545, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"eval_loss": 2.330482006072998, |
|
"eval_runtime": 13.3871, |
|
"eval_samples_per_second": 81.422, |
|
"eval_steps_per_second": 0.672, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 1.7913666925001428, |
|
"learning_rate": 1.1211088443646446e-06, |
|
"loss": 2.3446, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"eval_loss": 2.3284924030303955, |
|
"eval_runtime": 13.3985, |
|
"eval_samples_per_second": 81.353, |
|
"eval_steps_per_second": 0.672, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0064935064935066, |
|
"grad_norm": 3.0555751125596067, |
|
"learning_rate": 1.0019850354367667e-06, |
|
"loss": 2.2502, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0064935064935066, |
|
"eval_loss": 2.326810598373413, |
|
"eval_runtime": 13.3767, |
|
"eval_samples_per_second": 81.485, |
|
"eval_steps_per_second": 0.673, |
|
"step": 155 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 616, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 5, |
|
"total_flos": 15912853831680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|