|
{ |
|
"best_metric": 2.4018473625183105, |
|
"best_model_checkpoint": "./output/training_results/C017_random_sample_Meta-Llama-3-8B_pretrain_20240724/checkpoint-35802", |
|
"epoch": 4.0, |
|
"eval_steps": 3978, |
|
"global_step": 39772, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00010057326762546515, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.7164, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.20004022930705018, |
|
"grad_norm": 1.9595824499713421, |
|
"learning_rate": 1.9943010392222595e-06, |
|
"loss": 2.5584, |
|
"step": 1989 |
|
}, |
|
{ |
|
"epoch": 0.40008045861410035, |
|
"grad_norm": 1.8475727372215602, |
|
"learning_rate": 2.237952084257793e-06, |
|
"loss": 2.4822, |
|
"step": 3978 |
|
}, |
|
{ |
|
"epoch": 0.40008045861410035, |
|
"eval_loss": 2.457235813140869, |
|
"eval_runtime": 809.2413, |
|
"eval_samples_per_second": 87.369, |
|
"eval_steps_per_second": 0.683, |
|
"step": 3978 |
|
}, |
|
{ |
|
"epoch": 0.6001206879211506, |
|
"grad_norm": 1.7662911861496735, |
|
"learning_rate": 1.2175319826671093e-06, |
|
"loss": 2.4419, |
|
"step": 5967 |
|
}, |
|
{ |
|
"epoch": 0.8001609172282007, |
|
"grad_norm": 1.7983814582083166, |
|
"learning_rate": 6.499668127721108e-07, |
|
"loss": 2.4215, |
|
"step": 7956 |
|
}, |
|
{ |
|
"epoch": 0.8001609172282007, |
|
"eval_loss": 2.4154767990112305, |
|
"eval_runtime": 808.5614, |
|
"eval_samples_per_second": 87.443, |
|
"eval_steps_per_second": 0.684, |
|
"step": 7956 |
|
}, |
|
{ |
|
"epoch": 1.000201146535251, |
|
"grad_norm": 1.7969883421186013, |
|
"learning_rate": 3.4535991044489427e-07, |
|
"loss": 2.4128, |
|
"step": 9945 |
|
}, |
|
{ |
|
"epoch": 1.2002413758423012, |
|
"grad_norm": 1.8600898537077046, |
|
"learning_rate": 1.8847567083143102e-07, |
|
"loss": 2.3585, |
|
"step": 11934 |
|
}, |
|
{ |
|
"epoch": 1.2002413758423012, |
|
"eval_loss": 2.407348394393921, |
|
"eval_runtime": 809.2343, |
|
"eval_samples_per_second": 87.37, |
|
"eval_steps_per_second": 0.683, |
|
"step": 11934 |
|
}, |
|
{ |
|
"epoch": 1.4002816051493512, |
|
"grad_norm": 1.9169506816386161, |
|
"learning_rate": 1.1133168818945991e-07, |
|
"loss": 2.3538, |
|
"step": 13923 |
|
}, |
|
{ |
|
"epoch": 1.6003218344564014, |
|
"grad_norm": 1.8761137775542636, |
|
"learning_rate": 7.547427840102147e-08, |
|
"loss": 2.3526, |
|
"step": 15912 |
|
}, |
|
{ |
|
"epoch": 1.6003218344564014, |
|
"eval_loss": 2.40483021736145, |
|
"eval_runtime": 808.8539, |
|
"eval_samples_per_second": 87.411, |
|
"eval_steps_per_second": 0.684, |
|
"step": 15912 |
|
}, |
|
{ |
|
"epoch": 1.8003620637634516, |
|
"grad_norm": 1.761004624388482, |
|
"learning_rate": 5.979347528334647e-08, |
|
"loss": 2.3525, |
|
"step": 17901 |
|
}, |
|
{ |
|
"epoch": 2.000402293070502, |
|
"grad_norm": 1.813766837169378, |
|
"learning_rate": 5.34373848297726e-08, |
|
"loss": 2.3542, |
|
"step": 19890 |
|
}, |
|
{ |
|
"epoch": 2.000402293070502, |
|
"eval_loss": 2.40364408493042, |
|
"eval_runtime": 808.719, |
|
"eval_samples_per_second": 87.426, |
|
"eval_steps_per_second": 0.684, |
|
"step": 19890 |
|
}, |
|
{ |
|
"epoch": 2.200442522377552, |
|
"grad_norm": 1.8622400969038313, |
|
"learning_rate": 5.108120698150338e-08, |
|
"loss": 2.3457, |
|
"step": 21879 |
|
}, |
|
{ |
|
"epoch": 2.4004827516846023, |
|
"grad_norm": 1.9759538842891853, |
|
"learning_rate": 5.029702769777328e-08, |
|
"loss": 2.3441, |
|
"step": 23868 |
|
}, |
|
{ |
|
"epoch": 2.4004827516846023, |
|
"eval_loss": 2.4035539627075195, |
|
"eval_runtime": 809.1929, |
|
"eval_samples_per_second": 87.375, |
|
"eval_steps_per_second": 0.683, |
|
"step": 23868 |
|
}, |
|
{ |
|
"epoch": 2.6005229809916526, |
|
"grad_norm": 1.9777564770023575, |
|
"learning_rate": 5.0068636563154646e-08, |
|
"loss": 2.3446, |
|
"step": 25857 |
|
}, |
|
{ |
|
"epoch": 2.8005632102987024, |
|
"grad_norm": 1.8192679060780033, |
|
"learning_rate": 5.001264640771992e-08, |
|
"loss": 2.3451, |
|
"step": 27846 |
|
}, |
|
{ |
|
"epoch": 2.8005632102987024, |
|
"eval_loss": 2.4028046131134033, |
|
"eval_runtime": 808.0983, |
|
"eval_samples_per_second": 87.493, |
|
"eval_steps_per_second": 0.684, |
|
"step": 27846 |
|
}, |
|
{ |
|
"epoch": 3.0006034396057526, |
|
"grad_norm": 2.0137358755507666, |
|
"learning_rate": 5.0001713989719166e-08, |
|
"loss": 2.3443, |
|
"step": 29835 |
|
}, |
|
{ |
|
"epoch": 3.200643668912803, |
|
"grad_norm": 1.8381002615524122, |
|
"learning_rate": 5.0000148985750135e-08, |
|
"loss": 2.3415, |
|
"step": 31824 |
|
}, |
|
{ |
|
"epoch": 3.200643668912803, |
|
"eval_loss": 2.402500867843628, |
|
"eval_runtime": 371.3559, |
|
"eval_samples_per_second": 190.392, |
|
"eval_steps_per_second": 1.489, |
|
"step": 31824 |
|
}, |
|
{ |
|
"epoch": 3.400683898219853, |
|
"grad_norm": 1.9296099508223659, |
|
"learning_rate": 5.0000006382565474e-08, |
|
"loss": 2.34, |
|
"step": 33813 |
|
}, |
|
{ |
|
"epoch": 3.6007241275269033, |
|
"grad_norm": 1.9423815954292634, |
|
"learning_rate": 5.0000000076751155e-08, |
|
"loss": 2.3406, |
|
"step": 35802 |
|
}, |
|
{ |
|
"epoch": 3.6007241275269033, |
|
"eval_loss": 2.4018473625183105, |
|
"eval_runtime": 346.8193, |
|
"eval_samples_per_second": 203.861, |
|
"eval_steps_per_second": 1.594, |
|
"step": 35802 |
|
}, |
|
{ |
|
"epoch": 3.8007643568339535, |
|
"grad_norm": 1.8386633630301288, |
|
"learning_rate": 5.0000000000041456e-08, |
|
"loss": 2.3406, |
|
"step": 37791 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 39772, |
|
"total_flos": 4159012790599680.0, |
|
"train_loss": 0.7021765150223461, |
|
"train_runtime": 17437.0559, |
|
"train_samples_per_second": 145.971, |
|
"train_steps_per_second": 2.281 |
|
} |
|
], |
|
"logging_steps": 1989, |
|
"max_steps": 39772, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 3978, |
|
"total_flos": 4159012790599680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|