|
{ |
|
"best_metric": 0.8514555096626282, |
|
"best_model_checkpoint": "./output/training_results/C016_random_sample_llama3-8b-base_instruct_20240504_181744/checkpoint-40", |
|
"epoch": 4.0, |
|
"eval_steps": 20, |
|
"global_step": 192, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.9711, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 12.43094855680643, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.949, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 4.621787623957677, |
|
"learning_rate": 5.25e-06, |
|
"loss": 0.8756, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 5.102324743913745, |
|
"learning_rate": 9e-06, |
|
"loss": 0.8441, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 5.046721891827742, |
|
"learning_rate": 1.275e-05, |
|
"loss": 0.8263, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"eval_loss": 0.8584555983543396, |
|
"eval_runtime": 2.0011, |
|
"eval_samples_per_second": 169.909, |
|
"eval_steps_per_second": 1.499, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 4.324316961991125, |
|
"learning_rate": 1.3195176200175283e-05, |
|
"loss": 0.8289, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 3.9781224609845345, |
|
"learning_rate": 9.515676612044427e-06, |
|
"loss": 0.9015, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 3.851416633553399, |
|
"learning_rate": 6.797580677308734e-06, |
|
"loss": 0.823, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 4.17708940737461, |
|
"learning_rate": 4.808575415542887e-06, |
|
"loss": 0.8014, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 0.8514555096626282, |
|
"eval_runtime": 1.9698, |
|
"eval_samples_per_second": 172.604, |
|
"eval_steps_per_second": 1.523, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 3.865023549249989, |
|
"learning_rate": 3.3676619069852654e-06, |
|
"loss": 0.8814, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.6024413325254803, |
|
"learning_rate": 2.334947896124909e-06, |
|
"loss": 0.7239, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"grad_norm": 2.9479655076003395, |
|
"learning_rate": 1.603233215095547e-06, |
|
"loss": 0.466, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.8904458700460633, |
|
"learning_rate": 1.0911174606561334e-06, |
|
"loss": 0.4375, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.871760368347168, |
|
"eval_runtime": 1.9682, |
|
"eval_samples_per_second": 172.743, |
|
"eval_steps_per_second": 1.524, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"grad_norm": 3.7825224108872644, |
|
"learning_rate": 7.373930741131784e-07, |
|
"loss": 0.3976, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 3.9946225754826044, |
|
"learning_rate": 4.965174334325768e-07, |
|
"loss": 0.4467, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 4.437633052742651, |
|
"learning_rate": 3.349849877937343e-07, |
|
"loss": 0.4382, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.8225073330864148, |
|
"learning_rate": 2.2844505627726646e-07, |
|
"loss": 0.4593, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.855841338634491, |
|
"eval_runtime": 1.9784, |
|
"eval_samples_per_second": 171.854, |
|
"eval_steps_per_second": 1.516, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"grad_norm": 3.2403588491446373, |
|
"learning_rate": 1.594328760942437e-07, |
|
"loss": 0.4262, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 3.3007958768222556, |
|
"learning_rate": 1.156010161291434e-07, |
|
"loss": 0.4236, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"grad_norm": 3.7194840629840744, |
|
"learning_rate": 8.835555547373544e-08, |
|
"loss": 0.4859, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 3.4233566225496928, |
|
"learning_rate": 7.181664349277562e-08, |
|
"loss": 0.3969, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 0.8527613282203674, |
|
"eval_runtime": 1.973, |
|
"eval_samples_per_second": 172.329, |
|
"eval_steps_per_second": 1.521, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 3.255038919801069, |
|
"learning_rate": 6.203637972657601e-08, |
|
"loss": 0.3693, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 3.1973516190995497, |
|
"learning_rate": 5.6418543066491835e-08, |
|
"loss": 0.4039, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3958333333333335, |
|
"grad_norm": 3.463654018617807, |
|
"learning_rate": 5.329471712759216e-08, |
|
"loss": 0.3573, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.418086411896532, |
|
"learning_rate": 5.161995210302015e-08, |
|
"loss": 0.3982, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.8576084971427917, |
|
"eval_runtime": 1.9748, |
|
"eval_samples_per_second": 172.17, |
|
"eval_steps_per_second": 1.519, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6041666666666665, |
|
"grad_norm": 3.4030059443316723, |
|
"learning_rate": 5.075841465580837e-08, |
|
"loss": 0.3815, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 3.7277312294575444, |
|
"learning_rate": 5.033564114946932e-08, |
|
"loss": 0.3762, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 3.3451733690112104, |
|
"learning_rate": 5.013915282607116e-08, |
|
"loss": 0.3809, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 3.217687429197331, |
|
"learning_rate": 5.005343402153039e-08, |
|
"loss": 0.3742, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"eval_loss": 0.8624204993247986, |
|
"eval_runtime": 1.974, |
|
"eval_samples_per_second": 172.243, |
|
"eval_steps_per_second": 1.52, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0208333333333335, |
|
"grad_norm": 3.0189998840641734, |
|
"learning_rate": 5.001872829857116e-08, |
|
"loss": 0.3793, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 3.1862692972750577, |
|
"learning_rate": 5.000587713853837e-08, |
|
"loss": 0.3734, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2291666666666665, |
|
"grad_norm": 3.179336124252825, |
|
"learning_rate": 5.0001608748597456e-08, |
|
"loss": 0.3713, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.66026630563327, |
|
"learning_rate": 5.0000370319656156e-08, |
|
"loss": 0.3692, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 0.8662496209144592, |
|
"eval_runtime": 1.9702, |
|
"eval_samples_per_second": 172.572, |
|
"eval_steps_per_second": 1.523, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 3.286706597182968, |
|
"learning_rate": 5.0000067945715855e-08, |
|
"loss": 0.3657, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 3.4539000739999817, |
|
"learning_rate": 5.0000009144677036e-08, |
|
"loss": 0.3513, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6458333333333335, |
|
"grad_norm": 3.3076231790188904, |
|
"learning_rate": 5.0000000785521776e-08, |
|
"loss": 0.3683, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.4401672686211464, |
|
"learning_rate": 5.000000003317662e-08, |
|
"loss": 0.3667, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.8689968585968018, |
|
"eval_runtime": 1.9682, |
|
"eval_samples_per_second": 172.746, |
|
"eval_steps_per_second": 1.524, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8541666666666665, |
|
"grad_norm": 3.709318957772044, |
|
"learning_rate": 5.000000000038355e-08, |
|
"loss": 0.3741, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 3.1576885454409456, |
|
"learning_rate": 5.000000000000018e-08, |
|
"loss": 0.3694, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 192, |
|
"total_flos": 5362900008960.0, |
|
"train_loss": 0.5138544160872698, |
|
"train_runtime": 1043.9459, |
|
"train_samples_per_second": 11.702, |
|
"train_steps_per_second": 0.184 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 192, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 20, |
|
"total_flos": 5362900008960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|