|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.6923076923076925, |
|
"eval_steps": 100, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 92.9238510131836, |
|
"learning_rate": 9.9907e-06, |
|
"loss": 2.4016, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"eval_loss": 2.1255013942718506, |
|
"eval_runtime": 34.4318, |
|
"eval_samples_per_second": 11.414, |
|
"eval_steps_per_second": 1.452, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 47.019439697265625, |
|
"learning_rate": 9.980700000000001e-06, |
|
"loss": 2.0696, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 1.9405728578567505, |
|
"eval_runtime": 34.0713, |
|
"eval_samples_per_second": 11.535, |
|
"eval_steps_per_second": 1.468, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 52.230751037597656, |
|
"learning_rate": 9.970700000000001e-06, |
|
"loss": 1.9983, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 1.900020718574524, |
|
"eval_runtime": 34.1694, |
|
"eval_samples_per_second": 11.502, |
|
"eval_steps_per_second": 1.463, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 39.62374496459961, |
|
"learning_rate": 9.960800000000001e-06, |
|
"loss": 1.8888, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"eval_loss": 1.8164124488830566, |
|
"eval_runtime": 34.1213, |
|
"eval_samples_per_second": 11.518, |
|
"eval_steps_per_second": 1.465, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 39.697731018066406, |
|
"learning_rate": 9.9508e-06, |
|
"loss": 1.8456, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 1.7753618955612183, |
|
"eval_runtime": 34.149, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 52.823184967041016, |
|
"learning_rate": 9.9408e-06, |
|
"loss": 1.7839, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 1.7344136238098145, |
|
"eval_runtime": 34.121, |
|
"eval_samples_per_second": 11.518, |
|
"eval_steps_per_second": 1.465, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"grad_norm": 107.05223083496094, |
|
"learning_rate": 9.9308e-06, |
|
"loss": 1.7544, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"eval_loss": 1.6865615844726562, |
|
"eval_runtime": 34.0842, |
|
"eval_samples_per_second": 11.53, |
|
"eval_steps_per_second": 1.467, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 53.641353607177734, |
|
"learning_rate": 9.9208e-06, |
|
"loss": 1.6812, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"eval_loss": 1.6568766832351685, |
|
"eval_runtime": 34.432, |
|
"eval_samples_per_second": 11.414, |
|
"eval_steps_per_second": 1.452, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 40.92328643798828, |
|
"learning_rate": 9.9109e-06, |
|
"loss": 1.6501, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 1.6080188751220703, |
|
"eval_runtime": 34.1751, |
|
"eval_samples_per_second": 11.5, |
|
"eval_steps_per_second": 1.463, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 28.52039909362793, |
|
"learning_rate": 9.9009e-06, |
|
"loss": 1.6579, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"eval_loss": 1.6059809923171997, |
|
"eval_runtime": 34.1634, |
|
"eval_samples_per_second": 11.504, |
|
"eval_steps_per_second": 1.464, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"grad_norm": 73.21099090576172, |
|
"learning_rate": 9.8909e-06, |
|
"loss": 1.6286, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"eval_loss": 1.5779187679290771, |
|
"eval_runtime": 34.1489, |
|
"eval_samples_per_second": 11.508, |
|
"eval_steps_per_second": 1.464, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 36.768428802490234, |
|
"learning_rate": 9.8809e-06, |
|
"loss": 1.5871, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.5641562938690186, |
|
"eval_runtime": 34.1081, |
|
"eval_samples_per_second": 11.522, |
|
"eval_steps_per_second": 1.466, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 28.098352432250977, |
|
"learning_rate": 9.8709e-06, |
|
"loss": 1.6231, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"eval_loss": 1.530659556388855, |
|
"eval_runtime": 34.0717, |
|
"eval_samples_per_second": 11.534, |
|
"eval_steps_per_second": 1.467, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 48.131195068359375, |
|
"learning_rate": 9.8609e-06, |
|
"loss": 1.5178, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"eval_loss": 1.5207794904708862, |
|
"eval_runtime": 34.0224, |
|
"eval_samples_per_second": 11.551, |
|
"eval_steps_per_second": 1.47, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 15.9362211227417, |
|
"learning_rate": 9.8509e-06, |
|
"loss": 1.5434, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 1.4978805780410767, |
|
"eval_runtime": 34.0269, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.469, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 513, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.73364749312e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|