|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8583690987124464, |
|
"eval_steps": 4, |
|
"global_step": 25, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"grad_norm": 19.15982437133789, |
|
"learning_rate": 2e-05, |
|
"loss": 13.5674, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"eval_loss": 1.6727509498596191, |
|
"eval_runtime": 8.0648, |
|
"eval_samples_per_second": 6.076, |
|
"eval_steps_per_second": 1.612, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 16.63048553466797, |
|
"learning_rate": 4e-05, |
|
"loss": 14.7517, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.10300429184549356, |
|
"grad_norm": 15.25049877166748, |
|
"learning_rate": 6e-05, |
|
"loss": 13.1345, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 17.173583984375, |
|
"learning_rate": 8e-05, |
|
"loss": 11.7391, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"eval_loss": 1.4845433235168457, |
|
"eval_runtime": 8.0677, |
|
"eval_samples_per_second": 6.074, |
|
"eval_steps_per_second": 1.611, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 15.846421241760254, |
|
"learning_rate": 0.0001, |
|
"loss": 13.1035, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 13.740381240844727, |
|
"learning_rate": 0.00012, |
|
"loss": 10.3992, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.24034334763948498, |
|
"grad_norm": 10.496570587158203, |
|
"learning_rate": 0.00014, |
|
"loss": 9.9128, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 8.85119342803955, |
|
"learning_rate": 0.00016, |
|
"loss": 7.9807, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"eval_loss": 0.9227344989776611, |
|
"eval_runtime": 8.0856, |
|
"eval_samples_per_second": 6.06, |
|
"eval_steps_per_second": 1.608, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3090128755364807, |
|
"grad_norm": 8.351367950439453, |
|
"learning_rate": 0.00018, |
|
"loss": 6.646, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 11.750950813293457, |
|
"learning_rate": 0.0002, |
|
"loss": 7.369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3776824034334764, |
|
"grad_norm": 12.794264793395996, |
|
"learning_rate": 0.0001996917333733128, |
|
"loss": 5.8694, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 21.229198455810547, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 6.5926, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"eval_loss": 0.6635634899139404, |
|
"eval_runtime": 8.081, |
|
"eval_samples_per_second": 6.064, |
|
"eval_steps_per_second": 1.609, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.44635193133047213, |
|
"grad_norm": 15.183062553405762, |
|
"learning_rate": 0.00019723699203976766, |
|
"loss": 5.3242, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 11.120061874389648, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 5.0979, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 9.379684448242188, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 4.2185, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 9.187313079833984, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 3.9636, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"eval_loss": 0.5391715168952942, |
|
"eval_runtime": 8.0725, |
|
"eval_samples_per_second": 6.07, |
|
"eval_steps_per_second": 1.61, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5836909871244635, |
|
"grad_norm": 7.920268535614014, |
|
"learning_rate": 0.00018526401643540922, |
|
"loss": 3.4412, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 8.719185829162598, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 4.7588, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6523605150214592, |
|
"grad_norm": 7.354686260223389, |
|
"learning_rate": 0.0001760405965600031, |
|
"loss": 3.7329, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 6.449216842651367, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 3.0476, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"eval_loss": 0.4746493995189667, |
|
"eval_runtime": 8.0715, |
|
"eval_samples_per_second": 6.071, |
|
"eval_steps_per_second": 1.611, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.721030042918455, |
|
"grad_norm": 10.40072250366211, |
|
"learning_rate": 0.00016494480483301836, |
|
"loss": 3.6035, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 9.310944557189941, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 3.1352, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7896995708154506, |
|
"grad_norm": 9.59798526763916, |
|
"learning_rate": 0.0001522498564715949, |
|
"loss": 3.9863, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"grad_norm": 7.370415210723877, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 3.0432, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"eval_loss": 0.4330078065395355, |
|
"eval_runtime": 8.0682, |
|
"eval_samples_per_second": 6.073, |
|
"eval_steps_per_second": 1.611, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 9.128814697265625, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 3.4839, |
|
"step": 25 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.306916403806208e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|