|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.092526690391459, |
|
"eval_steps": 25, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17793594306049823, |
|
"grad_norm": 3.1571543216705322, |
|
"learning_rate": 0.0001951951951951952, |
|
"loss": 2.3406, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.35587188612099646, |
|
"grad_norm": 1.42654550075531, |
|
"learning_rate": 0.0001901901901901902, |
|
"loss": 1.3793, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5338078291814946, |
|
"grad_norm": 1.1719086170196533, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 1.2397, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7117437722419929, |
|
"grad_norm": 1.6048916578292847, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 1.2513, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8896797153024911, |
|
"grad_norm": 0.9349364638328552, |
|
"learning_rate": 0.0001751751751751752, |
|
"loss": 1.1693, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0640569395017794, |
|
"grad_norm": 1.2133198976516724, |
|
"learning_rate": 0.0001701701701701702, |
|
"loss": 1.2021, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2419928825622777, |
|
"grad_norm": 1.7000682353973389, |
|
"learning_rate": 0.00016516516516516518, |
|
"loss": 1.0343, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4199288256227758, |
|
"grad_norm": 1.325333833694458, |
|
"learning_rate": 0.00016016016016016018, |
|
"loss": 1.105, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5978647686832739, |
|
"grad_norm": 1.2120150327682495, |
|
"learning_rate": 0.00015515515515515516, |
|
"loss": 1.2193, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7758007117437722, |
|
"grad_norm": 1.1473174095153809, |
|
"learning_rate": 0.00015015015015015014, |
|
"loss": 1.0786, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9537366548042705, |
|
"grad_norm": 1.087303876876831, |
|
"learning_rate": 0.00014514514514514515, |
|
"loss": 1.0669, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.1281138790035588, |
|
"grad_norm": 1.2311229705810547, |
|
"learning_rate": 0.00014014014014014013, |
|
"loss": 1.1659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.306049822064057, |
|
"grad_norm": 1.148964524269104, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 1.1135, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.4839857651245554, |
|
"grad_norm": 1.0676476955413818, |
|
"learning_rate": 0.00013013013013013014, |
|
"loss": 0.9726, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.6619217081850532, |
|
"grad_norm": 1.005470633506775, |
|
"learning_rate": 0.00012512512512512512, |
|
"loss": 0.9146, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.8398576512455516, |
|
"grad_norm": 1.5128366947174072, |
|
"learning_rate": 0.00012012012012012013, |
|
"loss": 1.0593, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.01423487544484, |
|
"grad_norm": 0.8878112435340881, |
|
"learning_rate": 0.00011511511511511512, |
|
"loss": 1.0623, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.192170818505338, |
|
"grad_norm": 1.3680928945541382, |
|
"learning_rate": 0.00011011011011011012, |
|
"loss": 0.9638, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3701067615658364, |
|
"grad_norm": 1.0474141836166382, |
|
"learning_rate": 0.00010510510510510511, |
|
"loss": 0.9672, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.5480427046263348, |
|
"grad_norm": 0.9096865057945251, |
|
"learning_rate": 0.00010010010010010012, |
|
"loss": 0.9519, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7259786476868326, |
|
"grad_norm": 1.9101051092147827, |
|
"learning_rate": 9.50950950950951e-05, |
|
"loss": 0.9851, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.903914590747331, |
|
"grad_norm": 1.1126383543014526, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 0.9335, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.07829181494662, |
|
"grad_norm": 1.3586586713790894, |
|
"learning_rate": 8.50850850850851e-05, |
|
"loss": 0.9603, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.2562277580071175, |
|
"grad_norm": 1.1161298751831055, |
|
"learning_rate": 8.008008008008009e-05, |
|
"loss": 0.8657, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.434163701067615, |
|
"grad_norm": 1.347420573234558, |
|
"learning_rate": 7.507507507507507e-05, |
|
"loss": 0.982, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.612099644128114, |
|
"grad_norm": 1.450341820716858, |
|
"learning_rate": 7.007007007007007e-05, |
|
"loss": 0.8658, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.790035587188612, |
|
"grad_norm": 1.0948293209075928, |
|
"learning_rate": 6.506506506506507e-05, |
|
"loss": 0.9306, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.967971530249111, |
|
"grad_norm": 1.7064975500106812, |
|
"learning_rate": 6.0060060060060066e-05, |
|
"loss": 0.8281, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.142348754448399, |
|
"grad_norm": 1.314003586769104, |
|
"learning_rate": 5.505505505505506e-05, |
|
"loss": 0.7796, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.320284697508897, |
|
"grad_norm": 1.3033592700958252, |
|
"learning_rate": 5.005005005005006e-05, |
|
"loss": 0.8998, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.498220640569395, |
|
"grad_norm": 2.4211108684539795, |
|
"learning_rate": 4.5045045045045046e-05, |
|
"loss": 0.8154, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.6761565836298935, |
|
"grad_norm": 1.4809461832046509, |
|
"learning_rate": 4.0040040040040046e-05, |
|
"loss": 0.8276, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.854092526690391, |
|
"grad_norm": 1.4405505657196045, |
|
"learning_rate": 3.503503503503503e-05, |
|
"loss": 0.8643, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.02846975088968, |
|
"grad_norm": 1.4615403413772583, |
|
"learning_rate": 3.0030030030030033e-05, |
|
"loss": 0.8953, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.2064056939501775, |
|
"grad_norm": 1.3846161365509033, |
|
"learning_rate": 2.502502502502503e-05, |
|
"loss": 0.7582, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 6.384341637010676, |
|
"grad_norm": 1.0929971933364868, |
|
"learning_rate": 2.0020020020020023e-05, |
|
"loss": 0.8246, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.562277580071174, |
|
"grad_norm": 1.1935001611709595, |
|
"learning_rate": 1.5015015015015016e-05, |
|
"loss": 0.8572, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.740213523131673, |
|
"grad_norm": 1.4628770351409912, |
|
"learning_rate": 1.0010010010010011e-05, |
|
"loss": 0.8221, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.918149466192171, |
|
"grad_norm": 1.2371264696121216, |
|
"learning_rate": 5.005005005005006e-06, |
|
"loss": 0.7659, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.092526690391459, |
|
"grad_norm": 1.589613676071167, |
|
"learning_rate": 0.0, |
|
"loss": 0.7643, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.060019431812096e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|