|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.025288179883252902, |
|
"eval_steps": 13, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00016858786588835268, |
|
"eval_loss": 0.9273973107337952, |
|
"eval_runtime": 305.4938, |
|
"eval_samples_per_second": 32.701, |
|
"eval_steps_per_second": 8.177, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005057635976650581, |
|
"grad_norm": 0.5028714537620544, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9043, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0010115271953301161, |
|
"grad_norm": 0.5011371970176697, |
|
"learning_rate": 6e-05, |
|
"loss": 0.8721, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0015172907929951742, |
|
"grad_norm": 0.8638129234313965, |
|
"learning_rate": 9e-05, |
|
"loss": 0.8597, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0020230543906602323, |
|
"grad_norm": 0.5769167542457581, |
|
"learning_rate": 9.994965332706573e-05, |
|
"loss": 0.8598, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.002191642256548585, |
|
"eval_loss": 0.8061522245407104, |
|
"eval_runtime": 306.4617, |
|
"eval_samples_per_second": 32.598, |
|
"eval_steps_per_second": 8.151, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00252881798832529, |
|
"grad_norm": 0.3618827164173126, |
|
"learning_rate": 9.968561049466214e-05, |
|
"loss": 0.7392, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0030345815859903484, |
|
"grad_norm": 0.3715335428714752, |
|
"learning_rate": 9.919647942993148e-05, |
|
"loss": 0.77, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0035403451836554063, |
|
"grad_norm": 0.35133299231529236, |
|
"learning_rate": 9.848447601883435e-05, |
|
"loss": 0.7486, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.004046108781320465, |
|
"grad_norm": 0.39814493060112, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.7931, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00438328451309717, |
|
"eval_loss": 0.779468834400177, |
|
"eval_runtime": 306.7465, |
|
"eval_samples_per_second": 32.568, |
|
"eval_steps_per_second": 8.144, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.004551872378985523, |
|
"grad_norm": 0.35872843861579895, |
|
"learning_rate": 9.640574942595196e-05, |
|
"loss": 0.7735, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00505763597665058, |
|
"grad_norm": 0.334773987531662, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 0.7485, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005563399574315639, |
|
"grad_norm": 0.3601718544960022, |
|
"learning_rate": 9.348705665778478e-05, |
|
"loss": 0.8453, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.006069163171980697, |
|
"grad_norm": 0.3790546655654907, |
|
"learning_rate": 9.172866268606513e-05, |
|
"loss": 0.7661, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.006574926769645755, |
|
"grad_norm": 0.2799323499202728, |
|
"learning_rate": 8.978122744408906e-05, |
|
"loss": 0.7374, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.006574926769645755, |
|
"eval_loss": 0.7738218903541565, |
|
"eval_runtime": 306.4708, |
|
"eval_samples_per_second": 32.597, |
|
"eval_steps_per_second": 8.151, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.007080690367310813, |
|
"grad_norm": 0.3466595411300659, |
|
"learning_rate": 8.765357330018056e-05, |
|
"loss": 0.7734, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.007586453964975871, |
|
"grad_norm": 0.33398839831352234, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.7472, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00809221756264093, |
|
"grad_norm": 0.3427945673465729, |
|
"learning_rate": 8.289693629698564e-05, |
|
"loss": 0.7724, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.008597981160305987, |
|
"grad_norm": 0.33757373690605164, |
|
"learning_rate": 8.0289502192041e-05, |
|
"loss": 0.7546, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.00876656902619434, |
|
"eval_loss": 0.7691455483436584, |
|
"eval_runtime": 306.7906, |
|
"eval_samples_per_second": 32.563, |
|
"eval_steps_per_second": 8.142, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.009103744757971046, |
|
"grad_norm": 0.32756149768829346, |
|
"learning_rate": 7.754484907260513e-05, |
|
"loss": 0.7358, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.009609508355636103, |
|
"grad_norm": 0.3512526750564575, |
|
"learning_rate": 7.467541090321735e-05, |
|
"loss": 0.8495, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.01011527195330116, |
|
"grad_norm": 0.3337990939617157, |
|
"learning_rate": 7.169418695587791e-05, |
|
"loss": 0.7544, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01062103555096622, |
|
"grad_norm": 0.2693156599998474, |
|
"learning_rate": 6.861468292009727e-05, |
|
"loss": 0.7507, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.010958211282742925, |
|
"eval_loss": 0.7673783302307129, |
|
"eval_runtime": 306.4834, |
|
"eval_samples_per_second": 32.596, |
|
"eval_steps_per_second": 8.151, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.011126799148631277, |
|
"grad_norm": 0.34625276923179626, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.8253, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.011632562746296335, |
|
"grad_norm": 0.32037216424942017, |
|
"learning_rate": 6.22170203068947e-05, |
|
"loss": 0.7514, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.012138326343961394, |
|
"grad_norm": 0.2649615406990051, |
|
"learning_rate": 5.8927844739931834e-05, |
|
"loss": 0.7331, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.012644089941626451, |
|
"grad_norm": 0.3438839018344879, |
|
"learning_rate": 5.559822380516539e-05, |
|
"loss": 0.7573, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01314985353929151, |
|
"grad_norm": 0.3098960518836975, |
|
"learning_rate": 5.2243241517525754e-05, |
|
"loss": 0.7062, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.01314985353929151, |
|
"eval_loss": 0.7658212184906006, |
|
"eval_runtime": 306.7841, |
|
"eval_samples_per_second": 32.564, |
|
"eval_steps_per_second": 8.143, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.013655617136956568, |
|
"grad_norm": 0.32385092973709106, |
|
"learning_rate": 4.887809678520976e-05, |
|
"loss": 0.7489, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.014161380734621625, |
|
"grad_norm": 0.3278336822986603, |
|
"learning_rate": 4.551803455482833e-05, |
|
"loss": 0.7979, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.014667144332286684, |
|
"grad_norm": 0.2766069769859314, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.7464, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.015172907929951742, |
|
"grad_norm": 0.3032001256942749, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 0.6977, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.015341495795840094, |
|
"eval_loss": 0.7647122740745544, |
|
"eval_runtime": 306.5924, |
|
"eval_samples_per_second": 32.584, |
|
"eval_steps_per_second": 8.148, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0156786715276168, |
|
"grad_norm": 0.3444535434246063, |
|
"learning_rate": 3.562003362839914e-05, |
|
"loss": 0.7746, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.01618443512528186, |
|
"grad_norm": 0.3450552225112915, |
|
"learning_rate": 3.243125879593286e-05, |
|
"loss": 0.76, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.016690198722946917, |
|
"grad_norm": 0.3032662868499756, |
|
"learning_rate": 2.932207475167398e-05, |
|
"loss": 0.7742, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.017195962320611973, |
|
"grad_norm": 0.34202754497528076, |
|
"learning_rate": 2.630656687635007e-05, |
|
"loss": 0.7749, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.01753313805238868, |
|
"eval_loss": 0.7638227343559265, |
|
"eval_runtime": 306.6036, |
|
"eval_samples_per_second": 32.583, |
|
"eval_steps_per_second": 8.147, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.017701725918277032, |
|
"grad_norm": 0.3035350739955902, |
|
"learning_rate": 2.3398396174233178e-05, |
|
"loss": 0.7123, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01820748951594209, |
|
"grad_norm": 0.2928071916103363, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.7931, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.018713253113607147, |
|
"grad_norm": 0.3145976960659027, |
|
"learning_rate": 1.7956219300748793e-05, |
|
"loss": 0.7774, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.019219016711272206, |
|
"grad_norm": 0.32552438974380493, |
|
"learning_rate": 1.544686755065677e-05, |
|
"loss": 0.7435, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.019724780308937265, |
|
"grad_norm": 0.304483562707901, |
|
"learning_rate": 1.3094050125632972e-05, |
|
"loss": 0.7509, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.019724780308937265, |
|
"eval_loss": 0.7630516886711121, |
|
"eval_runtime": 306.6709, |
|
"eval_samples_per_second": 32.576, |
|
"eval_steps_per_second": 8.146, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.02023054390660232, |
|
"grad_norm": 0.3333122432231903, |
|
"learning_rate": 1.090842587659851e-05, |
|
"loss": 0.8215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02073630750426738, |
|
"grad_norm": 0.3269135355949402, |
|
"learning_rate": 8.899896227604509e-06, |
|
"loss": 0.8011, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.02124207110193244, |
|
"grad_norm": 0.279905766248703, |
|
"learning_rate": 7.077560319906695e-06, |
|
"loss": 0.7328, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.021747834699597495, |
|
"grad_norm": 0.2915093004703522, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 0.7201, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.02191642256548585, |
|
"eval_loss": 0.7627005577087402, |
|
"eval_runtime": 306.6146, |
|
"eval_samples_per_second": 32.582, |
|
"eval_steps_per_second": 8.147, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.022253598297262554, |
|
"grad_norm": 0.2774188816547394, |
|
"learning_rate": 4.023611372427471e-06, |
|
"loss": 0.6875, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.022759361894927613, |
|
"grad_norm": 0.3083136975765228, |
|
"learning_rate": 2.8058334845816213e-06, |
|
"loss": 0.7822, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02326512549259267, |
|
"grad_norm": 0.2817324101924896, |
|
"learning_rate": 1.8018569652073381e-06, |
|
"loss": 0.7197, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.02377088909025773, |
|
"grad_norm": 0.2889103293418884, |
|
"learning_rate": 1.016230078838226e-06, |
|
"loss": 0.7419, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.024108064822034436, |
|
"eval_loss": 0.7625572085380554, |
|
"eval_runtime": 306.5341, |
|
"eval_samples_per_second": 32.59, |
|
"eval_steps_per_second": 8.149, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.024276652687922787, |
|
"grad_norm": 0.30902647972106934, |
|
"learning_rate": 4.52511911603265e-07, |
|
"loss": 0.734, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.024782416285587843, |
|
"grad_norm": 0.31932878494262695, |
|
"learning_rate": 1.132562476771959e-07, |
|
"loss": 0.7666, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.025288179883252902, |
|
"grad_norm": 0.31999635696411133, |
|
"learning_rate": 0.0, |
|
"loss": 0.7732, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.26927046262784e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|