|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1469, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013614703880190605, |
|
"grad_norm": 46457.98828125, |
|
"learning_rate": 8.163265306122449e-07, |
|
"loss": 2.0792, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02722940776038121, |
|
"grad_norm": 35946.40234375, |
|
"learning_rate": 1.6326530612244897e-06, |
|
"loss": 2.0791, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04084411164057182, |
|
"grad_norm": 23938.412109375, |
|
"learning_rate": 2.4489795918367347e-06, |
|
"loss": 2.0782, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05445881552076242, |
|
"grad_norm": 32852.3359375, |
|
"learning_rate": 3.2653061224489794e-06, |
|
"loss": 2.0771, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06807351940095303, |
|
"grad_norm": 32399.712890625, |
|
"learning_rate": 4.081632653061225e-06, |
|
"loss": 2.0752, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08168822328114364, |
|
"grad_norm": 30975.478515625, |
|
"learning_rate": 4.897959183673469e-06, |
|
"loss": 2.0722, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09530292716133425, |
|
"grad_norm": 35666.22265625, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 2.0674, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10891763104152484, |
|
"grad_norm": 36289.2578125, |
|
"learning_rate": 6.530612244897959e-06, |
|
"loss": 2.0555, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12253233492171545, |
|
"grad_norm": 34320.37890625, |
|
"learning_rate": 7.346938775510204e-06, |
|
"loss": 2.0414, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13614703880190607, |
|
"grad_norm": 49061.66796875, |
|
"learning_rate": 8.16326530612245e-06, |
|
"loss": 2.0297, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14976174268209666, |
|
"grad_norm": 60060.45703125, |
|
"learning_rate": 8.979591836734694e-06, |
|
"loss": 2.0094, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16337644656228728, |
|
"grad_norm": 80072.6015625, |
|
"learning_rate": 9.795918367346939e-06, |
|
"loss": 1.9789, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17699115044247787, |
|
"grad_norm": 95368.8828125, |
|
"learning_rate": 1.0612244897959184e-05, |
|
"loss": 1.9378, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1906058543226685, |
|
"grad_norm": 70028.4765625, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 1.9166, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2042205582028591, |
|
"grad_norm": 90220.5859375, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 1.8492, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21783526208304968, |
|
"grad_norm": 59154.35546875, |
|
"learning_rate": 1.3061224489795918e-05, |
|
"loss": 1.8556, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2314499659632403, |
|
"grad_norm": 129406.875, |
|
"learning_rate": 1.3877551020408165e-05, |
|
"loss": 1.8048, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2450646698434309, |
|
"grad_norm": 176714.859375, |
|
"learning_rate": 1.4693877551020408e-05, |
|
"loss": 1.743, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2586793737236215, |
|
"grad_norm": 225369.375, |
|
"learning_rate": 1.5510204081632655e-05, |
|
"loss": 1.708, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.27229407760381213, |
|
"grad_norm": 324853.53125, |
|
"learning_rate": 1.63265306122449e-05, |
|
"loss": 1.7032, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2859087814840027, |
|
"grad_norm": 77990.5, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 1.7964, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2995234853641933, |
|
"grad_norm": 243999.53125, |
|
"learning_rate": 1.7959183673469387e-05, |
|
"loss": 1.7472, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3131381892443839, |
|
"grad_norm": 402316.78125, |
|
"learning_rate": 1.8775510204081636e-05, |
|
"loss": 1.6196, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.32675289312457456, |
|
"grad_norm": 173016.109375, |
|
"learning_rate": 1.9591836734693877e-05, |
|
"loss": 1.6543, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.34036759700476515, |
|
"grad_norm": 166789.75, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 1.7865, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34036759700476515, |
|
"eval_accuracy": 0.1981891348088531, |
|
"eval_loss": 1.6593838930130005, |
|
"eval_runtime": 296.9022, |
|
"eval_samples_per_second": 10.044, |
|
"eval_steps_per_second": 1.256, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 160284.609375, |
|
"learning_rate": 2.1224489795918368e-05, |
|
"loss": 1.6528, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.36759700476514634, |
|
"grad_norm": 195971.28125, |
|
"learning_rate": 2.2040816326530613e-05, |
|
"loss": 1.7909, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.381211708645337, |
|
"grad_norm": 152683.9375, |
|
"learning_rate": 2.2857142857142858e-05, |
|
"loss": 1.6901, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3948264125255276, |
|
"grad_norm": 188117.421875, |
|
"learning_rate": 2.3673469387755103e-05, |
|
"loss": 1.7382, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4084411164057182, |
|
"grad_norm": 221595.171875, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 1.7579, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.42205582028590877, |
|
"grad_norm": 250219.484375, |
|
"learning_rate": 2.5306122448979594e-05, |
|
"loss": 1.6842, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.43567052416609936, |
|
"grad_norm": 362297.6875, |
|
"learning_rate": 2.6122448979591835e-05, |
|
"loss": 1.6874, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.44928522804629, |
|
"grad_norm": 268312.65625, |
|
"learning_rate": 2.6938775510204084e-05, |
|
"loss": 1.6339, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4628999319264806, |
|
"grad_norm": 301583.75, |
|
"learning_rate": 2.775510204081633e-05, |
|
"loss": 1.6268, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4765146358066712, |
|
"grad_norm": 168292.046875, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 1.7985, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4901293396868618, |
|
"grad_norm": 274193.40625, |
|
"learning_rate": 2.9387755102040816e-05, |
|
"loss": 1.7689, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5037440435670524, |
|
"grad_norm": 142488.421875, |
|
"learning_rate": 2.9977307110438728e-05, |
|
"loss": 1.7201, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.517358747447243, |
|
"grad_norm": 273465.96875, |
|
"learning_rate": 2.9886535552193645e-05, |
|
"loss": 1.6105, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5309734513274337, |
|
"grad_norm": 408773.3125, |
|
"learning_rate": 2.9795763993948565e-05, |
|
"loss": 1.6843, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5445881552076243, |
|
"grad_norm": 187610.140625, |
|
"learning_rate": 2.970499243570348e-05, |
|
"loss": 1.7259, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5582028590878149, |
|
"grad_norm": 141979.921875, |
|
"learning_rate": 2.9614220877458398e-05, |
|
"loss": 1.7572, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5718175629680055, |
|
"grad_norm": 183132.390625, |
|
"learning_rate": 2.9523449319213314e-05, |
|
"loss": 1.7094, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.585432266848196, |
|
"grad_norm": 299271.59375, |
|
"learning_rate": 2.943267776096823e-05, |
|
"loss": 1.7294, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5990469707283866, |
|
"grad_norm": 207611.09375, |
|
"learning_rate": 2.9341906202723147e-05, |
|
"loss": 1.7864, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6126616746085772, |
|
"grad_norm": 262759.0625, |
|
"learning_rate": 2.9251134644478064e-05, |
|
"loss": 1.7288, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6262763784887678, |
|
"grad_norm": 291027.125, |
|
"learning_rate": 2.916036308623298e-05, |
|
"loss": 1.6042, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6398910823689585, |
|
"grad_norm": 355390.71875, |
|
"learning_rate": 2.90695915279879e-05, |
|
"loss": 1.7426, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6535057862491491, |
|
"grad_norm": 87989.2578125, |
|
"learning_rate": 2.8978819969742813e-05, |
|
"loss": 1.6938, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6671204901293397, |
|
"grad_norm": 302982.09375, |
|
"learning_rate": 2.888804841149773e-05, |
|
"loss": 1.8035, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6807351940095303, |
|
"grad_norm": 197021.3125, |
|
"learning_rate": 2.879727685325265e-05, |
|
"loss": 1.7716, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6807351940095303, |
|
"eval_accuracy": 0.1981891348088531, |
|
"eval_loss": 1.6633707284927368, |
|
"eval_runtime": 291.4938, |
|
"eval_samples_per_second": 10.23, |
|
"eval_steps_per_second": 1.28, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6943498978897209, |
|
"grad_norm": 348515.0625, |
|
"learning_rate": 2.8706505295007566e-05, |
|
"loss": 1.7593, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 252458.609375, |
|
"learning_rate": 2.8615733736762483e-05, |
|
"loss": 1.7406, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7215793056501021, |
|
"grad_norm": 179026.3125, |
|
"learning_rate": 2.8524962178517396e-05, |
|
"loss": 1.806, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7351940095302927, |
|
"grad_norm": 230475.78125, |
|
"learning_rate": 2.8434190620272316e-05, |
|
"loss": 1.7327, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7488087134104833, |
|
"grad_norm": 155040.359375, |
|
"learning_rate": 2.8343419062027232e-05, |
|
"loss": 1.6378, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.762423417290674, |
|
"grad_norm": 200565.359375, |
|
"learning_rate": 2.825264750378215e-05, |
|
"loss": 1.7561, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7760381211708646, |
|
"grad_norm": 160959.6875, |
|
"learning_rate": 2.8161875945537065e-05, |
|
"loss": 1.7322, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7896528250510552, |
|
"grad_norm": 277028.65625, |
|
"learning_rate": 2.8071104387291985e-05, |
|
"loss": 1.6283, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8032675289312458, |
|
"grad_norm": 145514.578125, |
|
"learning_rate": 2.79803328290469e-05, |
|
"loss": 1.696, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8168822328114363, |
|
"grad_norm": 274652.6875, |
|
"learning_rate": 2.7889561270801815e-05, |
|
"loss": 1.7019, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8304969366916269, |
|
"grad_norm": 108995.4453125, |
|
"learning_rate": 2.779878971255673e-05, |
|
"loss": 1.7189, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8441116405718175, |
|
"grad_norm": 84566.8046875, |
|
"learning_rate": 2.770801815431165e-05, |
|
"loss": 1.8068, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8577263444520081, |
|
"grad_norm": 239363.8125, |
|
"learning_rate": 2.7617246596066568e-05, |
|
"loss": 1.8275, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8713410483321987, |
|
"grad_norm": 213879.875, |
|
"learning_rate": 2.752647503782148e-05, |
|
"loss": 1.7145, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8849557522123894, |
|
"grad_norm": 201732.859375, |
|
"learning_rate": 2.74357034795764e-05, |
|
"loss": 1.648, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.89857045609258, |
|
"grad_norm": 262597.78125, |
|
"learning_rate": 2.7344931921331318e-05, |
|
"loss": 1.6619, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9121851599727706, |
|
"grad_norm": 343452.15625, |
|
"learning_rate": 2.7254160363086234e-05, |
|
"loss": 1.6732, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9257998638529612, |
|
"grad_norm": 164955.390625, |
|
"learning_rate": 2.716338880484115e-05, |
|
"loss": 1.837, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9394145677331518, |
|
"grad_norm": 243246.953125, |
|
"learning_rate": 2.7072617246596067e-05, |
|
"loss": 1.7863, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9530292716133424, |
|
"grad_norm": 133524.296875, |
|
"learning_rate": 2.6981845688350984e-05, |
|
"loss": 1.671, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.966643975493533, |
|
"grad_norm": 187500.75, |
|
"learning_rate": 2.68910741301059e-05, |
|
"loss": 1.6882, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9802586793737236, |
|
"grad_norm": 311251.9375, |
|
"learning_rate": 2.6800302571860817e-05, |
|
"loss": 1.6963, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9938733832539143, |
|
"grad_norm": 451302.75, |
|
"learning_rate": 2.6709531013615737e-05, |
|
"loss": 1.7184, |
|
"step": 1460 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 7345, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.24301109962752e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|