|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.863013698630137, |
|
"eval_steps": 500, |
|
"global_step": 360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0273972602739726, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 3.0637, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 3.056, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 3.0171, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 2.8054, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 2.4802, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 2.2946, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 2.0875, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 1.8806, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.9863013698630136, |
|
"eval_loss": 2.6093997955322266, |
|
"eval_runtime": 0.5633, |
|
"eval_samples_per_second": 17.753, |
|
"eval_steps_per_second": 1.775, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 44.0, |
|
"learning_rate": 0.00019992479525042303, |
|
"loss": 1.6979, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2328767123287672, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 1.5925, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019908004033648453, |
|
"loss": 1.5152, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019830777448228603, |
|
"loss": 1.4565, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.3866, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7808219178082192, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019607253577167205, |
|
"loss": 1.3522, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00019461481568757506, |
|
"loss": 1.3239, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.535839080810547, |
|
"eval_runtime": 0.5617, |
|
"eval_samples_per_second": 17.804, |
|
"eval_steps_per_second": 1.78, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.0547945205479454, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019293475242268223, |
|
"loss": 1.3035, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0001910362940966147, |
|
"loss": 1.2854, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.328767123287671, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018892390205361062, |
|
"loss": 1.2525, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.26, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.602739726027397, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018407766423091034, |
|
"loss": 1.2323, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00018135520702629675, |
|
"loss": 1.2267, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.8767123287671232, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00017844156649195759, |
|
"loss": 1.2327, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.9863013698630136, |
|
"eval_loss": 2.519155979156494, |
|
"eval_runtime": 0.5663, |
|
"eval_samples_per_second": 17.659, |
|
"eval_steps_per_second": 1.766, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.0136986301369864, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00017534358963276607, |
|
"loss": 1.2242, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.1506849315068495, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00017206855664077147, |
|
"loss": 1.1938, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.287671232876712, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 1.1879, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00016501850533471836, |
|
"loss": 1.1748, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.5616438356164384, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001612600545193203, |
|
"loss": 1.1704, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.6986301369863015, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 1.1679, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 3.8356164383561646, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00015332044328016914, |
|
"loss": 1.1663, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.9726027397260273, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014915794080553707, |
|
"loss": 1.1735, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.520301103591919, |
|
"eval_runtime": 0.5527, |
|
"eval_samples_per_second": 18.093, |
|
"eval_steps_per_second": 1.809, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.109589041095891, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 1.144, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.2465753424657535, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014049642820326735, |
|
"loss": 1.1273, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.383561643835616, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00013601777248047105, |
|
"loss": 1.1344, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.52054794520548, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00013145447561516138, |
|
"loss": 1.1364, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 4.657534246575342, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00012681726127606376, |
|
"loss": 1.1344, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.794520547945205, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00012211702683668878, |
|
"loss": 1.1252, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 4.931506849315069, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.1354, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.986301369863014, |
|
"eval_loss": 2.546712875366211, |
|
"eval_runtime": 0.5597, |
|
"eval_samples_per_second": 17.868, |
|
"eval_steps_per_second": 1.787, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.068493150684931, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00011257180167521629, |
|
"loss": 1.1069, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.205479452054795, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001077492420671931, |
|
"loss": 1.0983, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.342465753424658, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00010290847187431113, |
|
"loss": 1.0952, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 5.47945205479452, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 9.806086682281758e-05, |
|
"loss": 1.1105, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.616438356164384, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 9.321781870075908e-05, |
|
"loss": 1.1041, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 5.7534246575342465, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 1.0954, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 5.890410958904109, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.35908801082676e-05, |
|
"loss": 1.1015, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5496411323547363, |
|
"eval_runtime": 0.5525, |
|
"eval_samples_per_second": 18.099, |
|
"eval_steps_per_second": 1.81, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 7.882961277705895e-05, |
|
"loss": 1.0967, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.164383561643835, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 1.0792, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 6.301369863013699, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 6.94674002304887e-05, |
|
"loss": 1.0787, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 6.438356164383562, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.488845605272113e-05, |
|
"loss": 1.0699, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 6.575342465753424, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 1.0875, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.712328767123288, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 5.5988668799569545e-05, |
|
"loss": 1.0721, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.168874007033615e-05, |
|
"loss": 1.0769, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.986301369863014, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.7502341966544e-05, |
|
"loss": 1.0858, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 6.986301369863014, |
|
"eval_loss": 2.568040609359741, |
|
"eval_runtime": 0.5489, |
|
"eval_samples_per_second": 18.22, |
|
"eval_steps_per_second": 1.822, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.123287671232877, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.343931245134616e-05, |
|
"loss": 1.0577, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 7.260273972602739, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 3.950919957384582e-05, |
|
"loss": 1.0614, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 7.397260273972603, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.0698, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 7.534246575342466, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 3.2084332465620694e-05, |
|
"loss": 1.0695, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 7.671232876712329, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.8607026544210114e-05, |
|
"loss": 1.0702, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 7.808219178082192, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 2.529749287590042e-05, |
|
"loss": 1.0632, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 7.945205479452055, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.2163508807583998e-05, |
|
"loss": 1.0624, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.572319746017456, |
|
"eval_runtime": 0.5428, |
|
"eval_samples_per_second": 18.424, |
|
"eval_steps_per_second": 1.842, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 8.082191780821917, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.921243914762889e-05, |
|
"loss": 1.0583, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 8.219178082191782, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 1.0536, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.356164383561644, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.388633676074862e-05, |
|
"loss": 1.0613, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 8.493150684931507, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.1523820282334219e-05, |
|
"loss": 1.0589, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 8.63013698630137, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 1.0664, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 8.767123287671232, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.427603073110967e-06, |
|
"loss": 1.0656, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 8.904109589041095, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.7035283819124155e-06, |
|
"loss": 1.0546, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 8.986301369863014, |
|
"eval_loss": 2.5756430625915527, |
|
"eval_runtime": 0.5591, |
|
"eval_samples_per_second": 17.886, |
|
"eval_steps_per_second": 1.789, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 9.04109589041096, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 1.0515, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 9.178082191780822, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.9236950338380033e-06, |
|
"loss": 1.0521, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 9.315068493150685, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.874468937261531e-06, |
|
"loss": 1.0563, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 9.452054794520548, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.055836141905553e-06, |
|
"loss": 1.0535, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 9.58904109589041, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.6972042068341714e-07, |
|
"loss": 1.0521, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 9.726027397260275, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.1749913540496371e-07, |
|
"loss": 1.0664, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0, |
|
"loss": 1.0623, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"eval_loss": 2.5758092403411865, |
|
"eval_runtime": 0.5499, |
|
"eval_samples_per_second": 18.186, |
|
"eval_steps_per_second": 1.819, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"step": 360, |
|
"total_flos": 2.1145440197646746e+17, |
|
"train_loss": 1.288370986117257, |
|
"train_runtime": 1930.7273, |
|
"train_samples_per_second": 9.074, |
|
"train_steps_per_second": 0.186 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1145440197646746e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|