|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1370, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18248175182481752, |
|
"grad_norm": 23.73174476623535, |
|
"learning_rate": 0.00046, |
|
"loss": 4.0301, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"grad_norm": 2.086200714111328, |
|
"learning_rate": 0.00096, |
|
"loss": 1.0569, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5474452554744526, |
|
"grad_norm": 0.4636005461215973, |
|
"learning_rate": 0.0009825757575757576, |
|
"loss": 0.4231, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 0.905696451663971, |
|
"learning_rate": 0.0009636363636363637, |
|
"loss": 0.1587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9124087591240876, |
|
"grad_norm": 0.7420445084571838, |
|
"learning_rate": 0.0009446969696969697, |
|
"loss": 0.1302, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.09637967497110367, |
|
"eval_runtime": 192.8711, |
|
"eval_samples_per_second": 4.241, |
|
"eval_steps_per_second": 0.71, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.094890510948905, |
|
"grad_norm": 0.5315635204315186, |
|
"learning_rate": 0.0009257575757575758, |
|
"loss": 0.1168, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2773722627737225, |
|
"grad_norm": 2.168321132659912, |
|
"learning_rate": 0.0009068181818181819, |
|
"loss": 0.0957, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"grad_norm": 1.4148447513580322, |
|
"learning_rate": 0.000887878787878788, |
|
"loss": 0.0819, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6423357664233578, |
|
"grad_norm": 1.08322274684906, |
|
"learning_rate": 0.0008689393939393939, |
|
"loss": 0.0967, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"grad_norm": 0.34484872221946716, |
|
"learning_rate": 0.00085, |
|
"loss": 0.0897, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.05452003329992294, |
|
"eval_runtime": 193.1249, |
|
"eval_samples_per_second": 4.236, |
|
"eval_steps_per_second": 0.709, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.0072992700729926, |
|
"grad_norm": 0.4588640034198761, |
|
"learning_rate": 0.0008310606060606061, |
|
"loss": 0.078, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"grad_norm": 0.9430391788482666, |
|
"learning_rate": 0.0008121212121212122, |
|
"loss": 0.047, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.372262773722628, |
|
"grad_norm": 1.2605994939804077, |
|
"learning_rate": 0.0007931818181818182, |
|
"loss": 0.0733, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.554744525547445, |
|
"grad_norm": 0.4879356026649475, |
|
"learning_rate": 0.0007742424242424244, |
|
"loss": 0.0595, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.7372262773722627, |
|
"grad_norm": 0.7468044757843018, |
|
"learning_rate": 0.0007553030303030303, |
|
"loss": 0.0581, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"grad_norm": 1.2231853008270264, |
|
"learning_rate": 0.0007363636363636363, |
|
"loss": 0.0691, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.039563629776239395, |
|
"eval_runtime": 191.6012, |
|
"eval_samples_per_second": 4.269, |
|
"eval_steps_per_second": 0.715, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.102189781021898, |
|
"grad_norm": 0.3177865445613861, |
|
"learning_rate": 0.0007174242424242424, |
|
"loss": 0.0421, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.2846715328467155, |
|
"grad_norm": 1.0183290243148804, |
|
"learning_rate": 0.0006984848484848485, |
|
"loss": 0.0485, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.4671532846715327, |
|
"grad_norm": 0.361370712518692, |
|
"learning_rate": 0.0006795454545454546, |
|
"loss": 0.0369, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 1.0689235925674438, |
|
"learning_rate": 0.0006606060606060606, |
|
"loss": 0.0456, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.832116788321168, |
|
"grad_norm": 0.29606688022613525, |
|
"learning_rate": 0.0006416666666666667, |
|
"loss": 0.0482, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.027906004339456558, |
|
"eval_runtime": 190.9612, |
|
"eval_samples_per_second": 4.284, |
|
"eval_steps_per_second": 0.717, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 4.014598540145985, |
|
"grad_norm": 0.1826704740524292, |
|
"learning_rate": 0.0006227272727272727, |
|
"loss": 0.0451, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.197080291970803, |
|
"grad_norm": 0.1476697325706482, |
|
"learning_rate": 0.0006037878787878788, |
|
"loss": 0.0324, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"grad_norm": 0.37707987427711487, |
|
"learning_rate": 0.0005848484848484848, |
|
"loss": 0.0277, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.562043795620438, |
|
"grad_norm": 0.18928465247154236, |
|
"learning_rate": 0.0005659090909090909, |
|
"loss": 0.0331, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.744525547445256, |
|
"grad_norm": 0.10301584750413895, |
|
"learning_rate": 0.000546969696969697, |
|
"loss": 0.0189, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.927007299270073, |
|
"grad_norm": 0.5264261960983276, |
|
"learning_rate": 0.0005280303030303031, |
|
"loss": 0.0374, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.017436422407627106, |
|
"eval_runtime": 189.4577, |
|
"eval_samples_per_second": 4.318, |
|
"eval_steps_per_second": 0.723, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 5.109489051094891, |
|
"grad_norm": 0.3798004984855652, |
|
"learning_rate": 0.000509090909090909, |
|
"loss": 0.0166, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.291970802919708, |
|
"grad_norm": 0.22640006244182587, |
|
"learning_rate": 0.0004901515151515152, |
|
"loss": 0.0224, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.474452554744525, |
|
"grad_norm": 0.2977535128593445, |
|
"learning_rate": 0.0004712121212121212, |
|
"loss": 0.0178, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.656934306569343, |
|
"grad_norm": 0.046165354549884796, |
|
"learning_rate": 0.00045227272727272727, |
|
"loss": 0.0188, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.839416058394161, |
|
"grad_norm": 0.20036080479621887, |
|
"learning_rate": 0.00043333333333333337, |
|
"loss": 0.0189, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.009602065198123455, |
|
"eval_runtime": 190.5013, |
|
"eval_samples_per_second": 4.294, |
|
"eval_steps_per_second": 0.719, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 6.021897810218978, |
|
"grad_norm": 0.009559686295688152, |
|
"learning_rate": 0.00041439393939393936, |
|
"loss": 0.0112, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.204379562043796, |
|
"grad_norm": 0.17128558456897736, |
|
"learning_rate": 0.00039545454545454546, |
|
"loss": 0.0115, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.386861313868613, |
|
"grad_norm": 0.12201745808124542, |
|
"learning_rate": 0.0003765151515151515, |
|
"loss": 0.0099, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 6.569343065693431, |
|
"grad_norm": 0.221001997590065, |
|
"learning_rate": 0.0003575757575757576, |
|
"loss": 0.0062, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.751824817518248, |
|
"grad_norm": 0.1375964730978012, |
|
"learning_rate": 0.00033863636363636366, |
|
"loss": 0.0104, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.934306569343065, |
|
"grad_norm": 0.18805921077728271, |
|
"learning_rate": 0.0003196969696969697, |
|
"loss": 0.0108, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.006272537633776665, |
|
"eval_runtime": 189.9453, |
|
"eval_samples_per_second": 4.307, |
|
"eval_steps_per_second": 0.721, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 7.116788321167883, |
|
"grad_norm": 0.04299464076757431, |
|
"learning_rate": 0.0003007575757575758, |
|
"loss": 0.0065, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.299270072992701, |
|
"grad_norm": 0.19402629137039185, |
|
"learning_rate": 0.0002818181818181818, |
|
"loss": 0.0089, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.481751824817518, |
|
"grad_norm": 0.1514054536819458, |
|
"learning_rate": 0.0002628787878787879, |
|
"loss": 0.007, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 7.664233576642336, |
|
"grad_norm": 0.011143018491566181, |
|
"learning_rate": 0.00024393939393939392, |
|
"loss": 0.0074, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.846715328467154, |
|
"grad_norm": 0.037931449711322784, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 0.004, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.003884958801791072, |
|
"eval_runtime": 190.9247, |
|
"eval_samples_per_second": 4.284, |
|
"eval_steps_per_second": 0.718, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 8.02919708029197, |
|
"grad_norm": 0.09535694122314453, |
|
"learning_rate": 0.00020606060606060607, |
|
"loss": 0.0047, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.211678832116789, |
|
"grad_norm": 0.1803082972764969, |
|
"learning_rate": 0.00018712121212121214, |
|
"loss": 0.0047, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 8.394160583941606, |
|
"grad_norm": 0.07427278906106949, |
|
"learning_rate": 0.0001681818181818182, |
|
"loss": 0.0035, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.576642335766424, |
|
"grad_norm": 0.07938549667596817, |
|
"learning_rate": 0.00014924242424242424, |
|
"loss": 0.0038, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 8.75912408759124, |
|
"grad_norm": 0.047964416444301605, |
|
"learning_rate": 0.0001303030303030303, |
|
"loss": 0.0035, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.941605839416058, |
|
"grad_norm": 0.03935278207063675, |
|
"learning_rate": 0.00011136363636363636, |
|
"loss": 0.0039, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.002366352593526244, |
|
"eval_runtime": 191.3767, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.716, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 9.124087591240876, |
|
"grad_norm": 0.06493715196847916, |
|
"learning_rate": 9.242424242424242e-05, |
|
"loss": 0.0025, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 9.306569343065693, |
|
"grad_norm": 0.02973032370209694, |
|
"learning_rate": 7.348484848484849e-05, |
|
"loss": 0.0031, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 9.489051094890511, |
|
"grad_norm": 0.03156217932701111, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0013, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.671532846715328, |
|
"grad_norm": 0.01587042771279812, |
|
"learning_rate": 3.560606060606061e-05, |
|
"loss": 0.0036, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 9.854014598540147, |
|
"grad_norm": 0.02338283136487007, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0026, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.001961252186447382, |
|
"eval_runtime": 191.871, |
|
"eval_samples_per_second": 4.263, |
|
"eval_steps_per_second": 0.714, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1370, |
|
"total_flos": 2.4023145037824e+18, |
|
"train_loss": 0.1333546004319278, |
|
"train_runtime": 4360.7512, |
|
"train_samples_per_second": 1.876, |
|
"train_steps_per_second": 0.314 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1370, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4023145037824e+18, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|