|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 50, |
|
"global_step": 685, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 1.2194373607635498, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 3.5968, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 1.1968055963516235, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 3.6726, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21897810218978103, |
|
"grad_norm": 1.1814980506896973, |
|
"learning_rate": 1.5e-06, |
|
"loss": 3.5953, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 1.1611206531524658, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.5831, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"grad_norm": 1.0345042943954468, |
|
"learning_rate": 2.5e-06, |
|
"loss": 3.6277, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"eval_loss": 3.6583080291748047, |
|
"eval_runtime": 4.5719, |
|
"eval_samples_per_second": 106.74, |
|
"eval_steps_per_second": 13.342, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.43795620437956206, |
|
"grad_norm": 1.1489139795303345, |
|
"learning_rate": 3e-06, |
|
"loss": 3.5835, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5109489051094891, |
|
"grad_norm": 1.3421690464019775, |
|
"learning_rate": 3.5e-06, |
|
"loss": 3.5992, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.583941605839416, |
|
"grad_norm": 1.0778014659881592, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 3.5575, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.656934306569343, |
|
"grad_norm": 0.9279437065124512, |
|
"learning_rate": 4.5e-06, |
|
"loss": 3.6968, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 0.8781314492225647, |
|
"learning_rate": 5e-06, |
|
"loss": 3.5835, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"eval_loss": 3.6277682781219482, |
|
"eval_runtime": 4.5628, |
|
"eval_samples_per_second": 106.953, |
|
"eval_steps_per_second": 13.369, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8029197080291971, |
|
"grad_norm": 1.237952709197998, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 3.6226, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8759124087591241, |
|
"grad_norm": 1.1926833391189575, |
|
"learning_rate": 6e-06, |
|
"loss": 3.6014, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.948905109489051, |
|
"grad_norm": 0.8673391938209534, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 3.6303, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0218978102189782, |
|
"grad_norm": 1.0185987949371338, |
|
"learning_rate": 7e-06, |
|
"loss": 3.5962, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.094890510948905, |
|
"grad_norm": 0.8746767044067383, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 3.5365, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.094890510948905, |
|
"eval_loss": 3.578197956085205, |
|
"eval_runtime": 4.5695, |
|
"eval_samples_per_second": 106.796, |
|
"eval_steps_per_second": 13.349, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.167883211678832, |
|
"grad_norm": 0.9256734848022461, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.5504, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2408759124087592, |
|
"grad_norm": 0.7662177681922913, |
|
"learning_rate": 8.5e-06, |
|
"loss": 3.5484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.313868613138686, |
|
"grad_norm": 0.7936022877693176, |
|
"learning_rate": 9e-06, |
|
"loss": 3.547, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3868613138686132, |
|
"grad_norm": 0.9237962961196899, |
|
"learning_rate": 9.5e-06, |
|
"loss": 3.5648, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"grad_norm": 0.7705855369567871, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4902, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"eval_loss": 3.515073299407959, |
|
"eval_runtime": 4.5781, |
|
"eval_samples_per_second": 106.595, |
|
"eval_steps_per_second": 13.324, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5328467153284673, |
|
"grad_norm": 0.7808473110198975, |
|
"learning_rate": 9.793814432989691e-06, |
|
"loss": 3.4803, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6058394160583942, |
|
"grad_norm": 0.8358160853385925, |
|
"learning_rate": 9.587628865979383e-06, |
|
"loss": 3.4905, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6788321167883211, |
|
"grad_norm": 0.8808528184890747, |
|
"learning_rate": 9.381443298969073e-06, |
|
"loss": 3.4518, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7518248175182483, |
|
"grad_norm": 0.6673774719238281, |
|
"learning_rate": 9.175257731958764e-06, |
|
"loss": 3.4631, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"grad_norm": 0.7009378671646118, |
|
"learning_rate": 8.969072164948455e-06, |
|
"loss": 3.4264, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"eval_loss": 3.4590413570404053, |
|
"eval_runtime": 4.6618, |
|
"eval_samples_per_second": 104.68, |
|
"eval_steps_per_second": 13.085, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.897810218978102, |
|
"grad_norm": 0.6876445412635803, |
|
"learning_rate": 8.762886597938146e-06, |
|
"loss": 3.398, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9708029197080292, |
|
"grad_norm": 0.5596441626548767, |
|
"learning_rate": 8.556701030927836e-06, |
|
"loss": 3.4173, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0437956204379564, |
|
"grad_norm": 0.5952299237251282, |
|
"learning_rate": 8.350515463917526e-06, |
|
"loss": 3.3986, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.116788321167883, |
|
"grad_norm": 0.6578339338302612, |
|
"learning_rate": 8.144329896907216e-06, |
|
"loss": 3.3949, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"grad_norm": 0.7148367166519165, |
|
"learning_rate": 7.938144329896907e-06, |
|
"loss": 3.3845, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"eval_loss": 3.4218263626098633, |
|
"eval_runtime": 4.6207, |
|
"eval_samples_per_second": 105.611, |
|
"eval_steps_per_second": 13.201, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.2627737226277373, |
|
"grad_norm": 0.5892521142959595, |
|
"learning_rate": 7.731958762886599e-06, |
|
"loss": 3.3821, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.335766423357664, |
|
"grad_norm": 0.6282438635826111, |
|
"learning_rate": 7.525773195876289e-06, |
|
"loss": 3.3522, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.408759124087591, |
|
"grad_norm": 0.5816407799720764, |
|
"learning_rate": 7.319587628865979e-06, |
|
"loss": 3.3891, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4817518248175183, |
|
"grad_norm": 0.7334665656089783, |
|
"learning_rate": 7.113402061855671e-06, |
|
"loss": 3.3682, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.554744525547445, |
|
"grad_norm": 0.667533278465271, |
|
"learning_rate": 6.907216494845361e-06, |
|
"loss": 3.4053, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.554744525547445, |
|
"eval_loss": 3.3971924781799316, |
|
"eval_runtime": 4.558, |
|
"eval_samples_per_second": 107.065, |
|
"eval_steps_per_second": 13.383, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.627737226277372, |
|
"grad_norm": 0.7920149564743042, |
|
"learning_rate": 6.701030927835052e-06, |
|
"loss": 3.3418, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.7007299270072993, |
|
"grad_norm": 0.6718395352363586, |
|
"learning_rate": 6.494845360824743e-06, |
|
"loss": 3.379, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7737226277372264, |
|
"grad_norm": 0.6475045680999756, |
|
"learning_rate": 6.288659793814433e-06, |
|
"loss": 3.3975, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.846715328467153, |
|
"grad_norm": 0.7203289270401001, |
|
"learning_rate": 6.082474226804124e-06, |
|
"loss": 3.4352, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"grad_norm": 0.5514132976531982, |
|
"learning_rate": 5.876288659793815e-06, |
|
"loss": 3.3763, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"eval_loss": 3.3808703422546387, |
|
"eval_runtime": 4.5435, |
|
"eval_samples_per_second": 107.407, |
|
"eval_steps_per_second": 13.426, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9927007299270074, |
|
"grad_norm": 0.5394614934921265, |
|
"learning_rate": 5.670103092783505e-06, |
|
"loss": 3.3935, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.065693430656934, |
|
"grad_norm": 0.6785652041435242, |
|
"learning_rate": 5.463917525773196e-06, |
|
"loss": 3.3154, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.1386861313868613, |
|
"grad_norm": 0.546085774898529, |
|
"learning_rate": 5.257731958762888e-06, |
|
"loss": 3.323, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.2116788321167884, |
|
"grad_norm": 0.7874196171760559, |
|
"learning_rate": 5.051546391752578e-06, |
|
"loss": 3.3889, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.2846715328467155, |
|
"grad_norm": 0.5360156297683716, |
|
"learning_rate": 4.845360824742268e-06, |
|
"loss": 3.3871, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.2846715328467155, |
|
"eval_loss": 3.3709511756896973, |
|
"eval_runtime": 4.5554, |
|
"eval_samples_per_second": 107.126, |
|
"eval_steps_per_second": 13.391, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3576642335766422, |
|
"grad_norm": 0.6126905083656311, |
|
"learning_rate": 4.639175257731959e-06, |
|
"loss": 3.4215, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4306569343065694, |
|
"grad_norm": 0.7156729102134705, |
|
"learning_rate": 4.4329896907216494e-06, |
|
"loss": 3.3448, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.5036496350364965, |
|
"grad_norm": 0.5565685629844666, |
|
"learning_rate": 4.2268041237113405e-06, |
|
"loss": 3.3041, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.576642335766423, |
|
"grad_norm": 0.6424775123596191, |
|
"learning_rate": 4.020618556701032e-06, |
|
"loss": 3.3398, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 0.5503811836242676, |
|
"learning_rate": 3.814432989690722e-06, |
|
"loss": 3.3639, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"eval_loss": 3.364408493041992, |
|
"eval_runtime": 4.5774, |
|
"eval_samples_per_second": 106.611, |
|
"eval_steps_per_second": 13.326, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7226277372262775, |
|
"grad_norm": 0.8552574515342712, |
|
"learning_rate": 3.6082474226804126e-06, |
|
"loss": 3.3706, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.795620437956204, |
|
"grad_norm": 0.6950764060020447, |
|
"learning_rate": 3.4020618556701037e-06, |
|
"loss": 3.3846, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.8686131386861313, |
|
"grad_norm": 0.5460401177406311, |
|
"learning_rate": 3.195876288659794e-06, |
|
"loss": 3.3343, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.9416058394160585, |
|
"grad_norm": 0.5183790326118469, |
|
"learning_rate": 2.9896907216494846e-06, |
|
"loss": 3.3234, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.014598540145985, |
|
"grad_norm": 0.6364532113075256, |
|
"learning_rate": 2.7835051546391757e-06, |
|
"loss": 3.2898, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.014598540145985, |
|
"eval_loss": 3.36006236076355, |
|
"eval_runtime": 4.5449, |
|
"eval_samples_per_second": 107.373, |
|
"eval_steps_per_second": 13.422, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.087591240875913, |
|
"grad_norm": 0.6407883763313293, |
|
"learning_rate": 2.577319587628866e-06, |
|
"loss": 3.3432, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.160583941605839, |
|
"grad_norm": 0.7432695031166077, |
|
"learning_rate": 2.3711340206185566e-06, |
|
"loss": 3.3128, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.233576642335766, |
|
"grad_norm": 0.5630599856376648, |
|
"learning_rate": 2.1649484536082477e-06, |
|
"loss": 3.3308, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.306569343065694, |
|
"grad_norm": 0.691064178943634, |
|
"learning_rate": 1.9587628865979384e-06, |
|
"loss": 3.3607, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"grad_norm": 0.6786036491394043, |
|
"learning_rate": 1.7525773195876288e-06, |
|
"loss": 3.3413, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"eval_loss": 3.3572518825531006, |
|
"eval_runtime": 4.5354, |
|
"eval_samples_per_second": 107.597, |
|
"eval_steps_per_second": 13.45, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.452554744525547, |
|
"grad_norm": 0.5392524600028992, |
|
"learning_rate": 1.5463917525773197e-06, |
|
"loss": 3.3262, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.525547445255475, |
|
"grad_norm": 0.6452904343605042, |
|
"learning_rate": 1.3402061855670104e-06, |
|
"loss": 3.3672, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.598540145985401, |
|
"grad_norm": 0.6458753347396851, |
|
"learning_rate": 1.134020618556701e-06, |
|
"loss": 3.3539, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.671532846715328, |
|
"grad_norm": 0.579394519329071, |
|
"learning_rate": 9.278350515463919e-07, |
|
"loss": 3.3627, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.744525547445256, |
|
"grad_norm": 0.5088974237442017, |
|
"learning_rate": 7.216494845360824e-07, |
|
"loss": 3.3574, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.744525547445256, |
|
"eval_loss": 3.3557839393615723, |
|
"eval_runtime": 4.5731, |
|
"eval_samples_per_second": 106.712, |
|
"eval_steps_per_second": 13.339, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.817518248175182, |
|
"grad_norm": 0.5745378136634827, |
|
"learning_rate": 5.154639175257732e-07, |
|
"loss": 3.3426, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.89051094890511, |
|
"grad_norm": 0.5284786820411682, |
|
"learning_rate": 3.0927835051546394e-07, |
|
"loss": 3.3406, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.963503649635037, |
|
"grad_norm": 0.5814775824546814, |
|
"learning_rate": 1.0309278350515465e-07, |
|
"loss": 3.3041, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 685, |
|
"total_flos": 2.417304476319744e+16, |
|
"train_loss": 3.435254694249508, |
|
"train_runtime": 659.3834, |
|
"train_samples_per_second": 33.243, |
|
"train_steps_per_second": 1.039 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 685, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.417304476319744e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|