|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2616782486438751, |
|
"learning_rate": 0.00025, |
|
"loss": 0.7467, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4230063855648041, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0676, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4279615581035614, |
|
"learning_rate": 0.0004993330709158879, |
|
"loss": 0.0669, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.26194414496421814, |
|
"learning_rate": 0.0004973358420187776, |
|
"loss": 0.0401, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.18304042518138885, |
|
"learning_rate": 0.0004940189693889818, |
|
"loss": 0.0267, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.15659470856189728, |
|
"learning_rate": 0.0004894001499771015, |
|
"loss": 0.0172, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.8856974840164185, |
|
"learning_rate": 0.00048350402718313703, |
|
"loss": 0.0141, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.2954822778701782, |
|
"learning_rate": 0.0004763620593732867, |
|
"loss": 0.0225, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.36144667863845825, |
|
"learning_rate": 0.00046801235203595195, |
|
"loss": 0.0182, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.20678772032260895, |
|
"learning_rate": 0.0004584994544724695, |
|
"loss": 0.013, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"eval_loss": 0.013783249072730541, |
|
"eval_runtime": 4.8183, |
|
"eval_samples_per_second": 39.433, |
|
"eval_steps_per_second": 1.66, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.3987113833427429, |
|
"learning_rate": 0.00044787412210731353, |
|
"loss": 0.0078, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.23118920624256134, |
|
"learning_rate": 0.00043619304568594545, |
|
"loss": 0.0104, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.1873694211244583, |
|
"learning_rate": 0.0004235185488051585, |
|
"loss": 0.0191, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.03233395144343376, |
|
"learning_rate": 0.0004099182553897228, |
|
"loss": 0.0065, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.5932971239089966, |
|
"learning_rate": 0.00039546472888948826, |
|
"loss": 0.0034, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.13725461065769196, |
|
"learning_rate": 0.0003802350851219826, |
|
"loss": 0.007, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.22745588421821594, |
|
"learning_rate": 0.00036431058082615964, |
|
"loss": 0.0058, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0794651210308075, |
|
"learning_rate": 0.00034777618012253895, |
|
"loss": 0.0039, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.06090324744582176, |
|
"learning_rate": 0.00033072010119286155, |
|
"loss": 0.0035, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.17350110411643982, |
|
"learning_rate": 0.0003132333455979202, |
|
"loss": 0.0034, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"eval_loss": 0.01827273517847061, |
|
"eval_runtime": 4.8208, |
|
"eval_samples_per_second": 39.413, |
|
"eval_steps_per_second": 1.659, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0679766833782196, |
|
"learning_rate": 0.0002954092127448591, |
|
"loss": 0.0043, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.2287779450416565, |
|
"learning_rate": 0.00027734280209446867, |
|
"loss": 0.0033, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.022623347118496895, |
|
"learning_rate": 0.00025913050576441477, |
|
"loss": 0.0051, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.023388344794511795, |
|
"learning_rate": 0.00024086949423558527, |
|
"loss": 0.0011, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 0.06873177736997604, |
|
"learning_rate": 0.00022265719790553147, |
|
"loss": 0.0032, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 0.00556823518127203, |
|
"learning_rate": 0.0002045907872551409, |
|
"loss": 0.0016, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.00377739779651165, |
|
"learning_rate": 0.0001867666544020798, |
|
"loss": 0.0008, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 0.017272206023335457, |
|
"learning_rate": 0.00016927989880713852, |
|
"loss": 0.0008, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 0.028082353994250298, |
|
"learning_rate": 0.00015222381987746103, |
|
"loss": 0.0002, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.0010064858943223953, |
|
"learning_rate": 0.00013568941917384037, |
|
"loss": 0.0001, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"eval_loss": 0.01472838968038559, |
|
"eval_runtime": 4.8243, |
|
"eval_samples_per_second": 39.384, |
|
"eval_steps_per_second": 1.658, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"grad_norm": 0.0007610166212543845, |
|
"learning_rate": 0.00011976491487801746, |
|
"loss": 0.0008, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 0.04695134982466698, |
|
"learning_rate": 0.00010453527111051184, |
|
"loss": 0.0002, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.0033624598290771246, |
|
"learning_rate": 9.008174461027724e-05, |
|
"loss": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 0.003984487149864435, |
|
"learning_rate": 7.648145119484151e-05, |
|
"loss": 0.0001, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 0.002355258446186781, |
|
"learning_rate": 6.380695431405453e-05, |
|
"loss": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.00041891244472935796, |
|
"learning_rate": 5.21258778926865e-05, |
|
"loss": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 0.0004031055432278663, |
|
"learning_rate": 4.150054552753055e-05, |
|
"loss": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.00040033855475485325, |
|
"learning_rate": 3.198764796404807e-05, |
|
"loss": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"grad_norm": 0.0015161971095949411, |
|
"learning_rate": 2.3637940626713344e-05, |
|
"loss": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"grad_norm": 0.0011603726306930184, |
|
"learning_rate": 1.649597281686302e-05, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"eval_loss": 0.014345706440508366, |
|
"eval_runtime": 4.8186, |
|
"eval_samples_per_second": 39.43, |
|
"eval_steps_per_second": 1.66, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 9.11, |
|
"grad_norm": 0.0003027151105925441, |
|
"learning_rate": 1.0599850022898538e-05, |
|
"loss": 0.0001, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"grad_norm": 0.0009049432119354606, |
|
"learning_rate": 5.981030611018234e-06, |
|
"loss": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 0.0018125231144949794, |
|
"learning_rate": 2.664157981222437e-06, |
|
"loss": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 0.003249464090913534, |
|
"learning_rate": 6.66929084112089e-07, |
|
"loss": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.0004782461328431964, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 450, |
|
"total_flos": 4.3483361730730394e+17, |
|
"train_loss": 0.025018121934683425, |
|
"train_runtime": 909.2296, |
|
"train_samples_per_second": 11.834, |
|
"train_steps_per_second": 0.495 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.3483361730730394e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|