|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.558846453624318, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9726, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000197979797979798, |
|
"loss": 1.3188, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00019595959595959596, |
|
"loss": 1.1313, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00019393939393939395, |
|
"loss": 1.0928, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00019191919191919191, |
|
"loss": 1.0552, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0001898989898989899, |
|
"loss": 1.0644, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0001878787878787879, |
|
"loss": 1.0013, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00018585858585858586, |
|
"loss": 1.0065, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00018383838383838384, |
|
"loss": 1.0054, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.9734, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0001797979797979798, |
|
"loss": 0.9654, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.9673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00017575757575757578, |
|
"loss": 0.9584, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.00017373737373737377, |
|
"loss": 0.9694, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00017171717171717173, |
|
"loss": 0.9773, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.00016969696969696972, |
|
"loss": 0.9531, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00016767676767676768, |
|
"loss": 0.9775, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00016565656565656567, |
|
"loss": 0.9261, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.9715, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.00016161616161616162, |
|
"loss": 0.9392, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.0001595959595959596, |
|
"loss": 0.9722, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00015757575757575757, |
|
"loss": 0.9546, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.9177, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.00015353535353535353, |
|
"loss": 0.8817, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 0.9303, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0001494949494949495, |
|
"loss": 0.9237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.00014747474747474747, |
|
"loss": 0.9136, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.9011, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.00014343434343434342, |
|
"loss": 0.9071, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0001414141414141414, |
|
"loss": 0.9315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.0001393939393939394, |
|
"loss": 0.8887, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.0001373737373737374, |
|
"loss": 0.9104, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.00013535353535353538, |
|
"loss": 0.9178, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.9236, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.00013131313131313133, |
|
"loss": 0.908, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.00012929292929292932, |
|
"loss": 0.9268, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.8726, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00012525252525252527, |
|
"loss": 0.9305, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.00012323232323232323, |
|
"loss": 0.9196, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 0.8803, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.00011919191919191919, |
|
"loss": 0.8714, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00011717171717171717, |
|
"loss": 0.903, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.00011515151515151516, |
|
"loss": 0.8812, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.00011313131313131313, |
|
"loss": 0.8887, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.8708, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.8859, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.00010707070707070708, |
|
"loss": 0.922, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00010505050505050507, |
|
"loss": 0.8849, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.00010303030303030303, |
|
"loss": 0.8729, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.00010101010101010102, |
|
"loss": 0.8851, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 9.8989898989899e-05, |
|
"loss": 0.9178, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 9.696969696969698e-05, |
|
"loss": 0.8472, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 9.494949494949495e-05, |
|
"loss": 0.8691, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 9.292929292929293e-05, |
|
"loss": 0.8663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.8658, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.8641, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 8.686868686868688e-05, |
|
"loss": 0.8331, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 8.484848484848486e-05, |
|
"loss": 0.8843, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 8.282828282828283e-05, |
|
"loss": 0.8753, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 8.080808080808081e-05, |
|
"loss": 0.8589, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 7.878787878787879e-05, |
|
"loss": 0.8395, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 7.676767676767676e-05, |
|
"loss": 0.852, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 7.474747474747475e-05, |
|
"loss": 0.8413, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.866, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 7.07070707070707e-05, |
|
"loss": 0.8244, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 6.86868686868687e-05, |
|
"loss": 0.79, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.7924, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 6.464646464646466e-05, |
|
"loss": 0.8056, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 6.262626262626264e-05, |
|
"loss": 0.7935, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 0.8212, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 5.858585858585859e-05, |
|
"loss": 0.7993, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 5.6565656565656563e-05, |
|
"loss": 0.8074, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.8426, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 5.2525252525252536e-05, |
|
"loss": 0.796, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 5.050505050505051e-05, |
|
"loss": 0.783, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 0.801, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 4.6464646464646464e-05, |
|
"loss": 0.8028, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.7935, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 0.7957, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 0.8139, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 3.838383838383838e-05, |
|
"loss": 0.7718, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.8142, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 3.434343434343435e-05, |
|
"loss": 0.8083, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 0.7984, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 0.7968, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 2.8282828282828282e-05, |
|
"loss": 0.7945, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 2.6262626262626268e-05, |
|
"loss": 0.803, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 0.8167, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.7801, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 0.8059, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.7927, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 1.6161616161616165e-05, |
|
"loss": 0.7962, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 1.4141414141414141e-05, |
|
"loss": 0.7893, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 0.7965, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 0.7817, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"learning_rate": 8.080808080808082e-06, |
|
"loss": 0.8159, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 0.7822, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 4.040404040404041e-06, |
|
"loss": 0.7937, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 0.8044, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.0, |
|
"loss": 0.8113, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"step": 500, |
|
"total_flos": 1.3764276825690931e+17, |
|
"train_loss": 0.8930109763145446, |
|
"train_runtime": 25972.5576, |
|
"train_samples_per_second": 1.232, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 500, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 1.3764276825690931e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|