|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 14.646464646464647, |
|
"eval_steps": 20, |
|
"global_step": 580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.25252525252525254, |
|
"grad_norm": 13.126054763793945, |
|
"learning_rate": 1.6949152542372882e-06, |
|
"loss": 0.8683, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"grad_norm": 17.31291961669922, |
|
"learning_rate": 3.3898305084745763e-06, |
|
"loss": 0.7248, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 8.053046226501465, |
|
"learning_rate": 5.084745762711865e-06, |
|
"loss": 0.6134, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 9.220598220825195, |
|
"learning_rate": 6.779661016949153e-06, |
|
"loss": 0.5526, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2626262626262625, |
|
"grad_norm": 27.968978881835938, |
|
"learning_rate": 8.47457627118644e-06, |
|
"loss": 0.4393, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 5.585519313812256, |
|
"learning_rate": 9.980988593155894e-06, |
|
"loss": 0.34, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7676767676767677, |
|
"grad_norm": 4.464208602905273, |
|
"learning_rate": 9.790874524714829e-06, |
|
"loss": 0.3483, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 11.554718017578125, |
|
"learning_rate": 9.600760456273765e-06, |
|
"loss": 0.3142, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 6.658814907073975, |
|
"learning_rate": 9.4106463878327e-06, |
|
"loss": 0.2071, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"grad_norm": 6.011457443237305, |
|
"learning_rate": 9.220532319391637e-06, |
|
"loss": 0.2287, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 9.19509220123291, |
|
"learning_rate": 9.030418250950572e-06, |
|
"loss": 0.2189, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 4.081284523010254, |
|
"learning_rate": 8.840304182509506e-06, |
|
"loss": 0.1807, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.282828282828283, |
|
"grad_norm": 4.492301940917969, |
|
"learning_rate": 8.650190114068441e-06, |
|
"loss": 0.1727, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.5353535353535355, |
|
"grad_norm": 5.772425174713135, |
|
"learning_rate": 8.460076045627376e-06, |
|
"loss": 0.128, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"grad_norm": 24.84961700439453, |
|
"learning_rate": 8.269961977186313e-06, |
|
"loss": 0.1993, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 2.998711585998535, |
|
"learning_rate": 8.079847908745247e-06, |
|
"loss": 0.1502, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.292929292929293, |
|
"grad_norm": 6.577826499938965, |
|
"learning_rate": 7.889733840304184e-06, |
|
"loss": 0.1225, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 8.288976669311523, |
|
"learning_rate": 7.699619771863119e-06, |
|
"loss": 0.1113, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.797979797979798, |
|
"grad_norm": 8.804610252380371, |
|
"learning_rate": 7.509505703422054e-06, |
|
"loss": 0.1692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.05050505050505, |
|
"grad_norm": 5.5263752937316895, |
|
"learning_rate": 7.319391634980989e-06, |
|
"loss": 0.1364, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.303030303030303, |
|
"grad_norm": 3.2428457736968994, |
|
"learning_rate": 7.129277566539925e-06, |
|
"loss": 0.0863, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 6.467748165130615, |
|
"learning_rate": 6.93916349809886e-06, |
|
"loss": 0.1099, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.808080808080808, |
|
"grad_norm": 8.438054084777832, |
|
"learning_rate": 6.749049429657795e-06, |
|
"loss": 0.1037, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"grad_norm": 2.7920687198638916, |
|
"learning_rate": 6.55893536121673e-06, |
|
"loss": 0.1054, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.313131313131313, |
|
"grad_norm": 14.06596565246582, |
|
"learning_rate": 6.368821292775666e-06, |
|
"loss": 0.0708, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.565656565656566, |
|
"grad_norm": 3.4552762508392334, |
|
"learning_rate": 6.1787072243346015e-06, |
|
"loss": 0.0963, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 4.3161234855651855, |
|
"learning_rate": 5.988593155893536e-06, |
|
"loss": 0.1237, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 7.070707070707071, |
|
"grad_norm": 3.9048163890838623, |
|
"learning_rate": 5.798479087452472e-06, |
|
"loss": 0.0868, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 7.3232323232323235, |
|
"grad_norm": 8.73560905456543, |
|
"learning_rate": 5.608365019011407e-06, |
|
"loss": 0.0681, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 7.575757575757576, |
|
"grad_norm": 3.1624324321746826, |
|
"learning_rate": 5.418250950570343e-06, |
|
"loss": 0.0912, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 7.828282828282829, |
|
"grad_norm": 3.010721206665039, |
|
"learning_rate": 5.228136882129278e-06, |
|
"loss": 0.0618, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"grad_norm": 2.488093137741089, |
|
"learning_rate": 5.038022813688214e-06, |
|
"loss": 0.0484, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 44.87844467163086, |
|
"learning_rate": 4.847908745247149e-06, |
|
"loss": 0.0665, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 8.585858585858587, |
|
"grad_norm": 5.052587032318115, |
|
"learning_rate": 4.657794676806084e-06, |
|
"loss": 0.0643, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 8.83838383838384, |
|
"grad_norm": 1.7123836278915405, |
|
"learning_rate": 4.467680608365019e-06, |
|
"loss": 0.0575, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 1.540806770324707, |
|
"learning_rate": 4.277566539923955e-06, |
|
"loss": 0.0669, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 9.343434343434343, |
|
"grad_norm": 6.937175750732422, |
|
"learning_rate": 4.08745247148289e-06, |
|
"loss": 0.0502, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 9.595959595959595, |
|
"grad_norm": 1.7217215299606323, |
|
"learning_rate": 3.897338403041825e-06, |
|
"loss": 0.0586, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 9.848484848484848, |
|
"grad_norm": 2.884972095489502, |
|
"learning_rate": 3.707224334600761e-06, |
|
"loss": 0.0691, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 10.1010101010101, |
|
"grad_norm": 3.5889267921447754, |
|
"learning_rate": 3.517110266159696e-06, |
|
"loss": 0.0717, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 10.353535353535353, |
|
"grad_norm": 3.033485174179077, |
|
"learning_rate": 3.3269961977186314e-06, |
|
"loss": 0.0491, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 10.606060606060606, |
|
"grad_norm": 1.5832382440567017, |
|
"learning_rate": 3.136882129277567e-06, |
|
"loss": 0.0457, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 10.858585858585858, |
|
"grad_norm": 0.04450301453471184, |
|
"learning_rate": 2.9467680608365023e-06, |
|
"loss": 0.0687, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 4.540256977081299, |
|
"learning_rate": 2.756653992395438e-06, |
|
"loss": 0.0593, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 11.363636363636363, |
|
"grad_norm": 4.446552753448486, |
|
"learning_rate": 2.5665399239543728e-06, |
|
"loss": 0.0404, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 11.616161616161616, |
|
"grad_norm": 1.007643222808838, |
|
"learning_rate": 2.3764258555133084e-06, |
|
"loss": 0.0402, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 11.868686868686869, |
|
"grad_norm": 0.96592116355896, |
|
"learning_rate": 2.1863117870722437e-06, |
|
"loss": 0.0448, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 12.121212121212121, |
|
"grad_norm": 6.914277076721191, |
|
"learning_rate": 1.996197718631179e-06, |
|
"loss": 0.0536, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 12.373737373737374, |
|
"grad_norm": 4.67967414855957, |
|
"learning_rate": 1.8060836501901142e-06, |
|
"loss": 0.0487, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 12.626262626262626, |
|
"grad_norm": 2.039752721786499, |
|
"learning_rate": 1.6159695817490494e-06, |
|
"loss": 0.0468, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 12.878787878787879, |
|
"grad_norm": 7.157741546630859, |
|
"learning_rate": 1.4258555133079848e-06, |
|
"loss": 0.0317, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 13.131313131313131, |
|
"grad_norm": 4.240894317626953, |
|
"learning_rate": 1.2357414448669203e-06, |
|
"loss": 0.0424, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 13.383838383838384, |
|
"grad_norm": 1.8747172355651855, |
|
"learning_rate": 1.0456273764258558e-06, |
|
"loss": 0.0396, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 13.636363636363637, |
|
"grad_norm": 1.3047974109649658, |
|
"learning_rate": 8.555133079847909e-07, |
|
"loss": 0.0403, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 1.6948643922805786, |
|
"learning_rate": 6.653992395437263e-07, |
|
"loss": 0.0372, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 14.141414141414142, |
|
"grad_norm": 3.19494891166687, |
|
"learning_rate": 4.752851711026616e-07, |
|
"loss": 0.0422, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 14.393939393939394, |
|
"grad_norm": 0.13894398510456085, |
|
"learning_rate": 2.85171102661597e-07, |
|
"loss": 0.0292, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 14.646464646464647, |
|
"grad_norm": 0.04802278056740761, |
|
"learning_rate": 9.505703422053233e-08, |
|
"loss": 0.0377, |
|
"step": 580 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 585, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5658543749477888e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|