|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.64, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.3705686628818512, |
|
"learning_rate": 0.00019987329060020616, |
|
"loss": 2.7276, |
|
"num_input_tokens_seen": 78104, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.28178921341896057, |
|
"learning_rate": 0.00019949348350626456, |
|
"loss": 2.6452, |
|
"num_input_tokens_seen": 157384, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.1864372342824936, |
|
"learning_rate": 0.00019886154122075343, |
|
"loss": 2.5708, |
|
"num_input_tokens_seen": 240256, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.1788010597229004, |
|
"learning_rate": 0.00019797906520422677, |
|
"loss": 2.5831, |
|
"num_input_tokens_seen": 319032, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.23142138123512268, |
|
"learning_rate": 0.00019684829181681234, |
|
"loss": 2.6138, |
|
"num_input_tokens_seen": 397992, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.21496976912021637, |
|
"learning_rate": 0.00019547208665085457, |
|
"loss": 2.5461, |
|
"num_input_tokens_seen": 479904, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.2536880671977997, |
|
"learning_rate": 0.0001938539372689649, |
|
"loss": 2.5797, |
|
"num_input_tokens_seen": 555448, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.39764249324798584, |
|
"learning_rate": 0.00019199794436588243, |
|
"loss": 2.5482, |
|
"num_input_tokens_seen": 630888, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.29237431287765503, |
|
"learning_rate": 0.00018990881137654258, |
|
"loss": 2.4726, |
|
"num_input_tokens_seen": 709456, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.24707558751106262, |
|
"learning_rate": 0.0001875918325566888, |
|
"loss": 2.5557, |
|
"num_input_tokens_seen": 788504, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.24359019100666046, |
|
"learning_rate": 0.00018505287956623297, |
|
"loss": 2.5357, |
|
"num_input_tokens_seen": 864984, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.23572562634944916, |
|
"learning_rate": 0.00018229838658936564, |
|
"loss": 2.4798, |
|
"num_input_tokens_seen": 939136, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.41712337732315063, |
|
"learning_rate": 0.00017933533402912354, |
|
"loss": 2.5619, |
|
"num_input_tokens_seen": 1018024, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.2777227759361267, |
|
"learning_rate": 0.00017617123081773591, |
|
"loss": 2.5334, |
|
"num_input_tokens_seen": 1095072, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.25098717212677, |
|
"learning_rate": 0.00017281409538757883, |
|
"loss": 2.5199, |
|
"num_input_tokens_seen": 1172824, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.24321608245372772, |
|
"learning_rate": 0.00016927243535095997, |
|
"loss": 2.4897, |
|
"num_input_tokens_seen": 1244688, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.25580549240112305, |
|
"learning_rate": 0.0001655552259402295, |
|
"loss": 2.513, |
|
"num_input_tokens_seen": 1321104, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.24990542232990265, |
|
"learning_rate": 0.00016167188726285434, |
|
"loss": 2.5533, |
|
"num_input_tokens_seen": 1402456, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.26301833987236023, |
|
"learning_rate": 0.00015763226042909455, |
|
"loss": 2.5297, |
|
"num_input_tokens_seen": 1483648, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.23519474267959595, |
|
"learning_rate": 0.0001534465826127801, |
|
"loss": 2.518, |
|
"num_input_tokens_seen": 1560608, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.26162955164909363, |
|
"learning_rate": 0.00014912546110838775, |
|
"loss": 2.5537, |
|
"num_input_tokens_seen": 1640728, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.2565276026725769, |
|
"learning_rate": 0.00014467984645016258, |
|
"loss": 2.489, |
|
"num_input_tokens_seen": 1722440, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.3414108455181122, |
|
"learning_rate": 0.00014012100466140578, |
|
"loss": 2.5448, |
|
"num_input_tokens_seen": 1805984, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.2590246796607971, |
|
"learning_rate": 0.00013546048870425356, |
|
"loss": 2.4148, |
|
"num_input_tokens_seen": 1890592, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2765924036502838, |
|
"learning_rate": 0.00013071010920229909, |
|
"loss": 2.5512, |
|
"num_input_tokens_seen": 1971272, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.2582058906555176, |
|
"learning_rate": 0.00012588190451025207, |
|
"loss": 2.5264, |
|
"num_input_tokens_seen": 2053352, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.2416098713874817, |
|
"learning_rate": 0.00012098811020648475, |
|
"loss": 2.5525, |
|
"num_input_tokens_seen": 2132624, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.3602483868598938, |
|
"learning_rate": 0.00011604112808577603, |
|
"loss": 2.5029, |
|
"num_input_tokens_seen": 2210376, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.2507006824016571, |
|
"learning_rate": 0.000111053494730832, |
|
"loss": 2.5546, |
|
"num_input_tokens_seen": 2292896, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.32158130407333374, |
|
"learning_rate": 0.00010603784974222861, |
|
"loss": 2.5431, |
|
"num_input_tokens_seen": 2376432, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.3036045730113983, |
|
"learning_rate": 0.00010100690370728755, |
|
"loss": 2.4882, |
|
"num_input_tokens_seen": 2451848, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.31646254658699036, |
|
"learning_rate": 9.597340598905852e-05, |
|
"loss": 2.4319, |
|
"num_input_tokens_seen": 2532816, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.2881500720977783, |
|
"learning_rate": 9.095011241703623e-05, |
|
"loss": 2.4688, |
|
"num_input_tokens_seen": 2616160, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.27659812569618225, |
|
"learning_rate": 8.594975296149076e-05, |
|
"loss": 2.5149, |
|
"num_input_tokens_seen": 2694864, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.4290514290332794, |
|
"learning_rate": 8.098499947332934e-05, |
|
"loss": 2.4754, |
|
"num_input_tokens_seen": 2770464, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.28775912523269653, |
|
"learning_rate": 7.606843357124426e-05, |
|
"loss": 2.4875, |
|
"num_input_tokens_seen": 2852888, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.2709747850894928, |
|
"learning_rate": 7.121251475752539e-05, |
|
"loss": 2.563, |
|
"num_input_tokens_seen": 2933520, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.3473074734210968, |
|
"learning_rate": 6.642954884333955e-05, |
|
"loss": 2.4903, |
|
"num_input_tokens_seen": 3011656, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.28855767846107483, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 2.5325, |
|
"num_input_tokens_seen": 3088960, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.27710187435150146, |
|
"learning_rate": 5.713074385969457e-05, |
|
"loss": 2.4344, |
|
"num_input_tokens_seen": 3165160, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 3165160, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.30881325463765e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|