|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2675227394328518, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.3019086777975994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6939, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.150989333266983, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7167, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.640558809610037, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.517999397731128, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5472, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.9687061679463425, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4439, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.643479206523606, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2486, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.2754773308695095, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.2217, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7144730049127388, |
|
"learning_rate": 4e-05, |
|
"loss": 0.169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.4702829704135114, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.1994, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3985127340985621, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1612, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.375992184386137, |
|
"learning_rate": 4.982758620689655e-05, |
|
"loss": 0.1576, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9528635753013313, |
|
"learning_rate": 4.9655172413793107e-05, |
|
"loss": 0.1393, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.075169010198401, |
|
"learning_rate": 4.9482758620689655e-05, |
|
"loss": 0.1969, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.0953991165751207, |
|
"learning_rate": 4.931034482758621e-05, |
|
"loss": 0.1294, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.942660591044849, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 0.1306, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.1508904015728345, |
|
"learning_rate": 4.896551724137931e-05, |
|
"loss": 0.1526, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.9862795165358471, |
|
"learning_rate": 4.8793103448275864e-05, |
|
"loss": 0.1186, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.633061833817991, |
|
"learning_rate": 4.862068965517241e-05, |
|
"loss": 0.1457, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.8017052368446178, |
|
"learning_rate": 4.844827586206897e-05, |
|
"loss": 0.1234, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.1560694100709803, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 0.1346, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5737689267430703, |
|
"learning_rate": 4.810344827586207e-05, |
|
"loss": 0.116, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.957864677854788, |
|
"learning_rate": 4.793103448275863e-05, |
|
"loss": 0.1692, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.215039223521855, |
|
"learning_rate": 4.7758620689655176e-05, |
|
"loss": 0.1245, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.370517239734168, |
|
"learning_rate": 4.7586206896551725e-05, |
|
"loss": 0.1476, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.7341334563022532, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 0.1236, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5994298113068974, |
|
"learning_rate": 4.724137931034483e-05, |
|
"loss": 0.1161, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5317433190951963, |
|
"learning_rate": 4.7068965517241385e-05, |
|
"loss": 0.1035, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.191977732539556, |
|
"learning_rate": 4.689655172413793e-05, |
|
"loss": 0.1427, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.6038667570691656, |
|
"learning_rate": 4.672413793103448e-05, |
|
"loss": 0.1225, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.577572731831179, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 0.1399, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.6199241001441385, |
|
"learning_rate": 4.6379310344827586e-05, |
|
"loss": 0.1242, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.236577821186196, |
|
"learning_rate": 4.6206896551724135e-05, |
|
"loss": 0.1656, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.7294690605254757, |
|
"learning_rate": 4.603448275862069e-05, |
|
"loss": 0.1382, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.196527516378511, |
|
"learning_rate": 4.586206896551724e-05, |
|
"loss": 0.1257, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.1057444340221463, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 0.1238, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5409556870328274, |
|
"learning_rate": 4.551724137931035e-05, |
|
"loss": 0.1383, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5204083616874053, |
|
"learning_rate": 4.53448275862069e-05, |
|
"loss": 0.1068, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.3557725298931746, |
|
"learning_rate": 4.5172413793103454e-05, |
|
"loss": 0.1071, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.2601538460418644, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.125, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.9031725385762286, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 0.0991, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3946050262183123, |
|
"learning_rate": 4.465517241379311e-05, |
|
"loss": 0.1156, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.097644875106397, |
|
"learning_rate": 4.4482758620689656e-05, |
|
"loss": 0.1366, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.37846299019108, |
|
"learning_rate": 4.431034482758621e-05, |
|
"loss": 0.126, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.8340152889320331, |
|
"learning_rate": 4.413793103448276e-05, |
|
"loss": 0.1066, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.8304505611337867, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 0.0868, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.550196490898523, |
|
"learning_rate": 4.3793103448275864e-05, |
|
"loss": 0.1286, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.176112247796248, |
|
"learning_rate": 4.362068965517241e-05, |
|
"loss": 0.1206, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.6589263894091213, |
|
"learning_rate": 4.344827586206897e-05, |
|
"loss": 0.1008, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.8349611508902046, |
|
"learning_rate": 4.327586206896552e-05, |
|
"loss": 0.1198, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.1218964920724126, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 0.1166, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 0.6078919172286987, |
|
"eval_runtime": 116.8471, |
|
"eval_samples_per_second": 11.288, |
|
"eval_steps_per_second": 2.824, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 14449508352000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|