|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.42182968626417083, |
|
"eval_steps": 50, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010545742156604272, |
|
"grad_norm": 0.8949686288833618, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.0803, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021091484313208543, |
|
"grad_norm": 0.8410115838050842, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.0584, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03163722646981281, |
|
"grad_norm": 0.8865960836410522, |
|
"learning_rate": 6e-06, |
|
"loss": 2.0496, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.042182968626417086, |
|
"grad_norm": 0.7669851779937744, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.0115, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.052728710783021354, |
|
"grad_norm": 0.7927244901657104, |
|
"learning_rate": 1e-05, |
|
"loss": 1.999, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.052728710783021354, |
|
"eval_loss": 1.9660409688949585, |
|
"eval_runtime": 1845.8031, |
|
"eval_samples_per_second": 1.827, |
|
"eval_steps_per_second": 0.457, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06327445293962562, |
|
"grad_norm": 0.6458373069763184, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.9319, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07382019509622989, |
|
"grad_norm": 0.566554069519043, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.9084, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08436593725283417, |
|
"grad_norm": 0.4990901052951813, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.8971, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09491167940943844, |
|
"grad_norm": 0.4458348751068115, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.8216, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10545742156604271, |
|
"grad_norm": 0.4464481472969055, |
|
"learning_rate": 2e-05, |
|
"loss": 1.8433, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10545742156604271, |
|
"eval_loss": 1.8134711980819702, |
|
"eval_runtime": 1846.165, |
|
"eval_samples_per_second": 1.826, |
|
"eval_steps_per_second": 0.457, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11600316372264698, |
|
"grad_norm": 0.4543227255344391, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.8193, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12654890587925124, |
|
"grad_norm": 0.4522910416126251, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.7737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13709464803585553, |
|
"grad_norm": 0.4522746801376343, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.7885, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14764039019245978, |
|
"grad_norm": 0.44632086157798767, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.7669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15818613234906406, |
|
"grad_norm": 0.4553607404232025, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7605, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15818613234906406, |
|
"eval_loss": 1.7562175989151, |
|
"eval_runtime": 1836.9664, |
|
"eval_samples_per_second": 1.836, |
|
"eval_steps_per_second": 0.459, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16873187450566834, |
|
"grad_norm": 0.4852316975593567, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.7367, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1792776166622726, |
|
"grad_norm": 0.49629154801368713, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.7447, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18982335881887688, |
|
"grad_norm": 0.5253108739852905, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.7378, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20036910097548116, |
|
"grad_norm": 0.6104539036750793, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.737, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21091484313208542, |
|
"grad_norm": 0.564102828502655, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7325, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21091484313208542, |
|
"eval_loss": 1.727364420890808, |
|
"eval_runtime": 1836.4408, |
|
"eval_samples_per_second": 1.836, |
|
"eval_steps_per_second": 0.459, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2214605852886897, |
|
"grad_norm": 0.6158842444419861, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.7168, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23200632744529395, |
|
"grad_norm": 0.699800431728363, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.738, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24255206960189823, |
|
"grad_norm": 0.6008381843566895, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.7256, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2530978117585025, |
|
"grad_norm": 0.633844792842865, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.7033, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2636435539151068, |
|
"grad_norm": 0.6631755232810974, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6999, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2636435539151068, |
|
"eval_loss": 1.7032877206802368, |
|
"eval_runtime": 1836.0737, |
|
"eval_samples_per_second": 1.837, |
|
"eval_steps_per_second": 0.459, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27418929607171105, |
|
"grad_norm": 0.6652688980102539, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.6866, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2847350382283153, |
|
"grad_norm": 0.6938503980636597, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.7029, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29528078038491956, |
|
"grad_norm": 0.686392605304718, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.6989, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.30582652254152387, |
|
"grad_norm": 0.7344717979431152, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.6685, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3163722646981281, |
|
"grad_norm": 0.6960188150405884, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6705, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3163722646981281, |
|
"eval_loss": 1.6844133138656616, |
|
"eval_runtime": 1836.0829, |
|
"eval_samples_per_second": 1.837, |
|
"eval_steps_per_second": 0.459, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3269180068547324, |
|
"grad_norm": 0.6903261542320251, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.6672, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3374637490113367, |
|
"grad_norm": 0.7028161883354187, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.6934, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34800949116794094, |
|
"grad_norm": 0.7435672879219055, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.6712, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3585552333245452, |
|
"grad_norm": 0.7246424555778503, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.6608, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3691009754811495, |
|
"grad_norm": 0.6919660568237305, |
|
"learning_rate": 7e-05, |
|
"loss": 1.6568, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3691009754811495, |
|
"eval_loss": 1.6683924198150635, |
|
"eval_runtime": 1838.3493, |
|
"eval_samples_per_second": 1.834, |
|
"eval_steps_per_second": 0.459, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37964671763775376, |
|
"grad_norm": 0.7590942978858948, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.6742, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.390192459794358, |
|
"grad_norm": 0.6980053186416626, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.6744, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4007382019509623, |
|
"grad_norm": 0.7112457752227783, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.648, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4112839441075666, |
|
"grad_norm": 0.7157771587371826, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.6634, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42182968626417083, |
|
"grad_norm": 0.7440850138664246, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6567, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42182968626417083, |
|
"eval_loss": 1.6561506986618042, |
|
"eval_runtime": 1822.2187, |
|
"eval_samples_per_second": 1.85, |
|
"eval_steps_per_second": 0.463, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2844, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.274968196448256e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|