|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.2650056625141564, |
|
"eval_steps": 20, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09060022650056625, |
|
"grad_norm": 4.824601173400879, |
|
"learning_rate": 1.9393939393939395e-05, |
|
"loss": 6.3414, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09060022650056625, |
|
"eval_loss": 5.462944984436035, |
|
"eval_runtime": 169.7392, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1812004530011325, |
|
"grad_norm": 3.5273354053497314, |
|
"learning_rate": 1.8585858585858588e-05, |
|
"loss": 4.9204, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1812004530011325, |
|
"eval_loss": 4.40663480758667, |
|
"eval_runtime": 169.7659, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2718006795016987, |
|
"grad_norm": 2.5483546257019043, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 4.0241, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2718006795016987, |
|
"eval_loss": 3.6545450687408447, |
|
"eval_runtime": 169.7082, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.583, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.362400906002265, |
|
"grad_norm": 1.4762500524520874, |
|
"learning_rate": 1.6969696969696972e-05, |
|
"loss": 3.4114, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.362400906002265, |
|
"eval_loss": 3.1997323036193848, |
|
"eval_runtime": 169.459, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.584, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45300113250283125, |
|
"grad_norm": 1.2465909719467163, |
|
"learning_rate": 1.616161616161616e-05, |
|
"loss": 3.0527, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.45300113250283125, |
|
"eval_loss": 2.9332973957061768, |
|
"eval_runtime": 169.4702, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.584, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5436013590033975, |
|
"grad_norm": 1.2837079763412476, |
|
"learning_rate": 1.5353535353535354e-05, |
|
"loss": 2.8401, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5436013590033975, |
|
"eval_loss": 2.765261173248291, |
|
"eval_runtime": 169.5654, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6342015855039638, |
|
"grad_norm": 1.069353699684143, |
|
"learning_rate": 1.4545454545454546e-05, |
|
"loss": 2.7202, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6342015855039638, |
|
"eval_loss": 2.654095411300659, |
|
"eval_runtime": 169.5632, |
|
"eval_samples_per_second": 2.33, |
|
"eval_steps_per_second": 0.584, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.72480181200453, |
|
"grad_norm": 1.0665814876556396, |
|
"learning_rate": 1.3737373737373739e-05, |
|
"loss": 2.605, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.72480181200453, |
|
"eval_loss": 2.576014995574951, |
|
"eval_runtime": 169.9075, |
|
"eval_samples_per_second": 2.325, |
|
"eval_steps_per_second": 0.583, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8154020385050963, |
|
"grad_norm": 1.076709508895874, |
|
"learning_rate": 1.2929292929292931e-05, |
|
"loss": 2.5533, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8154020385050963, |
|
"eval_loss": 2.519667148590088, |
|
"eval_runtime": 169.6071, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9060022650056625, |
|
"grad_norm": 1.0686030387878418, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 2.5004, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9060022650056625, |
|
"eval_loss": 2.4773340225219727, |
|
"eval_runtime": 169.6567, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9966024915062288, |
|
"grad_norm": 1.1253015995025635, |
|
"learning_rate": 1.1313131313131314e-05, |
|
"loss": 2.4613, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9966024915062288, |
|
"eval_loss": 2.444694995880127, |
|
"eval_runtime": 169.7657, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.087202718006795, |
|
"grad_norm": 1.1171083450317383, |
|
"learning_rate": 1.0505050505050507e-05, |
|
"loss": 2.4456, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.087202718006795, |
|
"eval_loss": 2.4184916019439697, |
|
"eval_runtime": 169.7027, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.583, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1778029445073612, |
|
"grad_norm": 1.1789259910583496, |
|
"learning_rate": 9.696969696969698e-06, |
|
"loss": 2.4151, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1778029445073612, |
|
"eval_loss": 2.397007465362549, |
|
"eval_runtime": 169.6356, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2684031710079275, |
|
"grad_norm": 1.1507657766342163, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 2.3943, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2684031710079275, |
|
"eval_loss": 2.3794679641723633, |
|
"eval_runtime": 169.6152, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3590033975084936, |
|
"grad_norm": 1.1052231788635254, |
|
"learning_rate": 8.08080808080808e-06, |
|
"loss": 2.3621, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3590033975084936, |
|
"eval_loss": 2.3650312423706055, |
|
"eval_runtime": 169.6247, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.44960362400906, |
|
"grad_norm": 1.156396508216858, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 2.3475, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.44960362400906, |
|
"eval_loss": 2.3532702922821045, |
|
"eval_runtime": 169.6216, |
|
"eval_samples_per_second": 2.329, |
|
"eval_steps_per_second": 0.584, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.5402038505096263, |
|
"grad_norm": 1.1324844360351562, |
|
"learning_rate": 6.464646464646466e-06, |
|
"loss": 2.339, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5402038505096263, |
|
"eval_loss": 2.342968702316284, |
|
"eval_runtime": 169.6943, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.583, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.6308040770101924, |
|
"grad_norm": 1.2476097345352173, |
|
"learning_rate": 5.656565656565657e-06, |
|
"loss": 2.3247, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6308040770101924, |
|
"eval_loss": 2.334242343902588, |
|
"eval_runtime": 169.8944, |
|
"eval_samples_per_second": 2.325, |
|
"eval_steps_per_second": 0.583, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.721404303510759, |
|
"grad_norm": 1.1416445970535278, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 2.3335, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.721404303510759, |
|
"eval_loss": 2.326986789703369, |
|
"eval_runtime": 169.8042, |
|
"eval_samples_per_second": 2.326, |
|
"eval_steps_per_second": 0.583, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.812004530011325, |
|
"grad_norm": 1.1464892625808716, |
|
"learning_rate": 4.04040404040404e-06, |
|
"loss": 2.3007, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.812004530011325, |
|
"eval_loss": 2.321030378341675, |
|
"eval_runtime": 169.8884, |
|
"eval_samples_per_second": 2.325, |
|
"eval_steps_per_second": 0.583, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.9026047565118913, |
|
"grad_norm": 1.1699483394622803, |
|
"learning_rate": 3.232323232323233e-06, |
|
"loss": 2.3095, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.9026047565118913, |
|
"eval_loss": 2.3161513805389404, |
|
"eval_runtime": 169.7467, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.9932049830124576, |
|
"grad_norm": 1.1978620290756226, |
|
"learning_rate": 2.4242424242424244e-06, |
|
"loss": 2.3093, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9932049830124576, |
|
"eval_loss": 2.312627077102661, |
|
"eval_runtime": 170.0514, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.582, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.0838052095130237, |
|
"grad_norm": 1.23793363571167, |
|
"learning_rate": 1.6161616161616164e-06, |
|
"loss": 2.327, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0838052095130237, |
|
"eval_loss": 2.309922933578491, |
|
"eval_runtime": 169.6935, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.583, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.17440543601359, |
|
"grad_norm": 1.2800756692886353, |
|
"learning_rate": 8.080808080808082e-07, |
|
"loss": 2.3005, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.17440543601359, |
|
"eval_loss": 2.3085172176361084, |
|
"eval_runtime": 169.7305, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.2650056625141564, |
|
"grad_norm": 1.1457535028457642, |
|
"learning_rate": 0.0, |
|
"loss": 2.2871, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.2650056625141564, |
|
"eval_loss": 2.3081250190734863, |
|
"eval_runtime": 169.7694, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.583, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.75343643596161e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|