|
{ |
|
"best_metric": 1.367624282836914, |
|
"best_model_checkpoint": "./results/checkpoint-11635", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 11635, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21486892995272883, |
|
"grad_norm": 1.7943241596221924, |
|
"learning_rate": 4.7851310700472715e-05, |
|
"loss": 1.815, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.42973785990545765, |
|
"grad_norm": 1.5547664165496826, |
|
"learning_rate": 4.570262140094543e-05, |
|
"loss": 1.7137, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6446067898581865, |
|
"grad_norm": 1.829068899154663, |
|
"learning_rate": 4.355393210141814e-05, |
|
"loss": 1.6652, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8594757198109153, |
|
"grad_norm": 1.6881290674209595, |
|
"learning_rate": 4.140524280189085e-05, |
|
"loss": 1.6365, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4293888807296753, |
|
"eval_runtime": 13.2899, |
|
"eval_samples_per_second": 352.072, |
|
"eval_steps_per_second": 2.784, |
|
"step": 2327 |
|
}, |
|
{ |
|
"epoch": 1.0743446497636442, |
|
"grad_norm": 1.6054766178131104, |
|
"learning_rate": 3.9256553502363556e-05, |
|
"loss": 1.5857, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.289213579716373, |
|
"grad_norm": 1.550986647605896, |
|
"learning_rate": 3.7107864202836275e-05, |
|
"loss": 1.5097, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.504082509669102, |
|
"grad_norm": 1.5548337697982788, |
|
"learning_rate": 3.495917490330898e-05, |
|
"loss": 1.495, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.7189514396218306, |
|
"grad_norm": 1.5597646236419678, |
|
"learning_rate": 3.281048560378169e-05, |
|
"loss": 1.4887, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.9338203695745595, |
|
"grad_norm": 1.4021880626678467, |
|
"learning_rate": 3.066179630425441e-05, |
|
"loss": 1.4862, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.390964150428772, |
|
"eval_runtime": 13.2972, |
|
"eval_samples_per_second": 351.878, |
|
"eval_steps_per_second": 2.783, |
|
"step": 4654 |
|
}, |
|
{ |
|
"epoch": 2.1486892995272884, |
|
"grad_norm": 1.399411916732788, |
|
"learning_rate": 2.8513107004727117e-05, |
|
"loss": 1.4227, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.363558229480017, |
|
"grad_norm": 1.5368090867996216, |
|
"learning_rate": 2.636441770519983e-05, |
|
"loss": 1.3905, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.578427159432746, |
|
"grad_norm": 1.406023383140564, |
|
"learning_rate": 2.421572840567254e-05, |
|
"loss": 1.3967, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.793296089385475, |
|
"grad_norm": 1.437154769897461, |
|
"learning_rate": 2.206703910614525e-05, |
|
"loss": 1.4, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.3757044076919556, |
|
"eval_runtime": 13.2522, |
|
"eval_samples_per_second": 353.074, |
|
"eval_steps_per_second": 2.792, |
|
"step": 6981 |
|
}, |
|
{ |
|
"epoch": 3.008165019338204, |
|
"grad_norm": 1.37666916847229, |
|
"learning_rate": 1.9918349806617966e-05, |
|
"loss": 1.392, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.2230339492909326, |
|
"grad_norm": 1.4489548206329346, |
|
"learning_rate": 1.7769660507090678e-05, |
|
"loss": 1.3282, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.4379028792436612, |
|
"grad_norm": 1.3192425966262817, |
|
"learning_rate": 1.5620971207563386e-05, |
|
"loss": 1.333, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.6527718091963903, |
|
"grad_norm": 1.4072356224060059, |
|
"learning_rate": 1.34722819080361e-05, |
|
"loss": 1.3261, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.867640739149119, |
|
"grad_norm": 1.4121943712234497, |
|
"learning_rate": 1.1323592608508809e-05, |
|
"loss": 1.336, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.368784785270691, |
|
"eval_runtime": 13.295, |
|
"eval_samples_per_second": 351.936, |
|
"eval_steps_per_second": 2.783, |
|
"step": 9308 |
|
}, |
|
{ |
|
"epoch": 4.082509669101848, |
|
"grad_norm": 1.347611904144287, |
|
"learning_rate": 9.174903308981523e-06, |
|
"loss": 1.3144, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.297378599054577, |
|
"grad_norm": 1.3636679649353027, |
|
"learning_rate": 7.026214009454233e-06, |
|
"loss": 1.2937, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.512247529007306, |
|
"grad_norm": 1.3717917203903198, |
|
"learning_rate": 4.877524709926945e-06, |
|
"loss": 1.2912, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.727116458960034, |
|
"grad_norm": 1.5116732120513916, |
|
"learning_rate": 2.7288354103996563e-06, |
|
"loss": 1.2907, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.941985388912763, |
|
"grad_norm": 1.450828194618225, |
|
"learning_rate": 5.801461108723678e-07, |
|
"loss": 1.2871, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.367624282836914, |
|
"eval_runtime": 13.3043, |
|
"eval_samples_per_second": 351.69, |
|
"eval_steps_per_second": 2.781, |
|
"step": 11635 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 11635, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.523724794298368e+16, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|