|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 2, |
|
"global_step": 56, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 499.6013488769531, |
|
"learning_rate": 0.00035333138396736785, |
|
"loss": 5.7451, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"eval_loss": 4.145218372344971, |
|
"eval_runtime": 115.5605, |
|
"eval_samples_per_second": 3.029, |
|
"eval_steps_per_second": 0.052, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 41.22399139404297, |
|
"learning_rate": 0.0003402450364130209, |
|
"loss": 5.3316, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"eval_loss": 4.5622239112854, |
|
"eval_runtime": 159.5, |
|
"eval_samples_per_second": 2.194, |
|
"eval_steps_per_second": 0.038, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 2.202932596206665, |
|
"learning_rate": 0.00032715868885867396, |
|
"loss": 5.0703, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"eval_loss": 4.1197967529296875, |
|
"eval_runtime": 110.2677, |
|
"eval_samples_per_second": 3.174, |
|
"eval_steps_per_second": 0.054, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.4056661128997803, |
|
"learning_rate": 0.00031407234130432696, |
|
"loss": 4.869, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"eval_loss": 4.0951642990112305, |
|
"eval_runtime": 133.7942, |
|
"eval_samples_per_second": 2.616, |
|
"eval_steps_per_second": 0.045, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.29455429315567017, |
|
"learning_rate": 0.00030098599374998, |
|
"loss": 4.854, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"eval_loss": 4.096081733703613, |
|
"eval_runtime": 117.4958, |
|
"eval_samples_per_second": 2.979, |
|
"eval_steps_per_second": 0.051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.10492418706417084, |
|
"learning_rate": 0.00028789964619563307, |
|
"loss": 4.8537, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"eval_loss": 4.0943403244018555, |
|
"eval_runtime": 134.1581, |
|
"eval_samples_per_second": 2.609, |
|
"eval_steps_per_second": 0.045, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.016932426020503044, |
|
"learning_rate": 0.0002748132986412861, |
|
"loss": 4.8523, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 4.094045162200928, |
|
"eval_runtime": 151.7694, |
|
"eval_samples_per_second": 2.306, |
|
"eval_steps_per_second": 0.04, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.024902725592255592, |
|
"learning_rate": 0.0002617269510869391, |
|
"loss": 4.8522, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"eval_loss": 4.094315528869629, |
|
"eval_runtime": 180.0495, |
|
"eval_samples_per_second": 1.944, |
|
"eval_steps_per_second": 0.033, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.03983244299888611, |
|
"learning_rate": 0.0002486406035325922, |
|
"loss": 4.8525, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"eval_loss": 4.094301223754883, |
|
"eval_runtime": 111.1902, |
|
"eval_samples_per_second": 3.148, |
|
"eval_steps_per_second": 0.054, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.021751634776592255, |
|
"learning_rate": 0.00023555425597824523, |
|
"loss": 4.8524, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"eval_loss": 4.094165802001953, |
|
"eval_runtime": 122.9871, |
|
"eval_samples_per_second": 2.846, |
|
"eval_steps_per_second": 0.049, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.01534917950630188, |
|
"learning_rate": 0.00022246790842389826, |
|
"loss": 4.8522, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"eval_loss": 4.094093322753906, |
|
"eval_runtime": 155.9317, |
|
"eval_samples_per_second": 2.245, |
|
"eval_steps_per_second": 0.038, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.01246840413659811, |
|
"learning_rate": 0.0002093815608695513, |
|
"loss": 4.8522, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"eval_loss": 4.094055652618408, |
|
"eval_runtime": 199.4288, |
|
"eval_samples_per_second": 1.755, |
|
"eval_steps_per_second": 0.03, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.00821756012737751, |
|
"learning_rate": 0.00019629521331520434, |
|
"loss": 4.8521, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"eval_loss": 4.094019412994385, |
|
"eval_runtime": 110.2163, |
|
"eval_samples_per_second": 3.176, |
|
"eval_steps_per_second": 0.054, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.00944442953914404, |
|
"learning_rate": 0.0001832088657608574, |
|
"loss": 4.3067, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.093985557556152, |
|
"eval_runtime": 109.3834, |
|
"eval_samples_per_second": 3.2, |
|
"eval_steps_per_second": 0.055, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.0032597698736935854, |
|
"learning_rate": 0.00017012251820651045, |
|
"loss": 4.8521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"eval_loss": 4.093966960906982, |
|
"eval_runtime": 110.1116, |
|
"eval_samples_per_second": 3.179, |
|
"eval_steps_per_second": 0.054, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.0018325659912079573, |
|
"learning_rate": 0.00015703617065216348, |
|
"loss": 4.8521, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"eval_loss": 4.093964099884033, |
|
"eval_runtime": 110.0212, |
|
"eval_samples_per_second": 3.181, |
|
"eval_steps_per_second": 0.055, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 0.002593304729089141, |
|
"learning_rate": 0.00014394982309781654, |
|
"loss": 4.8521, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"eval_loss": 4.093966007232666, |
|
"eval_runtime": 162.0241, |
|
"eval_samples_per_second": 2.16, |
|
"eval_steps_per_second": 0.037, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.003515275428071618, |
|
"learning_rate": 0.00013086347554346956, |
|
"loss": 4.8521, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"eval_loss": 4.093967437744141, |
|
"eval_runtime": 113.1292, |
|
"eval_samples_per_second": 3.094, |
|
"eval_steps_per_second": 0.053, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 0.004078236408531666, |
|
"learning_rate": 0.00011777712798912262, |
|
"loss": 4.8521, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"eval_loss": 4.09396505355835, |
|
"eval_runtime": 184.5537, |
|
"eval_samples_per_second": 1.896, |
|
"eval_steps_per_second": 0.033, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.003892536973580718, |
|
"learning_rate": 0.00010469078043477564, |
|
"loss": 4.8521, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"eval_loss": 4.093959331512451, |
|
"eval_runtime": 113.0981, |
|
"eval_samples_per_second": 3.095, |
|
"eval_steps_per_second": 0.053, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.0023005367256700993, |
|
"learning_rate": 9.16044328804287e-05, |
|
"loss": 4.852, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 4.093954563140869, |
|
"eval_runtime": 118.4819, |
|
"eval_samples_per_second": 2.954, |
|
"eval_steps_per_second": 0.051, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.0026939427480101585, |
|
"learning_rate": 7.851808532608174e-05, |
|
"loss": 4.852, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"eval_loss": 4.09395170211792, |
|
"eval_runtime": 139.5353, |
|
"eval_samples_per_second": 2.508, |
|
"eval_steps_per_second": 0.043, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 0.0018068618373945355, |
|
"learning_rate": 6.543173777173478e-05, |
|
"loss": 4.852, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"eval_loss": 4.0939507484436035, |
|
"eval_runtime": 119.3509, |
|
"eval_samples_per_second": 2.933, |
|
"eval_steps_per_second": 0.05, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.0018717749044299126, |
|
"learning_rate": 5.234539021738782e-05, |
|
"loss": 4.852, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"eval_loss": 4.093950271606445, |
|
"eval_runtime": 127.677, |
|
"eval_samples_per_second": 2.741, |
|
"eval_steps_per_second": 0.047, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.001300295814871788, |
|
"learning_rate": 3.925904266304087e-05, |
|
"loss": 4.852, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"eval_loss": 4.093948841094971, |
|
"eval_runtime": 140.3917, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 0.043, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.0016825235215947032, |
|
"learning_rate": 2.617269510869391e-05, |
|
"loss": 4.852, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"eval_loss": 4.093948841094971, |
|
"eval_runtime": 132.6149, |
|
"eval_samples_per_second": 2.639, |
|
"eval_steps_per_second": 0.045, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 0.0014735407894477248, |
|
"learning_rate": 1.3086347554346956e-05, |
|
"loss": 4.852, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"eval_loss": 4.093948841094971, |
|
"eval_runtime": 123.3755, |
|
"eval_samples_per_second": 2.837, |
|
"eval_steps_per_second": 0.049, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0010560819646343589, |
|
"learning_rate": 0.0, |
|
"loss": 4.3066, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 4.093948841094971, |
|
"eval_runtime": 107.7247, |
|
"eval_samples_per_second": 3.249, |
|
"eval_steps_per_second": 0.056, |
|
"step": 56 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 56, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 407052651766068.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": { |
|
"_wandb": {}, |
|
"assignments": {}, |
|
"decay": 0.01, |
|
"learning_rate": 0.0003664177315217148, |
|
"metric": "eval/loss", |
|
"per_device_train_batch_size": 128 |
|
} |
|
} |
|
|