|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.48, |
|
"eval_steps": 500, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.6839892864227295, |
|
"learning_rate": 4e-05, |
|
"loss": 1.301, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.386156678199768, |
|
"learning_rate": 8e-05, |
|
"loss": 1.3643, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.9964010715484619, |
|
"learning_rate": 0.00012, |
|
"loss": 1.1346, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.9237359166145325, |
|
"learning_rate": 0.00016, |
|
"loss": 1.2865, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4482671618461609, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0312, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.395876407623291, |
|
"learning_rate": 0.00019636363636363636, |
|
"loss": 0.9728, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.6330752968788147, |
|
"learning_rate": 0.00019272727272727274, |
|
"loss": 1.0906, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.39497730135917664, |
|
"learning_rate": 0.0001890909090909091, |
|
"loss": 1.0165, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.3924015462398529, |
|
"learning_rate": 0.00018545454545454545, |
|
"loss": 1.0005, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3233727216720581, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.0234, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.5036228895187378, |
|
"learning_rate": 0.0001781818181818182, |
|
"loss": 0.8657, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.8032389283180237, |
|
"learning_rate": 0.00017454545454545454, |
|
"loss": 0.8278, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.3186076283454895, |
|
"learning_rate": 0.0001709090909090909, |
|
"loss": 0.877, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.3362283408641815, |
|
"learning_rate": 0.00016727272727272728, |
|
"loss": 0.8952, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.3292746841907501, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.8007, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.5558376908302307, |
|
"learning_rate": 0.00016, |
|
"loss": 0.9349, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.3033958375453949, |
|
"learning_rate": 0.00015636363636363637, |
|
"loss": 0.7249, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.301546186208725, |
|
"learning_rate": 0.00015272727272727275, |
|
"loss": 0.8493, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.2765434980392456, |
|
"learning_rate": 0.0001490909090909091, |
|
"loss": 0.782, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2590007483959198, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.6518, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.3257710039615631, |
|
"learning_rate": 0.00014181818181818184, |
|
"loss": 0.956, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.2133684605360031, |
|
"learning_rate": 0.0001381818181818182, |
|
"loss": 0.5397, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.22465167939662933, |
|
"learning_rate": 0.00013454545454545455, |
|
"loss": 0.7032, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.24608904123306274, |
|
"learning_rate": 0.00013090909090909093, |
|
"loss": 0.8161, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.29115498065948486, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.7039, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.3457951545715332, |
|
"learning_rate": 0.00012363636363636364, |
|
"loss": 0.6452, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.3534717857837677, |
|
"learning_rate": 0.00012, |
|
"loss": 0.7724, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.3112258315086365, |
|
"learning_rate": 0.00011636363636363636, |
|
"loss": 0.7029, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.32025501132011414, |
|
"learning_rate": 0.00011272727272727272, |
|
"loss": 0.9089, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23741815984249115, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.5997, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.4178565442562103, |
|
"learning_rate": 0.00010545454545454545, |
|
"loss": 0.835, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.23426000773906708, |
|
"learning_rate": 0.00010181818181818181, |
|
"loss": 0.7768, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.28046417236328125, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 0.8841, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.37808024883270264, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.8584, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2408115565776825, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.7474, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.24366675317287445, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.7107, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.2985966205596924, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 0.7617, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.23527264595031738, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7948, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.268421471118927, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 0.5638, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.26604562997817993, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.7577, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.21873734891414642, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 0.758, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.27550163865089417, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.774, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.2736279368400574, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 0.7378, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.2714160978794098, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.6681, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.29298704862594604, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.5982, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.3377845585346222, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.6176, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.34884122014045715, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 0.8299, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.2873862087726593, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.6805, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.25929009914398193, |
|
"learning_rate": 4e-05, |
|
"loss": 0.7985, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2327166497707367, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.5968, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 0.27193504571914673, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.7195, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.2823130488395691, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.8598, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.30219078063964844, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.763, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.2724802792072296, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.7016, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.26255902647972107, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.6229, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.27147480845451355, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.7846, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.2804729640483856, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.777, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.22172842919826508, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.9042, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.33171340823173523, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.6419, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.21746467053890228, |
|
"learning_rate": 0.0, |
|
"loss": 0.6739, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 60, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2413435443197952.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|