Mistral-Peptide-v1-15M / trainer_state.json
RaphaelMourad's picture
Upload 10 files
e987016 verified
{
"best_metric": 4.949131488800049,
"best_model_checkpoint": "./results/models/checkpoint-33700",
"epoch": 20.0,
"eval_steps": 500,
"global_step": 33700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.29673590504451036,
"grad_norm": 0.197265625,
"learning_rate": 0.0009940652818991099,
"loss": 5.6032,
"step": 500
},
{
"epoch": 0.5934718100890207,
"grad_norm": 0.1953125,
"learning_rate": 0.0009881305637982197,
"loss": 5.4199,
"step": 1000
},
{
"epoch": 0.8902077151335311,
"grad_norm": 0.1982421875,
"learning_rate": 0.0009821958456973294,
"loss": 5.3653,
"step": 1500
},
{
"epoch": 1.0,
"eval_loss": 5.317612171173096,
"eval_runtime": 0.4163,
"eval_samples_per_second": 2402.115,
"eval_steps_per_second": 4.804,
"step": 1685
},
{
"epoch": 1.1869436201780414,
"grad_norm": 0.1953125,
"learning_rate": 0.0009762611275964391,
"loss": 5.3193,
"step": 2000
},
{
"epoch": 1.4836795252225519,
"grad_norm": 0.208984375,
"learning_rate": 0.000970326409495549,
"loss": 5.2869,
"step": 2500
},
{
"epoch": 1.7804154302670623,
"grad_norm": 0.203125,
"learning_rate": 0.0009643916913946587,
"loss": 5.2619,
"step": 3000
},
{
"epoch": 2.0,
"eval_loss": 5.211864471435547,
"eval_runtime": 0.4845,
"eval_samples_per_second": 2063.824,
"eval_steps_per_second": 4.128,
"step": 3370
},
{
"epoch": 2.077151335311573,
"grad_norm": 0.197265625,
"learning_rate": 0.0009584569732937686,
"loss": 5.2314,
"step": 3500
},
{
"epoch": 2.373887240356083,
"grad_norm": 0.2197265625,
"learning_rate": 0.0009525222551928784,
"loss": 5.2019,
"step": 4000
},
{
"epoch": 2.6706231454005933,
"grad_norm": 0.21875,
"learning_rate": 0.0009465875370919882,
"loss": 5.1854,
"step": 4500
},
{
"epoch": 2.9673590504451037,
"grad_norm": 0.228515625,
"learning_rate": 0.0009406528189910979,
"loss": 5.1734,
"step": 5000
},
{
"epoch": 3.0,
"eval_loss": 5.1604228019714355,
"eval_runtime": 0.5733,
"eval_samples_per_second": 1744.297,
"eval_steps_per_second": 3.489,
"step": 5055
},
{
"epoch": 3.264094955489614,
"grad_norm": 0.2294921875,
"learning_rate": 0.0009347181008902077,
"loss": 5.1376,
"step": 5500
},
{
"epoch": 3.5608308605341246,
"grad_norm": 0.2314453125,
"learning_rate": 0.0009287833827893175,
"loss": 5.1259,
"step": 6000
},
{
"epoch": 3.857566765578635,
"grad_norm": 0.2265625,
"learning_rate": 0.0009228486646884273,
"loss": 5.1134,
"step": 6500
},
{
"epoch": 4.0,
"eval_loss": 5.109052658081055,
"eval_runtime": 0.6878,
"eval_samples_per_second": 1453.883,
"eval_steps_per_second": 2.908,
"step": 6740
},
{
"epoch": 4.154302670623146,
"grad_norm": 0.2470703125,
"learning_rate": 0.0009169139465875371,
"loss": 5.0915,
"step": 7000
},
{
"epoch": 4.451038575667655,
"grad_norm": 0.25,
"learning_rate": 0.0009109792284866469,
"loss": 5.0718,
"step": 7500
},
{
"epoch": 4.747774480712166,
"grad_norm": 0.25,
"learning_rate": 0.0009050445103857568,
"loss": 5.066,
"step": 8000
},
{
"epoch": 5.0,
"eval_loss": 5.074661731719971,
"eval_runtime": 0.5054,
"eval_samples_per_second": 1978.542,
"eval_steps_per_second": 3.957,
"step": 8425
},
{
"epoch": 5.044510385756676,
"grad_norm": 0.2470703125,
"learning_rate": 0.0008991097922848664,
"loss": 5.0542,
"step": 8500
},
{
"epoch": 5.341246290801187,
"grad_norm": 0.26171875,
"learning_rate": 0.0008931750741839763,
"loss": 5.023,
"step": 9000
},
{
"epoch": 5.637982195845697,
"grad_norm": 0.259765625,
"learning_rate": 0.0008872403560830861,
"loss": 5.0233,
"step": 9500
},
{
"epoch": 5.9347181008902075,
"grad_norm": 0.265625,
"learning_rate": 0.0008813056379821959,
"loss": 5.0197,
"step": 10000
},
{
"epoch": 6.0,
"eval_loss": 5.054934501647949,
"eval_runtime": 0.6462,
"eval_samples_per_second": 1547.607,
"eval_steps_per_second": 3.095,
"step": 10110
},
{
"epoch": 6.231454005934718,
"grad_norm": 0.271484375,
"learning_rate": 0.0008753709198813057,
"loss": 4.9826,
"step": 10500
},
{
"epoch": 6.528189910979228,
"grad_norm": 0.271484375,
"learning_rate": 0.0008694362017804155,
"loss": 4.984,
"step": 11000
},
{
"epoch": 6.824925816023739,
"grad_norm": 0.271484375,
"learning_rate": 0.0008635014836795252,
"loss": 4.982,
"step": 11500
},
{
"epoch": 7.0,
"eval_loss": 5.03799295425415,
"eval_runtime": 0.4792,
"eval_samples_per_second": 2086.994,
"eval_steps_per_second": 4.174,
"step": 11795
},
{
"epoch": 7.121661721068249,
"grad_norm": 0.287109375,
"learning_rate": 0.000857566765578635,
"loss": 4.9661,
"step": 12000
},
{
"epoch": 7.4183976261127595,
"grad_norm": 0.2890625,
"learning_rate": 0.0008516320474777448,
"loss": 4.9489,
"step": 12500
},
{
"epoch": 7.71513353115727,
"grad_norm": 0.294921875,
"learning_rate": 0.0008456973293768546,
"loss": 4.9486,
"step": 13000
},
{
"epoch": 8.0,
"eval_loss": 5.01511287689209,
"eval_runtime": 0.7849,
"eval_samples_per_second": 1273.986,
"eval_steps_per_second": 2.548,
"step": 13480
},
{
"epoch": 8.011869436201781,
"grad_norm": 0.294921875,
"learning_rate": 0.0008397626112759644,
"loss": 4.9437,
"step": 13500
},
{
"epoch": 8.308605341246292,
"grad_norm": 0.322265625,
"learning_rate": 0.0008338278931750742,
"loss": 4.9108,
"step": 14000
},
{
"epoch": 8.605341246290802,
"grad_norm": 0.310546875,
"learning_rate": 0.000827893175074184,
"loss": 4.9207,
"step": 14500
},
{
"epoch": 8.90207715133531,
"grad_norm": 0.30859375,
"learning_rate": 0.0008219584569732938,
"loss": 4.9149,
"step": 15000
},
{
"epoch": 9.0,
"eval_loss": 5.001404285430908,
"eval_runtime": 0.5146,
"eval_samples_per_second": 1943.422,
"eval_steps_per_second": 3.887,
"step": 15165
},
{
"epoch": 9.198813056379821,
"grad_norm": 0.310546875,
"learning_rate": 0.0008160237388724035,
"loss": 4.8919,
"step": 15500
},
{
"epoch": 9.495548961424332,
"grad_norm": 0.32421875,
"learning_rate": 0.0008100890207715134,
"loss": 4.8883,
"step": 16000
},
{
"epoch": 9.792284866468842,
"grad_norm": 0.330078125,
"learning_rate": 0.0008041543026706232,
"loss": 4.8952,
"step": 16500
},
{
"epoch": 10.0,
"eval_loss": 4.996912002563477,
"eval_runtime": 0.6039,
"eval_samples_per_second": 1655.995,
"eval_steps_per_second": 3.312,
"step": 16850
},
{
"epoch": 10.089020771513352,
"grad_norm": 0.3125,
"learning_rate": 0.000798219584569733,
"loss": 4.8771,
"step": 17000
},
{
"epoch": 10.385756676557863,
"grad_norm": 0.326171875,
"learning_rate": 0.0007922848664688428,
"loss": 4.8607,
"step": 17500
},
{
"epoch": 10.682492581602373,
"grad_norm": 0.34375,
"learning_rate": 0.0007863501483679525,
"loss": 4.87,
"step": 18000
},
{
"epoch": 10.979228486646884,
"grad_norm": 0.333984375,
"learning_rate": 0.0007804154302670623,
"loss": 4.868,
"step": 18500
},
{
"epoch": 11.0,
"eval_loss": 4.980271339416504,
"eval_runtime": 0.5082,
"eval_samples_per_second": 1967.91,
"eval_steps_per_second": 3.936,
"step": 18535
},
{
"epoch": 11.275964391691394,
"grad_norm": 0.341796875,
"learning_rate": 0.0007744807121661721,
"loss": 4.8319,
"step": 19000
},
{
"epoch": 11.572700296735905,
"grad_norm": 0.373046875,
"learning_rate": 0.000768545994065282,
"loss": 4.8425,
"step": 19500
},
{
"epoch": 11.869436201780415,
"grad_norm": 0.349609375,
"learning_rate": 0.0007626112759643917,
"loss": 4.8469,
"step": 20000
},
{
"epoch": 12.0,
"eval_loss": 4.969499111175537,
"eval_runtime": 0.5498,
"eval_samples_per_second": 1818.897,
"eval_steps_per_second": 3.638,
"step": 20220
},
{
"epoch": 12.166172106824925,
"grad_norm": 0.365234375,
"learning_rate": 0.0007566765578635016,
"loss": 4.8272,
"step": 20500
},
{
"epoch": 12.462908011869436,
"grad_norm": 0.3515625,
"learning_rate": 0.0007507418397626113,
"loss": 4.8171,
"step": 21000
},
{
"epoch": 12.759643916913946,
"grad_norm": 0.33984375,
"learning_rate": 0.0007448071216617211,
"loss": 4.8272,
"step": 21500
},
{
"epoch": 13.0,
"eval_loss": 4.971672058105469,
"eval_runtime": 0.5373,
"eval_samples_per_second": 1861.169,
"eval_steps_per_second": 3.722,
"step": 21905
},
{
"epoch": 13.056379821958457,
"grad_norm": 0.37109375,
"learning_rate": 0.0007388724035608308,
"loss": 4.8221,
"step": 22000
},
{
"epoch": 13.353115727002967,
"grad_norm": 0.375,
"learning_rate": 0.0007329376854599407,
"loss": 4.7935,
"step": 22500
},
{
"epoch": 13.649851632047477,
"grad_norm": 0.3671875,
"learning_rate": 0.0007270029673590504,
"loss": 4.8096,
"step": 23000
},
{
"epoch": 13.946587537091988,
"grad_norm": 0.384765625,
"learning_rate": 0.0007210682492581603,
"loss": 4.8121,
"step": 23500
},
{
"epoch": 14.0,
"eval_loss": 4.967980861663818,
"eval_runtime": 0.5226,
"eval_samples_per_second": 1913.569,
"eval_steps_per_second": 3.827,
"step": 23590
},
{
"epoch": 14.243323442136498,
"grad_norm": 0.3828125,
"learning_rate": 0.0007151335311572701,
"loss": 4.7805,
"step": 24000
},
{
"epoch": 14.540059347181009,
"grad_norm": 0.376953125,
"learning_rate": 0.0007091988130563798,
"loss": 4.7851,
"step": 24500
},
{
"epoch": 14.836795252225519,
"grad_norm": 0.388671875,
"learning_rate": 0.0007032640949554896,
"loss": 4.7942,
"step": 25000
},
{
"epoch": 15.0,
"eval_loss": 4.957317352294922,
"eval_runtime": 0.4875,
"eval_samples_per_second": 2051.234,
"eval_steps_per_second": 4.102,
"step": 25275
},
{
"epoch": 15.13353115727003,
"grad_norm": 0.388671875,
"learning_rate": 0.0006973293768545994,
"loss": 4.778,
"step": 25500
},
{
"epoch": 15.43026706231454,
"grad_norm": 0.416015625,
"learning_rate": 0.0006913946587537093,
"loss": 4.7668,
"step": 26000
},
{
"epoch": 15.72700296735905,
"grad_norm": 0.392578125,
"learning_rate": 0.000685459940652819,
"loss": 4.7775,
"step": 26500
},
{
"epoch": 16.0,
"eval_loss": 4.964081287384033,
"eval_runtime": 0.5374,
"eval_samples_per_second": 1860.926,
"eval_steps_per_second": 3.722,
"step": 26960
},
{
"epoch": 16.023738872403563,
"grad_norm": 0.40234375,
"learning_rate": 0.0006795252225519289,
"loss": 4.7739,
"step": 27000
},
{
"epoch": 16.320474777448073,
"grad_norm": 0.416015625,
"learning_rate": 0.0006735905044510386,
"loss": 4.7483,
"step": 27500
},
{
"epoch": 16.617210682492583,
"grad_norm": 0.396484375,
"learning_rate": 0.0006676557863501484,
"loss": 4.7586,
"step": 28000
},
{
"epoch": 16.91394658753709,
"grad_norm": 0.390625,
"learning_rate": 0.0006617210682492581,
"loss": 4.7701,
"step": 28500
},
{
"epoch": 17.0,
"eval_loss": 4.9522247314453125,
"eval_runtime": 0.4981,
"eval_samples_per_second": 2007.758,
"eval_steps_per_second": 4.016,
"step": 28645
},
{
"epoch": 17.2106824925816,
"grad_norm": 0.427734375,
"learning_rate": 0.000655786350148368,
"loss": 4.735,
"step": 29000
},
{
"epoch": 17.50741839762611,
"grad_norm": 0.408203125,
"learning_rate": 0.0006498516320474777,
"loss": 4.7469,
"step": 29500
},
{
"epoch": 17.80415430267062,
"grad_norm": 0.4140625,
"learning_rate": 0.0006439169139465876,
"loss": 4.7499,
"step": 30000
},
{
"epoch": 18.0,
"eval_loss": 4.961427688598633,
"eval_runtime": 0.4675,
"eval_samples_per_second": 2139.244,
"eval_steps_per_second": 4.278,
"step": 30330
},
{
"epoch": 18.100890207715132,
"grad_norm": 0.4140625,
"learning_rate": 0.0006379821958456973,
"loss": 4.7358,
"step": 30500
},
{
"epoch": 18.397626112759642,
"grad_norm": 0.396484375,
"learning_rate": 0.0006320474777448071,
"loss": 4.7299,
"step": 31000
},
{
"epoch": 18.694362017804153,
"grad_norm": 0.419921875,
"learning_rate": 0.0006261127596439168,
"loss": 4.735,
"step": 31500
},
{
"epoch": 18.991097922848663,
"grad_norm": 0.4375,
"learning_rate": 0.0006201780415430267,
"loss": 4.7428,
"step": 32000
},
{
"epoch": 19.0,
"eval_loss": 4.956191062927246,
"eval_runtime": 0.4912,
"eval_samples_per_second": 2035.99,
"eval_steps_per_second": 4.072,
"step": 32015
},
{
"epoch": 19.287833827893174,
"grad_norm": 0.447265625,
"learning_rate": 0.0006142433234421366,
"loss": 4.7161,
"step": 32500
},
{
"epoch": 19.584569732937684,
"grad_norm": 0.427734375,
"learning_rate": 0.0006083086053412463,
"loss": 4.7258,
"step": 33000
},
{
"epoch": 19.881305637982194,
"grad_norm": 0.4296875,
"learning_rate": 0.0006023738872403562,
"loss": 4.7254,
"step": 33500
},
{
"epoch": 20.0,
"eval_loss": 4.949131488800049,
"eval_runtime": 0.4629,
"eval_samples_per_second": 2160.419,
"eval_steps_per_second": 4.321,
"step": 33700
}
],
"logging_steps": 500,
"max_steps": 84250,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.76236489769554e+16,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}