|
{ |
|
"best_metric": 4.949131488800049, |
|
"best_model_checkpoint": "./results/models/checkpoint-33700", |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 33700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0009940652818991099, |
|
"loss": 5.6032, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5934718100890207, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0009881305637982197, |
|
"loss": 5.4199, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8902077151335311, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0009821958456973294, |
|
"loss": 5.3653, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 5.317612171173096, |
|
"eval_runtime": 0.4163, |
|
"eval_samples_per_second": 2402.115, |
|
"eval_steps_per_second": 4.804, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.1869436201780414, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0009762611275964391, |
|
"loss": 5.3193, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4836795252225519, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.000970326409495549, |
|
"loss": 5.2869, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.7804154302670623, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0009643916913946587, |
|
"loss": 5.2619, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 5.211864471435547, |
|
"eval_runtime": 0.4845, |
|
"eval_samples_per_second": 2063.824, |
|
"eval_steps_per_second": 4.128, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.077151335311573, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0009584569732937686, |
|
"loss": 5.2314, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.373887240356083, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0009525222551928784, |
|
"loss": 5.2019, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.6706231454005933, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0009465875370919882, |
|
"loss": 5.1854, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.9673590504451037, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0009406528189910979, |
|
"loss": 5.1734, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 5.1604228019714355, |
|
"eval_runtime": 0.5733, |
|
"eval_samples_per_second": 1744.297, |
|
"eval_steps_per_second": 3.489, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 3.264094955489614, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0009347181008902077, |
|
"loss": 5.1376, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.5608308605341246, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0009287833827893175, |
|
"loss": 5.1259, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.857566765578635, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0009228486646884273, |
|
"loss": 5.1134, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.109052658081055, |
|
"eval_runtime": 0.6878, |
|
"eval_samples_per_second": 1453.883, |
|
"eval_steps_per_second": 2.908, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 4.154302670623146, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0009169139465875371, |
|
"loss": 5.0915, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.451038575667655, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0009109792284866469, |
|
"loss": 5.0718, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.747774480712166, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0009050445103857568, |
|
"loss": 5.066, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.074661731719971, |
|
"eval_runtime": 0.5054, |
|
"eval_samples_per_second": 1978.542, |
|
"eval_steps_per_second": 3.957, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 5.044510385756676, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0008991097922848664, |
|
"loss": 5.0542, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 5.341246290801187, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0008931750741839763, |
|
"loss": 5.023, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.637982195845697, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0008872403560830861, |
|
"loss": 5.0233, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.9347181008902075, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0008813056379821959, |
|
"loss": 5.0197, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 5.054934501647949, |
|
"eval_runtime": 0.6462, |
|
"eval_samples_per_second": 1547.607, |
|
"eval_steps_per_second": 3.095, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 6.231454005934718, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008753709198813057, |
|
"loss": 4.9826, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.528189910979228, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008694362017804155, |
|
"loss": 4.984, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.824925816023739, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008635014836795252, |
|
"loss": 4.982, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 5.03799295425415, |
|
"eval_runtime": 0.4792, |
|
"eval_samples_per_second": 2086.994, |
|
"eval_steps_per_second": 4.174, |
|
"step": 11795 |
|
}, |
|
{ |
|
"epoch": 7.121661721068249, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.000857566765578635, |
|
"loss": 4.9661, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.4183976261127595, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0008516320474777448, |
|
"loss": 4.9489, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 7.71513353115727, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0008456973293768546, |
|
"loss": 4.9486, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 5.01511287689209, |
|
"eval_runtime": 0.7849, |
|
"eval_samples_per_second": 1273.986, |
|
"eval_steps_per_second": 2.548, |
|
"step": 13480 |
|
}, |
|
{ |
|
"epoch": 8.011869436201781, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0008397626112759644, |
|
"loss": 4.9437, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 8.308605341246292, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0008338278931750742, |
|
"loss": 4.9108, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 8.605341246290802, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000827893175074184, |
|
"loss": 4.9207, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 8.90207715133531, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0008219584569732938, |
|
"loss": 4.9149, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 5.001404285430908, |
|
"eval_runtime": 0.5146, |
|
"eval_samples_per_second": 1943.422, |
|
"eval_steps_per_second": 3.887, |
|
"step": 15165 |
|
}, |
|
{ |
|
"epoch": 9.198813056379821, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0008160237388724035, |
|
"loss": 4.8919, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 9.495548961424332, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0008100890207715134, |
|
"loss": 4.8883, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 9.792284866468842, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0008041543026706232, |
|
"loss": 4.8952, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 4.996912002563477, |
|
"eval_runtime": 0.6039, |
|
"eval_samples_per_second": 1655.995, |
|
"eval_steps_per_second": 3.312, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 10.089020771513352, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.000798219584569733, |
|
"loss": 4.8771, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 10.385756676557863, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0007922848664688428, |
|
"loss": 4.8607, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 10.682492581602373, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0007863501483679525, |
|
"loss": 4.87, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 10.979228486646884, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0007804154302670623, |
|
"loss": 4.868, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 4.980271339416504, |
|
"eval_runtime": 0.5082, |
|
"eval_samples_per_second": 1967.91, |
|
"eval_steps_per_second": 3.936, |
|
"step": 18535 |
|
}, |
|
{ |
|
"epoch": 11.275964391691394, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007744807121661721, |
|
"loss": 4.8319, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 11.572700296735905, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.000768545994065282, |
|
"loss": 4.8425, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 11.869436201780415, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0007626112759643917, |
|
"loss": 4.8469, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 4.969499111175537, |
|
"eval_runtime": 0.5498, |
|
"eval_samples_per_second": 1818.897, |
|
"eval_steps_per_second": 3.638, |
|
"step": 20220 |
|
}, |
|
{ |
|
"epoch": 12.166172106824925, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0007566765578635016, |
|
"loss": 4.8272, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 12.462908011869436, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0007507418397626113, |
|
"loss": 4.8171, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 12.759643916913946, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007448071216617211, |
|
"loss": 4.8272, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 4.971672058105469, |
|
"eval_runtime": 0.5373, |
|
"eval_samples_per_second": 1861.169, |
|
"eval_steps_per_second": 3.722, |
|
"step": 21905 |
|
}, |
|
{ |
|
"epoch": 13.056379821958457, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0007388724035608308, |
|
"loss": 4.8221, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 13.353115727002967, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0007329376854599407, |
|
"loss": 4.7935, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 13.649851632047477, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0007270029673590504, |
|
"loss": 4.8096, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 13.946587537091988, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0007210682492581603, |
|
"loss": 4.8121, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 4.967980861663818, |
|
"eval_runtime": 0.5226, |
|
"eval_samples_per_second": 1913.569, |
|
"eval_steps_per_second": 3.827, |
|
"step": 23590 |
|
}, |
|
{ |
|
"epoch": 14.243323442136498, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0007151335311572701, |
|
"loss": 4.7805, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 14.540059347181009, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007091988130563798, |
|
"loss": 4.7851, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 14.836795252225519, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0007032640949554896, |
|
"loss": 4.7942, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 4.957317352294922, |
|
"eval_runtime": 0.4875, |
|
"eval_samples_per_second": 2051.234, |
|
"eval_steps_per_second": 4.102, |
|
"step": 25275 |
|
}, |
|
{ |
|
"epoch": 15.13353115727003, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006973293768545994, |
|
"loss": 4.778, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 15.43026706231454, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0006913946587537093, |
|
"loss": 4.7668, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 15.72700296735905, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.000685459940652819, |
|
"loss": 4.7775, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 4.964081287384033, |
|
"eval_runtime": 0.5374, |
|
"eval_samples_per_second": 1860.926, |
|
"eval_steps_per_second": 3.722, |
|
"step": 26960 |
|
}, |
|
{ |
|
"epoch": 16.023738872403563, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0006795252225519289, |
|
"loss": 4.7739, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 16.320474777448073, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0006735905044510386, |
|
"loss": 4.7483, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 16.617210682492583, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0006676557863501484, |
|
"loss": 4.7586, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 16.91394658753709, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0006617210682492581, |
|
"loss": 4.7701, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 4.9522247314453125, |
|
"eval_runtime": 0.4981, |
|
"eval_samples_per_second": 2007.758, |
|
"eval_steps_per_second": 4.016, |
|
"step": 28645 |
|
}, |
|
{ |
|
"epoch": 17.2106824925816, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.000655786350148368, |
|
"loss": 4.735, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 17.50741839762611, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0006498516320474777, |
|
"loss": 4.7469, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 17.80415430267062, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0006439169139465876, |
|
"loss": 4.7499, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 4.961427688598633, |
|
"eval_runtime": 0.4675, |
|
"eval_samples_per_second": 2139.244, |
|
"eval_steps_per_second": 4.278, |
|
"step": 30330 |
|
}, |
|
{ |
|
"epoch": 18.100890207715132, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0006379821958456973, |
|
"loss": 4.7358, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 18.397626112759642, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0006320474777448071, |
|
"loss": 4.7299, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 18.694362017804153, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0006261127596439168, |
|
"loss": 4.735, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 18.991097922848663, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0006201780415430267, |
|
"loss": 4.7428, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 4.956191062927246, |
|
"eval_runtime": 0.4912, |
|
"eval_samples_per_second": 2035.99, |
|
"eval_steps_per_second": 4.072, |
|
"step": 32015 |
|
}, |
|
{ |
|
"epoch": 19.287833827893174, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0006142433234421366, |
|
"loss": 4.7161, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 19.584569732937684, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0006083086053412463, |
|
"loss": 4.7258, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 19.881305637982194, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0006023738872403562, |
|
"loss": 4.7254, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 4.949131488800049, |
|
"eval_runtime": 0.4629, |
|
"eval_samples_per_second": 2160.419, |
|
"eval_steps_per_second": 4.321, |
|
"step": 33700 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 84250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.76236489769554e+16, |
|
"train_batch_size": 512, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|