|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9982905982905983, |
|
"eval_steps": 500, |
|
"global_step": 2631, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011396011396011397, |
|
"grad_norm": 1.1062365284901086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7552, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022792022792022793, |
|
"grad_norm": 1.023752202077829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7051, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03418803418803419, |
|
"grad_norm": 1.0176219936883037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6821, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.045584045584045586, |
|
"grad_norm": 0.7858042820189418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05698005698005698, |
|
"grad_norm": 0.7933715158385674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06837606837606838, |
|
"grad_norm": 0.8447246301495516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6622, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07977207977207977, |
|
"grad_norm": 0.5473425796189046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6663, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09116809116809117, |
|
"grad_norm": 0.4937146235526688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6571, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.44446776727990156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 0.41849443527281166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6579, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12535612535612536, |
|
"grad_norm": 0.45940085033829986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6526, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13675213675213677, |
|
"grad_norm": 0.45931809899313636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6369, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.40591863341923856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6341, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15954415954415954, |
|
"grad_norm": 0.42649491292164343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6573, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.42419739622977437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6449, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18233618233618235, |
|
"grad_norm": 0.41999442196069786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6558, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19373219373219372, |
|
"grad_norm": 0.4637200181201795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6287, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.41547705456707573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6439, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.21652421652421652, |
|
"grad_norm": 0.43068607713697277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6396, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.4469322057262852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6356, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23931623931623933, |
|
"grad_norm": 0.4680911563203023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6306, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25071225071225073, |
|
"grad_norm": 0.4095294859092795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6277, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2621082621082621, |
|
"grad_norm": 0.45759740926828324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6339, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.27350427350427353, |
|
"grad_norm": 0.4140379707131278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.641, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 0.4150377896672994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6372, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.402341599576737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6403, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.42585340932157245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6415, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3190883190883191, |
|
"grad_norm": 0.45653778556147656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6399, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.33048433048433046, |
|
"grad_norm": 0.4195393934267986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6336, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.46802670579447797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6337, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35327635327635326, |
|
"grad_norm": 0.4203687337846972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.637, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3646723646723647, |
|
"grad_norm": 0.421822849143681, |
|
"learning_rate": 5e-06, |
|
"loss": 0.64, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.37606837606837606, |
|
"grad_norm": 0.4283596513144174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6252, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.38746438746438744, |
|
"grad_norm": 0.42324206057009117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.636, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.39886039886039887, |
|
"grad_norm": 0.4062746526152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6387, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.43787430045384385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6338, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.42165242165242167, |
|
"grad_norm": 0.4067600081663935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6337, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.43304843304843305, |
|
"grad_norm": 0.405651618692542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6235, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.39893273449497857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6305, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.4181843535226299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6424, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4672364672364672, |
|
"grad_norm": 0.42805555885189545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6319, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.47863247863247865, |
|
"grad_norm": 0.44895050300003103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6332, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.49002849002849, |
|
"grad_norm": 0.3919946319959885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6328, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5014245014245015, |
|
"grad_norm": 0.415410598131448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6402, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.4393590856709396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6306, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5242165242165242, |
|
"grad_norm": 0.43301735874135633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6344, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5356125356125356, |
|
"grad_norm": 0.44449051507968934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6231, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5470085470085471, |
|
"grad_norm": 0.41409814672813067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6341, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5584045584045584, |
|
"grad_norm": 0.404549684025027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 0.4180598818867765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6309, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5811965811965812, |
|
"grad_norm": 0.40656312646305987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6331, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.40247877044565616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6303, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.603988603988604, |
|
"grad_norm": 0.4186724709073127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6295, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.3956067792496914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.629, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6267806267806267, |
|
"grad_norm": 0.43010520803632213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6418, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6381766381766382, |
|
"grad_norm": 0.4242582783709579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6365, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6495726495726496, |
|
"grad_norm": 0.4574479642511814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6298, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6609686609686609, |
|
"grad_norm": 0.3999462091117723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6264, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6723646723646723, |
|
"grad_norm": 0.43650664891174007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6338, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 0.4209881207979195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6185, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6951566951566952, |
|
"grad_norm": 0.4356837089917804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6285, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7065527065527065, |
|
"grad_norm": 0.4267755900128707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6249, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.4252749404036598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6297, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7293447293447294, |
|
"grad_norm": 0.43616986641525424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.624, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.4164486549654651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.629, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7521367521367521, |
|
"grad_norm": 0.476343190261518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7635327635327636, |
|
"grad_norm": 0.40486827396324065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6261, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7749287749287749, |
|
"grad_norm": 0.4212351136466915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6304, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7863247863247863, |
|
"grad_norm": 0.41575901401793347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6398, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7977207977207977, |
|
"grad_norm": 0.4285454155969582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6319, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8091168091168092, |
|
"grad_norm": 0.40726171067131095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6314, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.41168149111216795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6243, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8319088319088319, |
|
"grad_norm": 0.435567753751087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6226, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8433048433048433, |
|
"grad_norm": 0.43940850789677355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6208, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 0.4188384621992378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6338, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8660968660968661, |
|
"grad_norm": 0.3960108041735021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6337, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8774928774928775, |
|
"grad_norm": 0.40675640823017994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6296, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.43353876595216656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6357, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9002849002849003, |
|
"grad_norm": 0.43992543662793077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6333, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 0.41627535741522503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6384, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.4274496512159185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6309, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9344729344729344, |
|
"grad_norm": 0.5000942948514508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6323, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9458689458689459, |
|
"grad_norm": 0.39649163621370453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9572649572649573, |
|
"grad_norm": 0.45128894713654466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6258, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9686609686609686, |
|
"grad_norm": 0.4053334632337957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6334, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.98005698005698, |
|
"grad_norm": 0.4570308695791834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6299, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9914529914529915, |
|
"grad_norm": 0.4142729888175128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6134, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9994301994301994, |
|
"eval_loss": 0.622437059879303, |
|
"eval_runtime": 442.3461, |
|
"eval_samples_per_second": 26.728, |
|
"eval_steps_per_second": 0.418, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.002849002849003, |
|
"grad_norm": 0.467206811021719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6384, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0142450142450143, |
|
"grad_norm": 0.4575873633037112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5855, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.4094192073196508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5924, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.41727147235729756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5882, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0484330484330484, |
|
"grad_norm": 0.40097390374474684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5834, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0598290598290598, |
|
"grad_norm": 0.3988722663272877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5875, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0712250712250713, |
|
"grad_norm": 0.409835543782938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.578, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0826210826210827, |
|
"grad_norm": 0.4348656181993297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5945, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0940170940170941, |
|
"grad_norm": 0.4560769367527893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.591, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1054131054131053, |
|
"grad_norm": 0.3987301391233058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5947, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1168091168091168, |
|
"grad_norm": 0.4310263093448157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5989, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1282051282051282, |
|
"grad_norm": 0.3988555704488419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 0.41694498325264395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5857, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.150997150997151, |
|
"grad_norm": 0.4261280155159663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5846, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1623931623931625, |
|
"grad_norm": 0.4090258551630524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5862, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.173789173789174, |
|
"grad_norm": 0.39703392125897946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5828, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.4171232168803472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5969, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1965811965811965, |
|
"grad_norm": 0.3986677142839061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5849, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.207977207977208, |
|
"grad_norm": 0.4210046425391405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5866, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2193732193732194, |
|
"grad_norm": 0.4497366233089093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5963, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.43086405644231185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5872, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.242165242165242, |
|
"grad_norm": 0.4519037391850927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5952, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.2535612535612537, |
|
"grad_norm": 0.41349582244683747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5903, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.264957264957265, |
|
"grad_norm": 0.3837938001947666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5989, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.2763532763532763, |
|
"grad_norm": 0.38645298038964926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.583, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2877492877492878, |
|
"grad_norm": 0.39026828874261793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5938, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2991452991452992, |
|
"grad_norm": 0.48601873116831096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5805, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3105413105413106, |
|
"grad_norm": 0.4496341989317277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5862, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3219373219373218, |
|
"grad_norm": 0.43314588815183497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.4373497446033339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5833, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.3447293447293447, |
|
"grad_norm": 0.4061985333964508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5824, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.3561253561253561, |
|
"grad_norm": 0.4144234110159319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6004, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 0.4373239103878606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5818, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.378917378917379, |
|
"grad_norm": 0.4210723366091624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.3903133903133904, |
|
"grad_norm": 0.4052006957338942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5906, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4017094017094016, |
|
"grad_norm": 0.4292623892695985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5927, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.413105413105413, |
|
"grad_norm": 0.4232783608596394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5956, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4245014245014245, |
|
"grad_norm": 0.43895695326546535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6033, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.4349281709940867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5825, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4472934472934473, |
|
"grad_norm": 0.4124297881341476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5842, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.4586894586894588, |
|
"grad_norm": 0.4103899829789082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5851, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.4700854700854702, |
|
"grad_norm": 0.4187405725906187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5815, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.4335514785672904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5893, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4928774928774928, |
|
"grad_norm": 0.4095416189258966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5812, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5042735042735043, |
|
"grad_norm": 0.4327159045023668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5872, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5156695156695157, |
|
"grad_norm": 0.42071355477765043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5894, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5270655270655271, |
|
"grad_norm": 0.41206489314047035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5948, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.4743925797235051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.54985754985755, |
|
"grad_norm": 0.40595707601991954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5892, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.5612535612535612, |
|
"grad_norm": 0.3947481991815675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5784, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.5726495726495726, |
|
"grad_norm": 0.38785299206305784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5864, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.584045584045584, |
|
"grad_norm": 0.4149251097325744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6075, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.5954415954415955, |
|
"grad_norm": 0.43637706913229096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5944, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.606837606837607, |
|
"grad_norm": 0.4169030325172147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5915, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.618233618233618, |
|
"grad_norm": 0.40313516644365976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5817, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.39503546583616356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5983, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.3985527253164314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5826, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.6524216524216524, |
|
"grad_norm": 0.4244488951950044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5889, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.6638176638176638, |
|
"grad_norm": 0.39693307887587553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.6752136752136753, |
|
"grad_norm": 0.3849679459506633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5806, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.6866096866096867, |
|
"grad_norm": 0.4201985578364686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5866, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.698005698005698, |
|
"grad_norm": 0.42432125023319545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5803, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 0.40730542273295467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5913, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7207977207977208, |
|
"grad_norm": 0.42837414750466624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5795, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7321937321937322, |
|
"grad_norm": 0.44083872834956234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5888, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.7435897435897436, |
|
"grad_norm": 0.41133942102181764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5865, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.7549857549857548, |
|
"grad_norm": 0.40260223356507924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5816, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.7663817663817665, |
|
"grad_norm": 0.4054088563875919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5952, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4218451424068199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5879, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.7891737891737893, |
|
"grad_norm": 0.4423529568236007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5907, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8005698005698005, |
|
"grad_norm": 0.41215301182035746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5841, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.811965811965812, |
|
"grad_norm": 0.4555696841177031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5849, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.8233618233618234, |
|
"grad_norm": 0.41997083905529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8347578347578346, |
|
"grad_norm": 0.40350765403827904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5773, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.41505233462990104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5828, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.8575498575498575, |
|
"grad_norm": 0.4094044224106121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.868945868945869, |
|
"grad_norm": 0.3989458077194491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5852, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.8803418803418803, |
|
"grad_norm": 0.3968449176678109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5765, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.8917378917378918, |
|
"grad_norm": 0.3975827713442406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5941, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.9031339031339032, |
|
"grad_norm": 0.4591167052806216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5958, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.9145299145299144, |
|
"grad_norm": 0.4763985809192953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5822, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.40816873290685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.591, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.9373219373219372, |
|
"grad_norm": 0.43451011164507114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5866, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.9487179487179487, |
|
"grad_norm": 0.42502005410583105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5812, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.96011396011396, |
|
"grad_norm": 0.3868140358085357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5952, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.9715099715099715, |
|
"grad_norm": 0.4233434645527226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5905, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.982905982905983, |
|
"grad_norm": 0.46128367957303146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5835, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.9943019943019942, |
|
"grad_norm": 0.41962900843595113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5823, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6174917817115784, |
|
"eval_runtime": 442.5819, |
|
"eval_samples_per_second": 26.714, |
|
"eval_steps_per_second": 0.418, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.005698005698006, |
|
"grad_norm": 0.43638289381677664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6003, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.017094017094017, |
|
"grad_norm": 0.4032954694771035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5295, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.0284900284900287, |
|
"grad_norm": 0.3978342138531873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5396, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.03988603988604, |
|
"grad_norm": 0.3941941742542143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5498, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.40614413388153375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5485, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0626780626780628, |
|
"grad_norm": 0.4062005374187212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5443, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.38632662394247547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5499, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.0854700854700856, |
|
"grad_norm": 0.3877796238652637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5472, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.096866096866097, |
|
"grad_norm": 0.4031396151639763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5557, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.1082621082621085, |
|
"grad_norm": 0.3758020305089208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5423, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.1196581196581197, |
|
"grad_norm": 0.46333515136342907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5556, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.131054131054131, |
|
"grad_norm": 0.3990721210469113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5464, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.1424501424501425, |
|
"grad_norm": 0.41896529664740606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5459, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.40224292638674486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5432, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.1652421652421654, |
|
"grad_norm": 0.3703829495333715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5434, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.1766381766381766, |
|
"grad_norm": 0.4195807512147461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.548, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.1880341880341883, |
|
"grad_norm": 0.42078014349068604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.551, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.1994301994301995, |
|
"grad_norm": 0.39550870444336733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5487, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.2108262108262107, |
|
"grad_norm": 0.403560752581769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5518, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.4148295164570796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5455, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.2336182336182335, |
|
"grad_norm": 0.37681071283125916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5371, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.245014245014245, |
|
"grad_norm": 0.4085602540294654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.548, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.2564102564102564, |
|
"grad_norm": 0.42666262080387535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5489, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.267806267806268, |
|
"grad_norm": 0.413370914720578, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5452, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 0.3924258676572947, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5506, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2905982905982905, |
|
"grad_norm": 0.4347195110430224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5495, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.301994301994302, |
|
"grad_norm": 0.40213883875930767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5447, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.3133903133903133, |
|
"grad_norm": 0.42546941310471453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5533, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.324786324786325, |
|
"grad_norm": 0.40042388002057316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5493, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.336182336182336, |
|
"grad_norm": 0.40985989196559397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5484, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.347578347578348, |
|
"grad_norm": 0.4262197347046128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5561, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.358974358974359, |
|
"grad_norm": 0.4079997903297647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5479, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.4171995256710412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5482, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.381766381766382, |
|
"grad_norm": 0.4067288627883757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5495, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.393162393162393, |
|
"grad_norm": 0.39812759486187826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5475, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.4045584045584047, |
|
"grad_norm": 0.4252046487226247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.415954415954416, |
|
"grad_norm": 0.385246050290494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5495, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.427350427350427, |
|
"grad_norm": 0.4086146276427414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.56, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.438746438746439, |
|
"grad_norm": 0.40396684063143223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5592, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.45014245014245, |
|
"grad_norm": 0.40575491064321195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5633, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.4073296395669543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5488, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.472934472934473, |
|
"grad_norm": 0.43882905338245753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5513, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.484330484330484, |
|
"grad_norm": 0.4031322481681622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5554, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.4957264957264957, |
|
"grad_norm": 0.42227630442588826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5574, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.5071225071225074, |
|
"grad_norm": 0.4277624308363176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5629, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.40043500855114567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5444, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.52991452991453, |
|
"grad_norm": 0.427445344908136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5591, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.5413105413105415, |
|
"grad_norm": 0.4197028690010052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5513, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.5527065527065527, |
|
"grad_norm": 0.41806376493939207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5501, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 0.4080384204790527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5488, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.5754985754985755, |
|
"grad_norm": 0.4339972064470789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5534, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.5868945868945867, |
|
"grad_norm": 0.4139873128656014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5509, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.5982905982905984, |
|
"grad_norm": 0.39593523779791756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5515, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.6096866096866096, |
|
"grad_norm": 0.3887745966959367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5567, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.6210826210826212, |
|
"grad_norm": 0.3899940114191536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5429, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.6324786324786325, |
|
"grad_norm": 0.4176311832860518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.553, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.6438746438746437, |
|
"grad_norm": 0.46727727994302587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5524, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.6552706552706553, |
|
"grad_norm": 0.4368321834367039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5552, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.4479324367839254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5534, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.678062678062678, |
|
"grad_norm": 0.41411545835899133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5467, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.6894586894586894, |
|
"grad_norm": 0.4201299885965421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.700854700854701, |
|
"grad_norm": 0.40978702073303064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5444, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.7122507122507122, |
|
"grad_norm": 0.4233459449335634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5563, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.7236467236467234, |
|
"grad_norm": 0.4159458912952842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5551, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"grad_norm": 0.41425606346483057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5539, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.7464387464387463, |
|
"grad_norm": 0.4166133827092343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5588, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.757834757834758, |
|
"grad_norm": 0.4263688845736852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5575, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.41269883049053624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5471, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.780626780626781, |
|
"grad_norm": 0.3894335667283599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5468, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.792022792022792, |
|
"grad_norm": 0.40933546113606567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5501, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.8034188034188032, |
|
"grad_norm": 0.39714648665213204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5444, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.40517136322070096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5601, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.826210826210826, |
|
"grad_norm": 0.44447910033491683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5623, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.8376068376068377, |
|
"grad_norm": 0.388103652560322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5543, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.849002849002849, |
|
"grad_norm": 0.40171877838716236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.562, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.8603988603988606, |
|
"grad_norm": 0.41856657884436094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5536, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.871794871794872, |
|
"grad_norm": 0.4173395435456696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5539, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.883190883190883, |
|
"grad_norm": 0.39093712576995243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5601, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.8945868945868947, |
|
"grad_norm": 0.4255070470787294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5557, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.905982905982906, |
|
"grad_norm": 0.45247644117965885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5528, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.9173789173789175, |
|
"grad_norm": 0.41729192613775734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5416, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.9287749287749287, |
|
"grad_norm": 0.3959874387272076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5471, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.9401709401709404, |
|
"grad_norm": 0.40279780924522723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5438, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.9515669515669516, |
|
"grad_norm": 0.41492112649690777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5533, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.4072236941032463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5446, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.9743589743589745, |
|
"grad_norm": 0.3967690970697916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.556, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.9857549857549857, |
|
"grad_norm": 0.4004788690287786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5571, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.9971509971509973, |
|
"grad_norm": 0.39905352277311656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5443, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.9982905982905983, |
|
"eval_loss": 0.6213015913963318, |
|
"eval_runtime": 442.3632, |
|
"eval_samples_per_second": 26.727, |
|
"eval_steps_per_second": 0.418, |
|
"step": 2631 |
|
}, |
|
{ |
|
"epoch": 2.9982905982905983, |
|
"step": 2631, |
|
"total_flos": 2758364765356032.0, |
|
"train_loss": 0.5923774672614808, |
|
"train_runtime": 70850.8498, |
|
"train_samples_per_second": 9.511, |
|
"train_steps_per_second": 0.037 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2631, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2758364765356032.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|