|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.914529914529915, |
|
"eval_steps": 500, |
|
"global_step": 870, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 0.158817857503891, |
|
"learning_rate": 0.0001999348095389677, |
|
"loss": 0.9924, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.21280939877033234, |
|
"learning_rate": 0.000199739323151795, |
|
"loss": 0.819, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.22974510490894318, |
|
"learning_rate": 0.00019941379571543596, |
|
"loss": 0.767, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.20720455050468445, |
|
"learning_rate": 0.00019895865165556377, |
|
"loss": 0.6948, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 0.1902514398097992, |
|
"learning_rate": 0.00019837448439320027, |
|
"loss": 0.6509, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 0.18335820734500885, |
|
"learning_rate": 0.00019766205557100868, |
|
"loss": 0.6344, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7977207977207977, |
|
"grad_norm": 0.17900069057941437, |
|
"learning_rate": 0.00019682229406025635, |
|
"loss": 0.6447, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 0.16915330290794373, |
|
"learning_rate": 0.00019585629474974415, |
|
"loss": 0.6335, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.16036000847816467, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 0.634, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 0.16852639615535736, |
|
"learning_rate": 0.0001935507835925601, |
|
"loss": 0.6058, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2535612535612537, |
|
"grad_norm": 0.15856905281543732, |
|
"learning_rate": 0.00019221427769259333, |
|
"loss": 0.5902, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 0.16909192502498627, |
|
"learning_rate": 0.00019075754196709572, |
|
"loss": 0.6051, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.1899166703224182, |
|
"learning_rate": 0.00018918247572153823, |
|
"loss": 0.6098, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5954415954415955, |
|
"grad_norm": 0.17596793174743652, |
|
"learning_rate": 0.00018749113254181498, |
|
"loss": 0.597, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 0.16560517251491547, |
|
"learning_rate": 0.00018568571761675893, |
|
"loss": 0.5899, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.8233618233618234, |
|
"grad_norm": 0.16513986885547638, |
|
"learning_rate": 0.00018376858486299647, |
|
"loss": 0.5989, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9373219373219372, |
|
"grad_norm": 0.20360782742500305, |
|
"learning_rate": 0.00018174223385588917, |
|
"loss": 0.5982, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.16155321896076202, |
|
"learning_rate": 0.00017960930657056438, |
|
"loss": 0.593, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.1652421652421654, |
|
"grad_norm": 0.1811763048171997, |
|
"learning_rate": 0.00017737258393728364, |
|
"loss": 0.6077, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 0.16952063143253326, |
|
"learning_rate": 0.00017503498221564025, |
|
"loss": 0.5749, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.393162393162393, |
|
"grad_norm": 0.17240603268146515, |
|
"learning_rate": 0.0001725995491923131, |
|
"loss": 0.5592, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.5071225071225074, |
|
"grad_norm": 0.1657334417104721, |
|
"learning_rate": 0.00017006946020733425, |
|
"loss": 0.5779, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.6210826210826212, |
|
"grad_norm": 0.16417497396469116, |
|
"learning_rate": 0.0001674480140140514, |
|
"loss": 0.5675, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"grad_norm": 0.174308180809021, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 0.5977, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.849002849002849, |
|
"grad_norm": 0.17116901278495789, |
|
"learning_rate": 0.0001619448361215723, |
|
"loss": 0.5582, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.16816489398479462, |
|
"learning_rate": 0.0001590702795164551, |
|
"loss": 0.5813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.17530137300491333, |
|
"learning_rate": 0.00015611870653623825, |
|
"loss": 0.559, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.190883190883191, |
|
"grad_norm": 0.1744232326745987, |
|
"learning_rate": 0.0001530939654689887, |
|
"loss": 0.5668, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.304843304843305, |
|
"grad_norm": 0.1809006929397583, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.5754, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 0.16484159231185913, |
|
"learning_rate": 0.00014684084406997903, |
|
"loss": 0.5731, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.532763532763533, |
|
"grad_norm": 0.19075918197631836, |
|
"learning_rate": 0.00014362061661555675, |
|
"loss": 0.5496, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.646723646723647, |
|
"grad_norm": 0.18451079726219177, |
|
"learning_rate": 0.00014034351619898088, |
|
"loss": 0.5463, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.7606837606837606, |
|
"grad_norm": 0.18566997349262238, |
|
"learning_rate": 0.00013701381553399145, |
|
"loss": 0.5768, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.8746438746438745, |
|
"grad_norm": 0.1669853925704956, |
|
"learning_rate": 0.0001336358559150175, |
|
"loss": 0.5606, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.9886039886039883, |
|
"grad_norm": 0.17847082018852234, |
|
"learning_rate": 0.00013021404155695725, |
|
"loss": 0.5756, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 0.16660483181476593, |
|
"learning_rate": 0.00012675283385292212, |
|
"loss": 0.5585, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.216524216524217, |
|
"grad_norm": 0.17163340747356415, |
|
"learning_rate": 0.00012325674555743106, |
|
"loss": 0.5434, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.330484330484331, |
|
"grad_norm": 0.16264410316944122, |
|
"learning_rate": 0.00011973033490264001, |
|
"loss": 0.5449, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.17614829540252686, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 0.5574, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.5584045584045585, |
|
"grad_norm": 0.19437584280967712, |
|
"learning_rate": 0.00011260497112202895, |
|
"loss": 0.5448, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.672364672364672, |
|
"grad_norm": 0.19045701622962952, |
|
"learning_rate": 0.00010901530811120655, |
|
"loss": 0.5474, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.786324786324786, |
|
"grad_norm": 0.21330882608890533, |
|
"learning_rate": 0.00010541389085854176, |
|
"loss": 0.5552, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.9002849002849, |
|
"grad_norm": 0.17429402470588684, |
|
"learning_rate": 0.00010180541492505604, |
|
"loss": 0.5495, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.014245014245014, |
|
"grad_norm": 0.17785826325416565, |
|
"learning_rate": 9.819458507494394e-05, |
|
"loss": 0.5583, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 0.19076977670192719, |
|
"learning_rate": 9.458610914145826e-05, |
|
"loss": 0.5291, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.2421652421652425, |
|
"grad_norm": 0.19988471269607544, |
|
"learning_rate": 9.098469188879349e-05, |
|
"loss": 0.5311, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.356125356125356, |
|
"grad_norm": 0.19638335704803467, |
|
"learning_rate": 8.739502887797107e-05, |
|
"loss": 0.5684, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.47008547008547, |
|
"grad_norm": 0.2043437659740448, |
|
"learning_rate": 8.382180034472353e-05, |
|
"loss": 0.5371, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.584045584045584, |
|
"grad_norm": 0.2045976221561432, |
|
"learning_rate": 8.026966509736001e-05, |
|
"loss": 0.5307, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.698005698005698, |
|
"grad_norm": 0.21237310767173767, |
|
"learning_rate": 7.674325444256899e-05, |
|
"loss": 0.5483, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.811965811965812, |
|
"grad_norm": 0.22306476533412933, |
|
"learning_rate": 7.324716614707793e-05, |
|
"loss": 0.5572, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 0.20065273344516754, |
|
"learning_rate": 6.978595844304271e-05, |
|
"loss": 0.5363, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.0398860398860394, |
|
"grad_norm": 0.21213628351688385, |
|
"learning_rate": 6.636414408498249e-05, |
|
"loss": 0.521, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.1936779022216797, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 0.5283, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.267806267806268, |
|
"grad_norm": 0.19564631581306458, |
|
"learning_rate": 5.965648380101916e-05, |
|
"loss": 0.5301, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.381766381766382, |
|
"grad_norm": 0.20069913566112518, |
|
"learning_rate": 5.6379383384443255e-05, |
|
"loss": 0.5204, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.495726495726496, |
|
"grad_norm": 0.21325626969337463, |
|
"learning_rate": 5.3159155930021e-05, |
|
"loss": 0.5419, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.60968660968661, |
|
"grad_norm": 0.21303197741508484, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.543, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.7236467236467234, |
|
"grad_norm": 0.21136346459388733, |
|
"learning_rate": 4.6906034531011346e-05, |
|
"loss": 0.5217, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"grad_norm": 0.21392931044101715, |
|
"learning_rate": 4.388129346376178e-05, |
|
"loss": 0.5288, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.951566951566951, |
|
"grad_norm": 0.22880437970161438, |
|
"learning_rate": 4.092972048354491e-05, |
|
"loss": 0.5273, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.065527065527066, |
|
"grad_norm": 0.21491903066635132, |
|
"learning_rate": 3.80551638784277e-05, |
|
"loss": 0.5332, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 0.26633119583129883, |
|
"learning_rate": 3.5261371521817244e-05, |
|
"loss": 0.5239, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.293447293447294, |
|
"grad_norm": 0.23685385286808014, |
|
"learning_rate": 3.2551985985948616e-05, |
|
"loss": 0.5309, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 0.22292840480804443, |
|
"learning_rate": 2.993053979266577e-05, |
|
"loss": 0.5372, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.521367521367521, |
|
"grad_norm": 0.2220107614994049, |
|
"learning_rate": 2.7400450807686938e-05, |
|
"loss": 0.5083, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.635327635327635, |
|
"grad_norm": 0.2191537618637085, |
|
"learning_rate": 2.496501778435977e-05, |
|
"loss": 0.5164, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.749287749287749, |
|
"grad_norm": 0.22593119740486145, |
|
"learning_rate": 2.2627416062716366e-05, |
|
"loss": 0.5152, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.863247863247864, |
|
"grad_norm": 0.23532789945602417, |
|
"learning_rate": 2.0390693429435627e-05, |
|
"loss": 0.5269, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 7.977207977207978, |
|
"grad_norm": 0.25111591815948486, |
|
"learning_rate": 1.825776614411082e-05, |
|
"loss": 0.5335, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.091168091168091, |
|
"grad_norm": 0.21956747770309448, |
|
"learning_rate": 1.6231415137003537e-05, |
|
"loss": 0.5144, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 0.23355403542518616, |
|
"learning_rate": 1.4314282383241096e-05, |
|
"loss": 0.5294, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.31908831908832, |
|
"grad_norm": 0.23712006211280823, |
|
"learning_rate": 1.2508867458185037e-05, |
|
"loss": 0.5229, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.433048433048434, |
|
"grad_norm": 0.22506175935268402, |
|
"learning_rate": 1.0817524278461776e-05, |
|
"loss": 0.5212, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 0.21853385865688324, |
|
"learning_rate": 9.242458032904311e-06, |
|
"loss": 0.5193, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.660968660968662, |
|
"grad_norm": 0.23257511854171753, |
|
"learning_rate": 7.785722307406684e-06, |
|
"loss": 0.5039, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.774928774928775, |
|
"grad_norm": 0.21563945710659027, |
|
"learning_rate": 6.4492164074399065e-06, |
|
"loss": 0.5232, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.22108329832553864, |
|
"learning_rate": 5.2346828817197655e-06, |
|
"loss": 0.5309, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.002849002849002, |
|
"grad_norm": 0.22330021858215332, |
|
"learning_rate": 4.143705250255869e-06, |
|
"loss": 0.5287, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.116809116809117, |
|
"grad_norm": 0.22394247353076935, |
|
"learning_rate": 3.1777059397436692e-06, |
|
"loss": 0.5007, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.2144930511713028, |
|
"learning_rate": 2.3379444289913342e-06, |
|
"loss": 0.5277, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.344729344729345, |
|
"grad_norm": 0.2214236557483673, |
|
"learning_rate": 1.6255156067997323e-06, |
|
"loss": 0.5173, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.45868945868946, |
|
"grad_norm": 0.2192196100950241, |
|
"learning_rate": 1.0413483444362771e-06, |
|
"loss": 0.5123, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.572649572649572, |
|
"grad_norm": 0.22837017476558685, |
|
"learning_rate": 5.862042845640403e-07, |
|
"loss": 0.5279, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.686609686609687, |
|
"grad_norm": 0.21172335743904114, |
|
"learning_rate": 2.606768482050215e-07, |
|
"loss": 0.5263, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 9.8005698005698, |
|
"grad_norm": 0.23530949652194977, |
|
"learning_rate": 6.519046103230508e-08, |
|
"loss": 0.5202, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 9.914529914529915, |
|
"grad_norm": 0.23058444261550903, |
|
"learning_rate": 0.0, |
|
"loss": 0.5243, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 9.914529914529915, |
|
"step": 870, |
|
"total_flos": 5.67984355540992e+16, |
|
"train_loss": 0.5655439464525245, |
|
"train_runtime": 2716.1071, |
|
"train_samples_per_second": 1.292, |
|
"train_steps_per_second": 0.32 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 5.67984355540992e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|