|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9981447124304266, |
|
"eval_steps": 500, |
|
"global_step": 1212, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024737167594310452, |
|
"grad_norm": 3.4830790839772257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8916, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.049474335188620905, |
|
"grad_norm": 6.942725060810088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7677, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07421150278293136, |
|
"grad_norm": 1.8951062871081399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7288, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09894867037724181, |
|
"grad_norm": 1.9357008010716068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7031, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12368583797155226, |
|
"grad_norm": 0.9499312221395673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6844, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14842300556586271, |
|
"grad_norm": 0.7455987403828673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6598, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17316017316017315, |
|
"grad_norm": 0.6470160824911549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6485, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19789734075448362, |
|
"grad_norm": 0.6635217517468123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6398, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22263450834879406, |
|
"grad_norm": 0.5563048334006576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6378, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24737167594310452, |
|
"grad_norm": 0.685513800699662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6307, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.272108843537415, |
|
"grad_norm": 0.8408848874695213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6176, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29684601113172543, |
|
"grad_norm": 0.6773751764933612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6178, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32158317872603587, |
|
"grad_norm": 0.7027587036277353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3463203463203463, |
|
"grad_norm": 0.6157245730205899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6092, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37105751391465674, |
|
"grad_norm": 0.5802260294344969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39579468150896724, |
|
"grad_norm": 0.6344807037013861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6009, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4205318491032777, |
|
"grad_norm": 0.48480545471576164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4452690166975881, |
|
"grad_norm": 0.6380631405566793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6004, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47000618429189855, |
|
"grad_norm": 0.5505169198792789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5994, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49474335188620905, |
|
"grad_norm": 0.6242896844382176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5985, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.5255950166858494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6022, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.54421768707483, |
|
"grad_norm": 0.5516723643293726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5927, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5689548546691404, |
|
"grad_norm": 0.584042711614131, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5925, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5936920222634509, |
|
"grad_norm": 0.6260676537261808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5928, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6184291898577613, |
|
"grad_norm": 0.5176619963131881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5858, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6431663574520717, |
|
"grad_norm": 0.5024236252235582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5879, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6679035250463822, |
|
"grad_norm": 0.6803247050243845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5844, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6926406926406926, |
|
"grad_norm": 0.6015791834486387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.717377860235003, |
|
"grad_norm": 0.5299909658526826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5843, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7421150278293135, |
|
"grad_norm": 0.5088932556423367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5864, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.766852195423624, |
|
"grad_norm": 0.5964832923195739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5822, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7915893630179345, |
|
"grad_norm": 0.5860489284099033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5727, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.5219580208519626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5794, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8410636982065554, |
|
"grad_norm": 0.5269259293045121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5793, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": 0.5291953882291388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5744, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8905380333951762, |
|
"grad_norm": 0.521358302515632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5707, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9152752009894867, |
|
"grad_norm": 0.5774675957700983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5765, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9400123685837971, |
|
"grad_norm": 0.5172232609124243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9647495361781077, |
|
"grad_norm": 0.5240273423167071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5682, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9894867037724181, |
|
"grad_norm": 0.6149272559928857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5719, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9993815708101422, |
|
"eval_loss": 0.5683358907699585, |
|
"eval_runtime": 219.0047, |
|
"eval_samples_per_second": 49.734, |
|
"eval_steps_per_second": 0.393, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0142238713667284, |
|
"grad_norm": 0.5073078170608074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.547, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.6437533127258526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5316, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0636982065553493, |
|
"grad_norm": 0.6100879209869073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5301, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.08843537414966, |
|
"grad_norm": 0.48108127450705374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5294, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1131725417439704, |
|
"grad_norm": 0.5868928526612365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5332, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1379097093382808, |
|
"grad_norm": 0.6202895625151273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.535, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1626468769325913, |
|
"grad_norm": 0.5075954994852542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5326, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1873840445269017, |
|
"grad_norm": 0.4728741075680983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.532, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.5081500756875807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5348, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2368583797155226, |
|
"grad_norm": 0.6910919006687529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5197, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.261595547309833, |
|
"grad_norm": 0.5501274010525514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5222, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2863327149041435, |
|
"grad_norm": 0.7316007600357546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5335, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.311069882498454, |
|
"grad_norm": 0.5057200127850379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5288, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3358070500927643, |
|
"grad_norm": 0.5719065403986006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.529, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3605442176870748, |
|
"grad_norm": 0.5660341689340015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5282, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3852813852813852, |
|
"grad_norm": 0.7015838028229988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5296, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4100185528756957, |
|
"grad_norm": 0.5640160639184895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.531, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.434755720470006, |
|
"grad_norm": 0.5241187267275058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5277, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4594928880643168, |
|
"grad_norm": 0.5322661450321486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5223, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4842300556586272, |
|
"grad_norm": 0.5414175662054105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5253, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5089672232529376, |
|
"grad_norm": 0.5120018621643256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5171, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.533704390847248, |
|
"grad_norm": 0.5872164754716582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5263, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.4661784659324605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5217, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.583178726035869, |
|
"grad_norm": 0.464230352567236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5204, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6079158936301794, |
|
"grad_norm": 0.5032827937357744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5216, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 0.4996792160145028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5218, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6573902288188003, |
|
"grad_norm": 0.5271632514608294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5226, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6821273964131107, |
|
"grad_norm": 0.49347623420693476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5193, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7068645640074211, |
|
"grad_norm": 0.6049309413421875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5247, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"grad_norm": 0.5973360853484692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.516, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.756338899196042, |
|
"grad_norm": 0.49072766464621603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5179, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7810760667903525, |
|
"grad_norm": 0.5091642966772914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.805813234384663, |
|
"grad_norm": 0.5801588473122157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5177, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8305504019789733, |
|
"grad_norm": 0.48516680968832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5231, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8552875695732838, |
|
"grad_norm": 0.46039086459013556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5249, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8800247371675942, |
|
"grad_norm": 0.5083902833654592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.518, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.5109759169478353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5177, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.929499072356215, |
|
"grad_norm": 0.4767006976535441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5132, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9542362399505255, |
|
"grad_norm": 0.5035389849175771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.978973407544836, |
|
"grad_norm": 0.5072097192727141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5166, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9987631416202845, |
|
"eval_loss": 0.5430302023887634, |
|
"eval_runtime": 219.3095, |
|
"eval_samples_per_second": 49.665, |
|
"eval_steps_per_second": 0.392, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.0037105751391464, |
|
"grad_norm": 0.5742594976510665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5105, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.028447742733457, |
|
"grad_norm": 0.5679263759068595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4801, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.0531849103277673, |
|
"grad_norm": 0.5672377634558515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4818, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.6793815683511014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4756, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.102659245516388, |
|
"grad_norm": 0.5158458141043201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4768, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1273964131106986, |
|
"grad_norm": 0.6722479592255648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.481, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.1521335807050095, |
|
"grad_norm": 0.622090680752648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4806, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.17687074829932, |
|
"grad_norm": 0.5446538279562969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2016079158936304, |
|
"grad_norm": 0.9897157820333273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4743, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.226345083487941, |
|
"grad_norm": 0.5709797478235871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4745, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2510822510822512, |
|
"grad_norm": 0.5508207898306552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4825, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.2758194186765617, |
|
"grad_norm": 0.4968712039591529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4805, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.300556586270872, |
|
"grad_norm": 0.5588550182736718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4816, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.3252937538651826, |
|
"grad_norm": 0.6560301930501052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4834, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.350030921459493, |
|
"grad_norm": 0.47583877726868273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.3747680890538034, |
|
"grad_norm": 0.5212062104260379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4787, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.399505256648114, |
|
"grad_norm": 0.47451320868129626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4829, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.4830738101162483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4843, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4489795918367347, |
|
"grad_norm": 0.5182106761363315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4757, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.473716759431045, |
|
"grad_norm": 0.5153536365130167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4768, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4984539270253556, |
|
"grad_norm": 0.5417873882437457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4782, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.523191094619666, |
|
"grad_norm": 0.5527273801359924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.475, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.5479282622139765, |
|
"grad_norm": 0.5335187564602286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.572665429808287, |
|
"grad_norm": 0.5000885945895702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4769, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.518335000088379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4771, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.622139764996908, |
|
"grad_norm": 0.5506780400189991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4813, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.6468769325912183, |
|
"grad_norm": 0.5009935325893752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.479, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.6716141001855287, |
|
"grad_norm": 0.5265944850039301, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4808, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.696351267779839, |
|
"grad_norm": 0.5426869881704958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7210884353741496, |
|
"grad_norm": 0.6372802273548948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4824, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.74582560296846, |
|
"grad_norm": 0.5506322753225094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4748, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.7705627705627704, |
|
"grad_norm": 0.6432775550069307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.795299938157081, |
|
"grad_norm": 0.5422596013158468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4756, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.8200371057513913, |
|
"grad_norm": 0.5766402488022242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4813, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.8447742733457018, |
|
"grad_norm": 0.6098697051701109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4813, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.869511440940012, |
|
"grad_norm": 0.6756833035066511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4774, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.8942486085343226, |
|
"grad_norm": 0.6408060819703033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4762, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.9189857761286335, |
|
"grad_norm": 0.5130121997574635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4764, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.9437229437229435, |
|
"grad_norm": 0.5162312463500585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4777, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.9684601113172544, |
|
"grad_norm": 0.5924512448077986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4791, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.9931972789115644, |
|
"grad_norm": 0.502845968511808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4761, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.9981447124304266, |
|
"eval_loss": 0.537095308303833, |
|
"eval_runtime": 219.5171, |
|
"eval_samples_per_second": 49.618, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 2.9981447124304266, |
|
"step": 1212, |
|
"total_flos": 2029726382161920.0, |
|
"train_loss": 0.5400018749063952, |
|
"train_runtime": 36478.2162, |
|
"train_samples_per_second": 17.019, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1212, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2029726382161920.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|