|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0520784509785541, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002603922548927705, |
|
"grad_norm": 0.8521247506141663, |
|
"learning_rate": 5.194805194805195e-06, |
|
"loss": 0.7412, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.000520784509785541, |
|
"grad_norm": 0.6229312419891357, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 0.7138, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0007811767646783114, |
|
"grad_norm": 0.4566498100757599, |
|
"learning_rate": 1.5584415584415583e-05, |
|
"loss": 0.7079, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.001041569019571082, |
|
"grad_norm": 0.4316692650318146, |
|
"learning_rate": 2.077922077922078e-05, |
|
"loss": 0.6988, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0013019612744638524, |
|
"grad_norm": 0.615436315536499, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 0.6937, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0015623535293566228, |
|
"grad_norm": 0.48698583245277405, |
|
"learning_rate": 3.1168831168831166e-05, |
|
"loss": 0.7043, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0018227457842493933, |
|
"grad_norm": 0.3984021544456482, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.6563, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.002083138039142164, |
|
"grad_norm": 0.37576180696487427, |
|
"learning_rate": 4.155844155844156e-05, |
|
"loss": 0.6462, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0023435302940349343, |
|
"grad_norm": 0.35269680619239807, |
|
"learning_rate": 4.675324675324675e-05, |
|
"loss": 0.6656, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0026039225489277048, |
|
"grad_norm": 0.31541451811790466, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.6547, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002864314803820475, |
|
"grad_norm": 0.3462330400943756, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.6621, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0031247070587132456, |
|
"grad_norm": 0.3465985953807831, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 0.6273, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.003385099313606016, |
|
"grad_norm": 0.3297797441482544, |
|
"learning_rate": 6.753246753246754e-05, |
|
"loss": 0.6559, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0036454915684987865, |
|
"grad_norm": 0.3888818621635437, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.6756, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003905883823391557, |
|
"grad_norm": 0.3542368710041046, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.6506, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004166276078284328, |
|
"grad_norm": 0.37369370460510254, |
|
"learning_rate": 8.311688311688312e-05, |
|
"loss": 0.6645, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004426668333177098, |
|
"grad_norm": 0.3700549900531769, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 0.6727, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004687060588069869, |
|
"grad_norm": 0.32032889127731323, |
|
"learning_rate": 9.35064935064935e-05, |
|
"loss": 0.6529, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.004947452842962639, |
|
"grad_norm": 0.3331650495529175, |
|
"learning_rate": 9.870129870129871e-05, |
|
"loss": 0.6627, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0052078450978554095, |
|
"grad_norm": 0.3300645351409912, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.676, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0054682373527481795, |
|
"grad_norm": 0.350356787443161, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.6564, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00572862960764095, |
|
"grad_norm": 0.382756769657135, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.6243, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00598902186253372, |
|
"grad_norm": 0.34450188279151917, |
|
"learning_rate": 0.00011948051948051949, |
|
"loss": 0.6611, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.006249414117426491, |
|
"grad_norm": 0.3705821633338928, |
|
"learning_rate": 0.00012467532467532467, |
|
"loss": 0.6384, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006509806372319262, |
|
"grad_norm": 0.36822304129600525, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.6415, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.006770198627212032, |
|
"grad_norm": 0.32358303666114807, |
|
"learning_rate": 0.00013506493506493507, |
|
"loss": 0.6584, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007030590882104803, |
|
"grad_norm": 0.33386844396591187, |
|
"learning_rate": 0.00014025974025974028, |
|
"loss": 0.6702, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.007290983136997573, |
|
"grad_norm": 0.32447949051856995, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.6519, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007551375391890344, |
|
"grad_norm": 0.3388073146343231, |
|
"learning_rate": 0.00015064935064935066, |
|
"loss": 0.6735, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.007811767646783114, |
|
"grad_norm": 0.39655518531799316, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.672, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008072159901675884, |
|
"grad_norm": 0.41258928179740906, |
|
"learning_rate": 0.00016103896103896104, |
|
"loss": 0.6626, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008332552156568656, |
|
"grad_norm": 0.3963010311126709, |
|
"learning_rate": 0.00016623376623376625, |
|
"loss": 0.6653, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.008592944411461426, |
|
"grad_norm": 0.3641106188297272, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.6389, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.008853336666354196, |
|
"grad_norm": 0.38745763897895813, |
|
"learning_rate": 0.00017662337662337663, |
|
"loss": 0.6928, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009113728921246966, |
|
"grad_norm": 0.4573372006416321, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.6679, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009374121176139737, |
|
"grad_norm": 0.45714282989501953, |
|
"learning_rate": 0.000187012987012987, |
|
"loss": 0.6453, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.009634513431032507, |
|
"grad_norm": 0.37631818652153015, |
|
"learning_rate": 0.00019220779220779222, |
|
"loss": 0.6467, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.009894905685925277, |
|
"grad_norm": 0.3658345639705658, |
|
"learning_rate": 0.00019740259740259742, |
|
"loss": 0.6631, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.010155297940818049, |
|
"grad_norm": 0.3953540623188019, |
|
"learning_rate": 0.00019999996515752773, |
|
"loss": 0.6573, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.010415690195710819, |
|
"grad_norm": 0.377763569355011, |
|
"learning_rate": 0.00019999968641789507, |
|
"loss": 0.6664, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010676082450603589, |
|
"grad_norm": 0.37128835916519165, |
|
"learning_rate": 0.0001999991289394067, |
|
"loss": 0.6342, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.010936474705496359, |
|
"grad_norm": 0.33881694078445435, |
|
"learning_rate": 0.00019999829272361654, |
|
"loss": 0.6476, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01119686696038913, |
|
"grad_norm": 0.39774075150489807, |
|
"learning_rate": 0.00019999717777285545, |
|
"loss": 0.633, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0114572592152819, |
|
"grad_norm": 0.41350051760673523, |
|
"learning_rate": 0.00019999578409023126, |
|
"loss": 0.6541, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01171765147017467, |
|
"grad_norm": 0.47954171895980835, |
|
"learning_rate": 0.00019999411167962868, |
|
"loss": 0.6545, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01197804372506744, |
|
"grad_norm": 0.46860000491142273, |
|
"learning_rate": 0.00019999216054570942, |
|
"loss": 0.6512, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.012238435979960213, |
|
"grad_norm": 0.4395809471607208, |
|
"learning_rate": 0.00019998993069391205, |
|
"loss": 0.6587, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.012498828234852983, |
|
"grad_norm": 0.43222516775131226, |
|
"learning_rate": 0.00019998742213045206, |
|
"loss": 0.6292, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.012759220489745753, |
|
"grad_norm": 0.39363613724708557, |
|
"learning_rate": 0.00019998463486232179, |
|
"loss": 0.6319, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.013019612744638524, |
|
"grad_norm": 0.4984697699546814, |
|
"learning_rate": 0.0001999815688972905, |
|
"loss": 0.6488, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.013280004999531294, |
|
"grad_norm": 0.4710462689399719, |
|
"learning_rate": 0.00019997822424390422, |
|
"loss": 0.6633, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.013540397254424064, |
|
"grad_norm": 0.4141169786453247, |
|
"learning_rate": 0.00019997460091148586, |
|
"loss": 0.6471, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.013800789509316834, |
|
"grad_norm": 0.39957430958747864, |
|
"learning_rate": 0.00019997069891013503, |
|
"loss": 0.6226, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.014061181764209606, |
|
"grad_norm": 0.4508794844150543, |
|
"learning_rate": 0.00019996651825072826, |
|
"loss": 0.6559, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.014321574019102376, |
|
"grad_norm": 0.4256739020347595, |
|
"learning_rate": 0.00019996205894491856, |
|
"loss": 0.6551, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.014581966273995146, |
|
"grad_norm": 0.43204987049102783, |
|
"learning_rate": 0.00019995732100513592, |
|
"loss": 0.6254, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.014842358528887916, |
|
"grad_norm": 0.37589946389198303, |
|
"learning_rate": 0.00019995230444458682, |
|
"loss": 0.6543, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.015102750783780688, |
|
"grad_norm": 0.40850168466567993, |
|
"learning_rate": 0.0001999470092772544, |
|
"loss": 0.6474, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.015363143038673458, |
|
"grad_norm": 0.3754895031452179, |
|
"learning_rate": 0.00019994143551789839, |
|
"loss": 0.6502, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.015623535293566228, |
|
"grad_norm": 0.3857438266277313, |
|
"learning_rate": 0.00019993558318205507, |
|
"loss": 0.6544, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.015883927548459, |
|
"grad_norm": 0.4063841998577118, |
|
"learning_rate": 0.00019992945228603724, |
|
"loss": 0.639, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.016144319803351768, |
|
"grad_norm": 0.35183581709861755, |
|
"learning_rate": 0.0001999230428469341, |
|
"loss": 0.6442, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.01640471205824454, |
|
"grad_norm": 0.4158167243003845, |
|
"learning_rate": 0.00019991635488261138, |
|
"loss": 0.6586, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.01666510431313731, |
|
"grad_norm": 0.45118188858032227, |
|
"learning_rate": 0.00019990938841171104, |
|
"loss": 0.6581, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.01692549656803008, |
|
"grad_norm": 0.39950400590896606, |
|
"learning_rate": 0.0001999021434536514, |
|
"loss": 0.6712, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.01718588882292285, |
|
"grad_norm": 0.35208678245544434, |
|
"learning_rate": 0.00019989462002862704, |
|
"loss": 0.6398, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.017446281077815623, |
|
"grad_norm": 0.38008975982666016, |
|
"learning_rate": 0.0001998868181576088, |
|
"loss": 0.6479, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.01770667333270839, |
|
"grad_norm": 0.4314909875392914, |
|
"learning_rate": 0.00019987873786234348, |
|
"loss": 0.6358, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.017967065587601163, |
|
"grad_norm": 0.3982577323913574, |
|
"learning_rate": 0.00019987037916535417, |
|
"loss": 0.6361, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.01822745784249393, |
|
"grad_norm": 0.3529202342033386, |
|
"learning_rate": 0.0001998617420899398, |
|
"loss": 0.64, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.018487850097386703, |
|
"grad_norm": 0.41149991750717163, |
|
"learning_rate": 0.0001998528266601754, |
|
"loss": 0.6684, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.018748242352279475, |
|
"grad_norm": 0.42630311846733093, |
|
"learning_rate": 0.0001998436329009118, |
|
"loss": 0.6429, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.019008634607172243, |
|
"grad_norm": 0.4028918147087097, |
|
"learning_rate": 0.00019983416083777563, |
|
"loss": 0.6573, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.019269026862065015, |
|
"grad_norm": 0.3785901963710785, |
|
"learning_rate": 0.0001998244104971693, |
|
"loss": 0.6132, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.019529419116957786, |
|
"grad_norm": 0.39018985629081726, |
|
"learning_rate": 0.0001998143819062709, |
|
"loss": 0.6287, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.019789811371850555, |
|
"grad_norm": 0.4268128573894501, |
|
"learning_rate": 0.00019980407509303413, |
|
"loss": 0.6585, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.020050203626743326, |
|
"grad_norm": 0.4293033480644226, |
|
"learning_rate": 0.00019979349008618808, |
|
"loss": 0.6843, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.020310595881636098, |
|
"grad_norm": 0.38943207263946533, |
|
"learning_rate": 0.00019978262691523743, |
|
"loss": 0.6265, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.020570988136528866, |
|
"grad_norm": 0.40528395771980286, |
|
"learning_rate": 0.00019977148561046217, |
|
"loss": 0.6392, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.020831380391421638, |
|
"grad_norm": 0.4273380935192108, |
|
"learning_rate": 0.0001997600662029175, |
|
"loss": 0.6615, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.021091772646314406, |
|
"grad_norm": 0.4269028306007385, |
|
"learning_rate": 0.00019974836872443388, |
|
"loss": 0.6412, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.021352164901207178, |
|
"grad_norm": 0.3542031943798065, |
|
"learning_rate": 0.0001997363932076168, |
|
"loss": 0.6606, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.02161255715609995, |
|
"grad_norm": 0.36826202273368835, |
|
"learning_rate": 0.00019972413968584682, |
|
"loss": 0.6387, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.021872949410992718, |
|
"grad_norm": 0.4278506338596344, |
|
"learning_rate": 0.0001997116081932793, |
|
"loss": 0.6544, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02213334166588549, |
|
"grad_norm": 0.467886358499527, |
|
"learning_rate": 0.0001996987987648446, |
|
"loss": 0.6524, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.02239373392077826, |
|
"grad_norm": 0.36823606491088867, |
|
"learning_rate": 0.0001996857114362476, |
|
"loss": 0.6553, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.02265412617567103, |
|
"grad_norm": 0.42569059133529663, |
|
"learning_rate": 0.00019967234624396793, |
|
"loss": 0.6484, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.0229145184305638, |
|
"grad_norm": 0.36995476484298706, |
|
"learning_rate": 0.00019965870322525965, |
|
"loss": 0.6626, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.023174910685456573, |
|
"grad_norm": 0.4284444749355316, |
|
"learning_rate": 0.0001996447824181513, |
|
"loss": 0.6579, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02343530294034934, |
|
"grad_norm": 0.36263275146484375, |
|
"learning_rate": 0.0001996305838614457, |
|
"loss": 0.6466, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.023695695195242113, |
|
"grad_norm": 0.43936702609062195, |
|
"learning_rate": 0.00019961610759471984, |
|
"loss": 0.6534, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.02395608745013488, |
|
"grad_norm": 0.37757524847984314, |
|
"learning_rate": 0.00019960135365832486, |
|
"loss": 0.6344, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.024216479705027653, |
|
"grad_norm": 0.40086570382118225, |
|
"learning_rate": 0.00019958632209338587, |
|
"loss": 0.6265, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.024476871959920425, |
|
"grad_norm": 0.3435315489768982, |
|
"learning_rate": 0.00019957101294180174, |
|
"loss": 0.6479, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.024737264214813193, |
|
"grad_norm": 0.34466204047203064, |
|
"learning_rate": 0.00019955542624624522, |
|
"loss": 0.641, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.024997656469705965, |
|
"grad_norm": 0.46282994747161865, |
|
"learning_rate": 0.00019953956205016256, |
|
"loss": 0.6389, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.025258048724598737, |
|
"grad_norm": 0.3815780580043793, |
|
"learning_rate": 0.00019952342039777362, |
|
"loss": 0.6472, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.025518440979491505, |
|
"grad_norm": 0.43121904134750366, |
|
"learning_rate": 0.00019950700133407163, |
|
"loss": 0.6314, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.025778833234384277, |
|
"grad_norm": 0.41635170578956604, |
|
"learning_rate": 0.00019949030490482296, |
|
"loss": 0.6483, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.02603922548927705, |
|
"grad_norm": 0.3946804106235504, |
|
"learning_rate": 0.0001994733311565673, |
|
"loss": 0.6383, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.026299617744169817, |
|
"grad_norm": 0.48494285345077515, |
|
"learning_rate": 0.0001994560801366171, |
|
"loss": 0.6617, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.02656000999906259, |
|
"grad_norm": 0.4007907807826996, |
|
"learning_rate": 0.00019943855189305792, |
|
"loss": 0.6187, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.026820402253955357, |
|
"grad_norm": 0.4674074649810791, |
|
"learning_rate": 0.00019942074647474786, |
|
"loss": 0.6629, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.02708079450884813, |
|
"grad_norm": 0.3703964650630951, |
|
"learning_rate": 0.00019940266393131775, |
|
"loss": 0.6606, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0273411867637409, |
|
"grad_norm": 0.4177350401878357, |
|
"learning_rate": 0.00019938430431317081, |
|
"loss": 0.6285, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.02760157901863367, |
|
"grad_norm": 0.391641765832901, |
|
"learning_rate": 0.00019936566767148257, |
|
"loss": 0.6448, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.02786197127352644, |
|
"grad_norm": 0.38827773928642273, |
|
"learning_rate": 0.00019934675405820077, |
|
"loss": 0.6272, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.028122363528419212, |
|
"grad_norm": 0.41332709789276123, |
|
"learning_rate": 0.00019932756352604515, |
|
"loss": 0.6316, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02838275578331198, |
|
"grad_norm": 0.38579726219177246, |
|
"learning_rate": 0.00019930809612850735, |
|
"loss": 0.6357, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.028643148038204752, |
|
"grad_norm": 0.4541114568710327, |
|
"learning_rate": 0.00019928835191985076, |
|
"loss": 0.6546, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.028903540293097524, |
|
"grad_norm": 0.37385833263397217, |
|
"learning_rate": 0.0001992683309551103, |
|
"loss": 0.6378, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.029163932547990292, |
|
"grad_norm": 0.39442044496536255, |
|
"learning_rate": 0.00019924803329009243, |
|
"loss": 0.6549, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.029424324802883064, |
|
"grad_norm": 0.3960839509963989, |
|
"learning_rate": 0.00019922745898137473, |
|
"loss": 0.6304, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.029684717057775832, |
|
"grad_norm": 0.4159034192562103, |
|
"learning_rate": 0.00019920660808630598, |
|
"loss": 0.6503, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.029945109312668604, |
|
"grad_norm": 0.4242476522922516, |
|
"learning_rate": 0.00019918548066300592, |
|
"loss": 0.6305, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.030205501567561376, |
|
"grad_norm": 0.4142429530620575, |
|
"learning_rate": 0.0001991640767703651, |
|
"loss": 0.6246, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.030465893822454144, |
|
"grad_norm": 0.4049033522605896, |
|
"learning_rate": 0.00019914239646804462, |
|
"loss": 0.6315, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.030726286077346916, |
|
"grad_norm": 0.4325621426105499, |
|
"learning_rate": 0.00019912043981647616, |
|
"loss": 0.6467, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.030986678332239687, |
|
"grad_norm": 0.35380443930625916, |
|
"learning_rate": 0.00019909820687686157, |
|
"loss": 0.6416, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.031247070587132456, |
|
"grad_norm": 0.3798046410083771, |
|
"learning_rate": 0.0001990756977111729, |
|
"loss": 0.6367, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03150746284202523, |
|
"grad_norm": 0.4257236123085022, |
|
"learning_rate": 0.0001990529123821522, |
|
"loss": 0.6414, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.031767855096918, |
|
"grad_norm": 0.4575822055339813, |
|
"learning_rate": 0.00019902985095331113, |
|
"loss": 0.6647, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03202824735181077, |
|
"grad_norm": 0.34732112288475037, |
|
"learning_rate": 0.00019900651348893114, |
|
"loss": 0.6446, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.032288639606703536, |
|
"grad_norm": 0.4493260979652405, |
|
"learning_rate": 0.00019898290005406296, |
|
"loss": 0.6672, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.03254903186159631, |
|
"grad_norm": 0.39185160398483276, |
|
"learning_rate": 0.00019895901071452667, |
|
"loss": 0.6581, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03280942411648908, |
|
"grad_norm": 0.37691834568977356, |
|
"learning_rate": 0.0001989348455369113, |
|
"loss": 0.644, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03306981637138185, |
|
"grad_norm": 0.378093421459198, |
|
"learning_rate": 0.0001989104045885748, |
|
"loss": 0.6515, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.03333020862627462, |
|
"grad_norm": 0.37683796882629395, |
|
"learning_rate": 0.00019888568793764385, |
|
"loss": 0.6281, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.03359060088116739, |
|
"grad_norm": 0.37529483437538147, |
|
"learning_rate": 0.00019886069565301355, |
|
"loss": 0.6606, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.03385099313606016, |
|
"grad_norm": 0.3849285840988159, |
|
"learning_rate": 0.00019883542780434733, |
|
"loss": 0.6388, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03411138539095293, |
|
"grad_norm": 0.3860384523868561, |
|
"learning_rate": 0.0001988098844620767, |
|
"loss": 0.6612, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.0343717776458457, |
|
"grad_norm": 0.4840448200702667, |
|
"learning_rate": 0.0001987840656974011, |
|
"loss": 0.6432, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.034632169900738474, |
|
"grad_norm": 0.3508262038230896, |
|
"learning_rate": 0.00019875797158228775, |
|
"loss": 0.6549, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.034892562155631246, |
|
"grad_norm": 0.4253254234790802, |
|
"learning_rate": 0.00019873160218947125, |
|
"loss": 0.6303, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03515295441052401, |
|
"grad_norm": 0.37659895420074463, |
|
"learning_rate": 0.00019870495759245362, |
|
"loss": 0.6278, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03541334666541678, |
|
"grad_norm": 0.36914440989494324, |
|
"learning_rate": 0.0001986780378655039, |
|
"loss": 0.6614, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.035673738920309554, |
|
"grad_norm": 0.40397894382476807, |
|
"learning_rate": 0.0001986508430836581, |
|
"loss": 0.6295, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.035934131175202326, |
|
"grad_norm": 0.3998821973800659, |
|
"learning_rate": 0.0001986233733227188, |
|
"loss": 0.6705, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.0361945234300951, |
|
"grad_norm": 0.37330886721611023, |
|
"learning_rate": 0.00019859562865925525, |
|
"loss": 0.6537, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.03645491568498786, |
|
"grad_norm": 0.3862515091896057, |
|
"learning_rate": 0.00019856760917060277, |
|
"loss": 0.6576, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.036715307939880634, |
|
"grad_norm": 0.39040204882621765, |
|
"learning_rate": 0.00019853931493486287, |
|
"loss": 0.6697, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.036975700194773406, |
|
"grad_norm": 0.3295992910861969, |
|
"learning_rate": 0.00019851074603090277, |
|
"loss": 0.6175, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.03723609244966618, |
|
"grad_norm": 0.33969369530677795, |
|
"learning_rate": 0.00019848190253835536, |
|
"loss": 0.6453, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.03749648470455895, |
|
"grad_norm": 0.456320196390152, |
|
"learning_rate": 0.00019845278453761896, |
|
"loss": 0.6392, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.03775687695945172, |
|
"grad_norm": 0.3699491024017334, |
|
"learning_rate": 0.00019842339210985696, |
|
"loss": 0.636, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.038017269214344486, |
|
"grad_norm": 0.41601112484931946, |
|
"learning_rate": 0.00019839372533699774, |
|
"loss": 0.6566, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.03827766146923726, |
|
"grad_norm": 0.39745938777923584, |
|
"learning_rate": 0.00019836378430173438, |
|
"loss": 0.6421, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.03853805372413003, |
|
"grad_norm": 0.38357457518577576, |
|
"learning_rate": 0.0001983335690875245, |
|
"loss": 0.6355, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.0387984459790228, |
|
"grad_norm": 0.3879673182964325, |
|
"learning_rate": 0.00019830307977858984, |
|
"loss": 0.6295, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.03905883823391557, |
|
"grad_norm": 0.42652568221092224, |
|
"learning_rate": 0.00019827231645991623, |
|
"loss": 0.6374, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03931923048880834, |
|
"grad_norm": 0.3830074369907379, |
|
"learning_rate": 0.00019824127921725326, |
|
"loss": 0.6292, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.03957962274370111, |
|
"grad_norm": 0.39314061403274536, |
|
"learning_rate": 0.00019820996813711407, |
|
"loss": 0.6416, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.03984001499859388, |
|
"grad_norm": 0.3321419060230255, |
|
"learning_rate": 0.0001981783833067751, |
|
"loss": 0.6206, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.04010040725348665, |
|
"grad_norm": 0.41209813952445984, |
|
"learning_rate": 0.0001981465248142758, |
|
"loss": 0.6576, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.040360799508379425, |
|
"grad_norm": 0.4043482542037964, |
|
"learning_rate": 0.00019811439274841842, |
|
"loss": 0.6588, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.040621191763272196, |
|
"grad_norm": 0.4470541179180145, |
|
"learning_rate": 0.00019808198719876782, |
|
"loss": 0.6595, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04088158401816496, |
|
"grad_norm": 0.3442763090133667, |
|
"learning_rate": 0.00019804930825565112, |
|
"loss": 0.6584, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.04114197627305773, |
|
"grad_norm": 0.4013935923576355, |
|
"learning_rate": 0.00019801635601015752, |
|
"loss": 0.6315, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.041402368527950505, |
|
"grad_norm": 0.36532357335090637, |
|
"learning_rate": 0.00019798313055413808, |
|
"loss": 0.6453, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.041662760782843276, |
|
"grad_norm": 0.4390687644481659, |
|
"learning_rate": 0.00019794963198020525, |
|
"loss": 0.6375, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04192315303773605, |
|
"grad_norm": 0.3687056601047516, |
|
"learning_rate": 0.00019791586038173296, |
|
"loss": 0.637, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.04218354529262881, |
|
"grad_norm": 0.372841477394104, |
|
"learning_rate": 0.00019788181585285602, |
|
"loss": 0.6322, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.042443937547521585, |
|
"grad_norm": 0.3459762632846832, |
|
"learning_rate": 0.00019784749848847003, |
|
"loss": 0.62, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.042704329802414356, |
|
"grad_norm": 0.4031515121459961, |
|
"learning_rate": 0.0001978129083842312, |
|
"loss": 0.6438, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.04296472205730713, |
|
"grad_norm": 0.39984458684921265, |
|
"learning_rate": 0.00019777804563655583, |
|
"loss": 0.6224, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.0432251143121999, |
|
"grad_norm": 0.37194013595581055, |
|
"learning_rate": 0.00019774291034262026, |
|
"loss": 0.6258, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.04348550656709267, |
|
"grad_norm": 0.3989511728286743, |
|
"learning_rate": 0.00019770750260036054, |
|
"loss": 0.6385, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.043745898821985436, |
|
"grad_norm": 0.3801423907279968, |
|
"learning_rate": 0.00019767182250847207, |
|
"loss": 0.6234, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.04400629107687821, |
|
"grad_norm": 0.3838658034801483, |
|
"learning_rate": 0.00019763587016640948, |
|
"loss": 0.656, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04426668333177098, |
|
"grad_norm": 0.5071051716804504, |
|
"learning_rate": 0.00019759964567438623, |
|
"loss": 0.6385, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04452707558666375, |
|
"grad_norm": 0.3741011321544647, |
|
"learning_rate": 0.00019756314913337432, |
|
"loss": 0.6452, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04478746784155652, |
|
"grad_norm": 0.41739609837532043, |
|
"learning_rate": 0.00019752638064510415, |
|
"loss": 0.627, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04504786009644929, |
|
"grad_norm": 0.38942453265190125, |
|
"learning_rate": 0.00019748934031206414, |
|
"loss": 0.6486, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.04530825235134206, |
|
"grad_norm": 0.40764692425727844, |
|
"learning_rate": 0.00019745202823750034, |
|
"loss": 0.6311, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04556864460623483, |
|
"grad_norm": 0.4089398682117462, |
|
"learning_rate": 0.0001974144445254164, |
|
"loss": 0.6262, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.0458290368611276, |
|
"grad_norm": 0.4223162531852722, |
|
"learning_rate": 0.00019737658928057302, |
|
"loss": 0.6633, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.046089429116020375, |
|
"grad_norm": 0.4696766436100006, |
|
"learning_rate": 0.00019733846260848776, |
|
"loss": 0.6448, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.04634982137091315, |
|
"grad_norm": 0.34561800956726074, |
|
"learning_rate": 0.0001973000646154349, |
|
"loss": 0.6629, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.04661021362580591, |
|
"grad_norm": 0.3809750974178314, |
|
"learning_rate": 0.00019726139540844484, |
|
"loss": 0.6261, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.04687060588069868, |
|
"grad_norm": 0.37188807129859924, |
|
"learning_rate": 0.00019722245509530401, |
|
"loss": 0.6392, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.047130998135591455, |
|
"grad_norm": 0.36847737431526184, |
|
"learning_rate": 0.00019718324378455458, |
|
"loss": 0.6238, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.04739139039048423, |
|
"grad_norm": 0.34314194321632385, |
|
"learning_rate": 0.00019714376158549404, |
|
"loss": 0.6512, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.047651782645377, |
|
"grad_norm": 0.3639289140701294, |
|
"learning_rate": 0.00019710400860817494, |
|
"loss": 0.6481, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.04791217490026976, |
|
"grad_norm": 0.34774431586265564, |
|
"learning_rate": 0.00019706398496340463, |
|
"loss": 0.6583, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.048172567155162535, |
|
"grad_norm": 0.37768319249153137, |
|
"learning_rate": 0.00019702369076274494, |
|
"loss": 0.6241, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.04843295941005531, |
|
"grad_norm": 0.3546730875968933, |
|
"learning_rate": 0.0001969831261185118, |
|
"loss": 0.6222, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.04869335166494808, |
|
"grad_norm": 0.3773512840270996, |
|
"learning_rate": 0.00019694229114377494, |
|
"loss": 0.6201, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.04895374391984085, |
|
"grad_norm": 0.3787965774536133, |
|
"learning_rate": 0.00019690118595235774, |
|
"loss": 0.6339, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.04921413617473362, |
|
"grad_norm": 0.3667986989021301, |
|
"learning_rate": 0.00019685981065883663, |
|
"loss": 0.6253, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.04947452842962639, |
|
"grad_norm": 0.39258262515068054, |
|
"learning_rate": 0.00019681816537854102, |
|
"loss": 0.6417, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.04973492068451916, |
|
"grad_norm": 0.3514678478240967, |
|
"learning_rate": 0.00019677625022755289, |
|
"loss": 0.6473, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.04999531293941193, |
|
"grad_norm": 0.38365432620048523, |
|
"learning_rate": 0.00019673406532270634, |
|
"loss": 0.6363, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.0502557051943047, |
|
"grad_norm": 0.34043630957603455, |
|
"learning_rate": 0.00019669161078158753, |
|
"loss": 0.6249, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.050516097449197474, |
|
"grad_norm": 0.41065657138824463, |
|
"learning_rate": 0.0001966488867225341, |
|
"loss": 0.6479, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.05077648970409024, |
|
"grad_norm": 0.3435451090335846, |
|
"learning_rate": 0.00019660589326463498, |
|
"loss": 0.6498, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05103688195898301, |
|
"grad_norm": 0.3457126021385193, |
|
"learning_rate": 0.00019656263052773002, |
|
"loss": 0.6188, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05129727421387578, |
|
"grad_norm": 0.34488430619239807, |
|
"learning_rate": 0.00019651909863240965, |
|
"loss": 0.6352, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.051557666468768554, |
|
"grad_norm": 0.34936293959617615, |
|
"learning_rate": 0.00019647529770001456, |
|
"loss": 0.6331, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.051818058723661325, |
|
"grad_norm": 0.34119752049446106, |
|
"learning_rate": 0.00019643122785263536, |
|
"loss": 0.6188, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.0520784509785541, |
|
"grad_norm": 0.35101839900016785, |
|
"learning_rate": 0.00019638688921311224, |
|
"loss": 0.6339, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 19202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.13160820359168e+18, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|