|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05207836057320916, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002603918028660458, |
|
"grad_norm": 0.4500846266746521, |
|
"learning_rate": 5.194805194805195e-06, |
|
"loss": 1.0381, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005207836057320916, |
|
"grad_norm": 0.35188010334968567, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 1.0108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0007811754085981374, |
|
"grad_norm": 0.2300374060869217, |
|
"learning_rate": 1.5584415584415583e-05, |
|
"loss": 0.9668, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0010415672114641832, |
|
"grad_norm": 0.16189467906951904, |
|
"learning_rate": 2.077922077922078e-05, |
|
"loss": 0.918, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001301959014330229, |
|
"grad_norm": 0.18843211233615875, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 0.9265, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0015623508171962747, |
|
"grad_norm": 0.20334510505199432, |
|
"learning_rate": 3.1168831168831166e-05, |
|
"loss": 0.9234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0018227426200623205, |
|
"grad_norm": 0.1745327115058899, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.881, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0020831344229283663, |
|
"grad_norm": 0.18667331337928772, |
|
"learning_rate": 4.155844155844156e-05, |
|
"loss": 0.8592, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.002343526225794412, |
|
"grad_norm": 0.1848158985376358, |
|
"learning_rate": 4.675324675324675e-05, |
|
"loss": 0.8537, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.002603918028660458, |
|
"grad_norm": 0.17589879035949707, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.8518, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0028643098315265037, |
|
"grad_norm": 0.2132624089717865, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.8511, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0031247016343925495, |
|
"grad_norm": 0.23070092499256134, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 0.7975, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0033850934372585953, |
|
"grad_norm": 0.25368157029151917, |
|
"learning_rate": 6.753246753246754e-05, |
|
"loss": 0.8134, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003645485240124641, |
|
"grad_norm": 0.22897231578826904, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.8322, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003905877042990687, |
|
"grad_norm": 0.19932536780834198, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.7959, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004166268845856733, |
|
"grad_norm": 0.21011792123317719, |
|
"learning_rate": 8.311688311688312e-05, |
|
"loss": 0.8102, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004426660648722778, |
|
"grad_norm": 0.20594824850559235, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 0.8128, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004687052451588824, |
|
"grad_norm": 0.20465536415576935, |
|
"learning_rate": 9.35064935064935e-05, |
|
"loss": 0.7989, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00494744425445487, |
|
"grad_norm": 0.4109392762184143, |
|
"learning_rate": 9.870129870129871e-05, |
|
"loss": 0.8108, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.005207836057320916, |
|
"grad_norm": 0.4293076694011688, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.8101, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005468227860186962, |
|
"grad_norm": 0.31628963351249695, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.7989, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005728619663053007, |
|
"grad_norm": 0.24642810225486755, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.7751, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.005989011465919053, |
|
"grad_norm": 0.3599106967449188, |
|
"learning_rate": 0.00011948051948051949, |
|
"loss": 0.8063, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.006249403268785099, |
|
"grad_norm": 0.17053447663784027, |
|
"learning_rate": 0.00012467532467532467, |
|
"loss": 0.7751, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006509795071651145, |
|
"grad_norm": 0.17303769290447235, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.7883, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0067701868745171905, |
|
"grad_norm": 0.1815861016511917, |
|
"learning_rate": 0.00013506493506493507, |
|
"loss": 0.788, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007030578677383236, |
|
"grad_norm": 0.24125365912914276, |
|
"learning_rate": 0.00014025974025974028, |
|
"loss": 0.8018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.007290970480249282, |
|
"grad_norm": 0.19443446397781372, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.7908, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007551362283115328, |
|
"grad_norm": 0.17829768359661102, |
|
"learning_rate": 0.00015064935064935066, |
|
"loss": 0.8033, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.007811754085981374, |
|
"grad_norm": 0.19535653293132782, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.7997, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008072145888847419, |
|
"grad_norm": 0.19930541515350342, |
|
"learning_rate": 0.00016103896103896104, |
|
"loss": 0.7945, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008332537691713465, |
|
"grad_norm": 0.2156297266483307, |
|
"learning_rate": 0.00016623376623376625, |
|
"loss": 0.8018, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00859292949457951, |
|
"grad_norm": 0.1924206018447876, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.7746, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.008853321297445557, |
|
"grad_norm": 0.2294880747795105, |
|
"learning_rate": 0.00017662337662337663, |
|
"loss": 0.8152, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009113713100311602, |
|
"grad_norm": 0.16817067563533783, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.7972, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009374104903177648, |
|
"grad_norm": 0.18544812500476837, |
|
"learning_rate": 0.000187012987012987, |
|
"loss": 0.7801, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.009634496706043693, |
|
"grad_norm": 0.19597066938877106, |
|
"learning_rate": 0.00019220779220779222, |
|
"loss": 0.7706, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00989488850890974, |
|
"grad_norm": 0.40291881561279297, |
|
"learning_rate": 0.00019740259740259742, |
|
"loss": 0.7911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.010155280311775785, |
|
"grad_norm": 0.23841074109077454, |
|
"learning_rate": 0.00019999996515752773, |
|
"loss": 0.7861, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.010415672114641832, |
|
"grad_norm": 0.1675388514995575, |
|
"learning_rate": 0.00019999968641789507, |
|
"loss": 0.788, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010676063917507876, |
|
"grad_norm": 1.8860758543014526, |
|
"learning_rate": 0.0001999991289394067, |
|
"loss": 0.7632, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.010936455720373923, |
|
"grad_norm": 0.17022117972373962, |
|
"learning_rate": 0.00019999829272361654, |
|
"loss": 0.784, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.011196847523239968, |
|
"grad_norm": 0.21460269391536713, |
|
"learning_rate": 0.00019999717777285545, |
|
"loss": 0.761, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.011457239326106015, |
|
"grad_norm": 0.19413785636425018, |
|
"learning_rate": 0.00019999578409023126, |
|
"loss": 0.7772, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01171763112897206, |
|
"grad_norm": 0.20223405957221985, |
|
"learning_rate": 0.00019999411167962868, |
|
"loss": 0.7811, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.011978022931838106, |
|
"grad_norm": 0.15166303515434265, |
|
"learning_rate": 0.00019999216054570942, |
|
"loss": 0.7709, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.012238414734704151, |
|
"grad_norm": 0.16307081282138824, |
|
"learning_rate": 0.00019998993069391205, |
|
"loss": 0.7811, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.012498806537570198, |
|
"grad_norm": 0.15996049344539642, |
|
"learning_rate": 0.00019998742213045206, |
|
"loss": 0.7599, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.012759198340436243, |
|
"grad_norm": 0.17560279369354248, |
|
"learning_rate": 0.00019998463486232179, |
|
"loss": 0.7572, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01301959014330229, |
|
"grad_norm": 0.17571642994880676, |
|
"learning_rate": 0.0001999815688972905, |
|
"loss": 0.7643, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.013279981946168334, |
|
"grad_norm": 0.17719799280166626, |
|
"learning_rate": 0.00019997822424390422, |
|
"loss": 0.7923, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.013540373749034381, |
|
"grad_norm": 0.19846616685390472, |
|
"learning_rate": 0.00019997460091148586, |
|
"loss": 0.7674, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.013800765551900426, |
|
"grad_norm": 0.2715558111667633, |
|
"learning_rate": 0.00019997069891013503, |
|
"loss": 0.7421, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.014061157354766473, |
|
"grad_norm": 0.1725197583436966, |
|
"learning_rate": 0.00019996651825072826, |
|
"loss": 0.7663, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.014321549157632518, |
|
"grad_norm": 0.15060502290725708, |
|
"learning_rate": 0.00019996205894491856, |
|
"loss": 0.7794, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.014581940960498564, |
|
"grad_norm": 0.16645808517932892, |
|
"learning_rate": 0.00019995732100513592, |
|
"loss": 0.752, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.014842332763364609, |
|
"grad_norm": 0.1736789345741272, |
|
"learning_rate": 0.00019995230444458682, |
|
"loss": 0.7788, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.015102724566230656, |
|
"grad_norm": 0.15416319668293, |
|
"learning_rate": 0.0001999470092772544, |
|
"loss": 0.7656, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0153631163690967, |
|
"grad_norm": 0.16610187292099, |
|
"learning_rate": 0.00019994143551789839, |
|
"loss": 0.7676, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.015623508171962747, |
|
"grad_norm": 0.15843011438846588, |
|
"learning_rate": 0.00019993558318205507, |
|
"loss": 0.7746, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.015883899974828794, |
|
"grad_norm": 0.26837801933288574, |
|
"learning_rate": 0.00019992945228603724, |
|
"loss": 0.7617, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.016144291777694837, |
|
"grad_norm": 0.15099173784255981, |
|
"learning_rate": 0.0001999230428469341, |
|
"loss": 0.7601, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.016404683580560884, |
|
"grad_norm": 0.15511856973171234, |
|
"learning_rate": 0.00019991635488261138, |
|
"loss": 0.7647, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.01666507538342693, |
|
"grad_norm": 0.14919579029083252, |
|
"learning_rate": 0.00019990938841171104, |
|
"loss": 0.7692, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.016925467186292977, |
|
"grad_norm": 0.15838642418384552, |
|
"learning_rate": 0.0001999021434536514, |
|
"loss": 0.7763, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.01718585898915902, |
|
"grad_norm": 0.15956635773181915, |
|
"learning_rate": 0.00019989462002862704, |
|
"loss": 0.7598, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.017446250792025067, |
|
"grad_norm": 0.1499069333076477, |
|
"learning_rate": 0.0001998868181576088, |
|
"loss": 0.7626, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.017706642594891114, |
|
"grad_norm": 0.2170073390007019, |
|
"learning_rate": 0.00019987873786234348, |
|
"loss": 0.7569, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.01796703439775716, |
|
"grad_norm": 0.17841948568820953, |
|
"learning_rate": 0.00019987037916535417, |
|
"loss": 0.7494, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.018227426200623204, |
|
"grad_norm": 0.2066909372806549, |
|
"learning_rate": 0.0001998617420899398, |
|
"loss": 0.7609, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01848781800348925, |
|
"grad_norm": 0.17015361785888672, |
|
"learning_rate": 0.0001998528266601754, |
|
"loss": 0.7761, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.018748209806355297, |
|
"grad_norm": 0.22166290879249573, |
|
"learning_rate": 0.0001998436329009118, |
|
"loss": 0.7573, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01900860160922134, |
|
"grad_norm": 0.15084640681743622, |
|
"learning_rate": 0.00019983416083777563, |
|
"loss": 0.7775, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.019268993412087387, |
|
"grad_norm": 0.17800921201705933, |
|
"learning_rate": 0.0001998244104971693, |
|
"loss": 0.7359, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.019529385214953433, |
|
"grad_norm": 0.17354707419872284, |
|
"learning_rate": 0.0001998143819062709, |
|
"loss": 0.7415, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.01978977701781948, |
|
"grad_norm": 0.16408118605613708, |
|
"learning_rate": 0.00019980407509303413, |
|
"loss": 0.7708, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.020050168820685523, |
|
"grad_norm": 0.16820089519023895, |
|
"learning_rate": 0.00019979349008618808, |
|
"loss": 0.791, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.02031056062355157, |
|
"grad_norm": 0.15958388149738312, |
|
"learning_rate": 0.00019978262691523743, |
|
"loss": 0.7412, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.020570952426417616, |
|
"grad_norm": 0.1646542251110077, |
|
"learning_rate": 0.00019977148561046217, |
|
"loss": 0.7529, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.020831344229283663, |
|
"grad_norm": 0.17032025754451752, |
|
"learning_rate": 0.0001997600662029175, |
|
"loss": 0.7656, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.021091736032149706, |
|
"grad_norm": 0.17189227044582367, |
|
"learning_rate": 0.00019974836872443388, |
|
"loss": 0.7433, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.021352127835015753, |
|
"grad_norm": 0.16334249079227448, |
|
"learning_rate": 0.0001997363932076168, |
|
"loss": 0.7703, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0216125196378818, |
|
"grad_norm": 0.1676424890756607, |
|
"learning_rate": 0.00019972413968584682, |
|
"loss": 0.7603, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.021872911440747846, |
|
"grad_norm": 0.16826209425926208, |
|
"learning_rate": 0.0001997116081932793, |
|
"loss": 0.7569, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02213330324361389, |
|
"grad_norm": 0.1876436173915863, |
|
"learning_rate": 0.0001996987987648446, |
|
"loss": 0.7553, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.022393695046479936, |
|
"grad_norm": 0.17252250015735626, |
|
"learning_rate": 0.0001996857114362476, |
|
"loss": 0.7644, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.022654086849345983, |
|
"grad_norm": 0.1632252335548401, |
|
"learning_rate": 0.00019967234624396793, |
|
"loss": 0.7568, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.02291447865221203, |
|
"grad_norm": 0.1818259060382843, |
|
"learning_rate": 0.00019965870322525965, |
|
"loss": 0.7672, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.023174870455078073, |
|
"grad_norm": 0.15418195724487305, |
|
"learning_rate": 0.0001996447824181513, |
|
"loss": 0.7642, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02343526225794412, |
|
"grad_norm": 0.17383505403995514, |
|
"learning_rate": 0.0001996305838614457, |
|
"loss": 0.7607, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.023695654060810166, |
|
"grad_norm": 0.17794272303581238, |
|
"learning_rate": 0.00019961610759471984, |
|
"loss": 0.7588, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.023956045863676213, |
|
"grad_norm": 0.1909121572971344, |
|
"learning_rate": 0.00019960135365832486, |
|
"loss": 0.7438, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.024216437666542256, |
|
"grad_norm": 0.17758873105049133, |
|
"learning_rate": 0.00019958632209338587, |
|
"loss": 0.7323, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.024476829469408302, |
|
"grad_norm": 0.15553662180900574, |
|
"learning_rate": 0.00019957101294180174, |
|
"loss": 0.7508, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.02473722127227435, |
|
"grad_norm": 0.15310749411582947, |
|
"learning_rate": 0.00019955542624624522, |
|
"loss": 0.7563, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.024997613075140396, |
|
"grad_norm": 0.1628728210926056, |
|
"learning_rate": 0.00019953956205016256, |
|
"loss": 0.7524, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.02525800487800644, |
|
"grad_norm": 0.16211454570293427, |
|
"learning_rate": 0.00019952342039777362, |
|
"loss": 0.7564, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.025518396680872486, |
|
"grad_norm": 0.15663012862205505, |
|
"learning_rate": 0.00019950700133407163, |
|
"loss": 0.7395, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.025778788483738532, |
|
"grad_norm": 0.1684863567352295, |
|
"learning_rate": 0.00019949030490482296, |
|
"loss": 0.753, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.02603918028660458, |
|
"grad_norm": 0.1561436653137207, |
|
"learning_rate": 0.0001994733311565673, |
|
"loss": 0.7409, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.026299572089470622, |
|
"grad_norm": 0.1781485229730606, |
|
"learning_rate": 0.0001994560801366171, |
|
"loss": 0.762, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.02655996389233667, |
|
"grad_norm": 0.15422071516513824, |
|
"learning_rate": 0.00019943855189305792, |
|
"loss": 0.7291, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.026820355695202715, |
|
"grad_norm": 0.17980527877807617, |
|
"learning_rate": 0.00019942074647474786, |
|
"loss": 0.7732, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.027080747498068762, |
|
"grad_norm": 0.15810626745224, |
|
"learning_rate": 0.00019940266393131775, |
|
"loss": 0.7764, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.027341139300934805, |
|
"grad_norm": 0.16385480761528015, |
|
"learning_rate": 0.00019938430431317081, |
|
"loss": 0.7404, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.027601531103800852, |
|
"grad_norm": 0.15134255588054657, |
|
"learning_rate": 0.00019936566767148257, |
|
"loss": 0.7506, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0278619229066669, |
|
"grad_norm": 0.1592187136411667, |
|
"learning_rate": 0.00019934675405820077, |
|
"loss": 0.73, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.028122314709532945, |
|
"grad_norm": 0.16852422058582306, |
|
"learning_rate": 0.00019932756352604515, |
|
"loss": 0.7443, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02838270651239899, |
|
"grad_norm": 0.15741507709026337, |
|
"learning_rate": 0.00019930809612850735, |
|
"loss": 0.7377, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.028643098315265035, |
|
"grad_norm": 0.22424879670143127, |
|
"learning_rate": 0.00019928835191985076, |
|
"loss": 0.7544, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.028903490118131082, |
|
"grad_norm": 0.2047310769557953, |
|
"learning_rate": 0.0001992683309551103, |
|
"loss": 0.7441, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.02916388192099713, |
|
"grad_norm": 0.16392463445663452, |
|
"learning_rate": 0.00019924803329009243, |
|
"loss": 0.7606, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.02942427372386317, |
|
"grad_norm": 0.16227149963378906, |
|
"learning_rate": 0.00019922745898137473, |
|
"loss": 0.736, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.029684665526729218, |
|
"grad_norm": 0.15652808547019958, |
|
"learning_rate": 0.00019920660808630598, |
|
"loss": 0.7513, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.029945057329595265, |
|
"grad_norm": 0.15162768959999084, |
|
"learning_rate": 0.00019918548066300592, |
|
"loss": 0.7303, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03020544913246131, |
|
"grad_norm": 0.17650415003299713, |
|
"learning_rate": 0.0001991640767703651, |
|
"loss": 0.7254, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.030465840935327355, |
|
"grad_norm": 0.1594468355178833, |
|
"learning_rate": 0.00019914239646804462, |
|
"loss": 0.741, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.0307262327381934, |
|
"grad_norm": 0.17928367853164673, |
|
"learning_rate": 0.00019912043981647616, |
|
"loss": 0.7515, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.030986624541059448, |
|
"grad_norm": 0.17009998857975006, |
|
"learning_rate": 0.00019909820687686157, |
|
"loss": 0.7539, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.031247016343925495, |
|
"grad_norm": 0.16556763648986816, |
|
"learning_rate": 0.0001990756977111729, |
|
"loss": 0.7418, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03150740814679154, |
|
"grad_norm": 0.1561640352010727, |
|
"learning_rate": 0.0001990529123821522, |
|
"loss": 0.7465, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.03176779994965759, |
|
"grad_norm": 0.15182287991046906, |
|
"learning_rate": 0.00019902985095331113, |
|
"loss": 0.7694, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03202819175252363, |
|
"grad_norm": 0.15173685550689697, |
|
"learning_rate": 0.00019900651348893114, |
|
"loss": 0.7519, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.032288583555389674, |
|
"grad_norm": 0.16535787284374237, |
|
"learning_rate": 0.00019898290005406296, |
|
"loss": 0.7646, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.032548975358255725, |
|
"grad_norm": 0.19272534549236298, |
|
"learning_rate": 0.00019895901071452667, |
|
"loss": 0.7655, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03280936716112177, |
|
"grad_norm": 0.1672705113887787, |
|
"learning_rate": 0.0001989348455369113, |
|
"loss": 0.7486, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03306975896398781, |
|
"grad_norm": 0.1525493860244751, |
|
"learning_rate": 0.0001989104045885748, |
|
"loss": 0.7546, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.03333015076685386, |
|
"grad_norm": 0.16333037614822388, |
|
"learning_rate": 0.00019888568793764385, |
|
"loss": 0.7299, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.033590542569719904, |
|
"grad_norm": 0.1590205729007721, |
|
"learning_rate": 0.00019886069565301355, |
|
"loss": 0.762, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.033850934372585954, |
|
"grad_norm": 0.15006420016288757, |
|
"learning_rate": 0.00019883542780434733, |
|
"loss": 0.7531, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.034111326175452, |
|
"grad_norm": 0.18390792608261108, |
|
"learning_rate": 0.0001988098844620767, |
|
"loss": 0.7621, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.03437171797831804, |
|
"grad_norm": 0.17046166956424713, |
|
"learning_rate": 0.0001987840656974011, |
|
"loss": 0.7422, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.03463210978118409, |
|
"grad_norm": 0.15121813118457794, |
|
"learning_rate": 0.00019875797158228775, |
|
"loss": 0.7555, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.034892501584050134, |
|
"grad_norm": 0.16219307482242584, |
|
"learning_rate": 0.00019873160218947125, |
|
"loss": 0.7301, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03515289338691618, |
|
"grad_norm": 0.1779986321926117, |
|
"learning_rate": 0.00019870495759245362, |
|
"loss": 0.7356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03541328518978223, |
|
"grad_norm": 0.16951359808444977, |
|
"learning_rate": 0.0001986780378655039, |
|
"loss": 0.7645, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.03567367699264827, |
|
"grad_norm": 0.16620802879333496, |
|
"learning_rate": 0.0001986508430836581, |
|
"loss": 0.7331, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.03593406879551432, |
|
"grad_norm": 0.1577858328819275, |
|
"learning_rate": 0.0001986233733227188, |
|
"loss": 0.7667, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.036194460598380364, |
|
"grad_norm": 0.1637091338634491, |
|
"learning_rate": 0.00019859562865925525, |
|
"loss": 0.7521, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.03645485240124641, |
|
"grad_norm": 0.15061691403388977, |
|
"learning_rate": 0.00019856760917060277, |
|
"loss": 0.744, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.03671524420411246, |
|
"grad_norm": 0.15373477339744568, |
|
"learning_rate": 0.00019853931493486287, |
|
"loss": 0.7677, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.0369756360069785, |
|
"grad_norm": 0.16468606889247894, |
|
"learning_rate": 0.00019851074603090277, |
|
"loss": 0.7179, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.037236027809844544, |
|
"grad_norm": 0.16084876656532288, |
|
"learning_rate": 0.00019848190253835536, |
|
"loss": 0.749, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.037496419612710594, |
|
"grad_norm": 0.16743004322052002, |
|
"learning_rate": 0.00019845278453761896, |
|
"loss": 0.7483, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.03775681141557664, |
|
"grad_norm": 0.17335088551044464, |
|
"learning_rate": 0.00019842339210985696, |
|
"loss": 0.735, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.03801720321844268, |
|
"grad_norm": 0.1546197235584259, |
|
"learning_rate": 0.00019839372533699774, |
|
"loss": 0.7549, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.03827759502130873, |
|
"grad_norm": 0.16218656301498413, |
|
"learning_rate": 0.00019836378430173438, |
|
"loss": 0.7425, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.03853798682417477, |
|
"grad_norm": 0.1712743639945984, |
|
"learning_rate": 0.0001983335690875245, |
|
"loss": 0.733, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.03879837862704082, |
|
"grad_norm": 0.15490613877773285, |
|
"learning_rate": 0.00019830307977858984, |
|
"loss": 0.7265, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.03905877042990687, |
|
"grad_norm": 0.1646670252084732, |
|
"learning_rate": 0.00019827231645991623, |
|
"loss": 0.7315, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03931916223277291, |
|
"grad_norm": 0.1599082201719284, |
|
"learning_rate": 0.00019824127921725326, |
|
"loss": 0.7293, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.03957955403563896, |
|
"grad_norm": 0.1565747708082199, |
|
"learning_rate": 0.00019820996813711407, |
|
"loss": 0.7396, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.039839945838505, |
|
"grad_norm": 0.154826357960701, |
|
"learning_rate": 0.0001981783833067751, |
|
"loss": 0.7217, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.040100337641371046, |
|
"grad_norm": 0.16705222427845, |
|
"learning_rate": 0.0001981465248142758, |
|
"loss": 0.761, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.040360729444237096, |
|
"grad_norm": 0.15651623904705048, |
|
"learning_rate": 0.00019811439274841842, |
|
"loss": 0.7565, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04062112124710314, |
|
"grad_norm": 0.16211090981960297, |
|
"learning_rate": 0.00019808198719876782, |
|
"loss": 0.7555, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04088151304996919, |
|
"grad_norm": 0.16856881976127625, |
|
"learning_rate": 0.00019804930825565112, |
|
"loss": 0.7567, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.04114190485283523, |
|
"grad_norm": 0.1588718593120575, |
|
"learning_rate": 0.00019801635601015752, |
|
"loss": 0.729, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.041402296655701276, |
|
"grad_norm": 0.17078711092472076, |
|
"learning_rate": 0.00019798313055413808, |
|
"loss": 0.7418, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.041662688458567326, |
|
"grad_norm": 0.16652734577655792, |
|
"learning_rate": 0.00019794963198020525, |
|
"loss": 0.7341, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04192308026143337, |
|
"grad_norm": 0.15535488724708557, |
|
"learning_rate": 0.00019791586038173296, |
|
"loss": 0.7396, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.04218347206429941, |
|
"grad_norm": 0.3506317734718323, |
|
"learning_rate": 0.00019788181585285602, |
|
"loss": 0.7345, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.04244386386716546, |
|
"grad_norm": 0.16875872015953064, |
|
"learning_rate": 0.00019784749848847003, |
|
"loss": 0.7214, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.042704255670031506, |
|
"grad_norm": 0.17675861716270447, |
|
"learning_rate": 0.0001978129083842312, |
|
"loss": 0.7431, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.042964647472897556, |
|
"grad_norm": 0.15601837635040283, |
|
"learning_rate": 0.00019777804563655583, |
|
"loss": 0.7215, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.0432250392757636, |
|
"grad_norm": 0.1874823123216629, |
|
"learning_rate": 0.00019774291034262026, |
|
"loss": 0.727, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.04348543107862964, |
|
"grad_norm": 0.17005637288093567, |
|
"learning_rate": 0.00019770750260036054, |
|
"loss": 0.7446, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.04374582288149569, |
|
"grad_norm": 0.17069579660892487, |
|
"learning_rate": 0.00019767182250847207, |
|
"loss": 0.7266, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.044006214684361736, |
|
"grad_norm": 0.16133156418800354, |
|
"learning_rate": 0.00019763587016640948, |
|
"loss": 0.7568, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04426660648722778, |
|
"grad_norm": 0.16229428350925446, |
|
"learning_rate": 0.00019759964567438623, |
|
"loss": 0.7402, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04452699829009383, |
|
"grad_norm": 0.1622512936592102, |
|
"learning_rate": 0.00019756314913337432, |
|
"loss": 0.7536, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04478739009295987, |
|
"grad_norm": 0.2161218672990799, |
|
"learning_rate": 0.00019752638064510415, |
|
"loss": 0.723, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04504778189582592, |
|
"grad_norm": 0.154169961810112, |
|
"learning_rate": 0.00019748934031206414, |
|
"loss": 0.7441, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.045308173698691966, |
|
"grad_norm": 0.15468057990074158, |
|
"learning_rate": 0.00019745202823750034, |
|
"loss": 0.7349, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04556856550155801, |
|
"grad_norm": 0.2015281468629837, |
|
"learning_rate": 0.0001974144445254164, |
|
"loss": 0.726, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04582895730442406, |
|
"grad_norm": 0.1931644082069397, |
|
"learning_rate": 0.00019737658928057302, |
|
"loss": 0.7604, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.0460893491072901, |
|
"grad_norm": 0.1528482288122177, |
|
"learning_rate": 0.00019733846260848776, |
|
"loss": 0.7408, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.046349740910156145, |
|
"grad_norm": 0.16370061039924622, |
|
"learning_rate": 0.0001973000646154349, |
|
"loss": 0.7647, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.046610132713022195, |
|
"grad_norm": 0.16271348297595978, |
|
"learning_rate": 0.00019726139540844484, |
|
"loss": 0.7212, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.04687052451588824, |
|
"grad_norm": 0.16218173503875732, |
|
"learning_rate": 0.00019722245509530401, |
|
"loss": 0.735, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.04713091631875429, |
|
"grad_norm": 0.17063820362091064, |
|
"learning_rate": 0.00019718324378455458, |
|
"loss": 0.7311, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.04739130812162033, |
|
"grad_norm": 0.1678459346294403, |
|
"learning_rate": 0.00019714376158549404, |
|
"loss": 0.7486, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.047651699924486375, |
|
"grad_norm": 0.15926459431648254, |
|
"learning_rate": 0.00019710400860817494, |
|
"loss": 0.743, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.047912091727352425, |
|
"grad_norm": 0.1775251179933548, |
|
"learning_rate": 0.00019706398496340463, |
|
"loss": 0.7512, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.04817248353021847, |
|
"grad_norm": 0.1572408229112625, |
|
"learning_rate": 0.00019702369076274494, |
|
"loss": 0.733, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.04843287533308451, |
|
"grad_norm": 0.29658186435699463, |
|
"learning_rate": 0.0001969831261185118, |
|
"loss": 0.7297, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.04869326713595056, |
|
"grad_norm": 0.16520118713378906, |
|
"learning_rate": 0.00019694229114377494, |
|
"loss": 0.721, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.048953658938816605, |
|
"grad_norm": 0.17762574553489685, |
|
"learning_rate": 0.00019690118595235774, |
|
"loss": 0.7304, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.049214050741682655, |
|
"grad_norm": 0.16636615991592407, |
|
"learning_rate": 0.00019685981065883663, |
|
"loss": 0.7257, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.0494744425445487, |
|
"grad_norm": 0.1622323989868164, |
|
"learning_rate": 0.00019681816537854102, |
|
"loss": 0.7353, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.04973483434741474, |
|
"grad_norm": 0.17419832944869995, |
|
"learning_rate": 0.00019677625022755289, |
|
"loss": 0.7452, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.04999522615028079, |
|
"grad_norm": 0.17460434138774872, |
|
"learning_rate": 0.00019673406532270634, |
|
"loss": 0.7391, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.050255617953146835, |
|
"grad_norm": 0.15844550728797913, |
|
"learning_rate": 0.00019669161078158753, |
|
"loss": 0.7327, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.05051600975601288, |
|
"grad_norm": 0.1638839989900589, |
|
"learning_rate": 0.0001966488867225341, |
|
"loss": 0.745, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.05077640155887893, |
|
"grad_norm": 0.1587786227464676, |
|
"learning_rate": 0.00019660589326463498, |
|
"loss": 0.7476, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05103679336174497, |
|
"grad_norm": 0.15708380937576294, |
|
"learning_rate": 0.00019656263052773002, |
|
"loss": 0.7208, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05129718516461102, |
|
"grad_norm": 0.15816234052181244, |
|
"learning_rate": 0.00019651909863240965, |
|
"loss": 0.7262, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.051557576967477065, |
|
"grad_norm": 0.16749270260334015, |
|
"learning_rate": 0.00019647529770001456, |
|
"loss": 0.7284, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.05181796877034311, |
|
"grad_norm": 0.16943767666816711, |
|
"learning_rate": 0.00019643122785263536, |
|
"loss": 0.7225, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.05207836057320916, |
|
"grad_norm": 0.42929205298423767, |
|
"learning_rate": 0.00019638688921311224, |
|
"loss": 0.7305, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 19202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.3833467527168e+17, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|