{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09113713100311602, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603918028660458, "grad_norm": 0.4500846266746521, "learning_rate": 5.194805194805195e-06, "loss": 1.0381, "step": 10 }, { "epoch": 0.0005207836057320916, "grad_norm": 0.35188010334968567, "learning_rate": 1.038961038961039e-05, "loss": 1.0108, "step": 20 }, { "epoch": 0.0007811754085981374, "grad_norm": 0.2300374060869217, "learning_rate": 1.5584415584415583e-05, "loss": 0.9668, "step": 30 }, { "epoch": 0.0010415672114641832, "grad_norm": 0.16189467906951904, "learning_rate": 2.077922077922078e-05, "loss": 0.918, "step": 40 }, { "epoch": 0.001301959014330229, "grad_norm": 0.18843211233615875, "learning_rate": 2.5974025974025972e-05, "loss": 0.9265, "step": 50 }, { "epoch": 0.0015623508171962747, "grad_norm": 0.20334510505199432, "learning_rate": 3.1168831168831166e-05, "loss": 0.9234, "step": 60 }, { "epoch": 0.0018227426200623205, "grad_norm": 0.1745327115058899, "learning_rate": 3.6363636363636364e-05, "loss": 0.881, "step": 70 }, { "epoch": 0.0020831344229283663, "grad_norm": 0.18667331337928772, "learning_rate": 4.155844155844156e-05, "loss": 0.8592, "step": 80 }, { "epoch": 0.002343526225794412, "grad_norm": 0.1848158985376358, "learning_rate": 4.675324675324675e-05, "loss": 0.8537, "step": 90 }, { "epoch": 0.002603918028660458, "grad_norm": 0.17589879035949707, "learning_rate": 5.1948051948051944e-05, "loss": 0.8518, "step": 100 }, { "epoch": 0.0028643098315265037, "grad_norm": 0.2132624089717865, "learning_rate": 5.714285714285714e-05, "loss": 0.8511, "step": 110 }, { "epoch": 0.0031247016343925495, "grad_norm": 0.23070092499256134, "learning_rate": 6.233766233766233e-05, "loss": 0.7975, "step": 120 }, { "epoch": 0.0033850934372585953, "grad_norm": 0.25368157029151917, "learning_rate": 6.753246753246754e-05, "loss": 0.8134, "step": 130 }, { "epoch": 0.003645485240124641, "grad_norm": 0.22897231578826904, "learning_rate": 7.272727272727273e-05, "loss": 0.8322, "step": 140 }, { "epoch": 0.003905877042990687, "grad_norm": 0.19932536780834198, "learning_rate": 7.792207792207793e-05, "loss": 0.7959, "step": 150 }, { "epoch": 0.004166268845856733, "grad_norm": 0.21011792123317719, "learning_rate": 8.311688311688312e-05, "loss": 0.8102, "step": 160 }, { "epoch": 0.004426660648722778, "grad_norm": 0.20594824850559235, "learning_rate": 8.831168831168831e-05, "loss": 0.8128, "step": 170 }, { "epoch": 0.004687052451588824, "grad_norm": 0.20465536415576935, "learning_rate": 9.35064935064935e-05, "loss": 0.7989, "step": 180 }, { "epoch": 0.00494744425445487, "grad_norm": 0.4109392762184143, "learning_rate": 9.870129870129871e-05, "loss": 0.8108, "step": 190 }, { "epoch": 0.005207836057320916, "grad_norm": 0.4293076694011688, "learning_rate": 0.00010389610389610389, "loss": 0.8101, "step": 200 }, { "epoch": 0.005468227860186962, "grad_norm": 0.31628963351249695, "learning_rate": 0.00010909090909090909, "loss": 0.7989, "step": 210 }, { "epoch": 0.005728619663053007, "grad_norm": 0.24642810225486755, "learning_rate": 0.00011428571428571428, "loss": 0.7751, "step": 220 }, { "epoch": 0.005989011465919053, "grad_norm": 0.3599106967449188, "learning_rate": 0.00011948051948051949, "loss": 0.8063, "step": 230 }, { "epoch": 0.006249403268785099, "grad_norm": 0.17053447663784027, "learning_rate": 0.00012467532467532467, "loss": 0.7751, "step": 240 }, { "epoch": 0.006509795071651145, "grad_norm": 0.17303769290447235, "learning_rate": 0.00012987012987012987, "loss": 0.7883, "step": 250 }, { "epoch": 0.0067701868745171905, "grad_norm": 0.1815861016511917, "learning_rate": 0.00013506493506493507, "loss": 0.788, "step": 260 }, { "epoch": 0.007030578677383236, "grad_norm": 0.24125365912914276, "learning_rate": 0.00014025974025974028, "loss": 0.8018, "step": 270 }, { "epoch": 0.007290970480249282, "grad_norm": 0.19443446397781372, "learning_rate": 0.00014545454545454546, "loss": 0.7908, "step": 280 }, { "epoch": 0.007551362283115328, "grad_norm": 0.17829768359661102, "learning_rate": 0.00015064935064935066, "loss": 0.8033, "step": 290 }, { "epoch": 0.007811754085981374, "grad_norm": 0.19535653293132782, "learning_rate": 0.00015584415584415587, "loss": 0.7997, "step": 300 }, { "epoch": 0.008072145888847419, "grad_norm": 0.19930541515350342, "learning_rate": 0.00016103896103896104, "loss": 0.7945, "step": 310 }, { "epoch": 0.008332537691713465, "grad_norm": 0.2156297266483307, "learning_rate": 0.00016623376623376625, "loss": 0.8018, "step": 320 }, { "epoch": 0.00859292949457951, "grad_norm": 0.1924206018447876, "learning_rate": 0.00017142857142857143, "loss": 0.7746, "step": 330 }, { "epoch": 0.008853321297445557, "grad_norm": 0.2294880747795105, "learning_rate": 0.00017662337662337663, "loss": 0.8152, "step": 340 }, { "epoch": 0.009113713100311602, "grad_norm": 0.16817067563533783, "learning_rate": 0.00018181818181818183, "loss": 0.7972, "step": 350 }, { "epoch": 0.009374104903177648, "grad_norm": 0.18544812500476837, "learning_rate": 0.000187012987012987, "loss": 0.7801, "step": 360 }, { "epoch": 0.009634496706043693, "grad_norm": 0.19597066938877106, "learning_rate": 0.00019220779220779222, "loss": 0.7706, "step": 370 }, { "epoch": 0.00989488850890974, "grad_norm": 0.40291881561279297, "learning_rate": 0.00019740259740259742, "loss": 0.7911, "step": 380 }, { "epoch": 0.010155280311775785, "grad_norm": 0.23841074109077454, "learning_rate": 0.00019999996515752773, "loss": 0.7861, "step": 390 }, { "epoch": 0.010415672114641832, "grad_norm": 0.1675388514995575, "learning_rate": 0.00019999968641789507, "loss": 0.788, "step": 400 }, { "epoch": 0.010676063917507876, "grad_norm": 1.8860758543014526, "learning_rate": 0.0001999991289394067, "loss": 0.7632, "step": 410 }, { "epoch": 0.010936455720373923, "grad_norm": 0.17022117972373962, "learning_rate": 0.00019999829272361654, "loss": 0.784, "step": 420 }, { "epoch": 0.011196847523239968, "grad_norm": 0.21460269391536713, "learning_rate": 0.00019999717777285545, "loss": 0.761, "step": 430 }, { "epoch": 0.011457239326106015, "grad_norm": 0.19413785636425018, "learning_rate": 0.00019999578409023126, "loss": 0.7772, "step": 440 }, { "epoch": 0.01171763112897206, "grad_norm": 0.20223405957221985, "learning_rate": 0.00019999411167962868, "loss": 0.7811, "step": 450 }, { "epoch": 0.011978022931838106, "grad_norm": 0.15166303515434265, "learning_rate": 0.00019999216054570942, "loss": 0.7709, "step": 460 }, { "epoch": 0.012238414734704151, "grad_norm": 0.16307081282138824, "learning_rate": 0.00019998993069391205, "loss": 0.7811, "step": 470 }, { "epoch": 0.012498806537570198, "grad_norm": 0.15996049344539642, "learning_rate": 0.00019998742213045206, "loss": 0.7599, "step": 480 }, { "epoch": 0.012759198340436243, "grad_norm": 0.17560279369354248, "learning_rate": 0.00019998463486232179, "loss": 0.7572, "step": 490 }, { "epoch": 0.01301959014330229, "grad_norm": 0.17571642994880676, "learning_rate": 0.0001999815688972905, "loss": 0.7643, "step": 500 }, { "epoch": 0.013279981946168334, "grad_norm": 0.17719799280166626, "learning_rate": 0.00019997822424390422, "loss": 0.7923, "step": 510 }, { "epoch": 0.013540373749034381, "grad_norm": 0.19846616685390472, "learning_rate": 0.00019997460091148586, "loss": 0.7674, "step": 520 }, { "epoch": 0.013800765551900426, "grad_norm": 0.2715558111667633, "learning_rate": 0.00019997069891013503, "loss": 0.7421, "step": 530 }, { "epoch": 0.014061157354766473, "grad_norm": 0.1725197583436966, "learning_rate": 0.00019996651825072826, "loss": 0.7663, "step": 540 }, { "epoch": 0.014321549157632518, "grad_norm": 0.15060502290725708, "learning_rate": 0.00019996205894491856, "loss": 0.7794, "step": 550 }, { "epoch": 0.014581940960498564, "grad_norm": 0.16645808517932892, "learning_rate": 0.00019995732100513592, "loss": 0.752, "step": 560 }, { "epoch": 0.014842332763364609, "grad_norm": 0.1736789345741272, "learning_rate": 0.00019995230444458682, "loss": 0.7788, "step": 570 }, { "epoch": 0.015102724566230656, "grad_norm": 0.15416319668293, "learning_rate": 0.0001999470092772544, "loss": 0.7656, "step": 580 }, { "epoch": 0.0153631163690967, "grad_norm": 0.16610187292099, "learning_rate": 0.00019994143551789839, "loss": 0.7676, "step": 590 }, { "epoch": 0.015623508171962747, "grad_norm": 0.15843011438846588, "learning_rate": 0.00019993558318205507, "loss": 0.7746, "step": 600 }, { "epoch": 0.015883899974828794, "grad_norm": 0.26837801933288574, "learning_rate": 0.00019992945228603724, "loss": 0.7617, "step": 610 }, { "epoch": 0.016144291777694837, "grad_norm": 0.15099173784255981, "learning_rate": 0.0001999230428469341, "loss": 0.7601, "step": 620 }, { "epoch": 0.016404683580560884, "grad_norm": 0.15511856973171234, "learning_rate": 0.00019991635488261138, "loss": 0.7647, "step": 630 }, { "epoch": 0.01666507538342693, "grad_norm": 0.14919579029083252, "learning_rate": 0.00019990938841171104, "loss": 0.7692, "step": 640 }, { "epoch": 0.016925467186292977, "grad_norm": 0.15838642418384552, "learning_rate": 0.0001999021434536514, "loss": 0.7763, "step": 650 }, { "epoch": 0.01718585898915902, "grad_norm": 0.15956635773181915, "learning_rate": 0.00019989462002862704, "loss": 0.7598, "step": 660 }, { "epoch": 0.017446250792025067, "grad_norm": 0.1499069333076477, "learning_rate": 0.0001998868181576088, "loss": 0.7626, "step": 670 }, { "epoch": 0.017706642594891114, "grad_norm": 0.2170073390007019, "learning_rate": 0.00019987873786234348, "loss": 0.7569, "step": 680 }, { "epoch": 0.01796703439775716, "grad_norm": 0.17841948568820953, "learning_rate": 0.00019987037916535417, "loss": 0.7494, "step": 690 }, { "epoch": 0.018227426200623204, "grad_norm": 0.2066909372806549, "learning_rate": 0.0001998617420899398, "loss": 0.7609, "step": 700 }, { "epoch": 0.01848781800348925, "grad_norm": 0.17015361785888672, "learning_rate": 0.0001998528266601754, "loss": 0.7761, "step": 710 }, { "epoch": 0.018748209806355297, "grad_norm": 0.22166290879249573, "learning_rate": 0.0001998436329009118, "loss": 0.7573, "step": 720 }, { "epoch": 0.01900860160922134, "grad_norm": 0.15084640681743622, "learning_rate": 0.00019983416083777563, "loss": 0.7775, "step": 730 }, { "epoch": 0.019268993412087387, "grad_norm": 0.17800921201705933, "learning_rate": 0.0001998244104971693, "loss": 0.7359, "step": 740 }, { "epoch": 0.019529385214953433, "grad_norm": 0.17354707419872284, "learning_rate": 0.0001998143819062709, "loss": 0.7415, "step": 750 }, { "epoch": 0.01978977701781948, "grad_norm": 0.16408118605613708, "learning_rate": 0.00019980407509303413, "loss": 0.7708, "step": 760 }, { "epoch": 0.020050168820685523, "grad_norm": 0.16820089519023895, "learning_rate": 0.00019979349008618808, "loss": 0.791, "step": 770 }, { "epoch": 0.02031056062355157, "grad_norm": 0.15958388149738312, "learning_rate": 0.00019978262691523743, "loss": 0.7412, "step": 780 }, { "epoch": 0.020570952426417616, "grad_norm": 0.1646542251110077, "learning_rate": 0.00019977148561046217, "loss": 0.7529, "step": 790 }, { "epoch": 0.020831344229283663, "grad_norm": 0.17032025754451752, "learning_rate": 0.0001997600662029175, "loss": 0.7656, "step": 800 }, { "epoch": 0.021091736032149706, "grad_norm": 0.17189227044582367, "learning_rate": 0.00019974836872443388, "loss": 0.7433, "step": 810 }, { "epoch": 0.021352127835015753, "grad_norm": 0.16334249079227448, "learning_rate": 0.0001997363932076168, "loss": 0.7703, "step": 820 }, { "epoch": 0.0216125196378818, "grad_norm": 0.1676424890756607, "learning_rate": 0.00019972413968584682, "loss": 0.7603, "step": 830 }, { "epoch": 0.021872911440747846, "grad_norm": 0.16826209425926208, "learning_rate": 0.0001997116081932793, "loss": 0.7569, "step": 840 }, { "epoch": 0.02213330324361389, "grad_norm": 0.1876436173915863, "learning_rate": 0.0001996987987648446, "loss": 0.7553, "step": 850 }, { "epoch": 0.022393695046479936, "grad_norm": 0.17252250015735626, "learning_rate": 0.0001996857114362476, "loss": 0.7644, "step": 860 }, { "epoch": 0.022654086849345983, "grad_norm": 0.1632252335548401, "learning_rate": 0.00019967234624396793, "loss": 0.7568, "step": 870 }, { "epoch": 0.02291447865221203, "grad_norm": 0.1818259060382843, "learning_rate": 0.00019965870322525965, "loss": 0.7672, "step": 880 }, { "epoch": 0.023174870455078073, "grad_norm": 0.15418195724487305, "learning_rate": 0.0001996447824181513, "loss": 0.7642, "step": 890 }, { "epoch": 0.02343526225794412, "grad_norm": 0.17383505403995514, "learning_rate": 0.0001996305838614457, "loss": 0.7607, "step": 900 }, { "epoch": 0.023695654060810166, "grad_norm": 0.17794272303581238, "learning_rate": 0.00019961610759471984, "loss": 0.7588, "step": 910 }, { "epoch": 0.023956045863676213, "grad_norm": 0.1909121572971344, "learning_rate": 0.00019960135365832486, "loss": 0.7438, "step": 920 }, { "epoch": 0.024216437666542256, "grad_norm": 0.17758873105049133, "learning_rate": 0.00019958632209338587, "loss": 0.7323, "step": 930 }, { "epoch": 0.024476829469408302, "grad_norm": 0.15553662180900574, "learning_rate": 0.00019957101294180174, "loss": 0.7508, "step": 940 }, { "epoch": 0.02473722127227435, "grad_norm": 0.15310749411582947, "learning_rate": 0.00019955542624624522, "loss": 0.7563, "step": 950 }, { "epoch": 0.024997613075140396, "grad_norm": 0.1628728210926056, "learning_rate": 0.00019953956205016256, "loss": 0.7524, "step": 960 }, { "epoch": 0.02525800487800644, "grad_norm": 0.16211454570293427, "learning_rate": 0.00019952342039777362, "loss": 0.7564, "step": 970 }, { "epoch": 0.025518396680872486, "grad_norm": 0.15663012862205505, "learning_rate": 0.00019950700133407163, "loss": 0.7395, "step": 980 }, { "epoch": 0.025778788483738532, "grad_norm": 0.1684863567352295, "learning_rate": 0.00019949030490482296, "loss": 0.753, "step": 990 }, { "epoch": 0.02603918028660458, "grad_norm": 0.1561436653137207, "learning_rate": 0.0001994733311565673, "loss": 0.7409, "step": 1000 }, { "epoch": 0.026299572089470622, "grad_norm": 0.1781485229730606, "learning_rate": 0.0001994560801366171, "loss": 0.762, "step": 1010 }, { "epoch": 0.02655996389233667, "grad_norm": 0.15422071516513824, "learning_rate": 0.00019943855189305792, "loss": 0.7291, "step": 1020 }, { "epoch": 0.026820355695202715, "grad_norm": 0.17980527877807617, "learning_rate": 0.00019942074647474786, "loss": 0.7732, "step": 1030 }, { "epoch": 0.027080747498068762, "grad_norm": 0.15810626745224, "learning_rate": 0.00019940266393131775, "loss": 0.7764, "step": 1040 }, { "epoch": 0.027341139300934805, "grad_norm": 0.16385480761528015, "learning_rate": 0.00019938430431317081, "loss": 0.7404, "step": 1050 }, { "epoch": 0.027601531103800852, "grad_norm": 0.15134255588054657, "learning_rate": 0.00019936566767148257, "loss": 0.7506, "step": 1060 }, { "epoch": 0.0278619229066669, "grad_norm": 0.1592187136411667, "learning_rate": 0.00019934675405820077, "loss": 0.73, "step": 1070 }, { "epoch": 0.028122314709532945, "grad_norm": 0.16852422058582306, "learning_rate": 0.00019932756352604515, "loss": 0.7443, "step": 1080 }, { "epoch": 0.02838270651239899, "grad_norm": 0.15741507709026337, "learning_rate": 0.00019930809612850735, "loss": 0.7377, "step": 1090 }, { "epoch": 0.028643098315265035, "grad_norm": 0.22424879670143127, "learning_rate": 0.00019928835191985076, "loss": 0.7544, "step": 1100 }, { "epoch": 0.028903490118131082, "grad_norm": 0.2047310769557953, "learning_rate": 0.0001992683309551103, "loss": 0.7441, "step": 1110 }, { "epoch": 0.02916388192099713, "grad_norm": 0.16392463445663452, "learning_rate": 0.00019924803329009243, "loss": 0.7606, "step": 1120 }, { "epoch": 0.02942427372386317, "grad_norm": 0.16227149963378906, "learning_rate": 0.00019922745898137473, "loss": 0.736, "step": 1130 }, { "epoch": 0.029684665526729218, "grad_norm": 0.15652808547019958, "learning_rate": 0.00019920660808630598, "loss": 0.7513, "step": 1140 }, { "epoch": 0.029945057329595265, "grad_norm": 0.15162768959999084, "learning_rate": 0.00019918548066300592, "loss": 0.7303, "step": 1150 }, { "epoch": 0.03020544913246131, "grad_norm": 0.17650415003299713, "learning_rate": 0.0001991640767703651, "loss": 0.7254, "step": 1160 }, { "epoch": 0.030465840935327355, "grad_norm": 0.1594468355178833, "learning_rate": 0.00019914239646804462, "loss": 0.741, "step": 1170 }, { "epoch": 0.0307262327381934, "grad_norm": 0.17928367853164673, "learning_rate": 0.00019912043981647616, "loss": 0.7515, "step": 1180 }, { "epoch": 0.030986624541059448, "grad_norm": 0.17009998857975006, "learning_rate": 0.00019909820687686157, "loss": 0.7539, "step": 1190 }, { "epoch": 0.031247016343925495, "grad_norm": 0.16556763648986816, "learning_rate": 0.0001990756977111729, "loss": 0.7418, "step": 1200 }, { "epoch": 0.03150740814679154, "grad_norm": 0.1561640352010727, "learning_rate": 0.0001990529123821522, "loss": 0.7465, "step": 1210 }, { "epoch": 0.03176779994965759, "grad_norm": 0.15182287991046906, "learning_rate": 0.00019902985095331113, "loss": 0.7694, "step": 1220 }, { "epoch": 0.03202819175252363, "grad_norm": 0.15173685550689697, "learning_rate": 0.00019900651348893114, "loss": 0.7519, "step": 1230 }, { "epoch": 0.032288583555389674, "grad_norm": 0.16535787284374237, "learning_rate": 0.00019898290005406296, "loss": 0.7646, "step": 1240 }, { "epoch": 0.032548975358255725, "grad_norm": 0.19272534549236298, "learning_rate": 0.00019895901071452667, "loss": 0.7655, "step": 1250 }, { "epoch": 0.03280936716112177, "grad_norm": 0.1672705113887787, "learning_rate": 0.0001989348455369113, "loss": 0.7486, "step": 1260 }, { "epoch": 0.03306975896398781, "grad_norm": 0.1525493860244751, "learning_rate": 0.0001989104045885748, "loss": 0.7546, "step": 1270 }, { "epoch": 0.03333015076685386, "grad_norm": 0.16333037614822388, "learning_rate": 0.00019888568793764385, "loss": 0.7299, "step": 1280 }, { "epoch": 0.033590542569719904, "grad_norm": 0.1590205729007721, "learning_rate": 0.00019886069565301355, "loss": 0.762, "step": 1290 }, { "epoch": 0.033850934372585954, "grad_norm": 0.15006420016288757, "learning_rate": 0.00019883542780434733, "loss": 0.7531, "step": 1300 }, { "epoch": 0.034111326175452, "grad_norm": 0.18390792608261108, "learning_rate": 0.0001988098844620767, "loss": 0.7621, "step": 1310 }, { "epoch": 0.03437171797831804, "grad_norm": 0.17046166956424713, "learning_rate": 0.0001987840656974011, "loss": 0.7422, "step": 1320 }, { "epoch": 0.03463210978118409, "grad_norm": 0.15121813118457794, "learning_rate": 0.00019875797158228775, "loss": 0.7555, "step": 1330 }, { "epoch": 0.034892501584050134, "grad_norm": 0.16219307482242584, "learning_rate": 0.00019873160218947125, "loss": 0.7301, "step": 1340 }, { "epoch": 0.03515289338691618, "grad_norm": 0.1779986321926117, "learning_rate": 0.00019870495759245362, "loss": 0.7356, "step": 1350 }, { "epoch": 0.03541328518978223, "grad_norm": 0.16951359808444977, "learning_rate": 0.0001986780378655039, "loss": 0.7645, "step": 1360 }, { "epoch": 0.03567367699264827, "grad_norm": 0.16620802879333496, "learning_rate": 0.0001986508430836581, "loss": 0.7331, "step": 1370 }, { "epoch": 0.03593406879551432, "grad_norm": 0.1577858328819275, "learning_rate": 0.0001986233733227188, "loss": 0.7667, "step": 1380 }, { "epoch": 0.036194460598380364, "grad_norm": 0.1637091338634491, "learning_rate": 0.00019859562865925525, "loss": 0.7521, "step": 1390 }, { "epoch": 0.03645485240124641, "grad_norm": 0.15061691403388977, "learning_rate": 0.00019856760917060277, "loss": 0.744, "step": 1400 }, { "epoch": 0.03671524420411246, "grad_norm": 0.15373477339744568, "learning_rate": 0.00019853931493486287, "loss": 0.7677, "step": 1410 }, { "epoch": 0.0369756360069785, "grad_norm": 0.16468606889247894, "learning_rate": 0.00019851074603090277, "loss": 0.7179, "step": 1420 }, { "epoch": 0.037236027809844544, "grad_norm": 0.16084876656532288, "learning_rate": 0.00019848190253835536, "loss": 0.749, "step": 1430 }, { "epoch": 0.037496419612710594, "grad_norm": 0.16743004322052002, "learning_rate": 0.00019845278453761896, "loss": 0.7483, "step": 1440 }, { "epoch": 0.03775681141557664, "grad_norm": 0.17335088551044464, "learning_rate": 0.00019842339210985696, "loss": 0.735, "step": 1450 }, { "epoch": 0.03801720321844268, "grad_norm": 0.1546197235584259, "learning_rate": 0.00019839372533699774, "loss": 0.7549, "step": 1460 }, { "epoch": 0.03827759502130873, "grad_norm": 0.16218656301498413, "learning_rate": 0.00019836378430173438, "loss": 0.7425, "step": 1470 }, { "epoch": 0.03853798682417477, "grad_norm": 0.1712743639945984, "learning_rate": 0.0001983335690875245, "loss": 0.733, "step": 1480 }, { "epoch": 0.03879837862704082, "grad_norm": 0.15490613877773285, "learning_rate": 0.00019830307977858984, "loss": 0.7265, "step": 1490 }, { "epoch": 0.03905877042990687, "grad_norm": 0.1646670252084732, "learning_rate": 0.00019827231645991623, "loss": 0.7315, "step": 1500 }, { "epoch": 0.03931916223277291, "grad_norm": 0.1599082201719284, "learning_rate": 0.00019824127921725326, "loss": 0.7293, "step": 1510 }, { "epoch": 0.03957955403563896, "grad_norm": 0.1565747708082199, "learning_rate": 0.00019820996813711407, "loss": 0.7396, "step": 1520 }, { "epoch": 0.039839945838505, "grad_norm": 0.154826357960701, "learning_rate": 0.0001981783833067751, "loss": 0.7217, "step": 1530 }, { "epoch": 0.040100337641371046, "grad_norm": 0.16705222427845, "learning_rate": 0.0001981465248142758, "loss": 0.761, "step": 1540 }, { "epoch": 0.040360729444237096, "grad_norm": 0.15651623904705048, "learning_rate": 0.00019811439274841842, "loss": 0.7565, "step": 1550 }, { "epoch": 0.04062112124710314, "grad_norm": 0.16211090981960297, "learning_rate": 0.00019808198719876782, "loss": 0.7555, "step": 1560 }, { "epoch": 0.04088151304996919, "grad_norm": 0.16856881976127625, "learning_rate": 0.00019804930825565112, "loss": 0.7567, "step": 1570 }, { "epoch": 0.04114190485283523, "grad_norm": 0.1588718593120575, "learning_rate": 0.00019801635601015752, "loss": 0.729, "step": 1580 }, { "epoch": 0.041402296655701276, "grad_norm": 0.17078711092472076, "learning_rate": 0.00019798313055413808, "loss": 0.7418, "step": 1590 }, { "epoch": 0.041662688458567326, "grad_norm": 0.16652734577655792, "learning_rate": 0.00019794963198020525, "loss": 0.7341, "step": 1600 }, { "epoch": 0.04192308026143337, "grad_norm": 0.15535488724708557, "learning_rate": 0.00019791586038173296, "loss": 0.7396, "step": 1610 }, { "epoch": 0.04218347206429941, "grad_norm": 0.3506317734718323, "learning_rate": 0.00019788181585285602, "loss": 0.7345, "step": 1620 }, { "epoch": 0.04244386386716546, "grad_norm": 0.16875872015953064, "learning_rate": 0.00019784749848847003, "loss": 0.7214, "step": 1630 }, { "epoch": 0.042704255670031506, "grad_norm": 0.17675861716270447, "learning_rate": 0.0001978129083842312, "loss": 0.7431, "step": 1640 }, { "epoch": 0.042964647472897556, "grad_norm": 0.15601837635040283, "learning_rate": 0.00019777804563655583, "loss": 0.7215, "step": 1650 }, { "epoch": 0.0432250392757636, "grad_norm": 0.1874823123216629, "learning_rate": 0.00019774291034262026, "loss": 0.727, "step": 1660 }, { "epoch": 0.04348543107862964, "grad_norm": 0.17005637288093567, "learning_rate": 0.00019770750260036054, "loss": 0.7446, "step": 1670 }, { "epoch": 0.04374582288149569, "grad_norm": 0.17069579660892487, "learning_rate": 0.00019767182250847207, "loss": 0.7266, "step": 1680 }, { "epoch": 0.044006214684361736, "grad_norm": 0.16133156418800354, "learning_rate": 0.00019763587016640948, "loss": 0.7568, "step": 1690 }, { "epoch": 0.04426660648722778, "grad_norm": 0.16229428350925446, "learning_rate": 0.00019759964567438623, "loss": 0.7402, "step": 1700 }, { "epoch": 0.04452699829009383, "grad_norm": 0.1622512936592102, "learning_rate": 0.00019756314913337432, "loss": 0.7536, "step": 1710 }, { "epoch": 0.04478739009295987, "grad_norm": 0.2161218672990799, "learning_rate": 0.00019752638064510415, "loss": 0.723, "step": 1720 }, { "epoch": 0.04504778189582592, "grad_norm": 0.154169961810112, "learning_rate": 0.00019748934031206414, "loss": 0.7441, "step": 1730 }, { "epoch": 0.045308173698691966, "grad_norm": 0.15468057990074158, "learning_rate": 0.00019745202823750034, "loss": 0.7349, "step": 1740 }, { "epoch": 0.04556856550155801, "grad_norm": 0.2015281468629837, "learning_rate": 0.0001974144445254164, "loss": 0.726, "step": 1750 }, { "epoch": 0.04582895730442406, "grad_norm": 0.1931644082069397, "learning_rate": 0.00019737658928057302, "loss": 0.7604, "step": 1760 }, { "epoch": 0.0460893491072901, "grad_norm": 0.1528482288122177, "learning_rate": 0.00019733846260848776, "loss": 0.7408, "step": 1770 }, { "epoch": 0.046349740910156145, "grad_norm": 0.16370061039924622, "learning_rate": 0.0001973000646154349, "loss": 0.7647, "step": 1780 }, { "epoch": 0.046610132713022195, "grad_norm": 0.16271348297595978, "learning_rate": 0.00019726139540844484, "loss": 0.7212, "step": 1790 }, { "epoch": 0.04687052451588824, "grad_norm": 0.16218173503875732, "learning_rate": 0.00019722245509530401, "loss": 0.735, "step": 1800 }, { "epoch": 0.04713091631875429, "grad_norm": 0.17063820362091064, "learning_rate": 0.00019718324378455458, "loss": 0.7311, "step": 1810 }, { "epoch": 0.04739130812162033, "grad_norm": 0.1678459346294403, "learning_rate": 0.00019714376158549404, "loss": 0.7486, "step": 1820 }, { "epoch": 0.047651699924486375, "grad_norm": 0.15926459431648254, "learning_rate": 0.00019710400860817494, "loss": 0.743, "step": 1830 }, { "epoch": 0.047912091727352425, "grad_norm": 0.1775251179933548, "learning_rate": 0.00019706398496340463, "loss": 0.7512, "step": 1840 }, { "epoch": 0.04817248353021847, "grad_norm": 0.1572408229112625, "learning_rate": 0.00019702369076274494, "loss": 0.733, "step": 1850 }, { "epoch": 0.04843287533308451, "grad_norm": 0.29658186435699463, "learning_rate": 0.0001969831261185118, "loss": 0.7297, "step": 1860 }, { "epoch": 0.04869326713595056, "grad_norm": 0.16520118713378906, "learning_rate": 0.00019694229114377494, "loss": 0.721, "step": 1870 }, { "epoch": 0.048953658938816605, "grad_norm": 0.17762574553489685, "learning_rate": 0.00019690118595235774, "loss": 0.7304, "step": 1880 }, { "epoch": 0.049214050741682655, "grad_norm": 0.16636615991592407, "learning_rate": 0.00019685981065883663, "loss": 0.7257, "step": 1890 }, { "epoch": 0.0494744425445487, "grad_norm": 0.1622323989868164, "learning_rate": 0.00019681816537854102, "loss": 0.7353, "step": 1900 }, { "epoch": 0.04973483434741474, "grad_norm": 0.17419832944869995, "learning_rate": 0.00019677625022755289, "loss": 0.7452, "step": 1910 }, { "epoch": 0.04999522615028079, "grad_norm": 0.17460434138774872, "learning_rate": 0.00019673406532270634, "loss": 0.7391, "step": 1920 }, { "epoch": 0.050255617953146835, "grad_norm": 0.15844550728797913, "learning_rate": 0.00019669161078158753, "loss": 0.7327, "step": 1930 }, { "epoch": 0.05051600975601288, "grad_norm": 0.1638839989900589, "learning_rate": 0.0001966488867225341, "loss": 0.745, "step": 1940 }, { "epoch": 0.05077640155887893, "grad_norm": 0.1587786227464676, "learning_rate": 0.00019660589326463498, "loss": 0.7476, "step": 1950 }, { "epoch": 0.05103679336174497, "grad_norm": 0.15708380937576294, "learning_rate": 0.00019656263052773002, "loss": 0.7208, "step": 1960 }, { "epoch": 0.05129718516461102, "grad_norm": 0.15816234052181244, "learning_rate": 0.00019651909863240965, "loss": 0.7262, "step": 1970 }, { "epoch": 0.051557576967477065, "grad_norm": 0.16749270260334015, "learning_rate": 0.00019647529770001456, "loss": 0.7284, "step": 1980 }, { "epoch": 0.05181796877034311, "grad_norm": 0.16943767666816711, "learning_rate": 0.00019643122785263536, "loss": 0.7225, "step": 1990 }, { "epoch": 0.05207836057320916, "grad_norm": 0.42929205298423767, "learning_rate": 0.00019638688921311224, "loss": 0.7305, "step": 2000 }, { "epoch": 0.0523387523760752, "grad_norm": 0.15851692855358124, "learning_rate": 0.00019634228190503454, "loss": 0.7344, "step": 2010 }, { "epoch": 0.052599144178941244, "grad_norm": 0.16053883731365204, "learning_rate": 0.00019629740605274062, "loss": 0.7468, "step": 2020 }, { "epoch": 0.052859535981807294, "grad_norm": 0.16504009068012238, "learning_rate": 0.00019625226178131728, "loss": 0.7375, "step": 2030 }, { "epoch": 0.05311992778467334, "grad_norm": 0.1618044674396515, "learning_rate": 0.00019620684921659953, "loss": 0.7201, "step": 2040 }, { "epoch": 0.05338031958753939, "grad_norm": 0.15512776374816895, "learning_rate": 0.00019616116848517027, "loss": 0.7355, "step": 2050 }, { "epoch": 0.05364071139040543, "grad_norm": 0.17377036809921265, "learning_rate": 0.00019611521971435979, "loss": 0.7226, "step": 2060 }, { "epoch": 0.053901103193271474, "grad_norm": 0.1685250997543335, "learning_rate": 0.0001960690030322456, "loss": 0.7483, "step": 2070 }, { "epoch": 0.054161494996137524, "grad_norm": 0.18394522368907928, "learning_rate": 0.00019602251856765194, "loss": 0.7385, "step": 2080 }, { "epoch": 0.05442188679900357, "grad_norm": 0.1753673106431961, "learning_rate": 0.0001959757664501495, "loss": 0.7378, "step": 2090 }, { "epoch": 0.05468227860186961, "grad_norm": 0.1795465052127838, "learning_rate": 0.000195928746810055, "loss": 0.748, "step": 2100 }, { "epoch": 0.05494267040473566, "grad_norm": 0.16327305138111115, "learning_rate": 0.0001958814597784309, "loss": 0.7306, "step": 2110 }, { "epoch": 0.055203062207601704, "grad_norm": 0.15880291163921356, "learning_rate": 0.00019583390548708486, "loss": 0.7281, "step": 2120 }, { "epoch": 0.05546345401046775, "grad_norm": 0.1702323853969574, "learning_rate": 0.0001957860840685696, "loss": 0.7407, "step": 2130 }, { "epoch": 0.0557238458133338, "grad_norm": 0.16931670904159546, "learning_rate": 0.0001957379956561825, "loss": 0.7272, "step": 2140 }, { "epoch": 0.05598423761619984, "grad_norm": 0.15455976128578186, "learning_rate": 0.000195689640383965, "loss": 0.7398, "step": 2150 }, { "epoch": 0.05624462941906589, "grad_norm": 0.16061417758464813, "learning_rate": 0.0001956410183867024, "loss": 0.749, "step": 2160 }, { "epoch": 0.056505021221931934, "grad_norm": 0.14933143556118011, "learning_rate": 0.00019559212979992365, "loss": 0.7418, "step": 2170 }, { "epoch": 0.05676541302479798, "grad_norm": 0.1592816412448883, "learning_rate": 0.00019554297475990058, "loss": 0.7423, "step": 2180 }, { "epoch": 0.05702580482766403, "grad_norm": 0.1677238792181015, "learning_rate": 0.00019549355340364787, "loss": 0.7101, "step": 2190 }, { "epoch": 0.05728619663053007, "grad_norm": 0.3558599054813385, "learning_rate": 0.00019544386586892238, "loss": 0.725, "step": 2200 }, { "epoch": 0.05754658843339611, "grad_norm": 0.1746376007795334, "learning_rate": 0.00019539391229422313, "loss": 0.7479, "step": 2210 }, { "epoch": 0.057806980236262163, "grad_norm": 0.15979182720184326, "learning_rate": 0.00019534369281879049, "loss": 0.7352, "step": 2220 }, { "epoch": 0.05806737203912821, "grad_norm": 0.16173166036605835, "learning_rate": 0.0001952932075826061, "loss": 0.7364, "step": 2230 }, { "epoch": 0.05832776384199426, "grad_norm": 0.1514744907617569, "learning_rate": 0.00019524245672639245, "loss": 0.734, "step": 2240 }, { "epoch": 0.0585881556448603, "grad_norm": 0.16860373318195343, "learning_rate": 0.00019519144039161222, "loss": 0.7098, "step": 2250 }, { "epoch": 0.05884854744772634, "grad_norm": 0.16847743093967438, "learning_rate": 0.00019514015872046833, "loss": 0.7103, "step": 2260 }, { "epoch": 0.05910893925059239, "grad_norm": 0.16181516647338867, "learning_rate": 0.00019508861185590307, "loss": 0.7561, "step": 2270 }, { "epoch": 0.059369331053458436, "grad_norm": 0.16594484448432922, "learning_rate": 0.0001950367999415981, "loss": 0.7308, "step": 2280 }, { "epoch": 0.05962972285632448, "grad_norm": 0.166441410779953, "learning_rate": 0.00019498472312197375, "loss": 0.735, "step": 2290 }, { "epoch": 0.05989011465919053, "grad_norm": 0.16273920238018036, "learning_rate": 0.00019493238154218886, "loss": 0.7458, "step": 2300 }, { "epoch": 0.06015050646205657, "grad_norm": 0.16227276623249054, "learning_rate": 0.00019487977534814012, "loss": 0.7143, "step": 2310 }, { "epoch": 0.06041089826492262, "grad_norm": 0.1619606912136078, "learning_rate": 0.000194826904686462, "loss": 0.7285, "step": 2320 }, { "epoch": 0.060671290067788666, "grad_norm": 0.1596045345067978, "learning_rate": 0.00019477376970452603, "loss": 0.7513, "step": 2330 }, { "epoch": 0.06093168187065471, "grad_norm": 0.17504757642745972, "learning_rate": 0.00019472037055044044, "loss": 0.7376, "step": 2340 }, { "epoch": 0.06119207367352076, "grad_norm": 0.1559167355298996, "learning_rate": 0.00019466670737304992, "loss": 0.7339, "step": 2350 }, { "epoch": 0.0614524654763868, "grad_norm": 0.1624836027622223, "learning_rate": 0.0001946127803219351, "loss": 0.7258, "step": 2360 }, { "epoch": 0.061712857279252846, "grad_norm": 0.17907138168811798, "learning_rate": 0.00019455858954741206, "loss": 0.72, "step": 2370 }, { "epoch": 0.061973249082118896, "grad_norm": 0.15922705829143524, "learning_rate": 0.00019450413520053202, "loss": 0.7187, "step": 2380 }, { "epoch": 0.06223364088498494, "grad_norm": 0.1552513986825943, "learning_rate": 0.0001944494174330809, "loss": 0.7183, "step": 2390 }, { "epoch": 0.06249403268785099, "grad_norm": 0.16838514804840088, "learning_rate": 0.00019439443639757885, "loss": 0.7286, "step": 2400 }, { "epoch": 0.06275442449071703, "grad_norm": 0.17352423071861267, "learning_rate": 0.00019433919224727986, "loss": 0.7436, "step": 2410 }, { "epoch": 0.06301481629358308, "grad_norm": 0.17366603016853333, "learning_rate": 0.0001942836851361713, "loss": 0.7265, "step": 2420 }, { "epoch": 0.06327520809644913, "grad_norm": 0.14833413064479828, "learning_rate": 0.00019422791521897357, "loss": 0.7234, "step": 2430 }, { "epoch": 0.06353559989931518, "grad_norm": 0.16602723300457, "learning_rate": 0.00019417188265113958, "loss": 0.725, "step": 2440 }, { "epoch": 0.06379599170218121, "grad_norm": 0.17290353775024414, "learning_rate": 0.00019411558758885438, "loss": 0.7174, "step": 2450 }, { "epoch": 0.06405638350504726, "grad_norm": 0.16486665606498718, "learning_rate": 0.0001940590301890346, "loss": 0.7301, "step": 2460 }, { "epoch": 0.06431677530791331, "grad_norm": 0.16255232691764832, "learning_rate": 0.00019400221060932827, "loss": 0.7462, "step": 2470 }, { "epoch": 0.06457716711077935, "grad_norm": 0.16139757633209229, "learning_rate": 0.0001939451290081141, "loss": 0.7424, "step": 2480 }, { "epoch": 0.0648375589136454, "grad_norm": 0.165597602725029, "learning_rate": 0.00019388778554450117, "loss": 0.7426, "step": 2490 }, { "epoch": 0.06509795071651145, "grad_norm": 0.19819000363349915, "learning_rate": 0.00019383018037832854, "loss": 0.7356, "step": 2500 }, { "epoch": 0.06535834251937749, "grad_norm": 0.16469696164131165, "learning_rate": 0.00019377231367016467, "loss": 0.718, "step": 2510 }, { "epoch": 0.06561873432224354, "grad_norm": 0.1644965261220932, "learning_rate": 0.00019371418558130702, "loss": 0.7253, "step": 2520 }, { "epoch": 0.06587912612510959, "grad_norm": 0.15347526967525482, "learning_rate": 0.00019365579627378174, "loss": 0.7214, "step": 2530 }, { "epoch": 0.06613951792797562, "grad_norm": 0.1618672013282776, "learning_rate": 0.00019359714591034302, "loss": 0.7204, "step": 2540 }, { "epoch": 0.06639990973084167, "grad_norm": 0.17043665051460266, "learning_rate": 0.00019353823465447268, "loss": 0.7278, "step": 2550 }, { "epoch": 0.06666030153370772, "grad_norm": 0.15762579441070557, "learning_rate": 0.00019347906267037983, "loss": 0.7283, "step": 2560 }, { "epoch": 0.06692069333657376, "grad_norm": 0.1622801572084427, "learning_rate": 0.00019341963012300029, "loss": 0.7193, "step": 2570 }, { "epoch": 0.06718108513943981, "grad_norm": 0.16705769300460815, "learning_rate": 0.00019335993717799617, "loss": 0.7414, "step": 2580 }, { "epoch": 0.06744147694230586, "grad_norm": 0.15886452794075012, "learning_rate": 0.00019329998400175545, "loss": 0.7242, "step": 2590 }, { "epoch": 0.06770186874517191, "grad_norm": 0.17994090914726257, "learning_rate": 0.00019323977076139142, "loss": 0.7017, "step": 2600 }, { "epoch": 0.06796226054803794, "grad_norm": 0.1609068214893341, "learning_rate": 0.00019317929762474232, "loss": 0.7352, "step": 2610 }, { "epoch": 0.068222652350904, "grad_norm": 0.15605950355529785, "learning_rate": 0.0001931185647603708, "loss": 0.7249, "step": 2620 }, { "epoch": 0.06848304415377005, "grad_norm": 0.16057750582695007, "learning_rate": 0.00019305757233756352, "loss": 0.7521, "step": 2630 }, { "epoch": 0.06874343595663608, "grad_norm": 0.1703862100839615, "learning_rate": 0.00019299632052633054, "loss": 0.7245, "step": 2640 }, { "epoch": 0.06900382775950213, "grad_norm": 0.16324444115161896, "learning_rate": 0.00019293480949740505, "loss": 0.7395, "step": 2650 }, { "epoch": 0.06926421956236818, "grad_norm": 0.15283791720867157, "learning_rate": 0.00019287303942224266, "loss": 0.7158, "step": 2660 }, { "epoch": 0.06952461136523422, "grad_norm": 0.1882282942533493, "learning_rate": 0.00019281101047302114, "loss": 0.724, "step": 2670 }, { "epoch": 0.06978500316810027, "grad_norm": 0.16147953271865845, "learning_rate": 0.00019274872282263984, "loss": 0.7365, "step": 2680 }, { "epoch": 0.07004539497096632, "grad_norm": 0.1614103466272354, "learning_rate": 0.00019268617664471916, "loss": 0.7206, "step": 2690 }, { "epoch": 0.07030578677383235, "grad_norm": 0.16784432530403137, "learning_rate": 0.00019262337211360016, "loss": 0.7279, "step": 2700 }, { "epoch": 0.0705661785766984, "grad_norm": 0.15966112911701202, "learning_rate": 0.000192560309404344, "loss": 0.7274, "step": 2710 }, { "epoch": 0.07082657037956445, "grad_norm": 0.16970521211624146, "learning_rate": 0.0001924969886927315, "loss": 0.7038, "step": 2720 }, { "epoch": 0.07108696218243049, "grad_norm": 0.16143856942653656, "learning_rate": 0.00019243341015526272, "loss": 0.7097, "step": 2730 }, { "epoch": 0.07134735398529654, "grad_norm": 0.16041269898414612, "learning_rate": 0.00019236957396915623, "loss": 0.722, "step": 2740 }, { "epoch": 0.07160774578816259, "grad_norm": 0.15845969319343567, "learning_rate": 0.00019230548031234882, "loss": 0.7238, "step": 2750 }, { "epoch": 0.07186813759102864, "grad_norm": 0.14966030418872833, "learning_rate": 0.00019224112936349502, "loss": 0.7182, "step": 2760 }, { "epoch": 0.07212852939389468, "grad_norm": 0.16525116562843323, "learning_rate": 0.00019217652130196653, "loss": 0.7397, "step": 2770 }, { "epoch": 0.07238892119676073, "grad_norm": 0.18119119107723236, "learning_rate": 0.0001921116563078516, "loss": 0.7222, "step": 2780 }, { "epoch": 0.07264931299962678, "grad_norm": 0.1709197610616684, "learning_rate": 0.00019204653456195478, "loss": 0.7068, "step": 2790 }, { "epoch": 0.07290970480249281, "grad_norm": 0.16309161484241486, "learning_rate": 0.00019198115624579625, "loss": 0.7349, "step": 2800 }, { "epoch": 0.07317009660535886, "grad_norm": 0.1736750453710556, "learning_rate": 0.00019191552154161135, "loss": 0.7445, "step": 2810 }, { "epoch": 0.07343048840822491, "grad_norm": 0.15009112656116486, "learning_rate": 0.00019184963063235006, "loss": 0.7034, "step": 2820 }, { "epoch": 0.07369088021109095, "grad_norm": 0.17244628071784973, "learning_rate": 0.0001917834837016766, "loss": 0.7285, "step": 2830 }, { "epoch": 0.073951272013957, "grad_norm": 0.15991820394992828, "learning_rate": 0.00019171708093396861, "loss": 0.7096, "step": 2840 }, { "epoch": 0.07421166381682305, "grad_norm": 0.17037667334079742, "learning_rate": 0.0001916504225143171, "loss": 0.7177, "step": 2850 }, { "epoch": 0.07447205561968909, "grad_norm": 0.16700348258018494, "learning_rate": 0.00019158350862852553, "loss": 0.7453, "step": 2860 }, { "epoch": 0.07473244742255514, "grad_norm": 0.17683659493923187, "learning_rate": 0.00019151633946310948, "loss": 0.7331, "step": 2870 }, { "epoch": 0.07499283922542119, "grad_norm": 0.16364306211471558, "learning_rate": 0.00019144891520529608, "loss": 0.7347, "step": 2880 }, { "epoch": 0.07525323102828722, "grad_norm": 0.1781424731016159, "learning_rate": 0.00019138123604302355, "loss": 0.7169, "step": 2890 }, { "epoch": 0.07551362283115327, "grad_norm": 0.16007259488105774, "learning_rate": 0.00019131330216494064, "loss": 0.7269, "step": 2900 }, { "epoch": 0.07577401463401932, "grad_norm": 0.1604921519756317, "learning_rate": 0.00019124511376040598, "loss": 0.7094, "step": 2910 }, { "epoch": 0.07603440643688536, "grad_norm": 0.16649965941905975, "learning_rate": 0.00019117667101948782, "loss": 0.7271, "step": 2920 }, { "epoch": 0.07629479823975141, "grad_norm": 0.16084066033363342, "learning_rate": 0.0001911079741329632, "loss": 0.7239, "step": 2930 }, { "epoch": 0.07655519004261746, "grad_norm": 0.1651066243648529, "learning_rate": 0.0001910390232923177, "loss": 0.7304, "step": 2940 }, { "epoch": 0.07681558184548351, "grad_norm": 0.1528957635164261, "learning_rate": 0.00019096981868974467, "loss": 0.7068, "step": 2950 }, { "epoch": 0.07707597364834955, "grad_norm": 0.172830730676651, "learning_rate": 0.00019090036051814483, "loss": 0.7277, "step": 2960 }, { "epoch": 0.0773363654512156, "grad_norm": 0.15909147262573242, "learning_rate": 0.00019083064897112571, "loss": 0.7135, "step": 2970 }, { "epoch": 0.07759675725408165, "grad_norm": 0.16273066401481628, "learning_rate": 0.0001907606842430011, "loss": 0.7346, "step": 2980 }, { "epoch": 0.07785714905694768, "grad_norm": 0.1595291793346405, "learning_rate": 0.00019069046652879049, "loss": 0.7377, "step": 2990 }, { "epoch": 0.07811754085981373, "grad_norm": 0.15573470294475555, "learning_rate": 0.0001906199960242185, "loss": 0.7026, "step": 3000 }, { "epoch": 0.07837793266267978, "grad_norm": 0.1670667678117752, "learning_rate": 0.0001905492729257145, "loss": 0.7231, "step": 3010 }, { "epoch": 0.07863832446554582, "grad_norm": 0.17074571549892426, "learning_rate": 0.00019047829743041184, "loss": 0.7003, "step": 3020 }, { "epoch": 0.07889871626841187, "grad_norm": 0.16979442536830902, "learning_rate": 0.00019040706973614738, "loss": 0.7217, "step": 3030 }, { "epoch": 0.07915910807127792, "grad_norm": 0.15843816101551056, "learning_rate": 0.00019033559004146103, "loss": 0.7334, "step": 3040 }, { "epoch": 0.07941949987414396, "grad_norm": 0.1607016921043396, "learning_rate": 0.0001902638585455951, "loss": 0.7271, "step": 3050 }, { "epoch": 0.07967989167701, "grad_norm": 0.1619115173816681, "learning_rate": 0.0001901918754484938, "loss": 0.7144, "step": 3060 }, { "epoch": 0.07994028347987606, "grad_norm": 0.1638360172510147, "learning_rate": 0.00019011964095080254, "loss": 0.7149, "step": 3070 }, { "epoch": 0.08020067528274209, "grad_norm": 0.16503652930259705, "learning_rate": 0.00019004715525386764, "loss": 0.7011, "step": 3080 }, { "epoch": 0.08046106708560814, "grad_norm": 0.16763822734355927, "learning_rate": 0.00018997441855973552, "loss": 0.7145, "step": 3090 }, { "epoch": 0.08072145888847419, "grad_norm": 0.1621125340461731, "learning_rate": 0.0001899014310711522, "loss": 0.7318, "step": 3100 }, { "epoch": 0.08098185069134024, "grad_norm": 0.16480112075805664, "learning_rate": 0.0001898281929915629, "loss": 0.7145, "step": 3110 }, { "epoch": 0.08124224249420628, "grad_norm": 0.1805388629436493, "learning_rate": 0.00018975470452511112, "loss": 0.7102, "step": 3120 }, { "epoch": 0.08150263429707233, "grad_norm": 0.1902652531862259, "learning_rate": 0.00018968096587663853, "loss": 0.7281, "step": 3130 }, { "epoch": 0.08176302609993838, "grad_norm": 0.1732487976551056, "learning_rate": 0.00018960697725168397, "loss": 0.7434, "step": 3140 }, { "epoch": 0.08202341790280442, "grad_norm": 0.1662171334028244, "learning_rate": 0.00018953273885648314, "loss": 0.716, "step": 3150 }, { "epoch": 0.08228380970567047, "grad_norm": 0.16129222512245178, "learning_rate": 0.00018945825089796797, "loss": 0.7318, "step": 3160 }, { "epoch": 0.08254420150853652, "grad_norm": 0.16837772727012634, "learning_rate": 0.00018938351358376596, "loss": 0.7137, "step": 3170 }, { "epoch": 0.08280459331140255, "grad_norm": 0.1618524193763733, "learning_rate": 0.00018930852712219974, "loss": 0.7079, "step": 3180 }, { "epoch": 0.0830649851142686, "grad_norm": 0.16333432495594025, "learning_rate": 0.00018923329172228632, "loss": 0.7062, "step": 3190 }, { "epoch": 0.08332537691713465, "grad_norm": 0.15985700488090515, "learning_rate": 0.00018915780759373672, "loss": 0.7277, "step": 3200 }, { "epoch": 0.08358576872000069, "grad_norm": 0.16181236505508423, "learning_rate": 0.0001890820749469551, "loss": 0.7048, "step": 3210 }, { "epoch": 0.08384616052286674, "grad_norm": 0.1679672747850418, "learning_rate": 0.00018900609399303853, "loss": 0.73, "step": 3220 }, { "epoch": 0.08410655232573279, "grad_norm": 0.16680286824703217, "learning_rate": 0.00018892986494377606, "loss": 0.7169, "step": 3230 }, { "epoch": 0.08436694412859883, "grad_norm": 0.15980315208435059, "learning_rate": 0.00018885338801164834, "loss": 0.7346, "step": 3240 }, { "epoch": 0.08462733593146488, "grad_norm": 0.16863352060317993, "learning_rate": 0.00018877666340982695, "loss": 0.7256, "step": 3250 }, { "epoch": 0.08488772773433093, "grad_norm": 0.1657836139202118, "learning_rate": 0.0001886996913521739, "loss": 0.7219, "step": 3260 }, { "epoch": 0.08514811953719698, "grad_norm": 0.1683470606803894, "learning_rate": 0.00018862247205324087, "loss": 0.7279, "step": 3270 }, { "epoch": 0.08540851134006301, "grad_norm": 0.1686122715473175, "learning_rate": 0.00018854500572826867, "loss": 0.7178, "step": 3280 }, { "epoch": 0.08566890314292906, "grad_norm": 0.16122782230377197, "learning_rate": 0.00018846729259318682, "loss": 0.7289, "step": 3290 }, { "epoch": 0.08592929494579511, "grad_norm": 0.1920589804649353, "learning_rate": 0.0001883893328646126, "loss": 0.7264, "step": 3300 }, { "epoch": 0.08618968674866115, "grad_norm": 0.17415335774421692, "learning_rate": 0.00018831112675985083, "loss": 0.7378, "step": 3310 }, { "epoch": 0.0864500785515272, "grad_norm": 0.16903561353683472, "learning_rate": 0.00018823267449689292, "loss": 0.702, "step": 3320 }, { "epoch": 0.08671047035439325, "grad_norm": 0.17158570885658264, "learning_rate": 0.00018815397629441658, "loss": 0.7264, "step": 3330 }, { "epoch": 0.08697086215725928, "grad_norm": 0.17094087600708008, "learning_rate": 0.0001880750323717849, "loss": 0.741, "step": 3340 }, { "epoch": 0.08723125396012534, "grad_norm": 0.1618223935365677, "learning_rate": 0.000187995842949046, "loss": 0.7084, "step": 3350 }, { "epoch": 0.08749164576299139, "grad_norm": 0.1671626716852188, "learning_rate": 0.0001879164082469322, "loss": 0.7236, "step": 3360 }, { "epoch": 0.08775203756585742, "grad_norm": 0.1681569665670395, "learning_rate": 0.00018783672848685966, "loss": 0.7382, "step": 3370 }, { "epoch": 0.08801242936872347, "grad_norm": 0.16421955823898315, "learning_rate": 0.00018775680389092748, "loss": 0.717, "step": 3380 }, { "epoch": 0.08827282117158952, "grad_norm": 0.174809530377388, "learning_rate": 0.00018767663468191725, "loss": 0.7225, "step": 3390 }, { "epoch": 0.08853321297445556, "grad_norm": 0.16181902587413788, "learning_rate": 0.00018759622108329243, "loss": 0.7014, "step": 3400 }, { "epoch": 0.08879360477732161, "grad_norm": 0.15579254925251007, "learning_rate": 0.0001875155633191977, "loss": 0.7125, "step": 3410 }, { "epoch": 0.08905399658018766, "grad_norm": 0.16342496871948242, "learning_rate": 0.00018743466161445823, "loss": 0.7075, "step": 3420 }, { "epoch": 0.08931438838305371, "grad_norm": 0.17215611040592194, "learning_rate": 0.00018735351619457923, "loss": 0.7331, "step": 3430 }, { "epoch": 0.08957478018591974, "grad_norm": 0.1682904213666916, "learning_rate": 0.00018727212728574522, "loss": 0.734, "step": 3440 }, { "epoch": 0.0898351719887858, "grad_norm": 0.16969889402389526, "learning_rate": 0.00018719049511481948, "loss": 0.7224, "step": 3450 }, { "epoch": 0.09009556379165184, "grad_norm": 0.16607950627803802, "learning_rate": 0.00018710861990934324, "loss": 0.7218, "step": 3460 }, { "epoch": 0.09035595559451788, "grad_norm": 0.16665585339069366, "learning_rate": 0.00018702650189753525, "loss": 0.7152, "step": 3470 }, { "epoch": 0.09061634739738393, "grad_norm": 0.16812992095947266, "learning_rate": 0.00018694414130829103, "loss": 0.7097, "step": 3480 }, { "epoch": 0.09087673920024998, "grad_norm": 0.16855508089065552, "learning_rate": 0.00018686153837118224, "loss": 0.7268, "step": 3490 }, { "epoch": 0.09113713100311602, "grad_norm": 0.1634734570980072, "learning_rate": 0.00018677869331645613, "loss": 0.7485, "step": 3500 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.64208568172544e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }