{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03905877042990687, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603918028660458, "grad_norm": 0.4500846266746521, "learning_rate": 5.194805194805195e-06, "loss": 1.0381, "step": 10 }, { "epoch": 0.0005207836057320916, "grad_norm": 0.35188010334968567, "learning_rate": 1.038961038961039e-05, "loss": 1.0108, "step": 20 }, { "epoch": 0.0007811754085981374, "grad_norm": 0.2300374060869217, "learning_rate": 1.5584415584415583e-05, "loss": 0.9668, "step": 30 }, { "epoch": 0.0010415672114641832, "grad_norm": 0.16189467906951904, "learning_rate": 2.077922077922078e-05, "loss": 0.918, "step": 40 }, { "epoch": 0.001301959014330229, "grad_norm": 0.18843211233615875, "learning_rate": 2.5974025974025972e-05, "loss": 0.9265, "step": 50 }, { "epoch": 0.0015623508171962747, "grad_norm": 0.20334510505199432, "learning_rate": 3.1168831168831166e-05, "loss": 0.9234, "step": 60 }, { "epoch": 0.0018227426200623205, "grad_norm": 0.1745327115058899, "learning_rate": 3.6363636363636364e-05, "loss": 0.881, "step": 70 }, { "epoch": 0.0020831344229283663, "grad_norm": 0.18667331337928772, "learning_rate": 4.155844155844156e-05, "loss": 0.8592, "step": 80 }, { "epoch": 0.002343526225794412, "grad_norm": 0.1848158985376358, "learning_rate": 4.675324675324675e-05, "loss": 0.8537, "step": 90 }, { "epoch": 0.002603918028660458, "grad_norm": 0.17589879035949707, "learning_rate": 5.1948051948051944e-05, "loss": 0.8518, "step": 100 }, { "epoch": 0.0028643098315265037, "grad_norm": 0.2132624089717865, "learning_rate": 5.714285714285714e-05, "loss": 0.8511, "step": 110 }, { "epoch": 0.0031247016343925495, "grad_norm": 0.23070092499256134, "learning_rate": 6.233766233766233e-05, "loss": 0.7975, "step": 120 }, { "epoch": 0.0033850934372585953, "grad_norm": 0.25368157029151917, "learning_rate": 6.753246753246754e-05, "loss": 0.8134, "step": 130 }, { "epoch": 0.003645485240124641, "grad_norm": 0.22897231578826904, "learning_rate": 7.272727272727273e-05, "loss": 0.8322, "step": 140 }, { "epoch": 0.003905877042990687, "grad_norm": 0.19932536780834198, "learning_rate": 7.792207792207793e-05, "loss": 0.7959, "step": 150 }, { "epoch": 0.004166268845856733, "grad_norm": 0.21011792123317719, "learning_rate": 8.311688311688312e-05, "loss": 0.8102, "step": 160 }, { "epoch": 0.004426660648722778, "grad_norm": 0.20594824850559235, "learning_rate": 8.831168831168831e-05, "loss": 0.8128, "step": 170 }, { "epoch": 0.004687052451588824, "grad_norm": 0.20465536415576935, "learning_rate": 9.35064935064935e-05, "loss": 0.7989, "step": 180 }, { "epoch": 0.00494744425445487, "grad_norm": 0.4109392762184143, "learning_rate": 9.870129870129871e-05, "loss": 0.8108, "step": 190 }, { "epoch": 0.005207836057320916, "grad_norm": 0.4293076694011688, "learning_rate": 0.00010389610389610389, "loss": 0.8101, "step": 200 }, { "epoch": 0.005468227860186962, "grad_norm": 0.31628963351249695, "learning_rate": 0.00010909090909090909, "loss": 0.7989, "step": 210 }, { "epoch": 0.005728619663053007, "grad_norm": 0.24642810225486755, "learning_rate": 0.00011428571428571428, "loss": 0.7751, "step": 220 }, { "epoch": 0.005989011465919053, "grad_norm": 0.3599106967449188, "learning_rate": 0.00011948051948051949, "loss": 0.8063, "step": 230 }, { "epoch": 0.006249403268785099, "grad_norm": 0.17053447663784027, "learning_rate": 0.00012467532467532467, "loss": 0.7751, "step": 240 }, { "epoch": 0.006509795071651145, "grad_norm": 0.17303769290447235, "learning_rate": 0.00012987012987012987, "loss": 0.7883, "step": 250 }, { "epoch": 0.0067701868745171905, "grad_norm": 0.1815861016511917, "learning_rate": 0.00013506493506493507, "loss": 0.788, "step": 260 }, { "epoch": 0.007030578677383236, "grad_norm": 0.24125365912914276, "learning_rate": 0.00014025974025974028, "loss": 0.8018, "step": 270 }, { "epoch": 0.007290970480249282, "grad_norm": 0.19443446397781372, "learning_rate": 0.00014545454545454546, "loss": 0.7908, "step": 280 }, { "epoch": 0.007551362283115328, "grad_norm": 0.17829768359661102, "learning_rate": 0.00015064935064935066, "loss": 0.8033, "step": 290 }, { "epoch": 0.007811754085981374, "grad_norm": 0.19535653293132782, "learning_rate": 0.00015584415584415587, "loss": 0.7997, "step": 300 }, { "epoch": 0.008072145888847419, "grad_norm": 0.19930541515350342, "learning_rate": 0.00016103896103896104, "loss": 0.7945, "step": 310 }, { "epoch": 0.008332537691713465, "grad_norm": 0.2156297266483307, "learning_rate": 0.00016623376623376625, "loss": 0.8018, "step": 320 }, { "epoch": 0.00859292949457951, "grad_norm": 0.1924206018447876, "learning_rate": 0.00017142857142857143, "loss": 0.7746, "step": 330 }, { "epoch": 0.008853321297445557, "grad_norm": 0.2294880747795105, "learning_rate": 0.00017662337662337663, "loss": 0.8152, "step": 340 }, { "epoch": 0.009113713100311602, "grad_norm": 0.16817067563533783, "learning_rate": 0.00018181818181818183, "loss": 0.7972, "step": 350 }, { "epoch": 0.009374104903177648, "grad_norm": 0.18544812500476837, "learning_rate": 0.000187012987012987, "loss": 0.7801, "step": 360 }, { "epoch": 0.009634496706043693, "grad_norm": 0.19597066938877106, "learning_rate": 0.00019220779220779222, "loss": 0.7706, "step": 370 }, { "epoch": 0.00989488850890974, "grad_norm": 0.40291881561279297, "learning_rate": 0.00019740259740259742, "loss": 0.7911, "step": 380 }, { "epoch": 0.010155280311775785, "grad_norm": 0.23841074109077454, "learning_rate": 0.00019999996515752773, "loss": 0.7861, "step": 390 }, { "epoch": 0.010415672114641832, "grad_norm": 0.1675388514995575, "learning_rate": 0.00019999968641789507, "loss": 0.788, "step": 400 }, { "epoch": 0.010676063917507876, "grad_norm": 1.8860758543014526, "learning_rate": 0.0001999991289394067, "loss": 0.7632, "step": 410 }, { "epoch": 0.010936455720373923, "grad_norm": 0.17022117972373962, "learning_rate": 0.00019999829272361654, "loss": 0.784, "step": 420 }, { "epoch": 0.011196847523239968, "grad_norm": 0.21460269391536713, "learning_rate": 0.00019999717777285545, "loss": 0.761, "step": 430 }, { "epoch": 0.011457239326106015, "grad_norm": 0.19413785636425018, "learning_rate": 0.00019999578409023126, "loss": 0.7772, "step": 440 }, { "epoch": 0.01171763112897206, "grad_norm": 0.20223405957221985, "learning_rate": 0.00019999411167962868, "loss": 0.7811, "step": 450 }, { "epoch": 0.011978022931838106, "grad_norm": 0.15166303515434265, "learning_rate": 0.00019999216054570942, "loss": 0.7709, "step": 460 }, { "epoch": 0.012238414734704151, "grad_norm": 0.16307081282138824, "learning_rate": 0.00019998993069391205, "loss": 0.7811, "step": 470 }, { "epoch": 0.012498806537570198, "grad_norm": 0.15996049344539642, "learning_rate": 0.00019998742213045206, "loss": 0.7599, "step": 480 }, { "epoch": 0.012759198340436243, "grad_norm": 0.17560279369354248, "learning_rate": 0.00019998463486232179, "loss": 0.7572, "step": 490 }, { "epoch": 0.01301959014330229, "grad_norm": 0.17571642994880676, "learning_rate": 0.0001999815688972905, "loss": 0.7643, "step": 500 }, { "epoch": 0.013279981946168334, "grad_norm": 0.17719799280166626, "learning_rate": 0.00019997822424390422, "loss": 0.7923, "step": 510 }, { "epoch": 0.013540373749034381, "grad_norm": 0.19846616685390472, "learning_rate": 0.00019997460091148586, "loss": 0.7674, "step": 520 }, { "epoch": 0.013800765551900426, "grad_norm": 0.2715558111667633, "learning_rate": 0.00019997069891013503, "loss": 0.7421, "step": 530 }, { "epoch": 0.014061157354766473, "grad_norm": 0.1725197583436966, "learning_rate": 0.00019996651825072826, "loss": 0.7663, "step": 540 }, { "epoch": 0.014321549157632518, "grad_norm": 0.15060502290725708, "learning_rate": 0.00019996205894491856, "loss": 0.7794, "step": 550 }, { "epoch": 0.014581940960498564, "grad_norm": 0.16645808517932892, "learning_rate": 0.00019995732100513592, "loss": 0.752, "step": 560 }, { "epoch": 0.014842332763364609, "grad_norm": 0.1736789345741272, "learning_rate": 0.00019995230444458682, "loss": 0.7788, "step": 570 }, { "epoch": 0.015102724566230656, "grad_norm": 0.15416319668293, "learning_rate": 0.0001999470092772544, "loss": 0.7656, "step": 580 }, { "epoch": 0.0153631163690967, "grad_norm": 0.16610187292099, "learning_rate": 0.00019994143551789839, "loss": 0.7676, "step": 590 }, { "epoch": 0.015623508171962747, "grad_norm": 0.15843011438846588, "learning_rate": 0.00019993558318205507, "loss": 0.7746, "step": 600 }, { "epoch": 0.015883899974828794, "grad_norm": 0.26837801933288574, "learning_rate": 0.00019992945228603724, "loss": 0.7617, "step": 610 }, { "epoch": 0.016144291777694837, "grad_norm": 0.15099173784255981, "learning_rate": 0.0001999230428469341, "loss": 0.7601, "step": 620 }, { "epoch": 0.016404683580560884, "grad_norm": 0.15511856973171234, "learning_rate": 0.00019991635488261138, "loss": 0.7647, "step": 630 }, { "epoch": 0.01666507538342693, "grad_norm": 0.14919579029083252, "learning_rate": 0.00019990938841171104, "loss": 0.7692, "step": 640 }, { "epoch": 0.016925467186292977, "grad_norm": 0.15838642418384552, "learning_rate": 0.0001999021434536514, "loss": 0.7763, "step": 650 }, { "epoch": 0.01718585898915902, "grad_norm": 0.15956635773181915, "learning_rate": 0.00019989462002862704, "loss": 0.7598, "step": 660 }, { "epoch": 0.017446250792025067, "grad_norm": 0.1499069333076477, "learning_rate": 0.0001998868181576088, "loss": 0.7626, "step": 670 }, { "epoch": 0.017706642594891114, "grad_norm": 0.2170073390007019, "learning_rate": 0.00019987873786234348, "loss": 0.7569, "step": 680 }, { "epoch": 0.01796703439775716, "grad_norm": 0.17841948568820953, "learning_rate": 0.00019987037916535417, "loss": 0.7494, "step": 690 }, { "epoch": 0.018227426200623204, "grad_norm": 0.2066909372806549, "learning_rate": 0.0001998617420899398, "loss": 0.7609, "step": 700 }, { "epoch": 0.01848781800348925, "grad_norm": 0.17015361785888672, "learning_rate": 0.0001998528266601754, "loss": 0.7761, "step": 710 }, { "epoch": 0.018748209806355297, "grad_norm": 0.22166290879249573, "learning_rate": 0.0001998436329009118, "loss": 0.7573, "step": 720 }, { "epoch": 0.01900860160922134, "grad_norm": 0.15084640681743622, "learning_rate": 0.00019983416083777563, "loss": 0.7775, "step": 730 }, { "epoch": 0.019268993412087387, "grad_norm": 0.17800921201705933, "learning_rate": 0.0001998244104971693, "loss": 0.7359, "step": 740 }, { "epoch": 0.019529385214953433, "grad_norm": 0.17354707419872284, "learning_rate": 0.0001998143819062709, "loss": 0.7415, "step": 750 }, { "epoch": 0.01978977701781948, "grad_norm": 0.16408118605613708, "learning_rate": 0.00019980407509303413, "loss": 0.7708, "step": 760 }, { "epoch": 0.020050168820685523, "grad_norm": 0.16820089519023895, "learning_rate": 0.00019979349008618808, "loss": 0.791, "step": 770 }, { "epoch": 0.02031056062355157, "grad_norm": 0.15958388149738312, "learning_rate": 0.00019978262691523743, "loss": 0.7412, "step": 780 }, { "epoch": 0.020570952426417616, "grad_norm": 0.1646542251110077, "learning_rate": 0.00019977148561046217, "loss": 0.7529, "step": 790 }, { "epoch": 0.020831344229283663, "grad_norm": 0.17032025754451752, "learning_rate": 0.0001997600662029175, "loss": 0.7656, "step": 800 }, { "epoch": 0.021091736032149706, "grad_norm": 0.17189227044582367, "learning_rate": 0.00019974836872443388, "loss": 0.7433, "step": 810 }, { "epoch": 0.021352127835015753, "grad_norm": 0.16334249079227448, "learning_rate": 0.0001997363932076168, "loss": 0.7703, "step": 820 }, { "epoch": 0.0216125196378818, "grad_norm": 0.1676424890756607, "learning_rate": 0.00019972413968584682, "loss": 0.7603, "step": 830 }, { "epoch": 0.021872911440747846, "grad_norm": 0.16826209425926208, "learning_rate": 0.0001997116081932793, "loss": 0.7569, "step": 840 }, { "epoch": 0.02213330324361389, "grad_norm": 0.1876436173915863, "learning_rate": 0.0001996987987648446, "loss": 0.7553, "step": 850 }, { "epoch": 0.022393695046479936, "grad_norm": 0.17252250015735626, "learning_rate": 0.0001996857114362476, "loss": 0.7644, "step": 860 }, { "epoch": 0.022654086849345983, "grad_norm": 0.1632252335548401, "learning_rate": 0.00019967234624396793, "loss": 0.7568, "step": 870 }, { "epoch": 0.02291447865221203, "grad_norm": 0.1818259060382843, "learning_rate": 0.00019965870322525965, "loss": 0.7672, "step": 880 }, { "epoch": 0.023174870455078073, "grad_norm": 0.15418195724487305, "learning_rate": 0.0001996447824181513, "loss": 0.7642, "step": 890 }, { "epoch": 0.02343526225794412, "grad_norm": 0.17383505403995514, "learning_rate": 0.0001996305838614457, "loss": 0.7607, "step": 900 }, { "epoch": 0.023695654060810166, "grad_norm": 0.17794272303581238, "learning_rate": 0.00019961610759471984, "loss": 0.7588, "step": 910 }, { "epoch": 0.023956045863676213, "grad_norm": 0.1909121572971344, "learning_rate": 0.00019960135365832486, "loss": 0.7438, "step": 920 }, { "epoch": 0.024216437666542256, "grad_norm": 0.17758873105049133, "learning_rate": 0.00019958632209338587, "loss": 0.7323, "step": 930 }, { "epoch": 0.024476829469408302, "grad_norm": 0.15553662180900574, "learning_rate": 0.00019957101294180174, "loss": 0.7508, "step": 940 }, { "epoch": 0.02473722127227435, "grad_norm": 0.15310749411582947, "learning_rate": 0.00019955542624624522, "loss": 0.7563, "step": 950 }, { "epoch": 0.024997613075140396, "grad_norm": 0.1628728210926056, "learning_rate": 0.00019953956205016256, "loss": 0.7524, "step": 960 }, { "epoch": 0.02525800487800644, "grad_norm": 0.16211454570293427, "learning_rate": 0.00019952342039777362, "loss": 0.7564, "step": 970 }, { "epoch": 0.025518396680872486, "grad_norm": 0.15663012862205505, "learning_rate": 0.00019950700133407163, "loss": 0.7395, "step": 980 }, { "epoch": 0.025778788483738532, "grad_norm": 0.1684863567352295, "learning_rate": 0.00019949030490482296, "loss": 0.753, "step": 990 }, { "epoch": 0.02603918028660458, "grad_norm": 0.1561436653137207, "learning_rate": 0.0001994733311565673, "loss": 0.7409, "step": 1000 }, { "epoch": 0.026299572089470622, "grad_norm": 0.1781485229730606, "learning_rate": 0.0001994560801366171, "loss": 0.762, "step": 1010 }, { "epoch": 0.02655996389233667, "grad_norm": 0.15422071516513824, "learning_rate": 0.00019943855189305792, "loss": 0.7291, "step": 1020 }, { "epoch": 0.026820355695202715, "grad_norm": 0.17980527877807617, "learning_rate": 0.00019942074647474786, "loss": 0.7732, "step": 1030 }, { "epoch": 0.027080747498068762, "grad_norm": 0.15810626745224, "learning_rate": 0.00019940266393131775, "loss": 0.7764, "step": 1040 }, { "epoch": 0.027341139300934805, "grad_norm": 0.16385480761528015, "learning_rate": 0.00019938430431317081, "loss": 0.7404, "step": 1050 }, { "epoch": 0.027601531103800852, "grad_norm": 0.15134255588054657, "learning_rate": 0.00019936566767148257, "loss": 0.7506, "step": 1060 }, { "epoch": 0.0278619229066669, "grad_norm": 0.1592187136411667, "learning_rate": 0.00019934675405820077, "loss": 0.73, "step": 1070 }, { "epoch": 0.028122314709532945, "grad_norm": 0.16852422058582306, "learning_rate": 0.00019932756352604515, "loss": 0.7443, "step": 1080 }, { "epoch": 0.02838270651239899, "grad_norm": 0.15741507709026337, "learning_rate": 0.00019930809612850735, "loss": 0.7377, "step": 1090 }, { "epoch": 0.028643098315265035, "grad_norm": 0.22424879670143127, "learning_rate": 0.00019928835191985076, "loss": 0.7544, "step": 1100 }, { "epoch": 0.028903490118131082, "grad_norm": 0.2047310769557953, "learning_rate": 0.0001992683309551103, "loss": 0.7441, "step": 1110 }, { "epoch": 0.02916388192099713, "grad_norm": 0.16392463445663452, "learning_rate": 0.00019924803329009243, "loss": 0.7606, "step": 1120 }, { "epoch": 0.02942427372386317, "grad_norm": 0.16227149963378906, "learning_rate": 0.00019922745898137473, "loss": 0.736, "step": 1130 }, { "epoch": 0.029684665526729218, "grad_norm": 0.15652808547019958, "learning_rate": 0.00019920660808630598, "loss": 0.7513, "step": 1140 }, { "epoch": 0.029945057329595265, "grad_norm": 0.15162768959999084, "learning_rate": 0.00019918548066300592, "loss": 0.7303, "step": 1150 }, { "epoch": 0.03020544913246131, "grad_norm": 0.17650415003299713, "learning_rate": 0.0001991640767703651, "loss": 0.7254, "step": 1160 }, { "epoch": 0.030465840935327355, "grad_norm": 0.1594468355178833, "learning_rate": 0.00019914239646804462, "loss": 0.741, "step": 1170 }, { "epoch": 0.0307262327381934, "grad_norm": 0.17928367853164673, "learning_rate": 0.00019912043981647616, "loss": 0.7515, "step": 1180 }, { "epoch": 0.030986624541059448, "grad_norm": 0.17009998857975006, "learning_rate": 0.00019909820687686157, "loss": 0.7539, "step": 1190 }, { "epoch": 0.031247016343925495, "grad_norm": 0.16556763648986816, "learning_rate": 0.0001990756977111729, "loss": 0.7418, "step": 1200 }, { "epoch": 0.03150740814679154, "grad_norm": 0.1561640352010727, "learning_rate": 0.0001990529123821522, "loss": 0.7465, "step": 1210 }, { "epoch": 0.03176779994965759, "grad_norm": 0.15182287991046906, "learning_rate": 0.00019902985095331113, "loss": 0.7694, "step": 1220 }, { "epoch": 0.03202819175252363, "grad_norm": 0.15173685550689697, "learning_rate": 0.00019900651348893114, "loss": 0.7519, "step": 1230 }, { "epoch": 0.032288583555389674, "grad_norm": 0.16535787284374237, "learning_rate": 0.00019898290005406296, "loss": 0.7646, "step": 1240 }, { "epoch": 0.032548975358255725, "grad_norm": 0.19272534549236298, "learning_rate": 0.00019895901071452667, "loss": 0.7655, "step": 1250 }, { "epoch": 0.03280936716112177, "grad_norm": 0.1672705113887787, "learning_rate": 0.0001989348455369113, "loss": 0.7486, "step": 1260 }, { "epoch": 0.03306975896398781, "grad_norm": 0.1525493860244751, "learning_rate": 0.0001989104045885748, "loss": 0.7546, "step": 1270 }, { "epoch": 0.03333015076685386, "grad_norm": 0.16333037614822388, "learning_rate": 0.00019888568793764385, "loss": 0.7299, "step": 1280 }, { "epoch": 0.033590542569719904, "grad_norm": 0.1590205729007721, "learning_rate": 0.00019886069565301355, "loss": 0.762, "step": 1290 }, { "epoch": 0.033850934372585954, "grad_norm": 0.15006420016288757, "learning_rate": 0.00019883542780434733, "loss": 0.7531, "step": 1300 }, { "epoch": 0.034111326175452, "grad_norm": 0.18390792608261108, "learning_rate": 0.0001988098844620767, "loss": 0.7621, "step": 1310 }, { "epoch": 0.03437171797831804, "grad_norm": 0.17046166956424713, "learning_rate": 0.0001987840656974011, "loss": 0.7422, "step": 1320 }, { "epoch": 0.03463210978118409, "grad_norm": 0.15121813118457794, "learning_rate": 0.00019875797158228775, "loss": 0.7555, "step": 1330 }, { "epoch": 0.034892501584050134, "grad_norm": 0.16219307482242584, "learning_rate": 0.00019873160218947125, "loss": 0.7301, "step": 1340 }, { "epoch": 0.03515289338691618, "grad_norm": 0.1779986321926117, "learning_rate": 0.00019870495759245362, "loss": 0.7356, "step": 1350 }, { "epoch": 0.03541328518978223, "grad_norm": 0.16951359808444977, "learning_rate": 0.0001986780378655039, "loss": 0.7645, "step": 1360 }, { "epoch": 0.03567367699264827, "grad_norm": 0.16620802879333496, "learning_rate": 0.0001986508430836581, "loss": 0.7331, "step": 1370 }, { "epoch": 0.03593406879551432, "grad_norm": 0.1577858328819275, "learning_rate": 0.0001986233733227188, "loss": 0.7667, "step": 1380 }, { "epoch": 0.036194460598380364, "grad_norm": 0.1637091338634491, "learning_rate": 0.00019859562865925525, "loss": 0.7521, "step": 1390 }, { "epoch": 0.03645485240124641, "grad_norm": 0.15061691403388977, "learning_rate": 0.00019856760917060277, "loss": 0.744, "step": 1400 }, { "epoch": 0.03671524420411246, "grad_norm": 0.15373477339744568, "learning_rate": 0.00019853931493486287, "loss": 0.7677, "step": 1410 }, { "epoch": 0.0369756360069785, "grad_norm": 0.16468606889247894, "learning_rate": 0.00019851074603090277, "loss": 0.7179, "step": 1420 }, { "epoch": 0.037236027809844544, "grad_norm": 0.16084876656532288, "learning_rate": 0.00019848190253835536, "loss": 0.749, "step": 1430 }, { "epoch": 0.037496419612710594, "grad_norm": 0.16743004322052002, "learning_rate": 0.00019845278453761896, "loss": 0.7483, "step": 1440 }, { "epoch": 0.03775681141557664, "grad_norm": 0.17335088551044464, "learning_rate": 0.00019842339210985696, "loss": 0.735, "step": 1450 }, { "epoch": 0.03801720321844268, "grad_norm": 0.1546197235584259, "learning_rate": 0.00019839372533699774, "loss": 0.7549, "step": 1460 }, { "epoch": 0.03827759502130873, "grad_norm": 0.16218656301498413, "learning_rate": 0.00019836378430173438, "loss": 0.7425, "step": 1470 }, { "epoch": 0.03853798682417477, "grad_norm": 0.1712743639945984, "learning_rate": 0.0001983335690875245, "loss": 0.733, "step": 1480 }, { "epoch": 0.03879837862704082, "grad_norm": 0.15490613877773285, "learning_rate": 0.00019830307977858984, "loss": 0.7265, "step": 1490 }, { "epoch": 0.03905877042990687, "grad_norm": 0.1646670252084732, "learning_rate": 0.00019827231645991623, "loss": 0.7315, "step": 1500 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.0375100645376e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }