{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9293516810895164, "eval_steps": 500, "global_step": 6800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005674563767910342, "grad_norm": 1.8945719003677368, "learning_rate": 2.830188679245283e-06, "loss": 0.9878, "step": 20 }, { "epoch": 0.011349127535820683, "grad_norm": 0.8699278235435486, "learning_rate": 5.660377358490566e-06, "loss": 0.9338, "step": 40 }, { "epoch": 0.017023691303731027, "grad_norm": 0.9612842798233032, "learning_rate": 8.49056603773585e-06, "loss": 0.8992, "step": 60 }, { "epoch": 0.022698255071641367, "grad_norm": 1.0209581851959229, "learning_rate": 1.1320754716981132e-05, "loss": 0.8802, "step": 80 }, { "epoch": 0.02837281883955171, "grad_norm": 1.1397087574005127, "learning_rate": 1.4150943396226415e-05, "loss": 0.8636, "step": 100 }, { "epoch": 0.034047382607462054, "grad_norm": 1.0688011646270752, "learning_rate": 1.69811320754717e-05, "loss": 0.8589, "step": 120 }, { "epoch": 0.039721946375372394, "grad_norm": 1.0701323747634888, "learning_rate": 1.981132075471698e-05, "loss": 0.8445, "step": 140 }, { "epoch": 0.045396510143282734, "grad_norm": 1.0749995708465576, "learning_rate": 2.2641509433962265e-05, "loss": 0.8438, "step": 160 }, { "epoch": 0.051071073911193074, "grad_norm": 1.2973322868347168, "learning_rate": 2.547169811320755e-05, "loss": 0.8399, "step": 180 }, { "epoch": 0.05674563767910342, "grad_norm": 0.9941120743751526, "learning_rate": 2.830188679245283e-05, "loss": 0.8459, "step": 200 }, { "epoch": 0.06242020144701376, "grad_norm": 1.1092499494552612, "learning_rate": 2.9999898623711896e-05, "loss": 0.8396, "step": 220 }, { "epoch": 0.06809476521492411, "grad_norm": 1.10667085647583, "learning_rate": 2.999875815620755e-05, "loss": 0.8403, "step": 240 }, { "epoch": 0.07376932898283445, "grad_norm": 1.0986227989196777, "learning_rate": 2.999635059750628e-05, "loss": 0.8296, "step": 260 }, { "epoch": 0.07944389275074479, "grad_norm": 0.9648028612136841, "learning_rate": 2.9992676150998032e-05, "loss": 0.8187, "step": 280 }, { "epoch": 0.08511845651865513, "grad_norm": 0.8029258251190186, "learning_rate": 2.998773512709909e-05, "loss": 0.8224, "step": 300 }, { "epoch": 0.09079302028656547, "grad_norm": 0.888502299785614, "learning_rate": 2.9981527943225862e-05, "loss": 0.8178, "step": 320 }, { "epoch": 0.09646758405447581, "grad_norm": 0.7894881963729858, "learning_rate": 2.997405512375964e-05, "loss": 0.8153, "step": 340 }, { "epoch": 0.10214214782238615, "grad_norm": 0.8492247462272644, "learning_rate": 2.996531730000227e-05, "loss": 0.8105, "step": 360 }, { "epoch": 0.1078167115902965, "grad_norm": 0.8247759938240051, "learning_rate": 2.9955315210122842e-05, "loss": 0.8, "step": 380 }, { "epoch": 0.11349127535820684, "grad_norm": 0.8270812034606934, "learning_rate": 2.99440496990953e-05, "loss": 0.802, "step": 400 }, { "epoch": 0.11916583912611718, "grad_norm": 0.8336136937141418, "learning_rate": 2.9931521718627107e-05, "loss": 0.7932, "step": 420 }, { "epoch": 0.12484040289402752, "grad_norm": 0.7927630543708801, "learning_rate": 2.991773232707879e-05, "loss": 0.7903, "step": 440 }, { "epoch": 0.13051496666193788, "grad_norm": 0.8075955510139465, "learning_rate": 2.9902682689374578e-05, "loss": 0.7897, "step": 460 }, { "epoch": 0.13618953042984822, "grad_norm": 0.7381598353385925, "learning_rate": 2.9886374076903945e-05, "loss": 0.785, "step": 480 }, { "epoch": 0.14186409419775856, "grad_norm": 0.799022912979126, "learning_rate": 2.986880786741426e-05, "loss": 0.7862, "step": 500 }, { "epoch": 0.1475386579656689, "grad_norm": 0.7515665292739868, "learning_rate": 2.9849985544894333e-05, "loss": 0.7845, "step": 520 }, { "epoch": 0.15321322173357924, "grad_norm": 0.8161646723747253, "learning_rate": 2.982990869944908e-05, "loss": 0.7745, "step": 540 }, { "epoch": 0.15888778550148958, "grad_norm": 0.671816885471344, "learning_rate": 2.9808579027165204e-05, "loss": 0.7786, "step": 560 }, { "epoch": 0.16456234926939992, "grad_norm": 0.7310769557952881, "learning_rate": 2.978599832996788e-05, "loss": 0.7742, "step": 580 }, { "epoch": 0.17023691303731026, "grad_norm": 0.7568747401237488, "learning_rate": 2.9762168515468548e-05, "loss": 0.7691, "step": 600 }, { "epoch": 0.1759114768052206, "grad_norm": 0.6345218420028687, "learning_rate": 2.973709159680375e-05, "loss": 0.7695, "step": 620 }, { "epoch": 0.18158604057313094, "grad_norm": 0.7218050360679626, "learning_rate": 2.9710769692465073e-05, "loss": 0.7681, "step": 640 }, { "epoch": 0.18726060434104128, "grad_norm": 0.7665095925331116, "learning_rate": 2.9683205026120163e-05, "loss": 0.7667, "step": 660 }, { "epoch": 0.19293516810895162, "grad_norm": 0.6717973947525024, "learning_rate": 2.9654399926424884e-05, "loss": 0.7684, "step": 680 }, { "epoch": 0.19860973187686196, "grad_norm": 0.7454754114151001, "learning_rate": 2.9624356826826577e-05, "loss": 0.7622, "step": 700 }, { "epoch": 0.2042842956447723, "grad_norm": 0.6865426898002625, "learning_rate": 2.9593078265358498e-05, "loss": 0.761, "step": 720 }, { "epoch": 0.20995885941268266, "grad_norm": 0.7075285315513611, "learning_rate": 2.956056688442541e-05, "loss": 0.7578, "step": 740 }, { "epoch": 0.215633423180593, "grad_norm": 0.7438149452209473, "learning_rate": 2.9526825430580337e-05, "loss": 0.7571, "step": 760 }, { "epoch": 0.22130798694850334, "grad_norm": 0.6830400228500366, "learning_rate": 2.949185675429254e-05, "loss": 0.759, "step": 780 }, { "epoch": 0.22698255071641368, "grad_norm": 0.7147162556648254, "learning_rate": 2.9455663809706725e-05, "loss": 0.756, "step": 800 }, { "epoch": 0.23265711448432402, "grad_norm": 0.7116013765335083, "learning_rate": 2.9418249654393443e-05, "loss": 0.7538, "step": 820 }, { "epoch": 0.23833167825223436, "grad_norm": 0.64736407995224, "learning_rate": 2.9379617449090847e-05, "loss": 0.7513, "step": 840 }, { "epoch": 0.2440062420201447, "grad_norm": 0.6453843116760254, "learning_rate": 2.93397704574376e-05, "loss": 0.7538, "step": 860 }, { "epoch": 0.24968080578805504, "grad_norm": 0.6253499388694763, "learning_rate": 2.929871204569722e-05, "loss": 0.7463, "step": 880 }, { "epoch": 0.2553553695559654, "grad_norm": 0.6677010655403137, "learning_rate": 2.9256445682473683e-05, "loss": 0.7419, "step": 900 }, { "epoch": 0.26102993332387575, "grad_norm": 0.7070403695106506, "learning_rate": 2.9212974938418385e-05, "loss": 0.7449, "step": 920 }, { "epoch": 0.26670449709178606, "grad_norm": 0.6784743070602417, "learning_rate": 2.9168303485928495e-05, "loss": 0.7453, "step": 940 }, { "epoch": 0.27237906085969643, "grad_norm": 0.6076740026473999, "learning_rate": 2.912243509883673e-05, "loss": 0.7457, "step": 960 }, { "epoch": 0.27805362462760674, "grad_norm": 0.6722409129142761, "learning_rate": 2.9075373652092535e-05, "loss": 0.7373, "step": 980 }, { "epoch": 0.2837281883955171, "grad_norm": 0.7188818454742432, "learning_rate": 2.9027123121434714e-05, "loss": 0.7343, "step": 1000 }, { "epoch": 0.2894027521634274, "grad_norm": 0.657289981842041, "learning_rate": 2.897768758305558e-05, "loss": 0.7336, "step": 1020 }, { "epoch": 0.2950773159313378, "grad_norm": 0.6076385378837585, "learning_rate": 2.892707121325658e-05, "loss": 0.7331, "step": 1040 }, { "epoch": 0.3007518796992481, "grad_norm": 0.6217896342277527, "learning_rate": 2.8875278288095507e-05, "loss": 0.7339, "step": 1060 }, { "epoch": 0.30642644346715847, "grad_norm": 0.6453694701194763, "learning_rate": 2.882231318302523e-05, "loss": 0.7334, "step": 1080 }, { "epoch": 0.3121010072350688, "grad_norm": 0.6069263219833374, "learning_rate": 2.8768180372524093e-05, "loss": 0.734, "step": 1100 }, { "epoch": 0.31777557100297915, "grad_norm": 0.6342785358428955, "learning_rate": 2.8712884429717873e-05, "loss": 0.7254, "step": 1120 }, { "epoch": 0.32345013477088946, "grad_norm": 0.5936433672904968, "learning_rate": 2.8656430025993464e-05, "loss": 0.7232, "step": 1140 }, { "epoch": 0.32912469853879983, "grad_norm": 0.5988269448280334, "learning_rate": 2.8598821930604252e-05, "loss": 0.726, "step": 1160 }, { "epoch": 0.3347992623067102, "grad_norm": 0.6247944235801697, "learning_rate": 2.8540065010267183e-05, "loss": 0.729, "step": 1180 }, { "epoch": 0.3404738260746205, "grad_norm": 0.6017037034034729, "learning_rate": 2.848016422875164e-05, "loss": 0.7216, "step": 1200 }, { "epoch": 0.3461483898425309, "grad_norm": 0.7368952631950378, "learning_rate": 2.84191246464601e-05, "loss": 0.7331, "step": 1220 }, { "epoch": 0.3518229536104412, "grad_norm": 0.6655734777450562, "learning_rate": 2.835695142000064e-05, "loss": 0.7233, "step": 1240 }, { "epoch": 0.35749751737835156, "grad_norm": 0.6325275301933289, "learning_rate": 2.8293649801751288e-05, "loss": 0.7208, "step": 1260 }, { "epoch": 0.36317208114626187, "grad_norm": 0.6046157479286194, "learning_rate": 2.822922513941634e-05, "loss": 0.7156, "step": 1280 }, { "epoch": 0.36884664491417224, "grad_norm": 0.6081031560897827, "learning_rate": 2.816368287557454e-05, "loss": 0.722, "step": 1300 }, { "epoch": 0.37452120868208255, "grad_norm": 0.6153631806373596, "learning_rate": 2.809702854721934e-05, "loss": 0.7171, "step": 1320 }, { "epoch": 0.3801957724499929, "grad_norm": 0.6361656188964844, "learning_rate": 2.8029267785291092e-05, "loss": 0.7134, "step": 1340 }, { "epoch": 0.38587033621790323, "grad_norm": 0.6033869981765747, "learning_rate": 2.796040631420139e-05, "loss": 0.7171, "step": 1360 }, { "epoch": 0.3915448999858136, "grad_norm": 0.6300106644630432, "learning_rate": 2.789044995134944e-05, "loss": 0.7139, "step": 1380 }, { "epoch": 0.3972194637537239, "grad_norm": 0.5989068150520325, "learning_rate": 2.781940460663062e-05, "loss": 0.7142, "step": 1400 }, { "epoch": 0.4028940275216343, "grad_norm": 0.5790150761604309, "learning_rate": 2.774727628193721e-05, "loss": 0.7126, "step": 1420 }, { "epoch": 0.4085685912895446, "grad_norm": 0.5948804616928101, "learning_rate": 2.7674071070651378e-05, "loss": 0.7103, "step": 1440 }, { "epoch": 0.41424315505745496, "grad_norm": 0.6838712096214294, "learning_rate": 2.7599795157130364e-05, "loss": 0.7169, "step": 1460 }, { "epoch": 0.4199177188253653, "grad_norm": 0.6502018570899963, "learning_rate": 2.7524454816184076e-05, "loss": 0.7094, "step": 1480 }, { "epoch": 0.42559228259327564, "grad_norm": 0.6322967410087585, "learning_rate": 2.7448056412544956e-05, "loss": 0.7134, "step": 1500 }, { "epoch": 0.431266846361186, "grad_norm": 0.5761287212371826, "learning_rate": 2.7370606400330334e-05, "loss": 0.7067, "step": 1520 }, { "epoch": 0.4369414101290963, "grad_norm": 0.6147580742835999, "learning_rate": 2.729211132249713e-05, "loss": 0.7078, "step": 1540 }, { "epoch": 0.4426159738970067, "grad_norm": 0.6231666207313538, "learning_rate": 2.7212577810289157e-05, "loss": 0.7066, "step": 1560 }, { "epoch": 0.448290537664917, "grad_norm": 0.5739862322807312, "learning_rate": 2.713201258267689e-05, "loss": 0.708, "step": 1580 }, { "epoch": 0.45396510143282737, "grad_norm": 0.7059602737426758, "learning_rate": 2.7050422445789843e-05, "loss": 0.7043, "step": 1600 }, { "epoch": 0.4596396652007377, "grad_norm": 0.6156895160675049, "learning_rate": 2.696781429234162e-05, "loss": 0.7118, "step": 1620 }, { "epoch": 0.46531422896864805, "grad_norm": 0.5444714426994324, "learning_rate": 2.6884195101047567e-05, "loss": 0.7031, "step": 1640 }, { "epoch": 0.47098879273655836, "grad_norm": 0.6431369185447693, "learning_rate": 2.6799571936035284e-05, "loss": 0.7056, "step": 1660 }, { "epoch": 0.4766633565044687, "grad_norm": 0.6375367641448975, "learning_rate": 2.671395194624779e-05, "loss": 0.6991, "step": 1680 }, { "epoch": 0.48233792027237904, "grad_norm": 0.6311667561531067, "learning_rate": 2.6627342364839604e-05, "loss": 0.6991, "step": 1700 }, { "epoch": 0.4880124840402894, "grad_norm": 0.580328643321991, "learning_rate": 2.6539750508565683e-05, "loss": 0.7027, "step": 1720 }, { "epoch": 0.4936870478081997, "grad_norm": 0.6254743933677673, "learning_rate": 2.6451183777163316e-05, "loss": 0.6977, "step": 1740 }, { "epoch": 0.4993616115761101, "grad_norm": 0.8747753500938416, "learning_rate": 2.636164965272699e-05, "loss": 0.6974, "step": 1760 }, { "epoch": 0.5050361753440205, "grad_norm": 0.5931680798530579, "learning_rate": 2.6271155699076305e-05, "loss": 0.7001, "step": 1780 }, { "epoch": 0.5107107391119308, "grad_norm": 0.5763223767280579, "learning_rate": 2.6179709561116983e-05, "loss": 0.7023, "step": 1800 }, { "epoch": 0.5163853028798411, "grad_norm": 0.5211492776870728, "learning_rate": 2.6087318964195032e-05, "loss": 0.6957, "step": 1820 }, { "epoch": 0.5220598666477515, "grad_norm": 0.5684000253677368, "learning_rate": 2.59939917134441e-05, "loss": 0.6916, "step": 1840 }, { "epoch": 0.5277344304156618, "grad_norm": 0.6029589176177979, "learning_rate": 2.5899735693126113e-05, "loss": 0.6942, "step": 1860 }, { "epoch": 0.5334089941835721, "grad_norm": 0.5765926837921143, "learning_rate": 2.5804558865965206e-05, "loss": 0.6973, "step": 1880 }, { "epoch": 0.5390835579514824, "grad_norm": 0.5227144956588745, "learning_rate": 2.5708469272475044e-05, "loss": 0.6929, "step": 1900 }, { "epoch": 0.5447581217193929, "grad_norm": 0.6175386309623718, "learning_rate": 2.5611475030279546e-05, "loss": 0.6908, "step": 1920 }, { "epoch": 0.5504326854873032, "grad_norm": 0.5724866986274719, "learning_rate": 2.5513584333427125e-05, "loss": 0.6893, "step": 1940 }, { "epoch": 0.5561072492552135, "grad_norm": 0.5964395403862, "learning_rate": 2.541480545169846e-05, "loss": 0.6944, "step": 1960 }, { "epoch": 0.5617818130231238, "grad_norm": 0.6019209027290344, "learning_rate": 2.5315146729907827e-05, "loss": 0.6899, "step": 1980 }, { "epoch": 0.5674563767910342, "grad_norm": 0.6371375918388367, "learning_rate": 2.521461658719819e-05, "loss": 0.6904, "step": 2000 }, { "epoch": 0.5731309405589445, "grad_norm": 0.5762882232666016, "learning_rate": 2.5113223516329924e-05, "loss": 0.6887, "step": 2020 }, { "epoch": 0.5788055043268548, "grad_norm": 0.591663122177124, "learning_rate": 2.501097608296334e-05, "loss": 0.6894, "step": 2040 }, { "epoch": 0.5844800680947652, "grad_norm": 0.5833630561828613, "learning_rate": 2.4907882924935072e-05, "loss": 0.6866, "step": 2060 }, { "epoch": 0.5901546318626756, "grad_norm": 0.5615355968475342, "learning_rate": 2.4803952751528363e-05, "loss": 0.6927, "step": 2080 }, { "epoch": 0.5958291956305859, "grad_norm": 0.5507014989852905, "learning_rate": 2.4699194342737295e-05, "loss": 0.6934, "step": 2100 }, { "epoch": 0.6015037593984962, "grad_norm": 0.5132161974906921, "learning_rate": 2.459361654852505e-05, "loss": 0.688, "step": 2120 }, { "epoch": 0.6071783231664066, "grad_norm": 0.5238850116729736, "learning_rate": 2.4487228288076293e-05, "loss": 0.6804, "step": 2140 }, { "epoch": 0.6128528869343169, "grad_norm": 0.5849164724349976, "learning_rate": 2.438003854904366e-05, "loss": 0.6911, "step": 2160 }, { "epoch": 0.6185274507022273, "grad_norm": 0.5290674567222595, "learning_rate": 2.4272056386788485e-05, "loss": 0.6838, "step": 2180 }, { "epoch": 0.6242020144701376, "grad_norm": 0.5804121494293213, "learning_rate": 2.4163290923615814e-05, "loss": 0.6894, "step": 2200 }, { "epoch": 0.629876578238048, "grad_norm": 0.5559779405593872, "learning_rate": 2.4053751348003757e-05, "loss": 0.6859, "step": 2220 }, { "epoch": 0.6355511420059583, "grad_norm": 0.5486791133880615, "learning_rate": 2.394344691382723e-05, "loss": 0.6836, "step": 2240 }, { "epoch": 0.6412257057738686, "grad_norm": 0.5544127225875854, "learning_rate": 2.3832386939576214e-05, "loss": 0.681, "step": 2260 }, { "epoch": 0.6469002695417789, "grad_norm": 0.5256103277206421, "learning_rate": 2.3720580807568513e-05, "loss": 0.6823, "step": 2280 }, { "epoch": 0.6525748333096894, "grad_norm": 0.5488288402557373, "learning_rate": 2.3608037963157142e-05, "loss": 0.6818, "step": 2300 }, { "epoch": 0.6582493970775997, "grad_norm": 0.5254908204078674, "learning_rate": 2.3494767913932393e-05, "loss": 0.6774, "step": 2320 }, { "epoch": 0.66392396084551, "grad_norm": 0.5880591869354248, "learning_rate": 2.338078022891864e-05, "loss": 0.6795, "step": 2340 }, { "epoch": 0.6695985246134204, "grad_norm": 0.5331950783729553, "learning_rate": 2.3266084537765924e-05, "loss": 0.6777, "step": 2360 }, { "epoch": 0.6752730883813307, "grad_norm": 0.5736955404281616, "learning_rate": 2.3150690529936475e-05, "loss": 0.6792, "step": 2380 }, { "epoch": 0.680947652149241, "grad_norm": 0.5705032348632812, "learning_rate": 2.303460795388613e-05, "loss": 0.6736, "step": 2400 }, { "epoch": 0.6866222159171513, "grad_norm": 0.569355845451355, "learning_rate": 2.2917846616240784e-05, "loss": 0.6767, "step": 2420 }, { "epoch": 0.6922967796850618, "grad_norm": 1.2819143533706665, "learning_rate": 2.2800416380967952e-05, "loss": 0.6772, "step": 2440 }, { "epoch": 0.6979713434529721, "grad_norm": 0.5238373279571533, "learning_rate": 2.268232716854343e-05, "loss": 0.674, "step": 2460 }, { "epoch": 0.7036459072208824, "grad_norm": 0.5886688828468323, "learning_rate": 2.2563588955113246e-05, "loss": 0.6757, "step": 2480 }, { "epoch": 0.7093204709887927, "grad_norm": 0.5450348854064941, "learning_rate": 2.244421177165085e-05, "loss": 0.6691, "step": 2500 }, { "epoch": 0.7149950347567031, "grad_norm": 0.5553733706474304, "learning_rate": 2.232420570310974e-05, "loss": 0.6751, "step": 2520 }, { "epoch": 0.7206695985246134, "grad_norm": 0.5076789259910583, "learning_rate": 2.2203580887571423e-05, "loss": 0.6739, "step": 2540 }, { "epoch": 0.7263441622925237, "grad_norm": 0.5153952240943909, "learning_rate": 2.2082347515389027e-05, "loss": 0.6734, "step": 2560 }, { "epoch": 0.732018726060434, "grad_norm": 0.5176730155944824, "learning_rate": 2.1960515828326372e-05, "loss": 0.6706, "step": 2580 }, { "epoch": 0.7376932898283445, "grad_norm": 0.526030421257019, "learning_rate": 2.1838096118692768e-05, "loss": 0.6694, "step": 2600 }, { "epoch": 0.7433678535962548, "grad_norm": 0.6030652523040771, "learning_rate": 2.1715098728473518e-05, "loss": 0.6707, "step": 2620 }, { "epoch": 0.7490424173641651, "grad_norm": 0.6607082486152649, "learning_rate": 2.1591534048456225e-05, "loss": 0.6668, "step": 2640 }, { "epoch": 0.7547169811320755, "grad_norm": 0.5300272107124329, "learning_rate": 2.1467412517352996e-05, "loss": 0.6696, "step": 2660 }, { "epoch": 0.7603915448999858, "grad_norm": 0.5344169735908508, "learning_rate": 2.1342744620918568e-05, "loss": 0.6736, "step": 2680 }, { "epoch": 0.7660661086678962, "grad_norm": 0.5058417916297913, "learning_rate": 2.121754089106448e-05, "loss": 0.6681, "step": 2700 }, { "epoch": 0.7717406724358065, "grad_norm": 0.5440433621406555, "learning_rate": 2.1091811904969344e-05, "loss": 0.6702, "step": 2720 }, { "epoch": 0.7774152362037169, "grad_norm": 0.5361486077308655, "learning_rate": 2.096556828418528e-05, "loss": 0.6686, "step": 2740 }, { "epoch": 0.7830897999716272, "grad_norm": 0.6350403428077698, "learning_rate": 2.0838820693740603e-05, "loss": 0.6678, "step": 2760 }, { "epoch": 0.7887643637395375, "grad_norm": 0.5326098203659058, "learning_rate": 2.0711579841238875e-05, "loss": 0.6711, "step": 2780 }, { "epoch": 0.7944389275074478, "grad_norm": 0.540676474571228, "learning_rate": 2.058385647595429e-05, "loss": 0.6705, "step": 2800 }, { "epoch": 0.8001134912753582, "grad_norm": 0.4930702745914459, "learning_rate": 2.045566138792361e-05, "loss": 0.6683, "step": 2820 }, { "epoch": 0.8057880550432686, "grad_norm": 0.5729920268058777, "learning_rate": 2.032700540703459e-05, "loss": 0.6646, "step": 2840 }, { "epoch": 0.8114626188111789, "grad_norm": 0.5179927945137024, "learning_rate": 2.0197899402111127e-05, "loss": 0.6632, "step": 2860 }, { "epoch": 0.8171371825790892, "grad_norm": 0.5147942900657654, "learning_rate": 2.0068354279995008e-05, "loss": 0.6558, "step": 2880 }, { "epoch": 0.8228117463469996, "grad_norm": 0.5044906735420227, "learning_rate": 1.9938380984624533e-05, "loss": 0.6634, "step": 2900 }, { "epoch": 0.8284863101149099, "grad_norm": 0.5231923460960388, "learning_rate": 1.9807990496109965e-05, "loss": 0.6698, "step": 2920 }, { "epoch": 0.8341608738828202, "grad_norm": 0.5322957634925842, "learning_rate": 1.967719382980594e-05, "loss": 0.6568, "step": 2940 }, { "epoch": 0.8398354376507307, "grad_norm": 0.512269139289856, "learning_rate": 1.9546002035380886e-05, "loss": 0.6654, "step": 2960 }, { "epoch": 0.845510001418641, "grad_norm": 0.508976399898529, "learning_rate": 1.9414426195883558e-05, "loss": 0.6552, "step": 2980 }, { "epoch": 0.8511845651865513, "grad_norm": 0.5061299204826355, "learning_rate": 1.9282477426806723e-05, "loss": 0.6599, "step": 3000 }, { "epoch": 0.8568591289544616, "grad_norm": 0.510822057723999, "learning_rate": 1.9150166875148155e-05, "loss": 0.6612, "step": 3020 }, { "epoch": 0.862533692722372, "grad_norm": 0.5578708648681641, "learning_rate": 1.9017505718468934e-05, "loss": 0.658, "step": 3040 }, { "epoch": 0.8682082564902823, "grad_norm": 0.5130868554115295, "learning_rate": 1.888450516394914e-05, "loss": 0.6541, "step": 3060 }, { "epoch": 0.8738828202581926, "grad_norm": 0.5147811770439148, "learning_rate": 1.8751176447441104e-05, "loss": 0.6586, "step": 3080 }, { "epoch": 0.879557384026103, "grad_norm": 0.5556140542030334, "learning_rate": 1.861753083252021e-05, "loss": 0.6535, "step": 3100 }, { "epoch": 0.8852319477940134, "grad_norm": 0.509611964225769, "learning_rate": 1.8483579609533318e-05, "loss": 0.6537, "step": 3120 }, { "epoch": 0.8909065115619237, "grad_norm": 0.5088684558868408, "learning_rate": 1.834933409464499e-05, "loss": 0.6562, "step": 3140 }, { "epoch": 0.896581075329834, "grad_norm": 0.48405396938323975, "learning_rate": 1.821480562888148e-05, "loss": 0.6583, "step": 3160 }, { "epoch": 0.9022556390977443, "grad_norm": 0.5087782144546509, "learning_rate": 1.808000557717268e-05, "loss": 0.6558, "step": 3180 }, { "epoch": 0.9079302028656547, "grad_norm": 0.5303909778594971, "learning_rate": 1.7944945327391957e-05, "loss": 0.6517, "step": 3200 }, { "epoch": 0.913604766633565, "grad_norm": 0.5164442658424377, "learning_rate": 1.7809636289394185e-05, "loss": 0.6529, "step": 3220 }, { "epoch": 0.9192793304014754, "grad_norm": 0.5162308216094971, "learning_rate": 1.7674089894051774e-05, "loss": 0.6542, "step": 3240 }, { "epoch": 0.9249538941693858, "grad_norm": 0.545396625995636, "learning_rate": 1.753831759228903e-05, "loss": 0.6527, "step": 3260 }, { "epoch": 0.9306284579372961, "grad_norm": 0.5134595632553101, "learning_rate": 1.740233085411477e-05, "loss": 0.6555, "step": 3280 }, { "epoch": 0.9363030217052064, "grad_norm": 0.48815637826919556, "learning_rate": 1.7266141167653353e-05, "loss": 0.6554, "step": 3300 }, { "epoch": 0.9419775854731167, "grad_norm": 0.5034410953521729, "learning_rate": 1.7129760038174146e-05, "loss": 0.6514, "step": 3320 }, { "epoch": 0.9476521492410271, "grad_norm": 0.5322323441505432, "learning_rate": 1.6993198987119576e-05, "loss": 0.6533, "step": 3340 }, { "epoch": 0.9533267130089375, "grad_norm": 0.48363253474235535, "learning_rate": 1.6856469551131805e-05, "loss": 0.6468, "step": 3360 }, { "epoch": 0.9590012767768478, "grad_norm": 0.4600164592266083, "learning_rate": 1.67195832810781e-05, "loss": 0.6472, "step": 3380 }, { "epoch": 0.9646758405447581, "grad_norm": 0.49600768089294434, "learning_rate": 1.6582551741075033e-05, "loss": 0.6467, "step": 3400 }, { "epoch": 0.9703504043126685, "grad_norm": 0.7202423810958862, "learning_rate": 1.6445386507511546e-05, "loss": 0.6502, "step": 3420 }, { "epoch": 0.9760249680805788, "grad_norm": 0.502703070640564, "learning_rate": 1.630809916807098e-05, "loss": 0.6424, "step": 3440 }, { "epoch": 0.9816995318484891, "grad_norm": 0.49266818165779114, "learning_rate": 1.617070132075214e-05, "loss": 0.6485, "step": 3460 }, { "epoch": 0.9873740956163994, "grad_norm": 0.5194821357727051, "learning_rate": 1.6033204572889516e-05, "loss": 0.6499, "step": 3480 }, { "epoch": 0.9930486593843099, "grad_norm": 0.49109163880348206, "learning_rate": 1.5895620540172682e-05, "loss": 0.6506, "step": 3500 }, { "epoch": 0.9987232231522202, "grad_norm": 0.5099320411682129, "learning_rate": 1.575796084566503e-05, "loss": 0.6466, "step": 3520 }, { "epoch": 1.0043977869201306, "grad_norm": 0.5476223230361938, "learning_rate": 1.562023711882182e-05, "loss": 0.5924, "step": 3540 }, { "epoch": 1.010072350688041, "grad_norm": 0.4934983551502228, "learning_rate": 1.548246099450776e-05, "loss": 0.5683, "step": 3560 }, { "epoch": 1.0157469144559512, "grad_norm": 0.5262681841850281, "learning_rate": 1.534464411201409e-05, "loss": 0.5733, "step": 3580 }, { "epoch": 1.0214214782238615, "grad_norm": 0.5271425843238831, "learning_rate": 1.520679811407526e-05, "loss": 0.5697, "step": 3600 }, { "epoch": 1.0270960419917718, "grad_norm": 0.5124356150627136, "learning_rate": 1.506893464588542e-05, "loss": 0.5653, "step": 3620 }, { "epoch": 1.0327706057596822, "grad_norm": 0.5131009817123413, "learning_rate": 1.4931065354114584e-05, "loss": 0.5669, "step": 3640 }, { "epoch": 1.0384451695275925, "grad_norm": 0.5003370046615601, "learning_rate": 1.4793201885924745e-05, "loss": 0.565, "step": 3660 }, { "epoch": 1.044119733295503, "grad_norm": 0.5440374612808228, "learning_rate": 1.465535588798592e-05, "loss": 0.5708, "step": 3680 }, { "epoch": 1.0497942970634133, "grad_norm": 0.5212259292602539, "learning_rate": 1.4517539005492237e-05, "loss": 0.57, "step": 3700 }, { "epoch": 1.0554688608313236, "grad_norm": 0.5004721879959106, "learning_rate": 1.4379762881178182e-05, "loss": 0.5692, "step": 3720 }, { "epoch": 1.061143424599234, "grad_norm": 0.5253936648368835, "learning_rate": 1.4242039154334973e-05, "loss": 0.5685, "step": 3740 }, { "epoch": 1.0668179883671443, "grad_norm": 0.5163034200668335, "learning_rate": 1.410437945982732e-05, "loss": 0.5706, "step": 3760 }, { "epoch": 1.0724925521350546, "grad_norm": 0.49630168080329895, "learning_rate": 1.3966795427110493e-05, "loss": 0.5725, "step": 3780 }, { "epoch": 1.0781671159029649, "grad_norm": 0.5117852091789246, "learning_rate": 1.3829298679247865e-05, "loss": 0.5646, "step": 3800 }, { "epoch": 1.0838416796708752, "grad_norm": 0.5082918405532837, "learning_rate": 1.369190083192902e-05, "loss": 0.5705, "step": 3820 }, { "epoch": 1.0895162434387857, "grad_norm": 0.5319990515708923, "learning_rate": 1.3554613492488453e-05, "loss": 0.5684, "step": 3840 }, { "epoch": 1.095190807206696, "grad_norm": 0.5344195365905762, "learning_rate": 1.3417448258924971e-05, "loss": 0.5658, "step": 3860 }, { "epoch": 1.1008653709746063, "grad_norm": 0.507433295249939, "learning_rate": 1.3280416718921902e-05, "loss": 0.5717, "step": 3880 }, { "epoch": 1.1065399347425167, "grad_norm": 0.5090216398239136, "learning_rate": 1.3143530448868198e-05, "loss": 0.5663, "step": 3900 }, { "epoch": 1.112214498510427, "grad_norm": 0.512146532535553, "learning_rate": 1.3006801012880425e-05, "loss": 0.5656, "step": 3920 }, { "epoch": 1.1178890622783373, "grad_norm": 0.5273200869560242, "learning_rate": 1.2870239961825853e-05, "loss": 0.5621, "step": 3940 }, { "epoch": 1.1235636260462476, "grad_norm": 0.5408139824867249, "learning_rate": 1.2733858832346648e-05, "loss": 0.5744, "step": 3960 }, { "epoch": 1.1292381898141581, "grad_norm": 0.4986436069011688, "learning_rate": 1.2597669145885231e-05, "loss": 0.5704, "step": 3980 }, { "epoch": 1.1349127535820684, "grad_norm": 0.5186699628829956, "learning_rate": 1.2461682407710973e-05, "loss": 0.5588, "step": 4000 }, { "epoch": 1.1405873173499788, "grad_norm": 0.5081115365028381, "learning_rate": 1.2325910105948229e-05, "loss": 0.5667, "step": 4020 }, { "epoch": 1.146261881117889, "grad_norm": 0.501616358757019, "learning_rate": 1.219036371060582e-05, "loss": 0.5628, "step": 4040 }, { "epoch": 1.1519364448857994, "grad_norm": 0.5288362503051758, "learning_rate": 1.2055054672608043e-05, "loss": 0.5642, "step": 4060 }, { "epoch": 1.1576110086537097, "grad_norm": 0.5392152070999146, "learning_rate": 1.1919994422827326e-05, "loss": 0.5606, "step": 4080 }, { "epoch": 1.16328557242162, "grad_norm": 0.514348030090332, "learning_rate": 1.1785194371118521e-05, "loss": 0.5653, "step": 4100 }, { "epoch": 1.1689601361895305, "grad_norm": 0.4942004978656769, "learning_rate": 1.1650665905355014e-05, "loss": 0.5622, "step": 4120 }, { "epoch": 1.1746346999574409, "grad_norm": 0.48802751302719116, "learning_rate": 1.1516420390466685e-05, "loss": 0.5613, "step": 4140 }, { "epoch": 1.1803092637253512, "grad_norm": 0.5025625228881836, "learning_rate": 1.1382469167479795e-05, "loss": 0.5656, "step": 4160 }, { "epoch": 1.1859838274932615, "grad_norm": 0.5276467204093933, "learning_rate": 1.1248823552558895e-05, "loss": 0.5639, "step": 4180 }, { "epoch": 1.1916583912611718, "grad_norm": 0.5035718083381653, "learning_rate": 1.1115494836050861e-05, "loss": 0.5612, "step": 4200 }, { "epoch": 1.197332955029082, "grad_norm": 0.5080997347831726, "learning_rate": 1.0982494281531069e-05, "loss": 0.5647, "step": 4220 }, { "epoch": 1.2030075187969924, "grad_norm": 0.505695104598999, "learning_rate": 1.0849833124851846e-05, "loss": 0.5681, "step": 4240 }, { "epoch": 1.2086820825649027, "grad_norm": 0.48905614018440247, "learning_rate": 1.0717522573193281e-05, "loss": 0.561, "step": 4260 }, { "epoch": 1.2143566463328133, "grad_norm": 0.49127668142318726, "learning_rate": 1.0585573804116448e-05, "loss": 0.5639, "step": 4280 }, { "epoch": 1.2200312101007236, "grad_norm": 0.5206524729728699, "learning_rate": 1.0453997964619112e-05, "loss": 0.5594, "step": 4300 }, { "epoch": 1.2257057738686339, "grad_norm": 0.48683062195777893, "learning_rate": 1.0322806170194061e-05, "loss": 0.5622, "step": 4320 }, { "epoch": 1.2313803376365442, "grad_norm": 0.532207190990448, "learning_rate": 1.0192009503890037e-05, "loss": 0.5581, "step": 4340 }, { "epoch": 1.2370549014044545, "grad_norm": 0.49200239777565, "learning_rate": 1.0061619015375473e-05, "loss": 0.5594, "step": 4360 }, { "epoch": 1.2427294651723648, "grad_norm": 0.504898190498352, "learning_rate": 9.931645720004995e-06, "loss": 0.5622, "step": 4380 }, { "epoch": 1.2484040289402751, "grad_norm": 0.5061923861503601, "learning_rate": 9.802100597888877e-06, "loss": 0.5572, "step": 4400 }, { "epoch": 1.2540785927081854, "grad_norm": 0.4961055815219879, "learning_rate": 9.672994592965409e-06, "loss": 0.5609, "step": 4420 }, { "epoch": 1.259753156476096, "grad_norm": 0.4930592477321625, "learning_rate": 9.544338612076396e-06, "loss": 0.5637, "step": 4440 }, { "epoch": 1.2654277202440063, "grad_norm": 0.4978179335594177, "learning_rate": 9.41614352404571e-06, "loss": 0.5615, "step": 4460 }, { "epoch": 1.2711022840119166, "grad_norm": 0.5112114548683167, "learning_rate": 9.288420158761127e-06, "loss": 0.558, "step": 4480 }, { "epoch": 1.276776847779827, "grad_norm": 0.5114573240280151, "learning_rate": 9.161179306259401e-06, "loss": 0.5561, "step": 4500 }, { "epoch": 1.2824514115477372, "grad_norm": 0.5023430585861206, "learning_rate": 9.034431715814726e-06, "loss": 0.5558, "step": 4520 }, { "epoch": 1.2881259753156475, "grad_norm": 0.503487765789032, "learning_rate": 8.908188095030655e-06, "loss": 0.5607, "step": 4540 }, { "epoch": 1.2938005390835579, "grad_norm": 0.5188455581665039, "learning_rate": 8.78245910893552e-06, "loss": 0.5639, "step": 4560 }, { "epoch": 1.2994751028514684, "grad_norm": 0.5216081738471985, "learning_rate": 8.657255379081438e-06, "loss": 0.5584, "step": 4580 }, { "epoch": 1.3051496666193787, "grad_norm": 0.5024508833885193, "learning_rate": 8.532587482647013e-06, "loss": 0.5604, "step": 4600 }, { "epoch": 1.310824230387289, "grad_norm": 0.5100445747375488, "learning_rate": 8.408465951543779e-06, "loss": 0.5596, "step": 4620 }, { "epoch": 1.3164987941551993, "grad_norm": 0.5005710124969482, "learning_rate": 8.284901271526481e-06, "loss": 0.5591, "step": 4640 }, { "epoch": 1.3221733579231096, "grad_norm": 0.5151055455207825, "learning_rate": 8.161903881307231e-06, "loss": 0.5462, "step": 4660 }, { "epoch": 1.32784792169102, "grad_norm": 0.4919968545436859, "learning_rate": 8.039484171673628e-06, "loss": 0.5523, "step": 4680 }, { "epoch": 1.3335224854589303, "grad_norm": 0.5007758140563965, "learning_rate": 7.917652484610975e-06, "loss": 0.5545, "step": 4700 }, { "epoch": 1.3391970492268408, "grad_norm": 0.4885912537574768, "learning_rate": 7.796419112428583e-06, "loss": 0.5582, "step": 4720 }, { "epoch": 1.344871612994751, "grad_norm": 0.4874049127101898, "learning_rate": 7.675794296890265e-06, "loss": 0.5505, "step": 4740 }, { "epoch": 1.3505461767626614, "grad_norm": 0.46998655796051025, "learning_rate": 7.555788228349143e-06, "loss": 0.554, "step": 4760 }, { "epoch": 1.3562207405305717, "grad_norm": 0.4996753931045532, "learning_rate": 7.436411044886753e-06, "loss": 0.5513, "step": 4780 }, { "epoch": 1.361895304298482, "grad_norm": 0.502571165561676, "learning_rate": 7.31767283145657e-06, "loss": 0.5547, "step": 4800 }, { "epoch": 1.3675698680663924, "grad_norm": 0.48792627453804016, "learning_rate": 7.199583619032052e-06, "loss": 0.5551, "step": 4820 }, { "epoch": 1.3732444318343027, "grad_norm": 0.48799988627433777, "learning_rate": 7.082153383759222e-06, "loss": 0.5524, "step": 4840 }, { "epoch": 1.3789189956022132, "grad_norm": 0.4976406991481781, "learning_rate": 6.9653920461138755e-06, "loss": 0.5548, "step": 4860 }, { "epoch": 1.3845935593701233, "grad_norm": 0.5006715655326843, "learning_rate": 6.849309470063529e-06, "loss": 0.5544, "step": 4880 }, { "epoch": 1.3902681231380338, "grad_norm": 0.4864628314971924, "learning_rate": 6.7339154622340754e-06, "loss": 0.5483, "step": 4900 }, { "epoch": 1.3959426869059441, "grad_norm": 0.48580724000930786, "learning_rate": 6.619219771081361e-06, "loss": 0.5544, "step": 4920 }, { "epoch": 1.4016172506738545, "grad_norm": 0.5042415857315063, "learning_rate": 6.505232086067607e-06, "loss": 0.5504, "step": 4940 }, { "epoch": 1.4072918144417648, "grad_norm": 0.4970082640647888, "learning_rate": 6.391962036842863e-06, "loss": 0.547, "step": 4960 }, { "epoch": 1.412966378209675, "grad_norm": 0.47866857051849365, "learning_rate": 6.279419192431494e-06, "loss": 0.5548, "step": 4980 }, { "epoch": 1.4186409419775854, "grad_norm": 0.4664076566696167, "learning_rate": 6.167613060423789e-06, "loss": 0.5454, "step": 5000 }, { "epoch": 1.4243155057454957, "grad_norm": 0.49711087346076965, "learning_rate": 6.0565530861727685e-06, "loss": 0.5519, "step": 5020 }, { "epoch": 1.4299900695134062, "grad_norm": 0.46965324878692627, "learning_rate": 5.946248651996244e-06, "loss": 0.5519, "step": 5040 }, { "epoch": 1.4356646332813165, "grad_norm": 0.505743145942688, "learning_rate": 5.836709076384188e-06, "loss": 0.5482, "step": 5060 }, { "epoch": 1.4413391970492269, "grad_norm": 0.5078002214431763, "learning_rate": 5.727943613211521e-06, "loss": 0.5575, "step": 5080 }, { "epoch": 1.4470137608171372, "grad_norm": 0.48647207021713257, "learning_rate": 5.619961450956347e-06, "loss": 0.5461, "step": 5100 }, { "epoch": 1.4526883245850475, "grad_norm": 0.4711668789386749, "learning_rate": 5.5127717119237084e-06, "loss": 0.5472, "step": 5120 }, { "epoch": 1.4583628883529578, "grad_norm": 0.518395721912384, "learning_rate": 5.406383451474948e-06, "loss": 0.5483, "step": 5140 }, { "epoch": 1.464037452120868, "grad_norm": 0.4849320948123932, "learning_rate": 5.300805657262706e-06, "loss": 0.5459, "step": 5160 }, { "epoch": 1.4697120158887786, "grad_norm": 0.501943826675415, "learning_rate": 5.1960472484716374e-06, "loss": 0.5482, "step": 5180 }, { "epoch": 1.475386579656689, "grad_norm": 0.48699691891670227, "learning_rate": 5.092117075064931e-06, "loss": 0.5522, "step": 5200 }, { "epoch": 1.4810611434245993, "grad_norm": 0.48894861340522766, "learning_rate": 4.989023917036667e-06, "loss": 0.5502, "step": 5220 }, { "epoch": 1.4867357071925096, "grad_norm": 0.49131521582603455, "learning_rate": 4.886776483670077e-06, "loss": 0.5466, "step": 5240 }, { "epoch": 1.49241027096042, "grad_norm": 0.47139400243759155, "learning_rate": 4.78538341280181e-06, "loss": 0.5473, "step": 5260 }, { "epoch": 1.4980848347283302, "grad_norm": 0.49604731798171997, "learning_rate": 4.684853270092173e-06, "loss": 0.5498, "step": 5280 }, { "epoch": 1.5037593984962405, "grad_norm": 0.4864351749420166, "learning_rate": 4.585194548301545e-06, "loss": 0.5448, "step": 5300 }, { "epoch": 1.509433962264151, "grad_norm": 0.48130905628204346, "learning_rate": 4.486415666572874e-06, "loss": 0.5469, "step": 5320 }, { "epoch": 1.5151085260320611, "grad_norm": 0.4783124625682831, "learning_rate": 4.388524969720458e-06, "loss": 0.546, "step": 5340 }, { "epoch": 1.5207830897999717, "grad_norm": 0.4969868063926697, "learning_rate": 4.2915307275249585e-06, "loss": 0.5453, "step": 5360 }, { "epoch": 1.526457653567882, "grad_norm": 0.4832542836666107, "learning_rate": 4.195441134034799e-06, "loss": 0.5463, "step": 5380 }, { "epoch": 1.5321322173357923, "grad_norm": 0.4712090790271759, "learning_rate": 4.10026430687389e-06, "loss": 0.5449, "step": 5400 }, { "epoch": 1.5378067811037026, "grad_norm": 0.4822421967983246, "learning_rate": 4.0060082865559035e-06, "loss": 0.5465, "step": 5420 }, { "epoch": 1.543481344871613, "grad_norm": 0.4809670150279999, "learning_rate": 3.912681035804971e-06, "loss": 0.5406, "step": 5440 }, { "epoch": 1.5491559086395235, "grad_norm": 0.4631410539150238, "learning_rate": 3.820290438883018e-06, "loss": 0.5461, "step": 5460 }, { "epoch": 1.5548304724074336, "grad_norm": 0.46498140692710876, "learning_rate": 3.728844300923694e-06, "loss": 0.5419, "step": 5480 }, { "epoch": 1.560505036175344, "grad_norm": 0.4786704480648041, "learning_rate": 3.6383503472730116e-06, "loss": 0.5476, "step": 5500 }, { "epoch": 1.5661795999432544, "grad_norm": 0.4655323624610901, "learning_rate": 3.548816222836688e-06, "loss": 0.5406, "step": 5520 }, { "epoch": 1.5718541637111647, "grad_norm": 0.46424925327301025, "learning_rate": 3.460249491434319e-06, "loss": 0.5415, "step": 5540 }, { "epoch": 1.577528727479075, "grad_norm": 0.45783787965774536, "learning_rate": 3.3726576351603985e-06, "loss": 0.5503, "step": 5560 }, { "epoch": 1.5832032912469853, "grad_norm": 0.49086692929267883, "learning_rate": 3.2860480537522103e-06, "loss": 0.543, "step": 5580 }, { "epoch": 1.5888778550148959, "grad_norm": 0.48474520444869995, "learning_rate": 3.2004280639647122e-06, "loss": 0.539, "step": 5600 }, { "epoch": 1.594552418782806, "grad_norm": 0.5037649869918823, "learning_rate": 3.115804898952434e-06, "loss": 0.5415, "step": 5620 }, { "epoch": 1.6002269825507165, "grad_norm": 0.4954313337802887, "learning_rate": 3.032185707658389e-06, "loss": 0.5487, "step": 5640 }, { "epoch": 1.6059015463186268, "grad_norm": 0.4597771465778351, "learning_rate": 2.949577554210157e-06, "loss": 0.5445, "step": 5660 }, { "epoch": 1.6115761100865371, "grad_norm": 0.4839852750301361, "learning_rate": 2.8679874173231137e-06, "loss": 0.5499, "step": 5680 }, { "epoch": 1.6172506738544474, "grad_norm": 0.4653310179710388, "learning_rate": 2.787422189710844e-06, "loss": 0.5453, "step": 5700 }, { "epoch": 1.6229252376223577, "grad_norm": 0.485579252243042, "learning_rate": 2.7078886775028693e-06, "loss": 0.5383, "step": 5720 }, { "epoch": 1.6285998013902683, "grad_norm": 0.4727838337421417, "learning_rate": 2.629393599669667e-06, "loss": 0.5421, "step": 5740 }, { "epoch": 1.6342743651581784, "grad_norm": 0.45239365100860596, "learning_rate": 2.5519435874550434e-06, "loss": 0.5357, "step": 5760 }, { "epoch": 1.639948928926089, "grad_norm": 0.4669874310493469, "learning_rate": 2.475545183815926e-06, "loss": 0.5385, "step": 5780 }, { "epoch": 1.645623492693999, "grad_norm": 0.4859563410282135, "learning_rate": 2.400204842869637e-06, "loss": 0.5446, "step": 5800 }, { "epoch": 1.6512980564619095, "grad_norm": 0.4492729902267456, "learning_rate": 2.3259289293486246e-06, "loss": 0.5418, "step": 5820 }, { "epoch": 1.6569726202298198, "grad_norm": 0.46383896470069885, "learning_rate": 2.252723718062787e-06, "loss": 0.5401, "step": 5840 }, { "epoch": 1.6626471839977301, "grad_norm": 0.48168492317199707, "learning_rate": 2.1805953933693835e-06, "loss": 0.5423, "step": 5860 }, { "epoch": 1.6683217477656405, "grad_norm": 0.46742239594459534, "learning_rate": 2.109550048650563e-06, "loss": 0.542, "step": 5880 }, { "epoch": 1.6739963115335508, "grad_norm": 0.46751725673675537, "learning_rate": 2.0395936857986125e-06, "loss": 0.5402, "step": 5900 }, { "epoch": 1.6796708753014613, "grad_norm": 0.49627310037612915, "learning_rate": 1.970732214708908e-06, "loss": 0.5461, "step": 5920 }, { "epoch": 1.6853454390693714, "grad_norm": 0.46826520562171936, "learning_rate": 1.9029714527806652e-06, "loss": 0.5385, "step": 5940 }, { "epoch": 1.691020002837282, "grad_norm": 0.4701858162879944, "learning_rate": 1.8363171244254606e-06, "loss": 0.5376, "step": 5960 }, { "epoch": 1.6966945666051922, "grad_norm": 0.4635229706764221, "learning_rate": 1.7707748605836632e-06, "loss": 0.5378, "step": 5980 }, { "epoch": 1.7023691303731026, "grad_norm": 0.4729613661766052, "learning_rate": 1.7063501982487135e-06, "loss": 0.5437, "step": 6000 }, { "epoch": 1.7080436941410129, "grad_norm": 0.4672451913356781, "learning_rate": 1.6430485799993673e-06, "loss": 0.5428, "step": 6020 }, { "epoch": 1.7137182579089232, "grad_norm": 0.46772390604019165, "learning_rate": 1.5808753535399022e-06, "loss": 0.5392, "step": 6040 }, { "epoch": 1.7193928216768337, "grad_norm": 0.46337825059890747, "learning_rate": 1.5198357712483629e-06, "loss": 0.5413, "step": 6060 }, { "epoch": 1.7250673854447438, "grad_norm": 0.48103076219558716, "learning_rate": 1.459934989732818e-06, "loss": 0.5416, "step": 6080 }, { "epoch": 1.7307419492126543, "grad_norm": 0.45769959688186646, "learning_rate": 1.4011780693957492e-06, "loss": 0.5436, "step": 6100 }, { "epoch": 1.7364165129805647, "grad_norm": 0.4552821218967438, "learning_rate": 1.3435699740065377e-06, "loss": 0.5425, "step": 6120 }, { "epoch": 1.742091076748475, "grad_norm": 0.48623600602149963, "learning_rate": 1.2871155702821324e-06, "loss": 0.5427, "step": 6140 }, { "epoch": 1.7477656405163853, "grad_norm": 0.5024483799934387, "learning_rate": 1.231819627475911e-06, "loss": 0.5384, "step": 6160 }, { "epoch": 1.7534402042842956, "grad_norm": 0.4556623101234436, "learning_rate": 1.1776868169747702e-06, "loss": 0.5393, "step": 6180 }, { "epoch": 1.7591147680522061, "grad_norm": 0.4748471677303314, "learning_rate": 1.1247217119044951e-06, "loss": 0.5385, "step": 6200 }, { "epoch": 1.7647893318201162, "grad_norm": 0.4622340500354767, "learning_rate": 1.07292878674342e-06, "loss": 0.5377, "step": 6220 }, { "epoch": 1.7704638955880267, "grad_norm": 0.4581329822540283, "learning_rate": 1.0223124169444236e-06, "loss": 0.5366, "step": 6240 }, { "epoch": 1.776138459355937, "grad_norm": 0.4667391777038574, "learning_rate": 9.72876878565287e-07, "loss": 0.539, "step": 6260 }, { "epoch": 1.7818130231238474, "grad_norm": 0.4563803970813751, "learning_rate": 9.246263479074663e-07, "loss": 0.5403, "step": 6280 }, { "epoch": 1.7874875868917577, "grad_norm": 0.44948819279670715, "learning_rate": 8.775649011632703e-07, "loss": 0.5392, "step": 6300 }, { "epoch": 1.793162150659668, "grad_norm": 0.4829549193382263, "learning_rate": 8.316965140715071e-07, "loss": 0.5373, "step": 6320 }, { "epoch": 1.7988367144275785, "grad_norm": 0.4718981683254242, "learning_rate": 7.870250615816182e-07, "loss": 0.5383, "step": 6340 }, { "epoch": 1.8045112781954886, "grad_norm": 0.4641667306423187, "learning_rate": 7.435543175263166e-07, "loss": 0.543, "step": 6360 }, { "epoch": 1.8101858419633992, "grad_norm": 0.45884087681770325, "learning_rate": 7.012879543027801e-07, "loss": 0.538, "step": 6380 }, { "epoch": 1.8158604057313092, "grad_norm": 0.4888609051704407, "learning_rate": 6.602295425624033e-07, "loss": 0.5366, "step": 6400 }, { "epoch": 1.8215349694992198, "grad_norm": 0.46243107318878174, "learning_rate": 6.20382550909157e-07, "loss": 0.5365, "step": 6420 }, { "epoch": 1.82720953326713, "grad_norm": 0.46520647406578064, "learning_rate": 5.817503456065559e-07, "loss": 0.5339, "step": 6440 }, { "epoch": 1.8328840970350404, "grad_norm": 0.47549664974212646, "learning_rate": 5.443361902932792e-07, "loss": 0.5361, "step": 6460 }, { "epoch": 1.838558660802951, "grad_norm": 0.4677965044975281, "learning_rate": 5.081432457074614e-07, "loss": 0.5394, "step": 6480 }, { "epoch": 1.844233224570861, "grad_norm": 0.46250638365745544, "learning_rate": 4.7317456941966597e-07, "loss": 0.5388, "step": 6500 }, { "epoch": 1.8499077883387716, "grad_norm": 0.4758864641189575, "learning_rate": 4.3943311557459177e-07, "loss": 0.534, "step": 6520 }, { "epoch": 1.8555823521066817, "grad_norm": 0.4370381832122803, "learning_rate": 4.069217346415027e-07, "loss": 0.5339, "step": 6540 }, { "epoch": 1.8612569158745922, "grad_norm": 0.4617324769496918, "learning_rate": 3.756431731734272e-07, "loss": 0.5396, "step": 6560 }, { "epoch": 1.8669314796425025, "grad_norm": 0.4532717168331146, "learning_rate": 3.4560007357511856e-07, "loss": 0.5393, "step": 6580 }, { "epoch": 1.8726060434104128, "grad_norm": 0.46486184000968933, "learning_rate": 3.16794973879837e-07, "loss": 0.5367, "step": 6600 }, { "epoch": 1.8782806071783231, "grad_norm": 0.44514200091362, "learning_rate": 2.8923030753492783e-07, "loss": 0.5384, "step": 6620 }, { "epoch": 1.8839551709462334, "grad_norm": 0.4737865924835205, "learning_rate": 2.6290840319625255e-07, "loss": 0.5355, "step": 6640 }, { "epoch": 1.889629734714144, "grad_norm": 0.45271801948547363, "learning_rate": 2.378314845314561e-07, "loss": 0.5451, "step": 6660 }, { "epoch": 1.895304298482054, "grad_norm": 0.46050384640693665, "learning_rate": 2.14001670032124e-07, "loss": 0.5347, "step": 6680 }, { "epoch": 1.9009788622499646, "grad_norm": 0.4726841151714325, "learning_rate": 1.9142097283479876e-07, "loss": 0.5428, "step": 6700 }, { "epoch": 1.906653426017875, "grad_norm": 0.4662003815174103, "learning_rate": 1.700913005509208e-07, "loss": 0.5407, "step": 6720 }, { "epoch": 1.9123279897857852, "grad_norm": 0.44422999024391174, "learning_rate": 1.500144551056709e-07, "loss": 0.535, "step": 6740 }, { "epoch": 1.9180025535536955, "grad_norm": 0.4599597752094269, "learning_rate": 1.3119213258574015e-07, "loss": 0.5376, "step": 6760 }, { "epoch": 1.9236771173216058, "grad_norm": 0.4735456705093384, "learning_rate": 1.1362592309605291e-07, "loss": 0.5392, "step": 6780 }, { "epoch": 1.9293516810895164, "grad_norm": 0.4692912995815277, "learning_rate": 9.731731062542604e-08, "loss": 0.5398, "step": 6800 } ], "logging_steps": 20, "max_steps": 7048, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5124467391135325e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }