{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.2757867574691772, "learning_rate": 0.000499986292341378, "loss": 5.9015, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.7096672654151917, "learning_rate": 0.0004999451708687113, "loss": 3.9877, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.6350783109664917, "learning_rate": 0.0004998766400914329, "loss": 3.7232, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.5088619589805603, "learning_rate": 0.0004997807075247146, "loss": 3.552, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.4739762842655182, "learning_rate": 0.0004996573836886434, "loss": 3.5127, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.628942608833313, "learning_rate": 0.0004995066821070679, "loss": 3.3798, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.5002753138542175, "learning_rate": 0.0004993286193061145, "loss": 3.3999, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.4983285963535309, "learning_rate": 0.0004991232148123761, "loss": 3.3398, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.4371359050273895, "learning_rate": 0.00049889049115077, "loss": 3.3405, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.5125981569290161, "learning_rate": 0.0004986304738420684, "loss": 3.3053, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.47835057973861694, "learning_rate": 0.000498343191400099, "loss": 3.2388, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.4769553244113922, "learning_rate": 0.0004980286753286195, "loss": 3.2004, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.44102832674980164, "learning_rate": 0.0004976869601178609, "loss": 3.2301, "step": 130 }, { "epoch": 0.05, "grad_norm": 0.44598281383514404, "learning_rate": 0.0004973180832407472, "loss": 3.2284, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.5305857062339783, "learning_rate": 0.0004969220851487844, "loss": 3.1978, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.5015985369682312, "learning_rate": 0.0004964990092676262, "loss": 3.198, "step": 160 }, { "epoch": 0.06, "grad_norm": 0.49595481157302856, "learning_rate": 0.0004960489019923105, "loss": 3.2582, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.4686504602432251, "learning_rate": 0.0004955718126821722, "loss": 3.1974, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.4878421425819397, "learning_rate": 0.0004950677936554305, "loss": 3.1467, "step": 190 }, { "epoch": 0.07, "grad_norm": 0.476806640625, "learning_rate": 0.0004945369001834514, "loss": 3.1321, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.5151681900024414, "learning_rate": 0.0004939791904846869, "loss": 3.1478, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.4726090133190155, "learning_rate": 0.0004933947257182901, "loss": 3.1757, "step": 220 }, { "epoch": 0.08, "grad_norm": 0.4868369698524475, "learning_rate": 0.000492783569977409, "loss": 3.1573, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.4568879306316376, "learning_rate": 0.0004921457902821578, "loss": 3.2129, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.45109352469444275, "learning_rate": 0.0004914814565722671, "loss": 3.1417, "step": 250 }, { "epoch": 0.09, "grad_norm": 0.5096346139907837, "learning_rate": 0.0004907906416994146, "loss": 3.1397, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.44307172298431396, "learning_rate": 0.0004900734214192358, "loss": 3.1851, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.469777375459671, "learning_rate": 0.0004893298743830168, "loss": 3.1117, "step": 280 }, { "epoch": 0.1, "grad_norm": 0.4518013596534729, "learning_rate": 0.0004885600821290692, "loss": 3.0941, "step": 290 }, { "epoch": 0.1, "grad_norm": 0.4657439589500427, "learning_rate": 0.0004877641290737884, "loss": 3.1936, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.4461677372455597, "learning_rate": 0.00048694210250239646, "loss": 3.1158, "step": 310 }, { "epoch": 0.11, "grad_norm": 0.454722136259079, "learning_rate": 0.0004860940925593703, "loss": 3.0921, "step": 320 }, { "epoch": 0.11, "grad_norm": 0.4858105480670929, "learning_rate": 0.0004852201922385564, "loss": 3.1601, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.4848730266094208, "learning_rate": 0.0004843204973729729, "loss": 3.1165, "step": 340 }, { "epoch": 0.12, "grad_norm": 0.5170342922210693, "learning_rate": 0.00048339510662430044, "loss": 3.0897, "step": 350 }, { "epoch": 0.12, "grad_norm": 0.4945329427719116, "learning_rate": 0.00048244412147206283, "loss": 3.0792, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.4752379357814789, "learning_rate": 0.0004814676462024987, "loss": 3.035, "step": 370 }, { "epoch": 0.13, "grad_norm": 0.4935299754142761, "learning_rate": 0.00048046578789712516, "loss": 3.1129, "step": 380 }, { "epoch": 0.13, "grad_norm": 0.47018569707870483, "learning_rate": 0.00047943865642099525, "loss": 3.1255, "step": 390 }, { "epoch": 0.13, "grad_norm": 0.45771998167037964, "learning_rate": 0.0004783863644106502, "loss": 3.0546, "step": 400 }, { "epoch": 0.14, "grad_norm": 0.4871847927570343, "learning_rate": 0.00047730902726176715, "loss": 3.1176, "step": 410 }, { "epoch": 0.14, "grad_norm": 0.5023790001869202, "learning_rate": 0.0004762067631165049, "loss": 3.1396, "step": 420 }, { "epoch": 0.14, "grad_norm": 0.5331537127494812, "learning_rate": 0.0004750796928505484, "loss": 3.0541, "step": 430 }, { "epoch": 0.15, "grad_norm": 0.46206673979759216, "learning_rate": 0.0004739279400598532, "loss": 3.1429, "step": 440 }, { "epoch": 0.15, "grad_norm": 0.5062265992164612, "learning_rate": 0.00047275163104709196, "loss": 3.0668, "step": 450 }, { "epoch": 0.15, "grad_norm": 0.7376482486724854, "learning_rate": 0.00047155089480780364, "loss": 3.1377, "step": 460 }, { "epoch": 0.16, "grad_norm": 0.5349658727645874, "learning_rate": 0.00047032586301624804, "loss": 3.0838, "step": 470 }, { "epoch": 0.16, "grad_norm": 0.49428579211235046, "learning_rate": 0.0004690766700109659, "loss": 3.1257, "step": 480 }, { "epoch": 0.16, "grad_norm": 0.4835917353630066, "learning_rate": 0.0004678034527800474, "loss": 3.0709, "step": 490 }, { "epoch": 0.17, "grad_norm": 0.5094427466392517, "learning_rate": 0.00046650635094610973, "loss": 3.0692, "step": 500 }, { "epoch": 0.17, "grad_norm": 0.5574087500572205, "learning_rate": 0.0004651855067509859, "loss": 3.0667, "step": 510 }, { "epoch": 0.17, "grad_norm": 0.5003999471664429, "learning_rate": 0.0004638410650401267, "loss": 3.0311, "step": 520 }, { "epoch": 0.18, "grad_norm": 0.5170697569847107, "learning_rate": 0.000462473173246716, "loss": 3.1082, "step": 530 }, { "epoch": 0.18, "grad_norm": 0.5004311800003052, "learning_rate": 0.00046108198137550377, "loss": 3.0825, "step": 540 }, { "epoch": 0.18, "grad_norm": 0.49480965733528137, "learning_rate": 0.00045966764198635603, "loss": 3.0888, "step": 550 }, { "epoch": 0.19, "grad_norm": 0.49261385202407837, "learning_rate": 0.00045823031017752484, "loss": 3.1568, "step": 560 }, { "epoch": 0.19, "grad_norm": 0.4577986001968384, "learning_rate": 0.00045677014356864043, "loss": 3.0953, "step": 570 }, { "epoch": 0.19, "grad_norm": 0.4814681112766266, "learning_rate": 0.000455287302283426, "loss": 3.0801, "step": 580 }, { "epoch": 0.2, "grad_norm": 0.563042402267456, "learning_rate": 0.00045378194893213854, "loss": 3.0719, "step": 590 }, { "epoch": 0.2, "grad_norm": 0.5507716536521912, "learning_rate": 0.0004522542485937369, "loss": 3.0724, "step": 600 }, { "epoch": 0.2, "grad_norm": 0.5448073148727417, "learning_rate": 0.0004507043687977787, "loss": 3.0294, "step": 610 }, { "epoch": 0.21, "grad_norm": 0.47773197293281555, "learning_rate": 0.00044913247950604905, "loss": 3.0682, "step": 620 }, { "epoch": 0.21, "grad_norm": 0.4662880301475525, "learning_rate": 0.0004475387530939226, "loss": 3.0106, "step": 630 }, { "epoch": 0.21, "grad_norm": 0.4735540747642517, "learning_rate": 0.00044592336433145995, "loss": 3.1345, "step": 640 }, { "epoch": 0.22, "grad_norm": 0.4746765196323395, "learning_rate": 0.0004442864903642427, "loss": 3.0772, "step": 650 }, { "epoch": 0.22, "grad_norm": 0.45668378472328186, "learning_rate": 0.0004426283106939473, "loss": 3.1174, "step": 660 }, { "epoch": 0.22, "grad_norm": 0.4704893231391907, "learning_rate": 0.0004409490071586606, "loss": 3.0816, "step": 670 }, { "epoch": 0.23, "grad_norm": 0.5290801525115967, "learning_rate": 0.0004392487639129391, "loss": 3.1089, "step": 680 }, { "epoch": 0.23, "grad_norm": 0.47564446926116943, "learning_rate": 0.0004375277674076149, "loss": 3.082, "step": 690 }, { "epoch": 0.23, "grad_norm": 0.4995107352733612, "learning_rate": 0.00043578620636934855, "loss": 3.1992, "step": 700 }, { "epoch": 0.24, "grad_norm": 0.48856520652770996, "learning_rate": 0.00043402427177993366, "loss": 3.0598, "step": 710 }, { "epoch": 0.24, "grad_norm": 0.4874037802219391, "learning_rate": 0.00043224215685535287, "loss": 3.0753, "step": 720 }, { "epoch": 0.24, "grad_norm": 0.45193690061569214, "learning_rate": 0.00043044005702459054, "loss": 2.9601, "step": 730 }, { "epoch": 0.25, "grad_norm": 0.46975770592689514, "learning_rate": 0.00042861816990820087, "loss": 3.0511, "step": 740 }, { "epoch": 0.25, "grad_norm": 0.5442278981208801, "learning_rate": 0.00042677669529663686, "loss": 3.0898, "step": 750 }, { "epoch": 0.25, "grad_norm": 0.5437372922897339, "learning_rate": 0.00042491583512834137, "loss": 3.0768, "step": 760 }, { "epoch": 0.26, "grad_norm": 0.4911404848098755, "learning_rate": 0.0004230357934676017, "loss": 3.1239, "step": 770 }, { "epoch": 0.26, "grad_norm": 0.4861234128475189, "learning_rate": 0.0004211367764821722, "loss": 3.0473, "step": 780 }, { "epoch": 0.26, "grad_norm": 0.49302661418914795, "learning_rate": 0.0004192189924206652, "loss": 3.0874, "step": 790 }, { "epoch": 0.27, "grad_norm": 0.4864264726638794, "learning_rate": 0.0004172826515897146, "loss": 3.0753, "step": 800 }, { "epoch": 0.27, "grad_norm": 0.48708102107048035, "learning_rate": 0.00041532796633091297, "loss": 3.1403, "step": 810 }, { "epoch": 0.27, "grad_norm": 0.5046274662017822, "learning_rate": 0.0004133551509975264, "loss": 3.0785, "step": 820 }, { "epoch": 0.28, "grad_norm": 0.5404755473136902, "learning_rate": 0.00041136442193098765, "loss": 3.0104, "step": 830 }, { "epoch": 0.28, "grad_norm": 0.5012314915657043, "learning_rate": 0.00040935599743717243, "loss": 3.0538, "step": 840 }, { "epoch": 0.28, "grad_norm": 0.5044212937355042, "learning_rate": 0.0004073300977624594, "loss": 3.0975, "step": 850 }, { "epoch": 0.29, "grad_norm": 0.5059216618537903, "learning_rate": 0.0004052869450695776, "loss": 3.1057, "step": 860 }, { "epoch": 0.29, "grad_norm": 0.4804486930370331, "learning_rate": 0.00040322676341324415, "loss": 3.0492, "step": 870 }, { "epoch": 0.29, "grad_norm": 0.5673746466636658, "learning_rate": 0.00040114977871559375, "loss": 2.9521, "step": 880 }, { "epoch": 0.3, "grad_norm": 0.4696684181690216, "learning_rate": 0.00039905621874140396, "loss": 3.0136, "step": 890 }, { "epoch": 0.3, "grad_norm": 0.5002209544181824, "learning_rate": 0.0003969463130731183, "loss": 3.0378, "step": 900 }, { "epoch": 0.3, "grad_norm": 0.48504123091697693, "learning_rate": 0.0003948202930856697, "loss": 3.0458, "step": 910 }, { "epoch": 0.31, "grad_norm": 0.48299267888069153, "learning_rate": 0.000392678391921108, "loss": 3.0699, "step": 920 }, { "epoch": 0.31, "grad_norm": 0.5440784096717834, "learning_rate": 0.00039052084446303264, "loss": 3.044, "step": 930 }, { "epoch": 0.31, "grad_norm": 0.498516321182251, "learning_rate": 0.000388347887310836, "loss": 3.0524, "step": 940 }, { "epoch": 0.32, "grad_norm": 0.49369409680366516, "learning_rate": 0.00038615975875375683, "loss": 3.0511, "step": 950 }, { "epoch": 0.32, "grad_norm": 0.5007414817810059, "learning_rate": 0.00038395669874474915, "loss": 3.0369, "step": 960 }, { "epoch": 0.32, "grad_norm": 0.4814367890357971, "learning_rate": 0.00038173894887416946, "loss": 3.0965, "step": 970 }, { "epoch": 0.33, "grad_norm": 0.5295039415359497, "learning_rate": 0.00037950675234328256, "loss": 3.0571, "step": 980 }, { "epoch": 0.33, "grad_norm": 0.5096908211708069, "learning_rate": 0.00037726035393759286, "loss": 2.9924, "step": 990 }, { "epoch": 0.33, "grad_norm": 0.49976101517677307, "learning_rate": 0.000375, "loss": 3.0523, "step": 1000 }, { "epoch": 0.34, "grad_norm": 0.486150860786438, "learning_rate": 0.0003727259384037852, "loss": 3.0453, "step": 1010 }, { "epoch": 0.34, "grad_norm": 0.4886365532875061, "learning_rate": 0.0003704384185254288, "loss": 3.0791, "step": 1020 }, { "epoch": 0.34, "grad_norm": 0.4869343638420105, "learning_rate": 0.00036813769121726354, "loss": 3.0962, "step": 1030 }, { "epoch": 0.35, "grad_norm": 0.47605258226394653, "learning_rate": 0.00036582400877996547, "loss": 3.087, "step": 1040 }, { "epoch": 0.35, "grad_norm": 0.49207133054733276, "learning_rate": 0.00036349762493488667, "loss": 3.0406, "step": 1050 }, { "epoch": 0.35, "grad_norm": 0.5010517835617065, "learning_rate": 0.00036115879479623185, "loss": 3.0638, "step": 1060 }, { "epoch": 0.36, "grad_norm": 0.48435714840888977, "learning_rate": 0.0003588077748430819, "loss": 3.0325, "step": 1070 }, { "epoch": 0.36, "grad_norm": 0.5021737813949585, "learning_rate": 0.0003564448228912682, "loss": 3.0115, "step": 1080 }, { "epoch": 0.36, "grad_norm": 0.46703141927719116, "learning_rate": 0.00035407019806510035, "loss": 2.9539, "step": 1090 }, { "epoch": 0.37, "grad_norm": 0.521730363368988, "learning_rate": 0.0003516841607689501, "loss": 3.0114, "step": 1100 }, { "epoch": 0.37, "grad_norm": 0.4748888611793518, "learning_rate": 0.00034928697265869515, "loss": 3.0226, "step": 1110 }, { "epoch": 0.37, "grad_norm": 0.4812580645084381, "learning_rate": 0.00034687889661302575, "loss": 3.0065, "step": 1120 }, { "epoch": 0.38, "grad_norm": 0.5008811354637146, "learning_rate": 0.00034446019670461683, "loss": 3.027, "step": 1130 }, { "epoch": 0.38, "grad_norm": 0.49032077193260193, "learning_rate": 0.00034203113817116957, "loss": 2.9933, "step": 1140 }, { "epoch": 0.38, "grad_norm": 0.47988900542259216, "learning_rate": 0.00033959198738632503, "loss": 3.0474, "step": 1150 }, { "epoch": 0.39, "grad_norm": 0.49220654368400574, "learning_rate": 0.0003371430118304538, "loss": 3.0226, "step": 1160 }, { "epoch": 0.39, "grad_norm": 0.49873360991477966, "learning_rate": 0.0003346844800613229, "loss": 3.0082, "step": 1170 }, { "epoch": 0.39, "grad_norm": 0.5125187039375305, "learning_rate": 0.0003322166616846458, "loss": 3.0468, "step": 1180 }, { "epoch": 0.4, "grad_norm": 0.509082555770874, "learning_rate": 0.0003297398273245175, "loss": 3.1134, "step": 1190 }, { "epoch": 0.4, "grad_norm": 0.45824429392814636, "learning_rate": 0.00032725424859373687, "loss": 3.0516, "step": 1200 }, { "epoch": 0.4, "grad_norm": 0.49980753660202026, "learning_rate": 0.0003247601980640217, "loss": 2.9742, "step": 1210 }, { "epoch": 0.41, "grad_norm": 0.48861539363861084, "learning_rate": 0.0003222579492361179, "loss": 3.004, "step": 1220 }, { "epoch": 0.41, "grad_norm": 0.5454481840133667, "learning_rate": 0.00031974777650980735, "loss": 2.9608, "step": 1230 }, { "epoch": 0.41, "grad_norm": 0.4744034707546234, "learning_rate": 0.0003172299551538164, "loss": 3.0384, "step": 1240 }, { "epoch": 0.42, "grad_norm": 0.5317044854164124, "learning_rate": 0.00031470476127563017, "loss": 3.0209, "step": 1250 }, { "epoch": 0.42, "grad_norm": 0.49222180247306824, "learning_rate": 0.0003121724717912138, "loss": 3.0135, "step": 1260 }, { "epoch": 0.42, "grad_norm": 0.4853774309158325, "learning_rate": 0.00030963336439464523, "loss": 3.0235, "step": 1270 }, { "epoch": 0.43, "grad_norm": 0.4858599603176117, "learning_rate": 0.00030708771752766396, "loss": 3.0256, "step": 1280 }, { "epoch": 0.43, "grad_norm": 0.4980061948299408, "learning_rate": 0.0003045358103491357, "loss": 2.9718, "step": 1290 }, { "epoch": 0.43, "grad_norm": 0.49256446957588196, "learning_rate": 0.0003019779227044398, "loss": 3.0288, "step": 1300 }, { "epoch": 0.44, "grad_norm": 0.5088287591934204, "learning_rate": 0.00029941433509478153, "loss": 3.0268, "step": 1310 }, { "epoch": 0.44, "grad_norm": 0.5106491446495056, "learning_rate": 0.0002968453286464312, "loss": 3.0032, "step": 1320 }, { "epoch": 0.44, "grad_norm": 0.4999896287918091, "learning_rate": 0.0002942711850798959, "loss": 2.9407, "step": 1330 }, { "epoch": 0.45, "grad_norm": 0.4718291163444519, "learning_rate": 0.00029169218667902556, "loss": 2.9313, "step": 1340 }, { "epoch": 0.45, "grad_norm": 0.485334187746048, "learning_rate": 0.00028910861626005774, "loss": 2.9993, "step": 1350 }, { "epoch": 0.45, "grad_norm": 0.47278451919555664, "learning_rate": 0.00028652075714060294, "loss": 3.0594, "step": 1360 }, { "epoch": 0.46, "grad_norm": 0.4961351454257965, "learning_rate": 0.0002839288931085761, "loss": 2.9736, "step": 1370 }, { "epoch": 0.46, "grad_norm": 0.48107197880744934, "learning_rate": 0.00028133330839107606, "loss": 2.9818, "step": 1380 }, { "epoch": 0.46, "grad_norm": 0.5018859505653381, "learning_rate": 0.0002787342876232167, "loss": 2.9993, "step": 1390 }, { "epoch": 0.47, "grad_norm": 0.5005733966827393, "learning_rate": 0.0002761321158169134, "loss": 2.971, "step": 1400 }, { "epoch": 0.47, "grad_norm": 0.45412546396255493, "learning_rate": 0.0002735270783296286, "loss": 2.9343, "step": 1410 }, { "epoch": 0.47, "grad_norm": 0.49526095390319824, "learning_rate": 0.00027091946083307894, "loss": 3.0247, "step": 1420 }, { "epoch": 0.48, "grad_norm": 0.5004629492759705, "learning_rate": 0.00026830954928190793, "loss": 2.9386, "step": 1430 }, { "epoch": 0.48, "grad_norm": 0.5008641481399536, "learning_rate": 0.0002656976298823284, "loss": 2.9768, "step": 1440 }, { "epoch": 0.48, "grad_norm": 0.4870932698249817, "learning_rate": 0.000263083989060736, "loss": 2.9825, "step": 1450 }, { "epoch": 0.49, "grad_norm": 0.47083091735839844, "learning_rate": 0.0002604689134322999, "loss": 2.9694, "step": 1460 }, { "epoch": 0.49, "grad_norm": 0.4780152440071106, "learning_rate": 0.00025785268976953206, "loss": 3.0139, "step": 1470 }, { "epoch": 0.49, "grad_norm": 0.47919341921806335, "learning_rate": 0.00025523560497083924, "loss": 2.9558, "step": 1480 }, { "epoch": 0.5, "grad_norm": 0.4744253158569336, "learning_rate": 0.00025261794602906147, "loss": 2.9823, "step": 1490 }, { "epoch": 0.5, "grad_norm": 0.47770825028419495, "learning_rate": 0.00025, "loss": 2.976, "step": 1500 }, { "epoch": 0.5, "grad_norm": 0.535241961479187, "learning_rate": 0.00024738205397093865, "loss": 2.9404, "step": 1510 }, { "epoch": 0.51, "grad_norm": 0.5206143260002136, "learning_rate": 0.00024476439502916077, "loss": 2.9697, "step": 1520 }, { "epoch": 0.51, "grad_norm": 0.4685532748699188, "learning_rate": 0.00024214731023046793, "loss": 3.031, "step": 1530 }, { "epoch": 0.51, "grad_norm": 0.45759207010269165, "learning_rate": 0.00023953108656770016, "loss": 2.9371, "step": 1540 }, { "epoch": 0.52, "grad_norm": 0.47409725189208984, "learning_rate": 0.00023691601093926405, "loss": 2.9492, "step": 1550 }, { "epoch": 0.52, "grad_norm": 0.4723811149597168, "learning_rate": 0.00023430237011767165, "loss": 3.0055, "step": 1560 }, { "epoch": 0.52, "grad_norm": 0.5121801495552063, "learning_rate": 0.00023169045071809213, "loss": 2.9654, "step": 1570 }, { "epoch": 0.53, "grad_norm": 0.47900766134262085, "learning_rate": 0.00022908053916692117, "loss": 2.9665, "step": 1580 }, { "epoch": 0.53, "grad_norm": 0.47504156827926636, "learning_rate": 0.00022647292167037142, "loss": 2.9543, "step": 1590 }, { "epoch": 0.53, "grad_norm": 0.48130717873573303, "learning_rate": 0.00022386788418308668, "loss": 2.9948, "step": 1600 }, { "epoch": 0.54, "grad_norm": 0.4968855381011963, "learning_rate": 0.00022126571237678338, "loss": 3.0497, "step": 1610 }, { "epoch": 0.54, "grad_norm": 0.45493414998054504, "learning_rate": 0.00021866669160892392, "loss": 3.0211, "step": 1620 }, { "epoch": 0.54, "grad_norm": 0.46341362595558167, "learning_rate": 0.00021607110689142393, "loss": 2.9249, "step": 1630 }, { "epoch": 0.55, "grad_norm": 0.4598373472690582, "learning_rate": 0.00021347924285939712, "loss": 2.9306, "step": 1640 }, { "epoch": 0.55, "grad_norm": 0.5339600443840027, "learning_rate": 0.00021089138373994224, "loss": 2.9457, "step": 1650 }, { "epoch": 0.55, "grad_norm": 0.4766974449157715, "learning_rate": 0.00020830781332097445, "loss": 2.9573, "step": 1660 }, { "epoch": 0.56, "grad_norm": 0.45860227942466736, "learning_rate": 0.0002057288149201042, "loss": 2.9067, "step": 1670 }, { "epoch": 0.56, "grad_norm": 0.4715401530265808, "learning_rate": 0.0002031546713535688, "loss": 2.9373, "step": 1680 }, { "epoch": 0.56, "grad_norm": 0.4715852439403534, "learning_rate": 0.00020058566490521845, "loss": 2.9486, "step": 1690 }, { "epoch": 0.57, "grad_norm": 0.4740593731403351, "learning_rate": 0.0001980220772955602, "loss": 2.9672, "step": 1700 }, { "epoch": 0.57, "grad_norm": 0.5071026086807251, "learning_rate": 0.00019546418965086444, "loss": 2.9791, "step": 1710 }, { "epoch": 0.57, "grad_norm": 0.46667274832725525, "learning_rate": 0.00019291228247233605, "loss": 2.9517, "step": 1720 }, { "epoch": 0.58, "grad_norm": 0.5175219178199768, "learning_rate": 0.00019036663560535483, "loss": 2.9765, "step": 1730 }, { "epoch": 0.58, "grad_norm": 0.4796634018421173, "learning_rate": 0.00018782752820878634, "loss": 2.9258, "step": 1740 }, { "epoch": 0.58, "grad_norm": 0.4871932566165924, "learning_rate": 0.0001852952387243698, "loss": 2.9403, "step": 1750 }, { "epoch": 0.59, "grad_norm": 0.4641266167163849, "learning_rate": 0.00018277004484618358, "loss": 2.9058, "step": 1760 }, { "epoch": 0.59, "grad_norm": 0.4585517346858978, "learning_rate": 0.0001802522234901927, "loss": 2.9802, "step": 1770 }, { "epoch": 0.59, "grad_norm": 0.4525165855884552, "learning_rate": 0.00017774205076388205, "loss": 3.0241, "step": 1780 }, { "epoch": 0.6, "grad_norm": 0.4651874303817749, "learning_rate": 0.00017523980193597836, "loss": 2.9783, "step": 1790 }, { "epoch": 0.6, "grad_norm": 0.4655090868473053, "learning_rate": 0.00017274575140626317, "loss": 2.9115, "step": 1800 }, { "epoch": 0.6, "grad_norm": 0.44796276092529297, "learning_rate": 0.0001702601726754825, "loss": 2.9357, "step": 1810 }, { "epoch": 0.61, "grad_norm": 0.5139617323875427, "learning_rate": 0.00016778333831535418, "loss": 2.9124, "step": 1820 }, { "epoch": 0.61, "grad_norm": 0.4756507873535156, "learning_rate": 0.00016531551993867715, "loss": 2.9252, "step": 1830 }, { "epoch": 0.61, "grad_norm": 0.47321391105651855, "learning_rate": 0.00016285698816954626, "loss": 2.9801, "step": 1840 }, { "epoch": 0.62, "grad_norm": 0.4512488842010498, "learning_rate": 0.00016040801261367493, "loss": 3.0134, "step": 1850 }, { "epoch": 0.62, "grad_norm": 0.5047795176506042, "learning_rate": 0.00015796886182883053, "loss": 2.9549, "step": 1860 }, { "epoch": 0.62, "grad_norm": 0.4670908451080322, "learning_rate": 0.00015553980329538326, "loss": 2.9534, "step": 1870 }, { "epoch": 0.63, "grad_norm": 0.4596426784992218, "learning_rate": 0.00015312110338697426, "loss": 2.9802, "step": 1880 }, { "epoch": 0.63, "grad_norm": 0.45904773473739624, "learning_rate": 0.00015071302734130488, "loss": 2.9435, "step": 1890 }, { "epoch": 0.63, "grad_norm": 0.48870381712913513, "learning_rate": 0.00014831583923105, "loss": 2.9378, "step": 1900 }, { "epoch": 0.64, "grad_norm": 0.4705340564250946, "learning_rate": 0.00014592980193489974, "loss": 2.9716, "step": 1910 }, { "epoch": 0.64, "grad_norm": 0.45324671268463135, "learning_rate": 0.00014355517710873183, "loss": 2.9729, "step": 1920 }, { "epoch": 0.64, "grad_norm": 0.4573574662208557, "learning_rate": 0.00014119222515691815, "loss": 2.9286, "step": 1930 }, { "epoch": 0.65, "grad_norm": 0.48345670104026794, "learning_rate": 0.00013884120520376818, "loss": 3.0016, "step": 1940 }, { "epoch": 0.65, "grad_norm": 0.4590473175048828, "learning_rate": 0.00013650237506511331, "loss": 2.8464, "step": 1950 }, { "epoch": 0.65, "grad_norm": 0.7705239057540894, "learning_rate": 0.00013417599122003462, "loss": 3.007, "step": 1960 }, { "epoch": 0.66, "grad_norm": 0.4770728647708893, "learning_rate": 0.00013186230878273653, "loss": 2.9381, "step": 1970 }, { "epoch": 0.66, "grad_norm": 0.44512978196144104, "learning_rate": 0.00012956158147457115, "loss": 2.9813, "step": 1980 }, { "epoch": 0.66, "grad_norm": 0.4624728262424469, "learning_rate": 0.00012727406159621478, "loss": 2.9069, "step": 1990 }, { "epoch": 0.67, "grad_norm": 0.47390344738960266, "learning_rate": 0.00012500000000000006, "loss": 2.9868, "step": 2000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "total_flos": 2.85480780300288e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }