{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 5592, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00894134477825465, "grad_norm": 0.5095388845429333, "learning_rate": 4.4642857142857145e-08, "loss": 1.8582, "step": 25 }, { "epoch": 0.0178826895565093, "grad_norm": 0.41826013911071735, "learning_rate": 8.928571428571429e-08, "loss": 1.8696, "step": 50 }, { "epoch": 0.02682403433476395, "grad_norm": 0.5293401027825467, "learning_rate": 1.3392857142857142e-07, "loss": 1.863, "step": 75 }, { "epoch": 0.0357653791130186, "grad_norm": 0.5080324323077997, "learning_rate": 1.7857142857142858e-07, "loss": 1.8639, "step": 100 }, { "epoch": 0.044706723891273246, "grad_norm": 0.5015276110428923, "learning_rate": 2.232142857142857e-07, "loss": 1.8466, "step": 125 }, { "epoch": 0.0536480686695279, "grad_norm": 0.5192534238774392, "learning_rate": 2.6785714285714284e-07, "loss": 1.8646, "step": 150 }, { "epoch": 0.06258941344778254, "grad_norm": 0.5036209286188108, "learning_rate": 3.1249999999999997e-07, "loss": 1.8602, "step": 175 }, { "epoch": 0.0715307582260372, "grad_norm": 0.4912106467940169, "learning_rate": 3.5714285714285716e-07, "loss": 1.8529, "step": 200 }, { "epoch": 0.08047210300429185, "grad_norm": 0.9830108153719349, "learning_rate": 4.017857142857143e-07, "loss": 1.8574, "step": 225 }, { "epoch": 0.08941344778254649, "grad_norm": 0.5138072684249293, "learning_rate": 4.464285714285714e-07, "loss": 1.8498, "step": 250 }, { "epoch": 0.09835479256080114, "grad_norm": 0.6040471717328045, "learning_rate": 4.910714285714285e-07, "loss": 1.853, "step": 275 }, { "epoch": 0.1072961373390558, "grad_norm": 0.6457522376165349, "learning_rate": 5.357142857142857e-07, "loss": 1.849, "step": 300 }, { "epoch": 0.11623748211731044, "grad_norm": 0.6334455155915341, "learning_rate": 5.803571428571429e-07, "loss": 1.8404, "step": 325 }, { "epoch": 0.1251788268955651, "grad_norm": 0.6379008256976824, "learning_rate": 6.249999999999999e-07, "loss": 1.8591, "step": 350 }, { "epoch": 0.13412017167381973, "grad_norm": 0.7299148775857892, "learning_rate": 6.69642857142857e-07, "loss": 1.8275, "step": 375 }, { "epoch": 0.1430615164520744, "grad_norm": 0.7276311175701822, "learning_rate": 7.142857142857143e-07, "loss": 1.7953, "step": 400 }, { "epoch": 0.15200286123032905, "grad_norm": 0.7388527300740464, "learning_rate": 7.589285714285714e-07, "loss": 1.7676, "step": 425 }, { "epoch": 0.1609442060085837, "grad_norm": 0.8114217398767946, "learning_rate": 8.035714285714286e-07, "loss": 1.7693, "step": 450 }, { "epoch": 0.16988555078683834, "grad_norm": 0.8300068303598062, "learning_rate": 8.482142857142857e-07, "loss": 1.7249, "step": 475 }, { "epoch": 0.17882689556509299, "grad_norm": 0.7407086667105495, "learning_rate": 8.928571428571428e-07, "loss": 1.7008, "step": 500 }, { "epoch": 0.18776824034334763, "grad_norm": 0.6185854806094319, "learning_rate": 9.374999999999999e-07, "loss": 1.6907, "step": 525 }, { "epoch": 0.19670958512160228, "grad_norm": 0.6491552501216883, "learning_rate": 9.82142857142857e-07, "loss": 1.6586, "step": 550 }, { "epoch": 0.20565092989985695, "grad_norm": 0.6005414444540486, "learning_rate": 9.970190779014309e-07, "loss": 1.6455, "step": 575 }, { "epoch": 0.2145922746781116, "grad_norm": 0.584948620571865, "learning_rate": 9.920508744038154e-07, "loss": 1.6181, "step": 600 }, { "epoch": 0.22353361945636624, "grad_norm": 0.5689579459890339, "learning_rate": 9.870826709062002e-07, "loss": 1.5861, "step": 625 }, { "epoch": 0.23247496423462088, "grad_norm": 0.5479875081985119, "learning_rate": 9.821144674085851e-07, "loss": 1.5784, "step": 650 }, { "epoch": 0.24141630901287553, "grad_norm": 0.5642004770422899, "learning_rate": 9.771462639109697e-07, "loss": 1.5524, "step": 675 }, { "epoch": 0.2503576537911302, "grad_norm": 0.5928876776119925, "learning_rate": 9.721780604133544e-07, "loss": 1.5371, "step": 700 }, { "epoch": 0.2592989985693848, "grad_norm": 0.5977759151905918, "learning_rate": 9.672098569157392e-07, "loss": 1.5132, "step": 725 }, { "epoch": 0.26824034334763946, "grad_norm": 0.5575061860434638, "learning_rate": 9.62241653418124e-07, "loss": 1.5064, "step": 750 }, { "epoch": 0.2771816881258941, "grad_norm": 0.5630188145960676, "learning_rate": 9.572734499205087e-07, "loss": 1.4761, "step": 775 }, { "epoch": 0.2861230329041488, "grad_norm": 0.5611716812815478, "learning_rate": 9.523052464228934e-07, "loss": 1.4614, "step": 800 }, { "epoch": 0.29506437768240346, "grad_norm": 0.46795354610537326, "learning_rate": 9.473370429252782e-07, "loss": 1.4298, "step": 825 }, { "epoch": 0.3040057224606581, "grad_norm": 0.44554190069369415, "learning_rate": 9.423688394276629e-07, "loss": 1.4318, "step": 850 }, { "epoch": 0.31294706723891275, "grad_norm": 1.3485889211129667, "learning_rate": 9.374006359300477e-07, "loss": 1.4183, "step": 875 }, { "epoch": 0.3218884120171674, "grad_norm": 0.5688806165123188, "learning_rate": 9.324324324324324e-07, "loss": 1.3918, "step": 900 }, { "epoch": 0.33082975679542204, "grad_norm": 0.5874567786702063, "learning_rate": 9.274642289348172e-07, "loss": 1.3886, "step": 925 }, { "epoch": 0.3397711015736767, "grad_norm": 0.3766300486264232, "learning_rate": 9.224960254372018e-07, "loss": 1.3839, "step": 950 }, { "epoch": 0.3487124463519313, "grad_norm": 0.39812336031911777, "learning_rate": 9.175278219395866e-07, "loss": 1.3677, "step": 975 }, { "epoch": 0.35765379113018597, "grad_norm": 0.3843707797396072, "learning_rate": 9.125596184419714e-07, "loss": 1.358, "step": 1000 }, { "epoch": 0.3665951359084406, "grad_norm": 0.3847089855262342, "learning_rate": 9.075914149443561e-07, "loss": 1.3469, "step": 1025 }, { "epoch": 0.37553648068669526, "grad_norm": 0.5315974805229721, "learning_rate": 9.026232114467408e-07, "loss": 1.3377, "step": 1050 }, { "epoch": 0.3844778254649499, "grad_norm": 0.37212834114959115, "learning_rate": 8.976550079491256e-07, "loss": 1.326, "step": 1075 }, { "epoch": 0.39341917024320455, "grad_norm": 0.3369886595418098, "learning_rate": 8.926868044515103e-07, "loss": 1.3141, "step": 1100 }, { "epoch": 0.40236051502145925, "grad_norm": 0.3232053255299734, "learning_rate": 8.877186009538951e-07, "loss": 1.3176, "step": 1125 }, { "epoch": 0.4113018597997139, "grad_norm": 0.5405167043382479, "learning_rate": 8.827503974562798e-07, "loss": 1.3125, "step": 1150 }, { "epoch": 0.42024320457796854, "grad_norm": 0.28257166431534414, "learning_rate": 8.777821939586645e-07, "loss": 1.2911, "step": 1175 }, { "epoch": 0.4291845493562232, "grad_norm": 0.29182342370487174, "learning_rate": 8.728139904610492e-07, "loss": 1.2935, "step": 1200 }, { "epoch": 0.43812589413447783, "grad_norm": 0.22773442408616737, "learning_rate": 8.678457869634341e-07, "loss": 1.2888, "step": 1225 }, { "epoch": 0.4470672389127325, "grad_norm": 0.22436263710764737, "learning_rate": 8.628775834658187e-07, "loss": 1.2745, "step": 1250 }, { "epoch": 0.4560085836909871, "grad_norm": 0.21775793386766282, "learning_rate": 8.579093799682035e-07, "loss": 1.269, "step": 1275 }, { "epoch": 0.46494992846924177, "grad_norm": 0.22214064111096374, "learning_rate": 8.529411764705882e-07, "loss": 1.267, "step": 1300 }, { "epoch": 0.4738912732474964, "grad_norm": 0.20243266736371351, "learning_rate": 8.47972972972973e-07, "loss": 1.276, "step": 1325 }, { "epoch": 0.48283261802575106, "grad_norm": 0.20705292614851298, "learning_rate": 8.430047694753577e-07, "loss": 1.272, "step": 1350 }, { "epoch": 0.4917739628040057, "grad_norm": 0.19119742077379076, "learning_rate": 8.380365659777425e-07, "loss": 1.2724, "step": 1375 }, { "epoch": 0.5007153075822603, "grad_norm": 0.23794007928706434, "learning_rate": 8.330683624801271e-07, "loss": 1.2583, "step": 1400 }, { "epoch": 0.509656652360515, "grad_norm": 0.18340837203234114, "learning_rate": 8.281001589825118e-07, "loss": 1.2499, "step": 1425 }, { "epoch": 0.5185979971387696, "grad_norm": 0.193128543688102, "learning_rate": 8.231319554848967e-07, "loss": 1.2612, "step": 1450 }, { "epoch": 0.5275393419170243, "grad_norm": 0.1751575826741444, "learning_rate": 8.181637519872813e-07, "loss": 1.2502, "step": 1475 }, { "epoch": 0.5364806866952789, "grad_norm": 0.2364503348095419, "learning_rate": 8.131955484896661e-07, "loss": 1.243, "step": 1500 }, { "epoch": 0.5454220314735336, "grad_norm": 0.25652812638575234, "learning_rate": 8.082273449920508e-07, "loss": 1.2559, "step": 1525 }, { "epoch": 0.5543633762517882, "grad_norm": 0.1737798761662478, "learning_rate": 8.032591414944355e-07, "loss": 1.2529, "step": 1550 }, { "epoch": 0.5633047210300429, "grad_norm": 0.16909231999113233, "learning_rate": 7.982909379968203e-07, "loss": 1.2425, "step": 1575 }, { "epoch": 0.5722460658082976, "grad_norm": 0.16621585163311706, "learning_rate": 7.933227344992051e-07, "loss": 1.2428, "step": 1600 }, { "epoch": 0.5811874105865522, "grad_norm": 0.29941509287671747, "learning_rate": 7.883545310015897e-07, "loss": 1.2501, "step": 1625 }, { "epoch": 0.5901287553648069, "grad_norm": 0.17035369815136836, "learning_rate": 7.833863275039745e-07, "loss": 1.2515, "step": 1650 }, { "epoch": 0.5990701001430615, "grad_norm": 0.17629538126188918, "learning_rate": 7.784181240063593e-07, "loss": 1.2463, "step": 1675 }, { "epoch": 0.6080114449213162, "grad_norm": 0.1697497467753232, "learning_rate": 7.73449920508744e-07, "loss": 1.2311, "step": 1700 }, { "epoch": 0.6169527896995708, "grad_norm": 0.15981908691911512, "learning_rate": 7.684817170111287e-07, "loss": 1.2449, "step": 1725 }, { "epoch": 0.6258941344778255, "grad_norm": 0.1529795356453295, "learning_rate": 7.635135135135135e-07, "loss": 1.2558, "step": 1750 }, { "epoch": 0.6348354792560801, "grad_norm": 0.1609652679207339, "learning_rate": 7.585453100158981e-07, "loss": 1.238, "step": 1775 }, { "epoch": 0.6437768240343348, "grad_norm": 0.14621942114492062, "learning_rate": 7.53577106518283e-07, "loss": 1.2281, "step": 1800 }, { "epoch": 0.6527181688125894, "grad_norm": 0.1618220669804669, "learning_rate": 7.486089030206677e-07, "loss": 1.2362, "step": 1825 }, { "epoch": 0.6616595135908441, "grad_norm": 0.16201225587717186, "learning_rate": 7.436406995230524e-07, "loss": 1.2261, "step": 1850 }, { "epoch": 0.6706008583690987, "grad_norm": 0.16891334320378984, "learning_rate": 7.386724960254371e-07, "loss": 1.2233, "step": 1875 }, { "epoch": 0.6795422031473534, "grad_norm": 0.15836253423551233, "learning_rate": 7.33704292527822e-07, "loss": 1.2041, "step": 1900 }, { "epoch": 0.6884835479256081, "grad_norm": 0.16173299579695544, "learning_rate": 7.287360890302066e-07, "loss": 1.2112, "step": 1925 }, { "epoch": 0.6974248927038627, "grad_norm": 0.15311569879619952, "learning_rate": 7.237678855325914e-07, "loss": 1.2045, "step": 1950 }, { "epoch": 0.7063662374821174, "grad_norm": 0.1454174761720946, "learning_rate": 7.187996820349761e-07, "loss": 1.2142, "step": 1975 }, { "epoch": 0.7153075822603719, "grad_norm": 0.14358617606783605, "learning_rate": 7.138314785373608e-07, "loss": 1.2254, "step": 2000 }, { "epoch": 0.7242489270386266, "grad_norm": 0.15123594116890207, "learning_rate": 7.088632750397456e-07, "loss": 1.2043, "step": 2025 }, { "epoch": 0.7331902718168812, "grad_norm": 0.1484178022383261, "learning_rate": 7.038950715421304e-07, "loss": 1.2102, "step": 2050 }, { "epoch": 0.7421316165951359, "grad_norm": 0.15779752084285467, "learning_rate": 6.98926868044515e-07, "loss": 1.219, "step": 2075 }, { "epoch": 0.7510729613733905, "grad_norm": 0.14639571745115018, "learning_rate": 6.939586645468998e-07, "loss": 1.2053, "step": 2100 }, { "epoch": 0.7600143061516452, "grad_norm": 0.18349123256744296, "learning_rate": 6.889904610492846e-07, "loss": 1.2154, "step": 2125 }, { "epoch": 0.7689556509298998, "grad_norm": 0.1564150328615537, "learning_rate": 6.840222575516693e-07, "loss": 1.2004, "step": 2150 }, { "epoch": 0.7778969957081545, "grad_norm": 0.15591360624463577, "learning_rate": 6.79054054054054e-07, "loss": 1.2097, "step": 2175 }, { "epoch": 0.7868383404864091, "grad_norm": 0.1525230509402887, "learning_rate": 6.740858505564388e-07, "loss": 1.2009, "step": 2200 }, { "epoch": 0.7957796852646638, "grad_norm": 0.13632266793566936, "learning_rate": 6.691176470588234e-07, "loss": 1.2169, "step": 2225 }, { "epoch": 0.8047210300429185, "grad_norm": 0.165619157435053, "learning_rate": 6.641494435612083e-07, "loss": 1.2032, "step": 2250 }, { "epoch": 0.8136623748211731, "grad_norm": 0.1382749276518515, "learning_rate": 6.59181240063593e-07, "loss": 1.1998, "step": 2275 }, { "epoch": 0.8226037195994278, "grad_norm": 0.15894676488709247, "learning_rate": 6.542130365659777e-07, "loss": 1.2046, "step": 2300 }, { "epoch": 0.8315450643776824, "grad_norm": 0.14139510562282273, "learning_rate": 6.492448330683624e-07, "loss": 1.2022, "step": 2325 }, { "epoch": 0.8404864091559371, "grad_norm": 0.16356442362082368, "learning_rate": 6.442766295707473e-07, "loss": 1.1888, "step": 2350 }, { "epoch": 0.8494277539341917, "grad_norm": 0.13001938286669296, "learning_rate": 6.393084260731319e-07, "loss": 1.2036, "step": 2375 }, { "epoch": 0.8583690987124464, "grad_norm": 0.1376171055331258, "learning_rate": 6.343402225755167e-07, "loss": 1.1967, "step": 2400 }, { "epoch": 0.867310443490701, "grad_norm": 0.1555553003869712, "learning_rate": 6.293720190779014e-07, "loss": 1.202, "step": 2425 }, { "epoch": 0.8762517882689557, "grad_norm": 0.1536545724212523, "learning_rate": 6.24403815580286e-07, "loss": 1.1993, "step": 2450 }, { "epoch": 0.8851931330472103, "grad_norm": 0.15017510970577388, "learning_rate": 6.194356120826709e-07, "loss": 1.1878, "step": 2475 }, { "epoch": 0.894134477825465, "grad_norm": 0.1501666434092056, "learning_rate": 6.144674085850557e-07, "loss": 1.1855, "step": 2500 }, { "epoch": 0.9030758226037195, "grad_norm": 0.14083948154628234, "learning_rate": 6.094992050874403e-07, "loss": 1.1858, "step": 2525 }, { "epoch": 0.9120171673819742, "grad_norm": 0.1480314047950012, "learning_rate": 6.04531001589825e-07, "loss": 1.1718, "step": 2550 }, { "epoch": 0.920958512160229, "grad_norm": 0.14172604863631122, "learning_rate": 5.995627980922098e-07, "loss": 1.1819, "step": 2575 }, { "epoch": 0.9298998569384835, "grad_norm": 0.12782051260136137, "learning_rate": 5.945945945945947e-07, "loss": 1.1956, "step": 2600 }, { "epoch": 0.9388412017167382, "grad_norm": 0.1451085538055058, "learning_rate": 5.896263910969793e-07, "loss": 1.1744, "step": 2625 }, { "epoch": 0.9477825464949928, "grad_norm": 0.40345434298159216, "learning_rate": 5.84658187599364e-07, "loss": 1.1783, "step": 2650 }, { "epoch": 0.9567238912732475, "grad_norm": 0.13772381526739436, "learning_rate": 5.796899841017488e-07, "loss": 1.1839, "step": 2675 }, { "epoch": 0.9656652360515021, "grad_norm": 0.15802899189862257, "learning_rate": 5.747217806041335e-07, "loss": 1.1792, "step": 2700 }, { "epoch": 0.9746065808297568, "grad_norm": 0.1467263131203197, "learning_rate": 5.697535771065183e-07, "loss": 1.1881, "step": 2725 }, { "epoch": 0.9835479256080114, "grad_norm": 0.18086918496034873, "learning_rate": 5.64785373608903e-07, "loss": 1.1847, "step": 2750 }, { "epoch": 0.9924892703862661, "grad_norm": 0.13322108072315525, "learning_rate": 5.598171701112877e-07, "loss": 1.174, "step": 2775 }, { "epoch": 1.0014306151645207, "grad_norm": 0.13758424586995027, "learning_rate": 5.548489666136724e-07, "loss": 1.1744, "step": 2800 }, { "epoch": 1.0103719599427754, "grad_norm": 0.14695511905283026, "learning_rate": 5.498807631160573e-07, "loss": 1.1753, "step": 2825 }, { "epoch": 1.01931330472103, "grad_norm": 0.13583673106446567, "learning_rate": 5.449125596184419e-07, "loss": 1.177, "step": 2850 }, { "epoch": 1.0282546494992848, "grad_norm": 0.14863281353489802, "learning_rate": 5.399443561208267e-07, "loss": 1.1698, "step": 2875 }, { "epoch": 1.0371959942775393, "grad_norm": 0.1719230111879567, "learning_rate": 5.349761526232114e-07, "loss": 1.17, "step": 2900 }, { "epoch": 1.046137339055794, "grad_norm": 0.14198220020823107, "learning_rate": 5.300079491255962e-07, "loss": 1.1599, "step": 2925 }, { "epoch": 1.0550786838340487, "grad_norm": 0.14656880263067942, "learning_rate": 5.250397456279809e-07, "loss": 1.1613, "step": 2950 }, { "epoch": 1.0640200286123034, "grad_norm": 0.17984614771854038, "learning_rate": 5.200715421303657e-07, "loss": 1.1614, "step": 2975 }, { "epoch": 1.0729613733905579, "grad_norm": 0.16125844481926355, "learning_rate": 5.151033386327503e-07, "loss": 1.1456, "step": 3000 }, { "epoch": 1.0819027181688126, "grad_norm": 0.15302132985828396, "learning_rate": 5.101351351351351e-07, "loss": 1.1582, "step": 3025 }, { "epoch": 1.0908440629470673, "grad_norm": 0.17416017697667222, "learning_rate": 5.051669316375199e-07, "loss": 1.1645, "step": 3050 }, { "epoch": 1.099785407725322, "grad_norm": 0.1576441191719269, "learning_rate": 5.001987281399046e-07, "loss": 1.1614, "step": 3075 }, { "epoch": 1.1087267525035764, "grad_norm": 0.15708931920894004, "learning_rate": 4.952305246422893e-07, "loss": 1.1589, "step": 3100 }, { "epoch": 1.1176680972818311, "grad_norm": 0.18088098015223084, "learning_rate": 4.902623211446741e-07, "loss": 1.1516, "step": 3125 }, { "epoch": 1.1266094420600858, "grad_norm": 0.15266497982250174, "learning_rate": 4.852941176470588e-07, "loss": 1.1432, "step": 3150 }, { "epoch": 1.1355507868383405, "grad_norm": 0.16769944079369944, "learning_rate": 4.803259141494435e-07, "loss": 1.151, "step": 3175 }, { "epoch": 1.144492131616595, "grad_norm": 0.183750642878192, "learning_rate": 4.7535771065182827e-07, "loss": 1.1629, "step": 3200 }, { "epoch": 1.1534334763948497, "grad_norm": 0.15008294024377883, "learning_rate": 4.70389507154213e-07, "loss": 1.1497, "step": 3225 }, { "epoch": 1.1623748211731044, "grad_norm": 0.1792182775394022, "learning_rate": 4.6542130365659777e-07, "loss": 1.1442, "step": 3250 }, { "epoch": 1.1713161659513591, "grad_norm": 0.1815534124009153, "learning_rate": 4.6045310015898247e-07, "loss": 1.1424, "step": 3275 }, { "epoch": 1.1802575107296138, "grad_norm": 0.1634530123009294, "learning_rate": 4.5548489666136727e-07, "loss": 1.1445, "step": 3300 }, { "epoch": 1.1891988555078683, "grad_norm": 0.15595048975318404, "learning_rate": 4.5051669316375196e-07, "loss": 1.1471, "step": 3325 }, { "epoch": 1.198140200286123, "grad_norm": 0.15375493348738128, "learning_rate": 4.455484896661367e-07, "loss": 1.1559, "step": 3350 }, { "epoch": 1.2070815450643777, "grad_norm": 0.15595330109052555, "learning_rate": 4.4058028616852146e-07, "loss": 1.1454, "step": 3375 }, { "epoch": 1.2160228898426324, "grad_norm": 0.1531828453320636, "learning_rate": 4.3561208267090616e-07, "loss": 1.1447, "step": 3400 }, { "epoch": 1.224964234620887, "grad_norm": 0.13496531066929612, "learning_rate": 4.306438791732909e-07, "loss": 1.1228, "step": 3425 }, { "epoch": 1.2339055793991416, "grad_norm": 0.16876704458729294, "learning_rate": 4.2567567567567566e-07, "loss": 1.1267, "step": 3450 }, { "epoch": 1.2428469241773963, "grad_norm": 0.35531759007502683, "learning_rate": 4.207074721780604e-07, "loss": 1.1282, "step": 3475 }, { "epoch": 1.251788268955651, "grad_norm": 0.14942448217724147, "learning_rate": 4.157392686804451e-07, "loss": 1.1321, "step": 3500 }, { "epoch": 1.2607296137339055, "grad_norm": 0.16823840198797532, "learning_rate": 4.107710651828299e-07, "loss": 1.1297, "step": 3525 }, { "epoch": 1.2696709585121602, "grad_norm": 0.1373099361223873, "learning_rate": 4.058028616852146e-07, "loss": 1.1289, "step": 3550 }, { "epoch": 1.2786123032904149, "grad_norm": 0.1238133749616558, "learning_rate": 4.0083465818759935e-07, "loss": 1.1197, "step": 3575 }, { "epoch": 1.2875536480686696, "grad_norm": 0.16232957883088006, "learning_rate": 3.958664546899841e-07, "loss": 1.1517, "step": 3600 }, { "epoch": 1.2964949928469243, "grad_norm": 0.15052195486822473, "learning_rate": 3.908982511923688e-07, "loss": 1.1115, "step": 3625 }, { "epoch": 1.3054363376251787, "grad_norm": 0.1464299159190494, "learning_rate": 3.8593004769475355e-07, "loss": 1.121, "step": 3650 }, { "epoch": 1.3143776824034334, "grad_norm": 0.16556774569358795, "learning_rate": 3.809618441971383e-07, "loss": 1.1248, "step": 3675 }, { "epoch": 1.3233190271816881, "grad_norm": 0.14677494246881193, "learning_rate": 3.7599364069952305e-07, "loss": 1.1301, "step": 3700 }, { "epoch": 1.3322603719599428, "grad_norm": 0.1385988595636303, "learning_rate": 3.7102543720190775e-07, "loss": 1.1217, "step": 3725 }, { "epoch": 1.3412017167381975, "grad_norm": 0.132770703438204, "learning_rate": 3.6605723370429255e-07, "loss": 1.128, "step": 3750 }, { "epoch": 1.350143061516452, "grad_norm": 0.14471226652721184, "learning_rate": 3.6108903020667724e-07, "loss": 1.1258, "step": 3775 }, { "epoch": 1.3590844062947067, "grad_norm": 0.15829087639433673, "learning_rate": 3.5612082670906194e-07, "loss": 1.1232, "step": 3800 }, { "epoch": 1.3680257510729614, "grad_norm": 0.14881510104209672, "learning_rate": 3.5115262321144674e-07, "loss": 1.1326, "step": 3825 }, { "epoch": 1.376967095851216, "grad_norm": 0.14588466142604242, "learning_rate": 3.4618441971383144e-07, "loss": 1.1164, "step": 3850 }, { "epoch": 1.3859084406294706, "grad_norm": 0.13477416021173907, "learning_rate": 3.412162162162162e-07, "loss": 1.1285, "step": 3875 }, { "epoch": 1.3948497854077253, "grad_norm": 0.21722908586608208, "learning_rate": 3.3624801271860094e-07, "loss": 1.1201, "step": 3900 }, { "epoch": 1.40379113018598, "grad_norm": 0.13558184653823538, "learning_rate": 3.312798092209857e-07, "loss": 1.1156, "step": 3925 }, { "epoch": 1.4127324749642347, "grad_norm": 0.14185323455679683, "learning_rate": 3.263116057233704e-07, "loss": 1.1177, "step": 3950 }, { "epoch": 1.4216738197424892, "grad_norm": 0.14569471186135233, "learning_rate": 3.213434022257552e-07, "loss": 1.117, "step": 3975 }, { "epoch": 1.4306151645207439, "grad_norm": 0.1303851673327878, "learning_rate": 3.163751987281399e-07, "loss": 1.1194, "step": 4000 }, { "epoch": 1.4395565092989986, "grad_norm": 0.13823108648889798, "learning_rate": 3.114069952305246e-07, "loss": 1.1153, "step": 4025 }, { "epoch": 1.4484978540772533, "grad_norm": 0.14544212946759183, "learning_rate": 3.064387917329094e-07, "loss": 1.1043, "step": 4050 }, { "epoch": 1.457439198855508, "grad_norm": 0.1290933120501116, "learning_rate": 3.014705882352941e-07, "loss": 1.1213, "step": 4075 }, { "epoch": 1.4663805436337625, "grad_norm": 0.12809510682493896, "learning_rate": 2.965023847376789e-07, "loss": 1.113, "step": 4100 }, { "epoch": 1.4753218884120172, "grad_norm": 0.1419793306501672, "learning_rate": 2.915341812400636e-07, "loss": 1.1309, "step": 4125 }, { "epoch": 1.4842632331902719, "grad_norm": 0.13406734402981146, "learning_rate": 2.8656597774244833e-07, "loss": 1.1256, "step": 4150 }, { "epoch": 1.4932045779685263, "grad_norm": 0.13587793689155384, "learning_rate": 2.815977742448331e-07, "loss": 1.1187, "step": 4175 }, { "epoch": 1.5021459227467813, "grad_norm": 0.13258607901887373, "learning_rate": 2.766295707472178e-07, "loss": 1.1228, "step": 4200 }, { "epoch": 1.5110872675250357, "grad_norm": 0.1247874755702546, "learning_rate": 2.716613672496025e-07, "loss": 1.1164, "step": 4225 }, { "epoch": 1.5200286123032904, "grad_norm": 0.12239855788322539, "learning_rate": 2.666931637519873e-07, "loss": 1.1117, "step": 4250 }, { "epoch": 1.5289699570815452, "grad_norm": 0.1369183999332412, "learning_rate": 2.61724960254372e-07, "loss": 1.1172, "step": 4275 }, { "epoch": 1.5379113018597996, "grad_norm": 1.1332621790402788, "learning_rate": 2.567567567567567e-07, "loss": 1.0993, "step": 4300 }, { "epoch": 1.5468526466380543, "grad_norm": 0.1253338454363431, "learning_rate": 2.517885532591415e-07, "loss": 1.1165, "step": 4325 }, { "epoch": 1.555793991416309, "grad_norm": 0.12665869326621307, "learning_rate": 2.468203497615262e-07, "loss": 1.1075, "step": 4350 }, { "epoch": 1.5647353361945635, "grad_norm": 0.12307463927377256, "learning_rate": 2.4185214626391097e-07, "loss": 1.1091, "step": 4375 }, { "epoch": 1.5736766809728184, "grad_norm": 0.13556347702963126, "learning_rate": 2.368839427662957e-07, "loss": 1.1106, "step": 4400 }, { "epoch": 1.582618025751073, "grad_norm": 0.1437851683344807, "learning_rate": 2.3191573926868044e-07, "loss": 1.1258, "step": 4425 }, { "epoch": 1.5915593705293276, "grad_norm": 0.1483471734540286, "learning_rate": 2.269475357710652e-07, "loss": 1.1185, "step": 4450 }, { "epoch": 1.6005007153075823, "grad_norm": 0.1417713425850489, "learning_rate": 2.2197933227344991e-07, "loss": 1.1091, "step": 4475 }, { "epoch": 1.6094420600858368, "grad_norm": 0.11258187644158707, "learning_rate": 2.1701112877583466e-07, "loss": 1.1141, "step": 4500 }, { "epoch": 1.6183834048640917, "grad_norm": 1.4844013392798963, "learning_rate": 2.120429252782194e-07, "loss": 1.1083, "step": 4525 }, { "epoch": 1.6273247496423462, "grad_norm": 0.14485866801127223, "learning_rate": 2.070747217806041e-07, "loss": 1.1128, "step": 4550 }, { "epoch": 1.636266094420601, "grad_norm": 0.1310091219191068, "learning_rate": 2.0210651828298886e-07, "loss": 1.1081, "step": 4575 }, { "epoch": 1.6452074391988556, "grad_norm": 0.1324864798139755, "learning_rate": 1.971383147853736e-07, "loss": 1.1218, "step": 4600 }, { "epoch": 1.65414878397711, "grad_norm": 0.16408287715823638, "learning_rate": 1.9217011128775833e-07, "loss": 1.1128, "step": 4625 }, { "epoch": 1.6630901287553648, "grad_norm": 0.13453997589781927, "learning_rate": 1.8720190779014308e-07, "loss": 1.109, "step": 4650 }, { "epoch": 1.6720314735336195, "grad_norm": 0.11431592419320451, "learning_rate": 1.8223370429252783e-07, "loss": 1.1128, "step": 4675 }, { "epoch": 1.680972818311874, "grad_norm": 0.12937079012352717, "learning_rate": 1.7726550079491255e-07, "loss": 1.107, "step": 4700 }, { "epoch": 1.6899141630901289, "grad_norm": 0.13493617023722984, "learning_rate": 1.7229729729729728e-07, "loss": 1.1026, "step": 4725 }, { "epoch": 1.6988555078683834, "grad_norm": 0.12427164940365912, "learning_rate": 1.6732909379968203e-07, "loss": 1.1002, "step": 4750 }, { "epoch": 1.707796852646638, "grad_norm": 0.12582353400466703, "learning_rate": 1.6236089030206675e-07, "loss": 1.1091, "step": 4775 }, { "epoch": 1.7167381974248928, "grad_norm": 0.1209250119003803, "learning_rate": 1.573926868044515e-07, "loss": 1.0988, "step": 4800 }, { "epoch": 1.7256795422031472, "grad_norm": 0.1416217314930489, "learning_rate": 1.5242448330683625e-07, "loss": 1.1083, "step": 4825 }, { "epoch": 1.7346208869814022, "grad_norm": 0.13057929166120621, "learning_rate": 1.4745627980922097e-07, "loss": 1.1164, "step": 4850 }, { "epoch": 1.7435622317596566, "grad_norm": 0.13275668946657931, "learning_rate": 1.4248807631160572e-07, "loss": 1.1164, "step": 4875 }, { "epoch": 1.7525035765379113, "grad_norm": 0.19255405164097672, "learning_rate": 1.3751987281399047e-07, "loss": 1.0945, "step": 4900 }, { "epoch": 1.761444921316166, "grad_norm": 0.13958161437492914, "learning_rate": 1.3255166931637517e-07, "loss": 1.1094, "step": 4925 }, { "epoch": 1.7703862660944205, "grad_norm": 0.1379908503816713, "learning_rate": 1.2758346581875992e-07, "loss": 1.1027, "step": 4950 }, { "epoch": 1.7793276108726752, "grad_norm": 0.1146403989881727, "learning_rate": 1.2261526232114467e-07, "loss": 1.0966, "step": 4975 }, { "epoch": 1.78826895565093, "grad_norm": 0.1296359557728573, "learning_rate": 1.176470588235294e-07, "loss": 1.1163, "step": 5000 }, { "epoch": 1.7972103004291844, "grad_norm": 0.1242709050610396, "learning_rate": 1.1267885532591414e-07, "loss": 1.109, "step": 5025 }, { "epoch": 1.8061516452074393, "grad_norm": 0.12401932092728496, "learning_rate": 1.0771065182829889e-07, "loss": 1.0986, "step": 5050 }, { "epoch": 1.8150929899856938, "grad_norm": 0.1268226445976438, "learning_rate": 1.0274244833068361e-07, "loss": 1.1023, "step": 5075 }, { "epoch": 1.8240343347639485, "grad_norm": 0.12184905639120043, "learning_rate": 9.777424483306836e-08, "loss": 1.0985, "step": 5100 }, { "epoch": 1.8329756795422032, "grad_norm": 0.12734274404323123, "learning_rate": 9.28060413354531e-08, "loss": 1.1154, "step": 5125 }, { "epoch": 1.8419170243204577, "grad_norm": 0.13235924797466722, "learning_rate": 8.783783783783784e-08, "loss": 1.098, "step": 5150 }, { "epoch": 1.8508583690987126, "grad_norm": 0.11628761020569767, "learning_rate": 8.286963434022257e-08, "loss": 1.106, "step": 5175 }, { "epoch": 1.859799713876967, "grad_norm": 0.19214819884604664, "learning_rate": 7.790143084260731e-08, "loss": 1.0889, "step": 5200 }, { "epoch": 1.8687410586552218, "grad_norm": 0.487598636016336, "learning_rate": 7.293322734499204e-08, "loss": 1.1045, "step": 5225 }, { "epoch": 1.8776824034334765, "grad_norm": 0.13127933734364353, "learning_rate": 6.79650238473768e-08, "loss": 1.1017, "step": 5250 }, { "epoch": 1.886623748211731, "grad_norm": 0.1314482800625486, "learning_rate": 6.299682034976152e-08, "loss": 1.1044, "step": 5275 }, { "epoch": 1.8955650929899857, "grad_norm": 0.12676434160993588, "learning_rate": 5.802861685214626e-08, "loss": 1.1043, "step": 5300 }, { "epoch": 1.9045064377682404, "grad_norm": 0.1323305276547178, "learning_rate": 5.3060413354531e-08, "loss": 1.1026, "step": 5325 }, { "epoch": 1.9134477825464948, "grad_norm": 0.13391474060102793, "learning_rate": 4.809220985691573e-08, "loss": 1.0969, "step": 5350 }, { "epoch": 1.9223891273247498, "grad_norm": 0.12245753375271169, "learning_rate": 4.3124006359300475e-08, "loss": 1.1081, "step": 5375 }, { "epoch": 1.9313304721030042, "grad_norm": 0.11951679306957765, "learning_rate": 3.815580286168521e-08, "loss": 1.1092, "step": 5400 }, { "epoch": 1.940271816881259, "grad_norm": 0.11961090020470673, "learning_rate": 3.3187599364069955e-08, "loss": 1.1007, "step": 5425 }, { "epoch": 1.9492131616595136, "grad_norm": 0.12422327512042645, "learning_rate": 2.8219395866454688e-08, "loss": 1.098, "step": 5450 }, { "epoch": 1.9581545064377681, "grad_norm": 0.125577214373895, "learning_rate": 2.3251192368839427e-08, "loss": 1.1009, "step": 5475 }, { "epoch": 1.967095851216023, "grad_norm": 0.12978760398009362, "learning_rate": 1.8282988871224164e-08, "loss": 1.0965, "step": 5500 }, { "epoch": 1.9760371959942775, "grad_norm": 0.15452388925779964, "learning_rate": 1.3314785373608903e-08, "loss": 1.1058, "step": 5525 }, { "epoch": 1.9849785407725322, "grad_norm": 0.12170241302022762, "learning_rate": 8.346581875993641e-09, "loss": 1.1066, "step": 5550 }, { "epoch": 1.993919885550787, "grad_norm": 0.13568263553279267, "learning_rate": 3.3783783783783785e-09, "loss": 1.0966, "step": 5575 } ], "logging_steps": 25, "max_steps": 5592, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.1563888596221952e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }