{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 17057, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.8626956674679014e-05, "grad_norm": 0.232421875, "learning_rate": 9.999413730433253e-06, "loss": 1.0997, "step": 1 }, { "epoch": 0.0014656739168669754, "grad_norm": 0.25, "learning_rate": 9.985343260831331e-06, "loss": 0.8715, "step": 25 }, { "epoch": 0.0029313478337339507, "grad_norm": 0.59765625, "learning_rate": 9.970686521662662e-06, "loss": 0.8613, "step": 50 }, { "epoch": 0.004397021750600926, "grad_norm": 0.3671875, "learning_rate": 9.95602978249399e-06, "loss": 0.9177, "step": 75 }, { "epoch": 0.0058626956674679015, "grad_norm": 0.412109375, "learning_rate": 9.941373043325322e-06, "loss": 0.8869, "step": 100 }, { "epoch": 0.007328369584334877, "grad_norm": 0.25, "learning_rate": 9.926716304156651e-06, "loss": 0.852, "step": 125 }, { "epoch": 0.008794043501201852, "grad_norm": 0.4921875, "learning_rate": 9.912059564987983e-06, "loss": 0.8612, "step": 150 }, { "epoch": 0.010259717418068828, "grad_norm": 0.484375, "learning_rate": 9.897402825819312e-06, "loss": 0.9075, "step": 175 }, { "epoch": 0.011725391334935803, "grad_norm": 0.578125, "learning_rate": 9.882746086650642e-06, "loss": 0.9488, "step": 200 }, { "epoch": 0.013191065251802778, "grad_norm": 0.486328125, "learning_rate": 9.868089347481973e-06, "loss": 0.8637, "step": 225 }, { "epoch": 0.014656739168669754, "grad_norm": 0.578125, "learning_rate": 9.853432608313303e-06, "loss": 0.9587, "step": 250 }, { "epoch": 0.01612241308553673, "grad_norm": 0.2451171875, "learning_rate": 9.838775869144634e-06, "loss": 0.8171, "step": 275 }, { "epoch": 0.017588087002403704, "grad_norm": 0.251953125, "learning_rate": 9.824119129975964e-06, "loss": 0.8413, "step": 300 }, { "epoch": 0.01905376091927068, "grad_norm": 0.39453125, "learning_rate": 9.809462390807294e-06, "loss": 0.796, "step": 325 }, { "epoch": 0.020519434836137655, "grad_norm": 0.326171875, "learning_rate": 9.794805651638625e-06, "loss": 0.8025, "step": 350 }, { "epoch": 0.02198510875300463, "grad_norm": 0.1630859375, "learning_rate": 9.780148912469955e-06, "loss": 0.8279, "step": 375 }, { "epoch": 0.023450782669871606, "grad_norm": 0.5078125, "learning_rate": 9.765492173301284e-06, "loss": 0.807, "step": 400 }, { "epoch": 0.02491645658673858, "grad_norm": 0.29296875, "learning_rate": 9.750835434132614e-06, "loss": 0.9146, "step": 425 }, { "epoch": 0.026382130503605557, "grad_norm": 0.3125, "learning_rate": 9.736178694963945e-06, "loss": 0.8667, "step": 450 }, { "epoch": 0.027847804420472532, "grad_norm": 0.349609375, "learning_rate": 9.721521955795275e-06, "loss": 0.8888, "step": 475 }, { "epoch": 0.029313478337339507, "grad_norm": 0.1982421875, "learning_rate": 9.706865216626606e-06, "loss": 0.8417, "step": 500 }, { "epoch": 0.030779152254206483, "grad_norm": 0.20703125, "learning_rate": 9.692208477457936e-06, "loss": 0.8205, "step": 525 }, { "epoch": 0.03224482617107346, "grad_norm": 0.2216796875, "learning_rate": 9.677551738289266e-06, "loss": 0.8273, "step": 550 }, { "epoch": 0.03371050008794044, "grad_norm": 0.39453125, "learning_rate": 9.662894999120597e-06, "loss": 0.8107, "step": 575 }, { "epoch": 0.03517617400480741, "grad_norm": 0.1728515625, "learning_rate": 9.648238259951927e-06, "loss": 0.7997, "step": 600 }, { "epoch": 0.03664184792167439, "grad_norm": 0.458984375, "learning_rate": 9.633581520783258e-06, "loss": 0.8802, "step": 625 }, { "epoch": 0.03810752183854136, "grad_norm": 0.2333984375, "learning_rate": 9.618924781614586e-06, "loss": 0.8859, "step": 650 }, { "epoch": 0.03957319575540834, "grad_norm": 0.294921875, "learning_rate": 9.604268042445918e-06, "loss": 0.8317, "step": 675 }, { "epoch": 0.04103886967227531, "grad_norm": 0.208984375, "learning_rate": 9.589611303277247e-06, "loss": 0.8483, "step": 700 }, { "epoch": 0.04250454358914229, "grad_norm": 0.2490234375, "learning_rate": 9.57495456410858e-06, "loss": 0.8267, "step": 725 }, { "epoch": 0.04397021750600926, "grad_norm": 0.3984375, "learning_rate": 9.560297824939908e-06, "loss": 0.9042, "step": 750 }, { "epoch": 0.04543589142287624, "grad_norm": 0.126953125, "learning_rate": 9.545641085771238e-06, "loss": 0.8058, "step": 775 }, { "epoch": 0.04690156533974321, "grad_norm": 0.34375, "learning_rate": 9.530984346602569e-06, "loss": 0.7874, "step": 800 }, { "epoch": 0.04836723925661019, "grad_norm": 0.216796875, "learning_rate": 9.516327607433899e-06, "loss": 0.8562, "step": 825 }, { "epoch": 0.04983291317347716, "grad_norm": 0.2890625, "learning_rate": 9.50167086826523e-06, "loss": 0.8571, "step": 850 }, { "epoch": 0.05129858709034414, "grad_norm": 0.2138671875, "learning_rate": 9.487014129096558e-06, "loss": 0.8252, "step": 875 }, { "epoch": 0.05276426100721111, "grad_norm": 0.2890625, "learning_rate": 9.47235738992789e-06, "loss": 0.9093, "step": 900 }, { "epoch": 0.05422993492407809, "grad_norm": 0.369140625, "learning_rate": 9.457700650759219e-06, "loss": 0.7995, "step": 925 }, { "epoch": 0.055695608840945064, "grad_norm": 0.330078125, "learning_rate": 9.443043911590551e-06, "loss": 0.9485, "step": 950 }, { "epoch": 0.05716128275781204, "grad_norm": 0.248046875, "learning_rate": 9.42838717242188e-06, "loss": 0.8695, "step": 975 }, { "epoch": 0.058626956674679015, "grad_norm": 0.33203125, "learning_rate": 9.41373043325321e-06, "loss": 0.8645, "step": 1000 }, { "epoch": 0.060092630591545994, "grad_norm": 1.5546875, "learning_rate": 9.39907369408454e-06, "loss": 0.9043, "step": 1025 }, { "epoch": 0.061558304508412966, "grad_norm": 2.25, "learning_rate": 9.384416954915871e-06, "loss": 0.8794, "step": 1050 }, { "epoch": 0.06302397842527994, "grad_norm": 0.248046875, "learning_rate": 9.369760215747201e-06, "loss": 0.8327, "step": 1075 }, { "epoch": 0.06448965234214692, "grad_norm": 0.353515625, "learning_rate": 9.355103476578532e-06, "loss": 0.9179, "step": 1100 }, { "epoch": 0.06595532625901389, "grad_norm": 1.140625, "learning_rate": 9.340446737409862e-06, "loss": 0.8321, "step": 1125 }, { "epoch": 0.06742100017588087, "grad_norm": 0.2412109375, "learning_rate": 9.325789998241193e-06, "loss": 0.8061, "step": 1150 }, { "epoch": 0.06888667409274785, "grad_norm": 0.25390625, "learning_rate": 9.311133259072523e-06, "loss": 0.8817, "step": 1175 }, { "epoch": 0.07035234800961482, "grad_norm": 0.283203125, "learning_rate": 9.296476519903852e-06, "loss": 0.9002, "step": 1200 }, { "epoch": 0.07181802192648179, "grad_norm": 0.38671875, "learning_rate": 9.281819780735182e-06, "loss": 0.9576, "step": 1225 }, { "epoch": 0.07328369584334878, "grad_norm": 0.439453125, "learning_rate": 9.267163041566513e-06, "loss": 0.8482, "step": 1250 }, { "epoch": 0.07474936976021575, "grad_norm": 0.1474609375, "learning_rate": 9.252506302397843e-06, "loss": 0.8323, "step": 1275 }, { "epoch": 0.07621504367708272, "grad_norm": 0.29296875, "learning_rate": 9.237849563229173e-06, "loss": 0.8371, "step": 1300 }, { "epoch": 0.07768071759394969, "grad_norm": 0.232421875, "learning_rate": 9.223192824060504e-06, "loss": 0.8462, "step": 1325 }, { "epoch": 0.07914639151081668, "grad_norm": 0.333984375, "learning_rate": 9.208536084891834e-06, "loss": 0.9729, "step": 1350 }, { "epoch": 0.08061206542768365, "grad_norm": 0.2177734375, "learning_rate": 9.193879345723165e-06, "loss": 1.0097, "step": 1375 }, { "epoch": 0.08207773934455062, "grad_norm": 0.205078125, "learning_rate": 9.179222606554495e-06, "loss": 0.8492, "step": 1400 }, { "epoch": 0.0835434132614176, "grad_norm": 0.2197265625, "learning_rate": 9.164565867385825e-06, "loss": 0.8352, "step": 1425 }, { "epoch": 0.08500908717828458, "grad_norm": 0.40625, "learning_rate": 9.149909128217154e-06, "loss": 0.82, "step": 1450 }, { "epoch": 0.08647476109515155, "grad_norm": 0.23828125, "learning_rate": 9.135252389048486e-06, "loss": 0.8827, "step": 1475 }, { "epoch": 0.08794043501201852, "grad_norm": 0.267578125, "learning_rate": 9.120595649879815e-06, "loss": 0.8238, "step": 1500 }, { "epoch": 0.08940610892888551, "grad_norm": 0.32421875, "learning_rate": 9.105938910711145e-06, "loss": 0.9153, "step": 1525 }, { "epoch": 0.09087178284575248, "grad_norm": 0.265625, "learning_rate": 9.091282171542476e-06, "loss": 0.8383, "step": 1550 }, { "epoch": 0.09233745676261945, "grad_norm": 0.5546875, "learning_rate": 9.076625432373806e-06, "loss": 0.8326, "step": 1575 }, { "epoch": 0.09380313067948642, "grad_norm": 0.197265625, "learning_rate": 9.061968693205137e-06, "loss": 0.9013, "step": 1600 }, { "epoch": 0.09526880459635341, "grad_norm": 0.23828125, "learning_rate": 9.047311954036467e-06, "loss": 0.882, "step": 1625 }, { "epoch": 0.09673447851322038, "grad_norm": 0.427734375, "learning_rate": 9.032655214867797e-06, "loss": 0.8533, "step": 1650 }, { "epoch": 0.09820015243008735, "grad_norm": 0.44140625, "learning_rate": 9.017998475699126e-06, "loss": 0.8696, "step": 1675 }, { "epoch": 0.09966582634695433, "grad_norm": 5.71875, "learning_rate": 9.003341736530458e-06, "loss": 1.0214, "step": 1700 }, { "epoch": 0.10113150026382131, "grad_norm": 0.404296875, "learning_rate": 8.988684997361787e-06, "loss": 0.8732, "step": 1725 }, { "epoch": 0.10259717418068828, "grad_norm": 0.1787109375, "learning_rate": 8.974028258193117e-06, "loss": 0.9454, "step": 1750 }, { "epoch": 0.10406284809755526, "grad_norm": 0.2294921875, "learning_rate": 8.959371519024448e-06, "loss": 0.8994, "step": 1775 }, { "epoch": 0.10552852201442223, "grad_norm": 0.25390625, "learning_rate": 8.944714779855778e-06, "loss": 0.9015, "step": 1800 }, { "epoch": 0.10699419593128921, "grad_norm": 0.138671875, "learning_rate": 8.930058040687109e-06, "loss": 0.884, "step": 1825 }, { "epoch": 0.10845986984815618, "grad_norm": 0.404296875, "learning_rate": 8.915401301518439e-06, "loss": 0.7944, "step": 1850 }, { "epoch": 0.10992554376502316, "grad_norm": 0.1953125, "learning_rate": 8.90074456234977e-06, "loss": 0.8855, "step": 1875 }, { "epoch": 0.11139121768189013, "grad_norm": 0.55859375, "learning_rate": 8.8860878231811e-06, "loss": 1.0339, "step": 1900 }, { "epoch": 0.11285689159875711, "grad_norm": 0.48046875, "learning_rate": 8.87143108401243e-06, "loss": 1.0966, "step": 1925 }, { "epoch": 0.11432256551562409, "grad_norm": 0.2421875, "learning_rate": 8.85677434484376e-06, "loss": 0.8355, "step": 1950 }, { "epoch": 0.11578823943249106, "grad_norm": 0.314453125, "learning_rate": 8.84211760567509e-06, "loss": 0.8109, "step": 1975 }, { "epoch": 0.11725391334935803, "grad_norm": 0.26171875, "learning_rate": 8.82746086650642e-06, "loss": 0.9596, "step": 2000 }, { "epoch": 0.11871958726622502, "grad_norm": 0.375, "learning_rate": 8.81280412733775e-06, "loss": 0.8931, "step": 2025 }, { "epoch": 0.12018526118309199, "grad_norm": 0.20703125, "learning_rate": 8.79814738816908e-06, "loss": 0.818, "step": 2050 }, { "epoch": 0.12165093509995896, "grad_norm": 0.2265625, "learning_rate": 8.783490649000411e-06, "loss": 0.8465, "step": 2075 }, { "epoch": 0.12311660901682593, "grad_norm": 0.1689453125, "learning_rate": 8.768833909831741e-06, "loss": 0.8754, "step": 2100 }, { "epoch": 0.12458228293369292, "grad_norm": 0.158203125, "learning_rate": 8.754177170663072e-06, "loss": 0.8368, "step": 2125 }, { "epoch": 0.1260479568505599, "grad_norm": 0.2734375, "learning_rate": 8.739520431494402e-06, "loss": 0.8626, "step": 2150 }, { "epoch": 0.12751363076742686, "grad_norm": 0.22265625, "learning_rate": 8.724863692325733e-06, "loss": 0.9038, "step": 2175 }, { "epoch": 0.12897930468429383, "grad_norm": 0.1796875, "learning_rate": 8.710206953157061e-06, "loss": 0.8421, "step": 2200 }, { "epoch": 0.1304449786011608, "grad_norm": 0.5, "learning_rate": 8.695550213988393e-06, "loss": 0.9368, "step": 2225 }, { "epoch": 0.13191065251802778, "grad_norm": 0.3671875, "learning_rate": 8.680893474819722e-06, "loss": 0.8676, "step": 2250 }, { "epoch": 0.13337632643489478, "grad_norm": 0.271484375, "learning_rate": 8.666236735651054e-06, "loss": 0.7942, "step": 2275 }, { "epoch": 0.13484200035176175, "grad_norm": 0.2216796875, "learning_rate": 8.651579996482383e-06, "loss": 0.895, "step": 2300 }, { "epoch": 0.13630767426862872, "grad_norm": 0.2119140625, "learning_rate": 8.636923257313713e-06, "loss": 0.8239, "step": 2325 }, { "epoch": 0.1377733481854957, "grad_norm": 0.1650390625, "learning_rate": 8.622266518145044e-06, "loss": 0.8594, "step": 2350 }, { "epoch": 0.13923902210236266, "grad_norm": 0.1923828125, "learning_rate": 8.607609778976374e-06, "loss": 0.9231, "step": 2375 }, { "epoch": 0.14070469601922964, "grad_norm": 0.400390625, "learning_rate": 8.592953039807705e-06, "loss": 0.8337, "step": 2400 }, { "epoch": 0.1421703699360966, "grad_norm": 0.10498046875, "learning_rate": 8.578296300639033e-06, "loss": 0.8142, "step": 2425 }, { "epoch": 0.14363604385296358, "grad_norm": 0.7421875, "learning_rate": 8.563639561470365e-06, "loss": 0.8561, "step": 2450 }, { "epoch": 0.14510171776983058, "grad_norm": 0.263671875, "learning_rate": 8.548982822301694e-06, "loss": 0.9175, "step": 2475 }, { "epoch": 0.14656739168669755, "grad_norm": 0.146484375, "learning_rate": 8.534326083133026e-06, "loss": 0.9498, "step": 2500 }, { "epoch": 0.14803306560356452, "grad_norm": 0.36328125, "learning_rate": 8.519669343964355e-06, "loss": 0.9313, "step": 2525 }, { "epoch": 0.1494987395204315, "grad_norm": 0.392578125, "learning_rate": 8.505012604795685e-06, "loss": 0.9784, "step": 2550 }, { "epoch": 0.15096441343729847, "grad_norm": 0.35546875, "learning_rate": 8.490355865627016e-06, "loss": 0.9337, "step": 2575 }, { "epoch": 0.15243008735416544, "grad_norm": 0.1025390625, "learning_rate": 8.475699126458346e-06, "loss": 0.975, "step": 2600 }, { "epoch": 0.1538957612710324, "grad_norm": 0.375, "learning_rate": 8.461042387289676e-06, "loss": 0.8933, "step": 2625 }, { "epoch": 0.15536143518789938, "grad_norm": 0.234375, "learning_rate": 8.446385648121007e-06, "loss": 0.7984, "step": 2650 }, { "epoch": 0.15682710910476638, "grad_norm": 0.2578125, "learning_rate": 8.431728908952337e-06, "loss": 0.8407, "step": 2675 }, { "epoch": 0.15829278302163335, "grad_norm": 0.23828125, "learning_rate": 8.417072169783668e-06, "loss": 0.8591, "step": 2700 }, { "epoch": 0.15975845693850033, "grad_norm": 0.439453125, "learning_rate": 8.402415430614998e-06, "loss": 0.906, "step": 2725 }, { "epoch": 0.1612241308553673, "grad_norm": 0.455078125, "learning_rate": 8.387758691446328e-06, "loss": 0.8644, "step": 2750 }, { "epoch": 0.16268980477223427, "grad_norm": 0.162109375, "learning_rate": 8.373101952277657e-06, "loss": 0.8917, "step": 2775 }, { "epoch": 0.16415547868910124, "grad_norm": 0.2197265625, "learning_rate": 8.358445213108988e-06, "loss": 0.852, "step": 2800 }, { "epoch": 0.1656211526059682, "grad_norm": 0.25390625, "learning_rate": 8.343788473940318e-06, "loss": 0.9059, "step": 2825 }, { "epoch": 0.1670868265228352, "grad_norm": 0.392578125, "learning_rate": 8.329131734771648e-06, "loss": 0.9189, "step": 2850 }, { "epoch": 0.16855250043970219, "grad_norm": 0.19140625, "learning_rate": 8.314474995602979e-06, "loss": 0.9356, "step": 2875 }, { "epoch": 0.17001817435656916, "grad_norm": 0.232421875, "learning_rate": 8.29981825643431e-06, "loss": 0.8653, "step": 2900 }, { "epoch": 0.17148384827343613, "grad_norm": 0.3125, "learning_rate": 8.28516151726564e-06, "loss": 0.9481, "step": 2925 }, { "epoch": 0.1729495221903031, "grad_norm": 0.267578125, "learning_rate": 8.27050477809697e-06, "loss": 0.8533, "step": 2950 }, { "epoch": 0.17441519610717007, "grad_norm": 0.353515625, "learning_rate": 8.2558480389283e-06, "loss": 0.9067, "step": 2975 }, { "epoch": 0.17588087002403704, "grad_norm": 0.7890625, "learning_rate": 8.24119129975963e-06, "loss": 0.8769, "step": 3000 }, { "epoch": 0.17734654394090402, "grad_norm": 0.19921875, "learning_rate": 8.226534560590961e-06, "loss": 1.0016, "step": 3025 }, { "epoch": 0.17881221785777102, "grad_norm": 0.2060546875, "learning_rate": 8.21187782142229e-06, "loss": 0.8201, "step": 3050 }, { "epoch": 0.180277891774638, "grad_norm": 0.287109375, "learning_rate": 8.197221082253622e-06, "loss": 0.9004, "step": 3075 }, { "epoch": 0.18174356569150496, "grad_norm": 0.26953125, "learning_rate": 8.18256434308495e-06, "loss": 0.9973, "step": 3100 }, { "epoch": 0.18320923960837193, "grad_norm": 0.2353515625, "learning_rate": 8.167907603916281e-06, "loss": 0.8702, "step": 3125 }, { "epoch": 0.1846749135252389, "grad_norm": 0.21875, "learning_rate": 8.153250864747612e-06, "loss": 1.1181, "step": 3150 }, { "epoch": 0.18614058744210588, "grad_norm": 0.671875, "learning_rate": 8.138594125578942e-06, "loss": 0.9449, "step": 3175 }, { "epoch": 0.18760626135897285, "grad_norm": 0.392578125, "learning_rate": 8.123937386410272e-06, "loss": 1.0326, "step": 3200 }, { "epoch": 0.18907193527583982, "grad_norm": 0.2099609375, "learning_rate": 8.109280647241601e-06, "loss": 0.836, "step": 3225 }, { "epoch": 0.19053760919270682, "grad_norm": 0.201171875, "learning_rate": 8.094623908072933e-06, "loss": 0.8702, "step": 3250 }, { "epoch": 0.1920032831095738, "grad_norm": 0.18359375, "learning_rate": 8.079967168904262e-06, "loss": 0.9676, "step": 3275 }, { "epoch": 0.19346895702644076, "grad_norm": 0.31640625, "learning_rate": 8.065310429735594e-06, "loss": 0.9803, "step": 3300 }, { "epoch": 0.19493463094330774, "grad_norm": 0.125, "learning_rate": 8.050653690566923e-06, "loss": 0.8786, "step": 3325 }, { "epoch": 0.1964003048601747, "grad_norm": 0.216796875, "learning_rate": 8.035996951398253e-06, "loss": 0.8793, "step": 3350 }, { "epoch": 0.19786597877704168, "grad_norm": 0.263671875, "learning_rate": 8.021340212229584e-06, "loss": 0.9167, "step": 3375 }, { "epoch": 0.19933165269390865, "grad_norm": 0.1484375, "learning_rate": 8.006683473060914e-06, "loss": 0.8833, "step": 3400 }, { "epoch": 0.20079732661077562, "grad_norm": 0.431640625, "learning_rate": 7.992026733892244e-06, "loss": 0.9054, "step": 3425 }, { "epoch": 0.20226300052764262, "grad_norm": 0.287109375, "learning_rate": 7.977369994723575e-06, "loss": 0.9006, "step": 3450 }, { "epoch": 0.2037286744445096, "grad_norm": 0.271484375, "learning_rate": 7.962713255554905e-06, "loss": 0.8351, "step": 3475 }, { "epoch": 0.20519434836137657, "grad_norm": 0.2099609375, "learning_rate": 7.948056516386236e-06, "loss": 0.8815, "step": 3500 }, { "epoch": 0.20666002227824354, "grad_norm": 0.396484375, "learning_rate": 7.933399777217566e-06, "loss": 0.8492, "step": 3525 }, { "epoch": 0.2081256961951105, "grad_norm": 0.23046875, "learning_rate": 7.918743038048895e-06, "loss": 0.7399, "step": 3550 }, { "epoch": 0.20959137011197748, "grad_norm": 0.2490234375, "learning_rate": 7.904086298880225e-06, "loss": 0.8195, "step": 3575 }, { "epoch": 0.21105704402884445, "grad_norm": 0.2099609375, "learning_rate": 7.889429559711556e-06, "loss": 0.8085, "step": 3600 }, { "epoch": 0.21252271794571143, "grad_norm": 0.2578125, "learning_rate": 7.874772820542886e-06, "loss": 0.9241, "step": 3625 }, { "epoch": 0.21398839186257843, "grad_norm": 0.189453125, "learning_rate": 7.860116081374216e-06, "loss": 0.8614, "step": 3650 }, { "epoch": 0.2154540657794454, "grad_norm": 0.279296875, "learning_rate": 7.845459342205547e-06, "loss": 0.8509, "step": 3675 }, { "epoch": 0.21691973969631237, "grad_norm": 0.26171875, "learning_rate": 7.830802603036877e-06, "loss": 0.9153, "step": 3700 }, { "epoch": 0.21838541361317934, "grad_norm": 0.435546875, "learning_rate": 7.816145863868208e-06, "loss": 0.857, "step": 3725 }, { "epoch": 0.2198510875300463, "grad_norm": 0.1904296875, "learning_rate": 7.801489124699538e-06, "loss": 0.8623, "step": 3750 }, { "epoch": 0.22131676144691328, "grad_norm": 0.2216796875, "learning_rate": 7.786832385530868e-06, "loss": 0.9246, "step": 3775 }, { "epoch": 0.22278243536378026, "grad_norm": 0.2890625, "learning_rate": 7.772175646362197e-06, "loss": 0.9, "step": 3800 }, { "epoch": 0.22424810928064723, "grad_norm": 0.294921875, "learning_rate": 7.75751890719353e-06, "loss": 1.0074, "step": 3825 }, { "epoch": 0.22571378319751423, "grad_norm": 0.98046875, "learning_rate": 7.742862168024858e-06, "loss": 0.8933, "step": 3850 }, { "epoch": 0.2271794571143812, "grad_norm": 0.462890625, "learning_rate": 7.72820542885619e-06, "loss": 0.8541, "step": 3875 }, { "epoch": 0.22864513103124817, "grad_norm": 0.48828125, "learning_rate": 7.713548689687519e-06, "loss": 0.8043, "step": 3900 }, { "epoch": 0.23011080494811514, "grad_norm": 0.365234375, "learning_rate": 7.698891950518849e-06, "loss": 0.9046, "step": 3925 }, { "epoch": 0.23157647886498212, "grad_norm": 0.3671875, "learning_rate": 7.68423521135018e-06, "loss": 1.0241, "step": 3950 }, { "epoch": 0.2330421527818491, "grad_norm": 0.412109375, "learning_rate": 7.66957847218151e-06, "loss": 0.8257, "step": 3975 }, { "epoch": 0.23450782669871606, "grad_norm": 0.248046875, "learning_rate": 7.65492173301284e-06, "loss": 0.9551, "step": 4000 }, { "epoch": 0.23597350061558303, "grad_norm": 0.2314453125, "learning_rate": 7.640264993844169e-06, "loss": 0.9757, "step": 4025 }, { "epoch": 0.23743917453245003, "grad_norm": 0.287109375, "learning_rate": 7.6256082546755e-06, "loss": 0.9187, "step": 4050 }, { "epoch": 0.238904848449317, "grad_norm": 0.5546875, "learning_rate": 7.610951515506831e-06, "loss": 0.9166, "step": 4075 }, { "epoch": 0.24037052236618398, "grad_norm": 0.275390625, "learning_rate": 7.596294776338161e-06, "loss": 0.9277, "step": 4100 }, { "epoch": 0.24183619628305095, "grad_norm": 0.16796875, "learning_rate": 7.581638037169491e-06, "loss": 0.8612, "step": 4125 }, { "epoch": 0.24330187019991792, "grad_norm": 0.357421875, "learning_rate": 7.566981298000822e-06, "loss": 0.8533, "step": 4150 }, { "epoch": 0.2447675441167849, "grad_norm": 0.2470703125, "learning_rate": 7.5523245588321515e-06, "loss": 0.8259, "step": 4175 }, { "epoch": 0.24623321803365186, "grad_norm": 0.126953125, "learning_rate": 7.537667819663482e-06, "loss": 0.882, "step": 4200 }, { "epoch": 0.24769889195051886, "grad_norm": 0.193359375, "learning_rate": 7.523011080494812e-06, "loss": 0.9179, "step": 4225 }, { "epoch": 0.24916456586738583, "grad_norm": 0.2138671875, "learning_rate": 7.508354341326143e-06, "loss": 0.7686, "step": 4250 }, { "epoch": 0.2506302397842528, "grad_norm": 0.1982421875, "learning_rate": 7.493697602157472e-06, "loss": 0.854, "step": 4275 }, { "epoch": 0.2520959137011198, "grad_norm": 0.2275390625, "learning_rate": 7.4790408629888035e-06, "loss": 0.9073, "step": 4300 }, { "epoch": 0.25356158761798675, "grad_norm": 0.232421875, "learning_rate": 7.464384123820133e-06, "loss": 0.8411, "step": 4325 }, { "epoch": 0.2550272615348537, "grad_norm": 0.2578125, "learning_rate": 7.449727384651463e-06, "loss": 0.911, "step": 4350 }, { "epoch": 0.2564929354517207, "grad_norm": 0.2158203125, "learning_rate": 7.435070645482794e-06, "loss": 0.8707, "step": 4375 }, { "epoch": 0.25795860936858767, "grad_norm": 0.1962890625, "learning_rate": 7.420413906314123e-06, "loss": 0.8607, "step": 4400 }, { "epoch": 0.25942428328545464, "grad_norm": 0.45703125, "learning_rate": 7.405757167145454e-06, "loss": 0.8238, "step": 4425 }, { "epoch": 0.2608899572023216, "grad_norm": 0.255859375, "learning_rate": 7.391100427976784e-06, "loss": 0.9302, "step": 4450 }, { "epoch": 0.2623556311191886, "grad_norm": 0.59375, "learning_rate": 7.376443688808115e-06, "loss": 0.8033, "step": 4475 }, { "epoch": 0.26382130503605555, "grad_norm": 0.2060546875, "learning_rate": 7.361786949639444e-06, "loss": 0.8793, "step": 4500 }, { "epoch": 0.2652869789529226, "grad_norm": 0.27734375, "learning_rate": 7.3471302104707754e-06, "loss": 0.8292, "step": 4525 }, { "epoch": 0.26675265286978955, "grad_norm": 0.232421875, "learning_rate": 7.332473471302105e-06, "loss": 0.8131, "step": 4550 }, { "epoch": 0.2682183267866565, "grad_norm": 0.51171875, "learning_rate": 7.317816732133436e-06, "loss": 0.8427, "step": 4575 }, { "epoch": 0.2696840007035235, "grad_norm": 0.333984375, "learning_rate": 7.303159992964766e-06, "loss": 0.9036, "step": 4600 }, { "epoch": 0.27114967462039047, "grad_norm": 0.484375, "learning_rate": 7.288503253796096e-06, "loss": 0.9235, "step": 4625 }, { "epoch": 0.27261534853725744, "grad_norm": 0.201171875, "learning_rate": 7.273846514627426e-06, "loss": 0.9909, "step": 4650 }, { "epoch": 0.2740810224541244, "grad_norm": 0.373046875, "learning_rate": 7.259189775458757e-06, "loss": 0.9281, "step": 4675 }, { "epoch": 0.2755466963709914, "grad_norm": 0.189453125, "learning_rate": 7.244533036290087e-06, "loss": 0.8731, "step": 4700 }, { "epoch": 0.27701237028785836, "grad_norm": 0.15234375, "learning_rate": 7.229876297121416e-06, "loss": 0.8967, "step": 4725 }, { "epoch": 0.27847804420472533, "grad_norm": 0.376953125, "learning_rate": 7.215219557952747e-06, "loss": 0.8448, "step": 4750 }, { "epoch": 0.2799437181215923, "grad_norm": 0.376953125, "learning_rate": 7.200562818784077e-06, "loss": 0.846, "step": 4775 }, { "epoch": 0.28140939203845927, "grad_norm": 0.2412109375, "learning_rate": 7.185906079615408e-06, "loss": 0.785, "step": 4800 }, { "epoch": 0.28287506595532624, "grad_norm": 0.396484375, "learning_rate": 7.171249340446738e-06, "loss": 0.995, "step": 4825 }, { "epoch": 0.2843407398721932, "grad_norm": 0.162109375, "learning_rate": 7.156592601278068e-06, "loss": 0.879, "step": 4850 }, { "epoch": 0.2858064137890602, "grad_norm": 0.1748046875, "learning_rate": 7.141935862109399e-06, "loss": 0.9504, "step": 4875 }, { "epoch": 0.28727208770592716, "grad_norm": 0.2080078125, "learning_rate": 7.127279122940729e-06, "loss": 1.0018, "step": 4900 }, { "epoch": 0.2887377616227942, "grad_norm": 0.26171875, "learning_rate": 7.1126223837720585e-06, "loss": 0.9797, "step": 4925 }, { "epoch": 0.29020343553966116, "grad_norm": 0.271484375, "learning_rate": 7.09796564460339e-06, "loss": 0.8494, "step": 4950 }, { "epoch": 0.29166910945652813, "grad_norm": 0.341796875, "learning_rate": 7.083308905434719e-06, "loss": 0.9362, "step": 4975 }, { "epoch": 0.2931347833733951, "grad_norm": 0.146484375, "learning_rate": 7.06865216626605e-06, "loss": 0.7694, "step": 5000 }, { "epoch": 0.2946004572902621, "grad_norm": 0.166015625, "learning_rate": 7.05399542709738e-06, "loss": 0.9893, "step": 5025 }, { "epoch": 0.29606613120712905, "grad_norm": 0.333984375, "learning_rate": 7.0393386879287106e-06, "loss": 0.8555, "step": 5050 }, { "epoch": 0.297531805123996, "grad_norm": 0.3046875, "learning_rate": 7.02468194876004e-06, "loss": 0.8452, "step": 5075 }, { "epoch": 0.298997479040863, "grad_norm": 0.21484375, "learning_rate": 7.010025209591371e-06, "loss": 0.8275, "step": 5100 }, { "epoch": 0.30046315295772996, "grad_norm": 0.185546875, "learning_rate": 6.995368470422701e-06, "loss": 0.7464, "step": 5125 }, { "epoch": 0.30192882687459693, "grad_norm": 0.140625, "learning_rate": 6.9807117312540305e-06, "loss": 0.8753, "step": 5150 }, { "epoch": 0.3033945007914639, "grad_norm": 0.2734375, "learning_rate": 6.966054992085362e-06, "loss": 1.016, "step": 5175 }, { "epoch": 0.3048601747083309, "grad_norm": 0.1796875, "learning_rate": 6.951398252916691e-06, "loss": 0.8956, "step": 5200 }, { "epoch": 0.30632584862519785, "grad_norm": 0.185546875, "learning_rate": 6.936741513748022e-06, "loss": 0.8358, "step": 5225 }, { "epoch": 0.3077915225420648, "grad_norm": 0.23046875, "learning_rate": 6.922084774579352e-06, "loss": 0.9687, "step": 5250 }, { "epoch": 0.3092571964589318, "grad_norm": 0.216796875, "learning_rate": 6.9074280354106825e-06, "loss": 0.8358, "step": 5275 }, { "epoch": 0.31072287037579877, "grad_norm": 0.451171875, "learning_rate": 6.892771296242012e-06, "loss": 0.8345, "step": 5300 }, { "epoch": 0.3121885442926658, "grad_norm": 0.40234375, "learning_rate": 6.878114557073343e-06, "loss": 0.9074, "step": 5325 }, { "epoch": 0.31365421820953276, "grad_norm": 4.96875, "learning_rate": 6.863457817904673e-06, "loss": 0.8969, "step": 5350 }, { "epoch": 0.31511989212639974, "grad_norm": 0.2373046875, "learning_rate": 6.848801078736004e-06, "loss": 0.7103, "step": 5375 }, { "epoch": 0.3165855660432667, "grad_norm": 0.1552734375, "learning_rate": 6.834144339567334e-06, "loss": 0.8087, "step": 5400 }, { "epoch": 0.3180512399601337, "grad_norm": 0.1982421875, "learning_rate": 6.819487600398664e-06, "loss": 0.9289, "step": 5425 }, { "epoch": 0.31951691387700065, "grad_norm": 0.24609375, "learning_rate": 6.804830861229994e-06, "loss": 0.8306, "step": 5450 }, { "epoch": 0.3209825877938676, "grad_norm": 0.19921875, "learning_rate": 6.790174122061325e-06, "loss": 0.793, "step": 5475 }, { "epoch": 0.3224482617107346, "grad_norm": 0.7109375, "learning_rate": 6.7755173828926545e-06, "loss": 0.768, "step": 5500 }, { "epoch": 0.32391393562760157, "grad_norm": 0.2041015625, "learning_rate": 6.760860643723984e-06, "loss": 0.8649, "step": 5525 }, { "epoch": 0.32537960954446854, "grad_norm": 0.2373046875, "learning_rate": 6.746203904555315e-06, "loss": 0.9173, "step": 5550 }, { "epoch": 0.3268452834613355, "grad_norm": 0.341796875, "learning_rate": 6.731547165386645e-06, "loss": 0.8569, "step": 5575 }, { "epoch": 0.3283109573782025, "grad_norm": 0.28515625, "learning_rate": 6.716890426217976e-06, "loss": 0.8013, "step": 5600 }, { "epoch": 0.32977663129506946, "grad_norm": 0.1767578125, "learning_rate": 6.702233687049306e-06, "loss": 0.9285, "step": 5625 }, { "epoch": 0.3312423052119364, "grad_norm": 0.357421875, "learning_rate": 6.687576947880636e-06, "loss": 0.9202, "step": 5650 }, { "epoch": 0.3327079791288034, "grad_norm": 0.61328125, "learning_rate": 6.672920208711966e-06, "loss": 0.8727, "step": 5675 }, { "epoch": 0.3341736530456704, "grad_norm": 0.205078125, "learning_rate": 6.658263469543297e-06, "loss": 0.8877, "step": 5700 }, { "epoch": 0.3356393269625374, "grad_norm": 0.28515625, "learning_rate": 6.6436067303746264e-06, "loss": 0.7748, "step": 5725 }, { "epoch": 0.33710500087940437, "grad_norm": 0.2578125, "learning_rate": 6.628949991205958e-06, "loss": 0.805, "step": 5750 }, { "epoch": 0.33857067479627134, "grad_norm": 0.341796875, "learning_rate": 6.614293252037287e-06, "loss": 0.8786, "step": 5775 }, { "epoch": 0.3400363487131383, "grad_norm": 0.64453125, "learning_rate": 6.599636512868618e-06, "loss": 0.8441, "step": 5800 }, { "epoch": 0.3415020226300053, "grad_norm": 0.228515625, "learning_rate": 6.584979773699948e-06, "loss": 0.8751, "step": 5825 }, { "epoch": 0.34296769654687226, "grad_norm": 0.37109375, "learning_rate": 6.5703230345312785e-06, "loss": 0.9569, "step": 5850 }, { "epoch": 0.34443337046373923, "grad_norm": 0.294921875, "learning_rate": 6.555666295362608e-06, "loss": 0.9255, "step": 5875 }, { "epoch": 0.3458990443806062, "grad_norm": 0.3125, "learning_rate": 6.541009556193938e-06, "loss": 0.8501, "step": 5900 }, { "epoch": 0.3473647182974732, "grad_norm": 0.2001953125, "learning_rate": 6.526352817025269e-06, "loss": 0.8933, "step": 5925 }, { "epoch": 0.34883039221434015, "grad_norm": 0.19921875, "learning_rate": 6.511696077856598e-06, "loss": 0.8941, "step": 5950 }, { "epoch": 0.3502960661312071, "grad_norm": 0.25390625, "learning_rate": 6.49703933868793e-06, "loss": 1.0143, "step": 5975 }, { "epoch": 0.3517617400480741, "grad_norm": 0.279296875, "learning_rate": 6.482382599519259e-06, "loss": 0.8929, "step": 6000 }, { "epoch": 0.35322741396494106, "grad_norm": 0.333984375, "learning_rate": 6.46772586035059e-06, "loss": 0.8641, "step": 6025 }, { "epoch": 0.35469308788180803, "grad_norm": 0.263671875, "learning_rate": 6.45306912118192e-06, "loss": 0.8743, "step": 6050 }, { "epoch": 0.356158761798675, "grad_norm": 0.333984375, "learning_rate": 6.4384123820132504e-06, "loss": 0.8072, "step": 6075 }, { "epoch": 0.35762443571554203, "grad_norm": 0.345703125, "learning_rate": 6.42375564284458e-06, "loss": 0.8964, "step": 6100 }, { "epoch": 0.359090109632409, "grad_norm": 0.27734375, "learning_rate": 6.409098903675911e-06, "loss": 0.9049, "step": 6125 }, { "epoch": 0.360555783549276, "grad_norm": 0.17578125, "learning_rate": 6.394442164507241e-06, "loss": 0.9438, "step": 6150 }, { "epoch": 0.36202145746614295, "grad_norm": 0.255859375, "learning_rate": 6.379785425338571e-06, "loss": 0.8891, "step": 6175 }, { "epoch": 0.3634871313830099, "grad_norm": 0.130859375, "learning_rate": 6.365128686169902e-06, "loss": 1.1601, "step": 6200 }, { "epoch": 0.3649528052998769, "grad_norm": 0.216796875, "learning_rate": 6.350471947001232e-06, "loss": 0.968, "step": 6225 }, { "epoch": 0.36641847921674386, "grad_norm": 0.7109375, "learning_rate": 6.3358152078325616e-06, "loss": 0.962, "step": 6250 }, { "epoch": 0.36788415313361084, "grad_norm": 0.4453125, "learning_rate": 6.321158468663893e-06, "loss": 0.8971, "step": 6275 }, { "epoch": 0.3693498270504778, "grad_norm": 0.2255859375, "learning_rate": 6.306501729495222e-06, "loss": 0.8835, "step": 6300 }, { "epoch": 0.3708155009673448, "grad_norm": 0.361328125, "learning_rate": 6.291844990326552e-06, "loss": 0.8783, "step": 6325 }, { "epoch": 0.37228117488421175, "grad_norm": 0.490234375, "learning_rate": 6.277188251157883e-06, "loss": 0.8959, "step": 6350 }, { "epoch": 0.3737468488010787, "grad_norm": 0.3125, "learning_rate": 6.262531511989213e-06, "loss": 0.8945, "step": 6375 }, { "epoch": 0.3752125227179457, "grad_norm": 0.173828125, "learning_rate": 6.247874772820543e-06, "loss": 0.9492, "step": 6400 }, { "epoch": 0.37667819663481267, "grad_norm": 0.37890625, "learning_rate": 6.2332180336518736e-06, "loss": 0.8728, "step": 6425 }, { "epoch": 0.37814387055167964, "grad_norm": 0.255859375, "learning_rate": 6.218561294483204e-06, "loss": 0.8508, "step": 6450 }, { "epoch": 0.3796095444685466, "grad_norm": 0.1884765625, "learning_rate": 6.2039045553145335e-06, "loss": 0.9759, "step": 6475 }, { "epoch": 0.38107521838541364, "grad_norm": 0.216796875, "learning_rate": 6.189247816145865e-06, "loss": 1.0576, "step": 6500 }, { "epoch": 0.3825408923022806, "grad_norm": 0.240234375, "learning_rate": 6.174591076977194e-06, "loss": 0.7754, "step": 6525 }, { "epoch": 0.3840065662191476, "grad_norm": 0.189453125, "learning_rate": 6.159934337808526e-06, "loss": 0.9326, "step": 6550 }, { "epoch": 0.38547224013601455, "grad_norm": 0.1796875, "learning_rate": 6.145277598639855e-06, "loss": 0.8764, "step": 6575 }, { "epoch": 0.3869379140528815, "grad_norm": 0.2314453125, "learning_rate": 6.1306208594711856e-06, "loss": 0.8589, "step": 6600 }, { "epoch": 0.3884035879697485, "grad_norm": 1.1953125, "learning_rate": 6.115964120302515e-06, "loss": 1.0026, "step": 6625 }, { "epoch": 0.38986926188661547, "grad_norm": 0.19140625, "learning_rate": 6.101307381133846e-06, "loss": 0.8483, "step": 6650 }, { "epoch": 0.39133493580348244, "grad_norm": 0.294921875, "learning_rate": 6.086650641965176e-06, "loss": 0.9063, "step": 6675 }, { "epoch": 0.3928006097203494, "grad_norm": 0.353515625, "learning_rate": 6.0719939027965055e-06, "loss": 0.8658, "step": 6700 }, { "epoch": 0.3942662836372164, "grad_norm": 0.2197265625, "learning_rate": 6.057337163627837e-06, "loss": 0.8389, "step": 6725 }, { "epoch": 0.39573195755408336, "grad_norm": 0.94921875, "learning_rate": 6.042680424459166e-06, "loss": 0.8769, "step": 6750 }, { "epoch": 0.39719763147095033, "grad_norm": 1.1796875, "learning_rate": 6.0280236852904975e-06, "loss": 0.9461, "step": 6775 }, { "epoch": 0.3986633053878173, "grad_norm": 0.17578125, "learning_rate": 6.013366946121827e-06, "loss": 0.9144, "step": 6800 }, { "epoch": 0.4001289793046843, "grad_norm": 0.2431640625, "learning_rate": 5.9987102069531575e-06, "loss": 0.8746, "step": 6825 }, { "epoch": 0.40159465322155125, "grad_norm": 0.60546875, "learning_rate": 5.984053467784487e-06, "loss": 0.8977, "step": 6850 }, { "epoch": 0.4030603271384182, "grad_norm": 0.478515625, "learning_rate": 5.969396728615818e-06, "loss": 0.8404, "step": 6875 }, { "epoch": 0.40452600105528524, "grad_norm": 0.326171875, "learning_rate": 5.954739989447148e-06, "loss": 0.8321, "step": 6900 }, { "epoch": 0.4059916749721522, "grad_norm": 0.169921875, "learning_rate": 5.940083250278479e-06, "loss": 0.8963, "step": 6925 }, { "epoch": 0.4074573488890192, "grad_norm": 0.234375, "learning_rate": 5.925426511109809e-06, "loss": 0.9668, "step": 6950 }, { "epoch": 0.40892302280588616, "grad_norm": 0.15625, "learning_rate": 5.910769771941139e-06, "loss": 0.8921, "step": 6975 }, { "epoch": 0.41038869672275313, "grad_norm": 0.2734375, "learning_rate": 5.8961130327724695e-06, "loss": 0.9034, "step": 7000 }, { "epoch": 0.4118543706396201, "grad_norm": 0.236328125, "learning_rate": 5.8814562936038e-06, "loss": 0.873, "step": 7025 }, { "epoch": 0.4133200445564871, "grad_norm": 0.28515625, "learning_rate": 5.8667995544351295e-06, "loss": 0.9082, "step": 7050 }, { "epoch": 0.41478571847335405, "grad_norm": 3.171875, "learning_rate": 5.852142815266459e-06, "loss": 0.9186, "step": 7075 }, { "epoch": 0.416251392390221, "grad_norm": 0.1396484375, "learning_rate": 5.83748607609779e-06, "loss": 0.9711, "step": 7100 }, { "epoch": 0.417717066307088, "grad_norm": 0.8046875, "learning_rate": 5.82282933692912e-06, "loss": 0.8168, "step": 7125 }, { "epoch": 0.41918274022395496, "grad_norm": 0.205078125, "learning_rate": 5.808172597760451e-06, "loss": 0.926, "step": 7150 }, { "epoch": 0.42064841414082194, "grad_norm": 0.134765625, "learning_rate": 5.793515858591781e-06, "loss": 0.7931, "step": 7175 }, { "epoch": 0.4221140880576889, "grad_norm": 0.1904296875, "learning_rate": 5.778859119423111e-06, "loss": 0.9143, "step": 7200 }, { "epoch": 0.4235797619745559, "grad_norm": 0.298828125, "learning_rate": 5.7642023802544415e-06, "loss": 0.846, "step": 7225 }, { "epoch": 0.42504543589142285, "grad_norm": 0.328125, "learning_rate": 5.749545641085772e-06, "loss": 0.8936, "step": 7250 }, { "epoch": 0.4265111098082899, "grad_norm": 0.1923828125, "learning_rate": 5.7348889019171014e-06, "loss": 0.9142, "step": 7275 }, { "epoch": 0.42797678372515685, "grad_norm": 0.470703125, "learning_rate": 5.720232162748433e-06, "loss": 0.9059, "step": 7300 }, { "epoch": 0.4294424576420238, "grad_norm": 1.1015625, "learning_rate": 5.705575423579762e-06, "loss": 0.7794, "step": 7325 }, { "epoch": 0.4309081315588908, "grad_norm": 0.2060546875, "learning_rate": 5.6909186844110935e-06, "loss": 0.7724, "step": 7350 }, { "epoch": 0.43237380547575777, "grad_norm": 0.169921875, "learning_rate": 5.676261945242423e-06, "loss": 0.9143, "step": 7375 }, { "epoch": 0.43383947939262474, "grad_norm": 0.2275390625, "learning_rate": 5.6616052060737535e-06, "loss": 0.8556, "step": 7400 }, { "epoch": 0.4353051533094917, "grad_norm": 0.259765625, "learning_rate": 5.646948466905083e-06, "loss": 0.8668, "step": 7425 }, { "epoch": 0.4367708272263587, "grad_norm": 0.6328125, "learning_rate": 5.632291727736414e-06, "loss": 0.87, "step": 7450 }, { "epoch": 0.43823650114322565, "grad_norm": 0.173828125, "learning_rate": 5.617634988567744e-06, "loss": 0.8162, "step": 7475 }, { "epoch": 0.4397021750600926, "grad_norm": 0.279296875, "learning_rate": 5.602978249399073e-06, "loss": 0.8705, "step": 7500 }, { "epoch": 0.4411678489769596, "grad_norm": 0.26953125, "learning_rate": 5.588321510230405e-06, "loss": 0.8221, "step": 7525 }, { "epoch": 0.44263352289382657, "grad_norm": 0.236328125, "learning_rate": 5.573664771061734e-06, "loss": 0.9005, "step": 7550 }, { "epoch": 0.44409919681069354, "grad_norm": 0.2890625, "learning_rate": 5.5590080318930654e-06, "loss": 0.8577, "step": 7575 }, { "epoch": 0.4455648707275605, "grad_norm": 0.28125, "learning_rate": 5.544351292724395e-06, "loss": 0.9489, "step": 7600 }, { "epoch": 0.4470305446444275, "grad_norm": 0.29296875, "learning_rate": 5.529694553555725e-06, "loss": 0.78, "step": 7625 }, { "epoch": 0.44849621856129446, "grad_norm": 0.390625, "learning_rate": 5.515037814387055e-06, "loss": 0.7977, "step": 7650 }, { "epoch": 0.4499618924781615, "grad_norm": 0.203125, "learning_rate": 5.500381075218386e-06, "loss": 0.8348, "step": 7675 }, { "epoch": 0.45142756639502846, "grad_norm": 0.287109375, "learning_rate": 5.485724336049716e-06, "loss": 0.9294, "step": 7700 }, { "epoch": 0.45289324031189543, "grad_norm": 0.333984375, "learning_rate": 5.471067596881047e-06, "loss": 0.905, "step": 7725 }, { "epoch": 0.4543589142287624, "grad_norm": 0.2490234375, "learning_rate": 5.456410857712377e-06, "loss": 0.865, "step": 7750 }, { "epoch": 0.4558245881456294, "grad_norm": 0.2578125, "learning_rate": 5.441754118543707e-06, "loss": 0.8706, "step": 7775 }, { "epoch": 0.45729026206249634, "grad_norm": 1.2109375, "learning_rate": 5.427097379375037e-06, "loss": 1.0095, "step": 7800 }, { "epoch": 0.4587559359793633, "grad_norm": 0.177734375, "learning_rate": 5.412440640206368e-06, "loss": 0.801, "step": 7825 }, { "epoch": 0.4602216098962303, "grad_norm": 0.2578125, "learning_rate": 5.397783901037697e-06, "loss": 0.8175, "step": 7850 }, { "epoch": 0.46168728381309726, "grad_norm": 0.298828125, "learning_rate": 5.383127161869027e-06, "loss": 0.9689, "step": 7875 }, { "epoch": 0.46315295772996423, "grad_norm": 0.2373046875, "learning_rate": 5.368470422700358e-06, "loss": 0.8495, "step": 7900 }, { "epoch": 0.4646186316468312, "grad_norm": 0.1796875, "learning_rate": 5.353813683531688e-06, "loss": 0.9133, "step": 7925 }, { "epoch": 0.4660843055636982, "grad_norm": 0.2021484375, "learning_rate": 5.339156944363019e-06, "loss": 0.8828, "step": 7950 }, { "epoch": 0.46754997948056515, "grad_norm": 0.2578125, "learning_rate": 5.3245002051943485e-06, "loss": 1.0062, "step": 7975 }, { "epoch": 0.4690156533974321, "grad_norm": 0.240234375, "learning_rate": 5.309843466025679e-06, "loss": 0.8171, "step": 8000 }, { "epoch": 0.4704813273142991, "grad_norm": 0.28515625, "learning_rate": 5.295186726857009e-06, "loss": 0.8704, "step": 8025 }, { "epoch": 0.47194700123116606, "grad_norm": 0.4140625, "learning_rate": 5.28052998768834e-06, "loss": 0.866, "step": 8050 }, { "epoch": 0.4734126751480331, "grad_norm": 0.15625, "learning_rate": 5.265873248519669e-06, "loss": 0.8298, "step": 8075 }, { "epoch": 0.47487834906490006, "grad_norm": 0.330078125, "learning_rate": 5.251216509351001e-06, "loss": 0.8654, "step": 8100 }, { "epoch": 0.47634402298176703, "grad_norm": 0.205078125, "learning_rate": 5.23655977018233e-06, "loss": 0.9916, "step": 8125 }, { "epoch": 0.477809696898634, "grad_norm": 0.9609375, "learning_rate": 5.2219030310136605e-06, "loss": 0.9645, "step": 8150 }, { "epoch": 0.479275370815501, "grad_norm": 0.314453125, "learning_rate": 5.207246291844991e-06, "loss": 0.8867, "step": 8175 }, { "epoch": 0.48074104473236795, "grad_norm": 0.326171875, "learning_rate": 5.192589552676321e-06, "loss": 0.9363, "step": 8200 }, { "epoch": 0.4822067186492349, "grad_norm": 0.283203125, "learning_rate": 5.177932813507651e-06, "loss": 0.7848, "step": 8225 }, { "epoch": 0.4836723925661019, "grad_norm": 0.33203125, "learning_rate": 5.163276074338982e-06, "loss": 0.8899, "step": 8250 }, { "epoch": 0.48513806648296887, "grad_norm": 0.1748046875, "learning_rate": 5.148619335170312e-06, "loss": 0.8092, "step": 8275 }, { "epoch": 0.48660374039983584, "grad_norm": 0.41796875, "learning_rate": 5.133962596001641e-06, "loss": 0.899, "step": 8300 }, { "epoch": 0.4880694143167028, "grad_norm": 0.169921875, "learning_rate": 5.1193058568329725e-06, "loss": 0.9528, "step": 8325 }, { "epoch": 0.4895350882335698, "grad_norm": 0.1474609375, "learning_rate": 5.104649117664302e-06, "loss": 0.8668, "step": 8350 }, { "epoch": 0.49100076215043675, "grad_norm": 0.3828125, "learning_rate": 5.0899923784956325e-06, "loss": 0.9176, "step": 8375 }, { "epoch": 0.4924664360673037, "grad_norm": 0.44140625, "learning_rate": 5.075335639326963e-06, "loss": 0.9135, "step": 8400 }, { "epoch": 0.4939321099841707, "grad_norm": 0.279296875, "learning_rate": 5.060678900158293e-06, "loss": 0.8201, "step": 8425 }, { "epoch": 0.4953977839010377, "grad_norm": 0.62109375, "learning_rate": 5.046022160989623e-06, "loss": 0.8947, "step": 8450 }, { "epoch": 0.4968634578179047, "grad_norm": 0.279296875, "learning_rate": 5.031365421820954e-06, "loss": 0.8838, "step": 8475 }, { "epoch": 0.49832913173477167, "grad_norm": 0.1962890625, "learning_rate": 5.016708682652284e-06, "loss": 0.8648, "step": 8500 }, { "epoch": 0.49979480565163864, "grad_norm": 2.390625, "learning_rate": 5.002051943483615e-06, "loss": 0.9247, "step": 8525 }, { "epoch": 0.5012604795685056, "grad_norm": 0.2333984375, "learning_rate": 4.9873952043149445e-06, "loss": 0.8839, "step": 8550 }, { "epoch": 0.5027261534853725, "grad_norm": 0.51953125, "learning_rate": 4.972738465146275e-06, "loss": 0.9194, "step": 8575 }, { "epoch": 0.5041918274022396, "grad_norm": 0.26953125, "learning_rate": 4.9580817259776045e-06, "loss": 0.8232, "step": 8600 }, { "epoch": 0.5056575013191065, "grad_norm": 0.37890625, "learning_rate": 4.943424986808935e-06, "loss": 1.0866, "step": 8625 }, { "epoch": 0.5071231752359735, "grad_norm": 0.23046875, "learning_rate": 4.928768247640265e-06, "loss": 0.9213, "step": 8650 }, { "epoch": 0.5085888491528405, "grad_norm": 0.2236328125, "learning_rate": 4.914111508471596e-06, "loss": 0.993, "step": 8675 }, { "epoch": 0.5100545230697074, "grad_norm": 0.5234375, "learning_rate": 4.899454769302926e-06, "loss": 0.9335, "step": 8700 }, { "epoch": 0.5115201969865745, "grad_norm": 0.3125, "learning_rate": 4.8847980301342565e-06, "loss": 0.9327, "step": 8725 }, { "epoch": 0.5129858709034414, "grad_norm": 0.515625, "learning_rate": 4.870141290965587e-06, "loss": 0.8948, "step": 8750 }, { "epoch": 0.5144515448203084, "grad_norm": 0.1591796875, "learning_rate": 4.8554845517969164e-06, "loss": 0.9082, "step": 8775 }, { "epoch": 0.5159172187371753, "grad_norm": 0.2138671875, "learning_rate": 4.840827812628247e-06, "loss": 0.894, "step": 8800 }, { "epoch": 0.5173828926540424, "grad_norm": 0.21484375, "learning_rate": 4.826171073459577e-06, "loss": 0.9065, "step": 8825 }, { "epoch": 0.5188485665709093, "grad_norm": 0.1669921875, "learning_rate": 4.811514334290907e-06, "loss": 0.8453, "step": 8850 }, { "epoch": 0.5203142404877763, "grad_norm": 0.21484375, "learning_rate": 4.796857595122237e-06, "loss": 1.0002, "step": 8875 }, { "epoch": 0.5217799144046432, "grad_norm": 0.9921875, "learning_rate": 4.782200855953568e-06, "loss": 0.9558, "step": 8900 }, { "epoch": 0.5232455883215102, "grad_norm": 0.1884765625, "learning_rate": 4.767544116784898e-06, "loss": 0.7789, "step": 8925 }, { "epoch": 0.5247112622383772, "grad_norm": 0.365234375, "learning_rate": 4.7528873776162284e-06, "loss": 0.8602, "step": 8950 }, { "epoch": 0.5261769361552442, "grad_norm": 0.451171875, "learning_rate": 4.738230638447559e-06, "loss": 0.849, "step": 8975 }, { "epoch": 0.5276426100721111, "grad_norm": 0.1591796875, "learning_rate": 4.723573899278888e-06, "loss": 0.8461, "step": 9000 }, { "epoch": 0.5291082839889781, "grad_norm": 0.47265625, "learning_rate": 4.708917160110219e-06, "loss": 0.8729, "step": 9025 }, { "epoch": 0.5305739579058452, "grad_norm": 0.341796875, "learning_rate": 4.694260420941549e-06, "loss": 1.0451, "step": 9050 }, { "epoch": 0.5320396318227121, "grad_norm": 0.1943359375, "learning_rate": 4.67960368177288e-06, "loss": 0.817, "step": 9075 }, { "epoch": 0.5335053057395791, "grad_norm": 0.2119140625, "learning_rate": 4.66494694260421e-06, "loss": 0.9171, "step": 9100 }, { "epoch": 0.534970979656446, "grad_norm": 0.353515625, "learning_rate": 4.6502902034355404e-06, "loss": 0.7783, "step": 9125 }, { "epoch": 0.536436653573313, "grad_norm": 0.44921875, "learning_rate": 4.635633464266871e-06, "loss": 0.8943, "step": 9150 }, { "epoch": 0.53790232749018, "grad_norm": 0.2490234375, "learning_rate": 4.6209767250982e-06, "loss": 0.9554, "step": 9175 }, { "epoch": 0.539368001407047, "grad_norm": 0.162109375, "learning_rate": 4.606319985929531e-06, "loss": 0.7875, "step": 9200 }, { "epoch": 0.5408336753239139, "grad_norm": 0.33984375, "learning_rate": 4.591663246760861e-06, "loss": 0.9086, "step": 9225 }, { "epoch": 0.5422993492407809, "grad_norm": 0.279296875, "learning_rate": 4.577006507592191e-06, "loss": 0.9511, "step": 9250 }, { "epoch": 0.5437650231576479, "grad_norm": 0.2353515625, "learning_rate": 4.562349768423521e-06, "loss": 0.8182, "step": 9275 }, { "epoch": 0.5452306970745149, "grad_norm": 0.302734375, "learning_rate": 4.547693029254852e-06, "loss": 0.8487, "step": 9300 }, { "epoch": 0.5466963709913818, "grad_norm": 0.1845703125, "learning_rate": 4.533036290086182e-06, "loss": 0.8708, "step": 9325 }, { "epoch": 0.5481620449082488, "grad_norm": 0.265625, "learning_rate": 4.518379550917512e-06, "loss": 0.8577, "step": 9350 }, { "epoch": 0.5496277188251157, "grad_norm": 0.36328125, "learning_rate": 4.503722811748843e-06, "loss": 0.8896, "step": 9375 }, { "epoch": 0.5510933927419828, "grad_norm": 0.2314453125, "learning_rate": 4.489066072580172e-06, "loss": 0.8778, "step": 9400 }, { "epoch": 0.5525590666588497, "grad_norm": 0.283203125, "learning_rate": 4.474409333411503e-06, "loss": 0.847, "step": 9425 }, { "epoch": 0.5540247405757167, "grad_norm": 0.384765625, "learning_rate": 4.459752594242833e-06, "loss": 0.8678, "step": 9450 }, { "epoch": 0.5554904144925837, "grad_norm": 0.228515625, "learning_rate": 4.4450958550741636e-06, "loss": 0.7334, "step": 9475 }, { "epoch": 0.5569560884094507, "grad_norm": 0.2080078125, "learning_rate": 4.430439115905494e-06, "loss": 1.1603, "step": 9500 }, { "epoch": 0.5584217623263177, "grad_norm": 0.19921875, "learning_rate": 4.415782376736824e-06, "loss": 0.8804, "step": 9525 }, { "epoch": 0.5598874362431846, "grad_norm": 0.431640625, "learning_rate": 4.401125637568155e-06, "loss": 0.9023, "step": 9550 }, { "epoch": 0.5613531101600516, "grad_norm": 0.2353515625, "learning_rate": 4.386468898399484e-06, "loss": 0.8169, "step": 9575 }, { "epoch": 0.5628187840769185, "grad_norm": 0.119140625, "learning_rate": 4.371812159230815e-06, "loss": 0.919, "step": 9600 }, { "epoch": 0.5642844579937856, "grad_norm": 0.546875, "learning_rate": 4.357155420062145e-06, "loss": 0.8916, "step": 9625 }, { "epoch": 0.5657501319106525, "grad_norm": 0.359375, "learning_rate": 4.342498680893475e-06, "loss": 0.832, "step": 9650 }, { "epoch": 0.5672158058275195, "grad_norm": 0.302734375, "learning_rate": 4.327841941724805e-06, "loss": 0.824, "step": 9675 }, { "epoch": 0.5686814797443864, "grad_norm": 0.361328125, "learning_rate": 4.3131852025561355e-06, "loss": 0.8998, "step": 9700 }, { "epoch": 0.5701471536612535, "grad_norm": 0.44140625, "learning_rate": 4.298528463387466e-06, "loss": 0.8192, "step": 9725 }, { "epoch": 0.5716128275781204, "grad_norm": 0.169921875, "learning_rate": 4.283871724218796e-06, "loss": 0.825, "step": 9750 }, { "epoch": 0.5730785014949874, "grad_norm": 0.2734375, "learning_rate": 4.269214985050127e-06, "loss": 0.9014, "step": 9775 }, { "epoch": 0.5745441754118543, "grad_norm": 0.65234375, "learning_rate": 4.254558245881456e-06, "loss": 1.1582, "step": 9800 }, { "epoch": 0.5760098493287213, "grad_norm": 0.2138671875, "learning_rate": 4.239901506712787e-06, "loss": 0.9225, "step": 9825 }, { "epoch": 0.5774755232455884, "grad_norm": 0.328125, "learning_rate": 4.225244767544117e-06, "loss": 0.8293, "step": 9850 }, { "epoch": 0.5789411971624553, "grad_norm": 0.26171875, "learning_rate": 4.2105880283754475e-06, "loss": 0.9205, "step": 9875 }, { "epoch": 0.5804068710793223, "grad_norm": 0.28125, "learning_rate": 4.195931289206778e-06, "loss": 0.9015, "step": 9900 }, { "epoch": 0.5818725449961892, "grad_norm": 0.205078125, "learning_rate": 4.181274550038108e-06, "loss": 0.7924, "step": 9925 }, { "epoch": 0.5833382189130563, "grad_norm": 0.78515625, "learning_rate": 4.166617810869438e-06, "loss": 1.0072, "step": 9950 }, { "epoch": 0.5848038928299232, "grad_norm": 0.2353515625, "learning_rate": 4.151961071700768e-06, "loss": 0.9099, "step": 9975 }, { "epoch": 0.5862695667467902, "grad_norm": 1.2265625, "learning_rate": 4.137304332532099e-06, "loss": 1.1021, "step": 10000 }, { "epoch": 0.5877352406636571, "grad_norm": 0.33984375, "learning_rate": 4.122647593363428e-06, "loss": 0.8166, "step": 10025 }, { "epoch": 0.5892009145805241, "grad_norm": 0.296875, "learning_rate": 4.107990854194759e-06, "loss": 0.9566, "step": 10050 }, { "epoch": 0.5906665884973911, "grad_norm": 0.2119140625, "learning_rate": 4.093334115026089e-06, "loss": 0.8494, "step": 10075 }, { "epoch": 0.5921322624142581, "grad_norm": 0.2451171875, "learning_rate": 4.0786773758574195e-06, "loss": 0.9279, "step": 10100 }, { "epoch": 0.593597936331125, "grad_norm": 0.2275390625, "learning_rate": 4.06402063668875e-06, "loss": 0.8628, "step": 10125 }, { "epoch": 0.595063610247992, "grad_norm": 0.310546875, "learning_rate": 4.04936389752008e-06, "loss": 0.9406, "step": 10150 }, { "epoch": 0.596529284164859, "grad_norm": 0.212890625, "learning_rate": 4.03470715835141e-06, "loss": 0.896, "step": 10175 }, { "epoch": 0.597994958081726, "grad_norm": 0.421875, "learning_rate": 4.02005041918274e-06, "loss": 0.8608, "step": 10200 }, { "epoch": 0.599460631998593, "grad_norm": 0.255859375, "learning_rate": 4.005393680014071e-06, "loss": 0.9103, "step": 10225 }, { "epoch": 0.6009263059154599, "grad_norm": 0.29296875, "learning_rate": 3.990736940845401e-06, "loss": 0.8136, "step": 10250 }, { "epoch": 0.602391979832327, "grad_norm": 0.19921875, "learning_rate": 3.9760802016767315e-06, "loss": 0.8374, "step": 10275 }, { "epoch": 0.6038576537491939, "grad_norm": 0.1806640625, "learning_rate": 3.961423462508062e-06, "loss": 0.8049, "step": 10300 }, { "epoch": 0.6053233276660609, "grad_norm": 0.392578125, "learning_rate": 3.946766723339392e-06, "loss": 0.7943, "step": 10325 }, { "epoch": 0.6067890015829278, "grad_norm": 1.0, "learning_rate": 3.932109984170722e-06, "loss": 0.8691, "step": 10350 }, { "epoch": 0.6082546754997948, "grad_norm": 0.146484375, "learning_rate": 3.917453245002052e-06, "loss": 0.8565, "step": 10375 }, { "epoch": 0.6097203494166618, "grad_norm": 0.357421875, "learning_rate": 3.902796505833383e-06, "loss": 0.9866, "step": 10400 }, { "epoch": 0.6111860233335288, "grad_norm": 0.236328125, "learning_rate": 3.888139766664712e-06, "loss": 0.9026, "step": 10425 }, { "epoch": 0.6126516972503957, "grad_norm": 0.416015625, "learning_rate": 3.873483027496043e-06, "loss": 0.9329, "step": 10450 }, { "epoch": 0.6141173711672627, "grad_norm": 0.458984375, "learning_rate": 3.858826288327373e-06, "loss": 0.8534, "step": 10475 }, { "epoch": 0.6155830450841296, "grad_norm": 0.279296875, "learning_rate": 3.8441695491587034e-06, "loss": 0.824, "step": 10500 }, { "epoch": 0.6170487190009967, "grad_norm": 0.255859375, "learning_rate": 3.829512809990034e-06, "loss": 0.8444, "step": 10525 }, { "epoch": 0.6185143929178636, "grad_norm": 0.26953125, "learning_rate": 3.814856070821364e-06, "loss": 0.8843, "step": 10550 }, { "epoch": 0.6199800668347306, "grad_norm": 0.1748046875, "learning_rate": 3.8001993316526942e-06, "loss": 0.8791, "step": 10575 }, { "epoch": 0.6214457407515975, "grad_norm": 0.287109375, "learning_rate": 3.7855425924840246e-06, "loss": 0.8695, "step": 10600 }, { "epoch": 0.6229114146684646, "grad_norm": 0.1748046875, "learning_rate": 3.7708858533153546e-06, "loss": 0.8917, "step": 10625 }, { "epoch": 0.6243770885853316, "grad_norm": 0.4921875, "learning_rate": 3.756229114146685e-06, "loss": 0.7642, "step": 10650 }, { "epoch": 0.6258427625021985, "grad_norm": 0.2421875, "learning_rate": 3.7415723749780154e-06, "loss": 0.8826, "step": 10675 }, { "epoch": 0.6273084364190655, "grad_norm": 0.197265625, "learning_rate": 3.7269156358093454e-06, "loss": 0.8513, "step": 10700 }, { "epoch": 0.6287741103359324, "grad_norm": 0.50390625, "learning_rate": 3.712258896640676e-06, "loss": 1.0076, "step": 10725 }, { "epoch": 0.6302397842527995, "grad_norm": 0.220703125, "learning_rate": 3.697602157472006e-06, "loss": 0.8629, "step": 10750 }, { "epoch": 0.6317054581696664, "grad_norm": 0.251953125, "learning_rate": 3.6829454183033366e-06, "loss": 0.7775, "step": 10775 }, { "epoch": 0.6331711320865334, "grad_norm": 0.2265625, "learning_rate": 3.6682886791346666e-06, "loss": 0.9906, "step": 10800 }, { "epoch": 0.6346368060034003, "grad_norm": 0.298828125, "learning_rate": 3.6536319399659966e-06, "loss": 0.8601, "step": 10825 }, { "epoch": 0.6361024799202674, "grad_norm": 1.6875, "learning_rate": 3.6389752007973266e-06, "loss": 0.848, "step": 10850 }, { "epoch": 0.6375681538371343, "grad_norm": 0.2490234375, "learning_rate": 3.624318461628657e-06, "loss": 0.9411, "step": 10875 }, { "epoch": 0.6390338277540013, "grad_norm": 0.2431640625, "learning_rate": 3.6096617224599874e-06, "loss": 0.9371, "step": 10900 }, { "epoch": 0.6404995016708682, "grad_norm": 0.302734375, "learning_rate": 3.5950049832913174e-06, "loss": 0.8688, "step": 10925 }, { "epoch": 0.6419651755877352, "grad_norm": 0.333984375, "learning_rate": 3.5803482441226478e-06, "loss": 0.9022, "step": 10950 }, { "epoch": 0.6434308495046022, "grad_norm": 0.259765625, "learning_rate": 3.565691504953978e-06, "loss": 0.8537, "step": 10975 }, { "epoch": 0.6448965234214692, "grad_norm": 0.5703125, "learning_rate": 3.5510347657853086e-06, "loss": 0.8228, "step": 11000 }, { "epoch": 0.6463621973383362, "grad_norm": 0.259765625, "learning_rate": 3.5363780266166386e-06, "loss": 0.8286, "step": 11025 }, { "epoch": 0.6478278712552031, "grad_norm": 0.26171875, "learning_rate": 3.521721287447969e-06, "loss": 0.8312, "step": 11050 }, { "epoch": 0.6492935451720702, "grad_norm": 0.294921875, "learning_rate": 3.5070645482792994e-06, "loss": 0.8555, "step": 11075 }, { "epoch": 0.6507592190889371, "grad_norm": 0.37890625, "learning_rate": 3.4924078091106293e-06, "loss": 0.8602, "step": 11100 }, { "epoch": 0.6522248930058041, "grad_norm": 0.294921875, "learning_rate": 3.4777510699419598e-06, "loss": 0.875, "step": 11125 }, { "epoch": 0.653690566922671, "grad_norm": 3.171875, "learning_rate": 3.46309433077329e-06, "loss": 1.0157, "step": 11150 }, { "epoch": 0.655156240839538, "grad_norm": 0.2197265625, "learning_rate": 3.44843759160462e-06, "loss": 0.8448, "step": 11175 }, { "epoch": 0.656621914756405, "grad_norm": 0.53515625, "learning_rate": 3.4337808524359505e-06, "loss": 0.8291, "step": 11200 }, { "epoch": 0.658087588673272, "grad_norm": 0.2578125, "learning_rate": 3.4191241132672805e-06, "loss": 0.9121, "step": 11225 }, { "epoch": 0.6595532625901389, "grad_norm": 0.1962890625, "learning_rate": 3.4044673740986105e-06, "loss": 0.9451, "step": 11250 }, { "epoch": 0.6610189365070059, "grad_norm": 0.458984375, "learning_rate": 3.389810634929941e-06, "loss": 0.8054, "step": 11275 }, { "epoch": 0.6624846104238729, "grad_norm": 0.2216796875, "learning_rate": 3.3751538957612713e-06, "loss": 0.7991, "step": 11300 }, { "epoch": 0.6639502843407399, "grad_norm": 0.57421875, "learning_rate": 3.3604971565926013e-06, "loss": 0.9013, "step": 11325 }, { "epoch": 0.6654159582576068, "grad_norm": 0.373046875, "learning_rate": 3.3458404174239317e-06, "loss": 1.0514, "step": 11350 }, { "epoch": 0.6668816321744738, "grad_norm": 0.3125, "learning_rate": 3.331183678255262e-06, "loss": 0.8199, "step": 11375 }, { "epoch": 0.6683473060913409, "grad_norm": 0.416015625, "learning_rate": 3.316526939086592e-06, "loss": 0.8791, "step": 11400 }, { "epoch": 0.6698129800082078, "grad_norm": 0.39453125, "learning_rate": 3.3018701999179225e-06, "loss": 0.9326, "step": 11425 }, { "epoch": 0.6712786539250748, "grad_norm": 0.365234375, "learning_rate": 3.287213460749253e-06, "loss": 0.8829, "step": 11450 }, { "epoch": 0.6727443278419417, "grad_norm": 0.30859375, "learning_rate": 3.2725567215805833e-06, "loss": 0.8332, "step": 11475 }, { "epoch": 0.6742100017588087, "grad_norm": 0.65625, "learning_rate": 3.2578999824119133e-06, "loss": 0.9214, "step": 11500 }, { "epoch": 0.6756756756756757, "grad_norm": 0.203125, "learning_rate": 3.2432432432432437e-06, "loss": 0.814, "step": 11525 }, { "epoch": 0.6771413495925427, "grad_norm": 0.255859375, "learning_rate": 3.228586504074574e-06, "loss": 0.8175, "step": 11550 }, { "epoch": 0.6786070235094096, "grad_norm": 0.369140625, "learning_rate": 3.213929764905904e-06, "loss": 0.8749, "step": 11575 }, { "epoch": 0.6800726974262766, "grad_norm": 0.2177734375, "learning_rate": 3.199273025737234e-06, "loss": 0.7743, "step": 11600 }, { "epoch": 0.6815383713431435, "grad_norm": 0.26171875, "learning_rate": 3.184616286568564e-06, "loss": 0.8811, "step": 11625 }, { "epoch": 0.6830040452600106, "grad_norm": 0.173828125, "learning_rate": 3.1699595473998945e-06, "loss": 0.8967, "step": 11650 }, { "epoch": 0.6844697191768775, "grad_norm": 0.18359375, "learning_rate": 3.155302808231225e-06, "loss": 0.9144, "step": 11675 }, { "epoch": 0.6859353930937445, "grad_norm": 0.296875, "learning_rate": 3.1406460690625553e-06, "loss": 0.8852, "step": 11700 }, { "epoch": 0.6874010670106114, "grad_norm": 0.248046875, "learning_rate": 3.1259893298938853e-06, "loss": 0.875, "step": 11725 }, { "epoch": 0.6888667409274785, "grad_norm": 0.189453125, "learning_rate": 3.1113325907252157e-06, "loss": 0.9148, "step": 11750 }, { "epoch": 0.6903324148443454, "grad_norm": 0.63671875, "learning_rate": 3.096675851556546e-06, "loss": 0.8413, "step": 11775 }, { "epoch": 0.6917980887612124, "grad_norm": 0.1787109375, "learning_rate": 3.082019112387876e-06, "loss": 0.9005, "step": 11800 }, { "epoch": 0.6932637626780794, "grad_norm": 0.302734375, "learning_rate": 3.0673623732192065e-06, "loss": 0.8366, "step": 11825 }, { "epoch": 0.6947294365949463, "grad_norm": 0.318359375, "learning_rate": 3.052705634050537e-06, "loss": 0.9477, "step": 11850 }, { "epoch": 0.6961951105118134, "grad_norm": 0.302734375, "learning_rate": 3.038048894881867e-06, "loss": 0.9308, "step": 11875 }, { "epoch": 0.6976607844286803, "grad_norm": 0.119140625, "learning_rate": 3.0233921557131972e-06, "loss": 0.7818, "step": 11900 }, { "epoch": 0.6991264583455473, "grad_norm": 0.296875, "learning_rate": 3.0087354165445277e-06, "loss": 0.9406, "step": 11925 }, { "epoch": 0.7005921322624142, "grad_norm": 0.244140625, "learning_rate": 2.994078677375858e-06, "loss": 1.0378, "step": 11950 }, { "epoch": 0.7020578061792813, "grad_norm": 5.25, "learning_rate": 2.979421938207188e-06, "loss": 0.8277, "step": 11975 }, { "epoch": 0.7035234800961482, "grad_norm": 0.275390625, "learning_rate": 2.964765199038518e-06, "loss": 0.8431, "step": 12000 }, { "epoch": 0.7049891540130152, "grad_norm": 0.1982421875, "learning_rate": 2.950108459869848e-06, "loss": 0.8696, "step": 12025 }, { "epoch": 0.7064548279298821, "grad_norm": 0.287109375, "learning_rate": 2.9354517207011784e-06, "loss": 0.8615, "step": 12050 }, { "epoch": 0.7079205018467492, "grad_norm": 0.416015625, "learning_rate": 2.920794981532509e-06, "loss": 0.9367, "step": 12075 }, { "epoch": 0.7093861757636161, "grad_norm": 0.2294921875, "learning_rate": 2.9061382423638392e-06, "loss": 0.9749, "step": 12100 }, { "epoch": 0.7108518496804831, "grad_norm": 0.375, "learning_rate": 2.891481503195169e-06, "loss": 0.8747, "step": 12125 }, { "epoch": 0.71231752359735, "grad_norm": 0.306640625, "learning_rate": 2.8768247640264996e-06, "loss": 0.8151, "step": 12150 }, { "epoch": 0.713783197514217, "grad_norm": 0.30078125, "learning_rate": 2.86216802485783e-06, "loss": 0.8211, "step": 12175 }, { "epoch": 0.7152488714310841, "grad_norm": 0.318359375, "learning_rate": 2.84751128568916e-06, "loss": 0.8662, "step": 12200 }, { "epoch": 0.716714545347951, "grad_norm": 0.6640625, "learning_rate": 2.8328545465204904e-06, "loss": 0.909, "step": 12225 }, { "epoch": 0.718180219264818, "grad_norm": 1.0234375, "learning_rate": 2.818197807351821e-06, "loss": 0.9167, "step": 12250 }, { "epoch": 0.7196458931816849, "grad_norm": 0.73828125, "learning_rate": 2.803541068183151e-06, "loss": 0.8634, "step": 12275 }, { "epoch": 0.721111567098552, "grad_norm": 0.16015625, "learning_rate": 2.788884329014481e-06, "loss": 1.0852, "step": 12300 }, { "epoch": 0.7225772410154189, "grad_norm": 0.2216796875, "learning_rate": 2.7742275898458116e-06, "loss": 0.9097, "step": 12325 }, { "epoch": 0.7240429149322859, "grad_norm": 0.373046875, "learning_rate": 2.759570850677142e-06, "loss": 0.8853, "step": 12350 }, { "epoch": 0.7255085888491528, "grad_norm": 0.1513671875, "learning_rate": 2.744914111508472e-06, "loss": 0.9239, "step": 12375 }, { "epoch": 0.7269742627660198, "grad_norm": 0.1708984375, "learning_rate": 2.730257372339802e-06, "loss": 1.0119, "step": 12400 }, { "epoch": 0.7284399366828868, "grad_norm": 0.279296875, "learning_rate": 2.715600633171132e-06, "loss": 0.9003, "step": 12425 }, { "epoch": 0.7299056105997538, "grad_norm": 0.2333984375, "learning_rate": 2.7009438940024624e-06, "loss": 0.8912, "step": 12450 }, { "epoch": 0.7313712845166207, "grad_norm": 0.3125, "learning_rate": 2.6862871548337928e-06, "loss": 0.9916, "step": 12475 }, { "epoch": 0.7328369584334877, "grad_norm": 0.208984375, "learning_rate": 2.6716304156651227e-06, "loss": 1.1092, "step": 12500 }, { "epoch": 0.7343026323503546, "grad_norm": 0.27734375, "learning_rate": 2.656973676496453e-06, "loss": 1.1346, "step": 12525 }, { "epoch": 0.7357683062672217, "grad_norm": 0.546875, "learning_rate": 2.6423169373277836e-06, "loss": 0.9109, "step": 12550 }, { "epoch": 0.7372339801840886, "grad_norm": 0.25390625, "learning_rate": 2.627660198159114e-06, "loss": 0.8605, "step": 12575 }, { "epoch": 0.7386996541009556, "grad_norm": 4.1875, "learning_rate": 2.613003458990444e-06, "loss": 0.8093, "step": 12600 }, { "epoch": 0.7401653280178226, "grad_norm": 0.1875, "learning_rate": 2.5983467198217744e-06, "loss": 0.9045, "step": 12625 }, { "epoch": 0.7416310019346896, "grad_norm": 0.25390625, "learning_rate": 2.5836899806531048e-06, "loss": 0.9586, "step": 12650 }, { "epoch": 0.7430966758515566, "grad_norm": 0.318359375, "learning_rate": 2.5690332414844347e-06, "loss": 0.8817, "step": 12675 }, { "epoch": 0.7445623497684235, "grad_norm": 0.6484375, "learning_rate": 2.554376502315765e-06, "loss": 1.0672, "step": 12700 }, { "epoch": 0.7460280236852905, "grad_norm": 0.37890625, "learning_rate": 2.5397197631470956e-06, "loss": 0.7547, "step": 12725 }, { "epoch": 0.7474936976021574, "grad_norm": 0.271484375, "learning_rate": 2.5250630239784255e-06, "loss": 0.86, "step": 12750 }, { "epoch": 0.7489593715190245, "grad_norm": 0.19921875, "learning_rate": 2.5104062848097555e-06, "loss": 0.837, "step": 12775 }, { "epoch": 0.7504250454358914, "grad_norm": 0.271484375, "learning_rate": 2.495749545641086e-06, "loss": 0.969, "step": 12800 }, { "epoch": 0.7518907193527584, "grad_norm": 0.2275390625, "learning_rate": 2.4810928064724163e-06, "loss": 0.858, "step": 12825 }, { "epoch": 0.7533563932696253, "grad_norm": 0.318359375, "learning_rate": 2.4664360673037467e-06, "loss": 0.8171, "step": 12850 }, { "epoch": 0.7548220671864924, "grad_norm": 0.23046875, "learning_rate": 2.4517793281350767e-06, "loss": 0.9102, "step": 12875 }, { "epoch": 0.7562877411033593, "grad_norm": 0.59765625, "learning_rate": 2.4371225889664067e-06, "loss": 0.991, "step": 12900 }, { "epoch": 0.7577534150202263, "grad_norm": 0.2333984375, "learning_rate": 2.422465849797737e-06, "loss": 0.8257, "step": 12925 }, { "epoch": 0.7592190889370932, "grad_norm": 0.435546875, "learning_rate": 2.4078091106290675e-06, "loss": 0.9095, "step": 12950 }, { "epoch": 0.7606847628539603, "grad_norm": 0.1337890625, "learning_rate": 2.3931523714603975e-06, "loss": 0.9126, "step": 12975 }, { "epoch": 0.7621504367708273, "grad_norm": 0.2314453125, "learning_rate": 2.378495632291728e-06, "loss": 0.8467, "step": 13000 }, { "epoch": 0.7636161106876942, "grad_norm": 0.291015625, "learning_rate": 2.3638388931230583e-06, "loss": 0.8176, "step": 13025 }, { "epoch": 0.7650817846045612, "grad_norm": 0.2333984375, "learning_rate": 2.3491821539543887e-06, "loss": 0.8675, "step": 13050 }, { "epoch": 0.7665474585214281, "grad_norm": 0.35546875, "learning_rate": 2.3345254147857187e-06, "loss": 0.8793, "step": 13075 }, { "epoch": 0.7680131324382952, "grad_norm": 0.1865234375, "learning_rate": 2.3198686756170487e-06, "loss": 0.9777, "step": 13100 }, { "epoch": 0.7694788063551621, "grad_norm": 0.31640625, "learning_rate": 2.305211936448379e-06, "loss": 0.8507, "step": 13125 }, { "epoch": 0.7709444802720291, "grad_norm": 0.189453125, "learning_rate": 2.2905551972797095e-06, "loss": 0.8181, "step": 13150 }, { "epoch": 0.772410154188896, "grad_norm": 0.2451171875, "learning_rate": 2.2758984581110395e-06, "loss": 0.8968, "step": 13175 }, { "epoch": 0.773875828105763, "grad_norm": 0.35546875, "learning_rate": 2.26124171894237e-06, "loss": 0.8177, "step": 13200 }, { "epoch": 0.77534150202263, "grad_norm": 0.2216796875, "learning_rate": 2.2465849797737003e-06, "loss": 0.8724, "step": 13225 }, { "epoch": 0.776807175939497, "grad_norm": 0.404296875, "learning_rate": 2.2319282406050307e-06, "loss": 0.965, "step": 13250 }, { "epoch": 0.7782728498563639, "grad_norm": 0.263671875, "learning_rate": 2.2172715014363607e-06, "loss": 0.8441, "step": 13275 }, { "epoch": 0.7797385237732309, "grad_norm": 0.18359375, "learning_rate": 2.2026147622676906e-06, "loss": 0.8694, "step": 13300 }, { "epoch": 0.7812041976900979, "grad_norm": 0.3671875, "learning_rate": 2.187958023099021e-06, "loss": 0.8556, "step": 13325 }, { "epoch": 0.7826698716069649, "grad_norm": 0.2197265625, "learning_rate": 2.1733012839303515e-06, "loss": 0.8436, "step": 13350 }, { "epoch": 0.7841355455238319, "grad_norm": 0.1748046875, "learning_rate": 2.1586445447616814e-06, "loss": 0.8208, "step": 13375 }, { "epoch": 0.7856012194406988, "grad_norm": 0.146484375, "learning_rate": 2.143987805593012e-06, "loss": 0.8849, "step": 13400 }, { "epoch": 0.7870668933575659, "grad_norm": 0.25390625, "learning_rate": 2.1293310664243422e-06, "loss": 0.785, "step": 13425 }, { "epoch": 0.7885325672744328, "grad_norm": 0.1630859375, "learning_rate": 2.1146743272556727e-06, "loss": 0.8334, "step": 13450 }, { "epoch": 0.7899982411912998, "grad_norm": 0.1455078125, "learning_rate": 2.1000175880870026e-06, "loss": 0.8887, "step": 13475 }, { "epoch": 0.7914639151081667, "grad_norm": 4.5, "learning_rate": 2.0853608489183326e-06, "loss": 0.9221, "step": 13500 }, { "epoch": 0.7929295890250337, "grad_norm": 0.166015625, "learning_rate": 2.070704109749663e-06, "loss": 0.9046, "step": 13525 }, { "epoch": 0.7943952629419007, "grad_norm": 0.94921875, "learning_rate": 2.0560473705809934e-06, "loss": 0.9419, "step": 13550 }, { "epoch": 0.7958609368587677, "grad_norm": 0.17578125, "learning_rate": 2.0413906314123234e-06, "loss": 0.9732, "step": 13575 }, { "epoch": 0.7973266107756346, "grad_norm": 0.376953125, "learning_rate": 2.026733892243654e-06, "loss": 0.8457, "step": 13600 }, { "epoch": 0.7987922846925016, "grad_norm": 0.23046875, "learning_rate": 2.0120771530749842e-06, "loss": 0.9258, "step": 13625 }, { "epoch": 0.8002579586093685, "grad_norm": 0.25, "learning_rate": 1.997420413906314e-06, "loss": 0.8751, "step": 13650 }, { "epoch": 0.8017236325262356, "grad_norm": 0.240234375, "learning_rate": 1.9827636747376446e-06, "loss": 0.892, "step": 13675 }, { "epoch": 0.8031893064431025, "grad_norm": 0.234375, "learning_rate": 1.9681069355689746e-06, "loss": 0.8987, "step": 13700 }, { "epoch": 0.8046549803599695, "grad_norm": 0.1474609375, "learning_rate": 1.953450196400305e-06, "loss": 0.8482, "step": 13725 }, { "epoch": 0.8061206542768364, "grad_norm": 0.2314453125, "learning_rate": 1.9387934572316354e-06, "loss": 0.965, "step": 13750 }, { "epoch": 0.8075863281937035, "grad_norm": 0.2275390625, "learning_rate": 1.9241367180629654e-06, "loss": 0.8086, "step": 13775 }, { "epoch": 0.8090520021105705, "grad_norm": 0.359375, "learning_rate": 1.909479978894296e-06, "loss": 0.8894, "step": 13800 }, { "epoch": 0.8105176760274374, "grad_norm": 1.953125, "learning_rate": 1.8948232397256262e-06, "loss": 0.8153, "step": 13825 }, { "epoch": 0.8119833499443044, "grad_norm": 0.50390625, "learning_rate": 1.8801665005569564e-06, "loss": 0.8529, "step": 13850 }, { "epoch": 0.8134490238611713, "grad_norm": 0.1884765625, "learning_rate": 1.8655097613882864e-06, "loss": 0.9254, "step": 13875 }, { "epoch": 0.8149146977780384, "grad_norm": 0.6015625, "learning_rate": 1.8508530222196166e-06, "loss": 0.932, "step": 13900 }, { "epoch": 0.8163803716949053, "grad_norm": 0.275390625, "learning_rate": 1.836196283050947e-06, "loss": 0.9038, "step": 13925 }, { "epoch": 0.8178460456117723, "grad_norm": 0.193359375, "learning_rate": 1.8215395438822772e-06, "loss": 0.8293, "step": 13950 }, { "epoch": 0.8193117195286392, "grad_norm": 1.078125, "learning_rate": 1.8068828047136076e-06, "loss": 0.8581, "step": 13975 }, { "epoch": 0.8207773934455063, "grad_norm": 0.294921875, "learning_rate": 1.7922260655449378e-06, "loss": 0.8774, "step": 14000 }, { "epoch": 0.8222430673623732, "grad_norm": 0.166015625, "learning_rate": 1.777569326376268e-06, "loss": 0.9594, "step": 14025 }, { "epoch": 0.8237087412792402, "grad_norm": 0.251953125, "learning_rate": 1.7629125872075984e-06, "loss": 0.9275, "step": 14050 }, { "epoch": 0.8251744151961071, "grad_norm": 0.212890625, "learning_rate": 1.7482558480389283e-06, "loss": 0.8318, "step": 14075 }, { "epoch": 0.8266400891129742, "grad_norm": 0.64453125, "learning_rate": 1.7335991088702585e-06, "loss": 0.9462, "step": 14100 }, { "epoch": 0.8281057630298411, "grad_norm": 0.33203125, "learning_rate": 1.718942369701589e-06, "loss": 0.879, "step": 14125 }, { "epoch": 0.8295714369467081, "grad_norm": 0.486328125, "learning_rate": 1.7042856305329191e-06, "loss": 0.8837, "step": 14150 }, { "epoch": 0.8310371108635751, "grad_norm": 0.390625, "learning_rate": 1.6896288913642495e-06, "loss": 0.8956, "step": 14175 }, { "epoch": 0.832502784780442, "grad_norm": 0.765625, "learning_rate": 1.6749721521955797e-06, "loss": 0.9283, "step": 14200 }, { "epoch": 0.8339684586973091, "grad_norm": 0.64453125, "learning_rate": 1.66031541302691e-06, "loss": 0.9279, "step": 14225 }, { "epoch": 0.835434132614176, "grad_norm": 0.5859375, "learning_rate": 1.64565867385824e-06, "loss": 0.8446, "step": 14250 }, { "epoch": 0.836899806531043, "grad_norm": 0.154296875, "learning_rate": 1.6310019346895703e-06, "loss": 0.8585, "step": 14275 }, { "epoch": 0.8383654804479099, "grad_norm": 0.2412109375, "learning_rate": 1.6163451955209005e-06, "loss": 0.9229, "step": 14300 }, { "epoch": 0.839831154364777, "grad_norm": 0.197265625, "learning_rate": 1.601688456352231e-06, "loss": 0.7812, "step": 14325 }, { "epoch": 0.8412968282816439, "grad_norm": 0.228515625, "learning_rate": 1.5870317171835611e-06, "loss": 0.925, "step": 14350 }, { "epoch": 0.8427625021985109, "grad_norm": 0.25, "learning_rate": 1.5723749780148915e-06, "loss": 0.9276, "step": 14375 }, { "epoch": 0.8442281761153778, "grad_norm": 0.365234375, "learning_rate": 1.5577182388462217e-06, "loss": 0.8881, "step": 14400 }, { "epoch": 0.8456938500322448, "grad_norm": 0.166015625, "learning_rate": 1.543061499677552e-06, "loss": 0.8279, "step": 14425 }, { "epoch": 0.8471595239491118, "grad_norm": 0.466796875, "learning_rate": 1.5284047605088819e-06, "loss": 0.8415, "step": 14450 }, { "epoch": 0.8486251978659788, "grad_norm": 0.220703125, "learning_rate": 1.5137480213402123e-06, "loss": 0.9089, "step": 14475 }, { "epoch": 0.8500908717828457, "grad_norm": 0.1953125, "learning_rate": 1.4990912821715425e-06, "loss": 0.8786, "step": 14500 }, { "epoch": 0.8515565456997127, "grad_norm": 0.29296875, "learning_rate": 1.484434543002873e-06, "loss": 1.0085, "step": 14525 }, { "epoch": 0.8530222196165798, "grad_norm": 0.240234375, "learning_rate": 1.469777803834203e-06, "loss": 0.9566, "step": 14550 }, { "epoch": 0.8544878935334467, "grad_norm": 0.3828125, "learning_rate": 1.4551210646655333e-06, "loss": 0.7709, "step": 14575 }, { "epoch": 0.8559535674503137, "grad_norm": 0.18359375, "learning_rate": 1.4404643254968637e-06, "loss": 0.8845, "step": 14600 }, { "epoch": 0.8574192413671806, "grad_norm": 0.2890625, "learning_rate": 1.4258075863281939e-06, "loss": 0.8734, "step": 14625 }, { "epoch": 0.8588849152840476, "grad_norm": 0.2099609375, "learning_rate": 1.4111508471595239e-06, "loss": 0.8424, "step": 14650 }, { "epoch": 0.8603505892009146, "grad_norm": 0.4375, "learning_rate": 1.3964941079908543e-06, "loss": 0.8982, "step": 14675 }, { "epoch": 0.8618162631177816, "grad_norm": 0.18359375, "learning_rate": 1.3818373688221845e-06, "loss": 0.8867, "step": 14700 }, { "epoch": 0.8632819370346485, "grad_norm": 0.2138671875, "learning_rate": 1.3671806296535149e-06, "loss": 0.768, "step": 14725 }, { "epoch": 0.8647476109515155, "grad_norm": 0.265625, "learning_rate": 1.352523890484845e-06, "loss": 0.7438, "step": 14750 }, { "epoch": 0.8662132848683824, "grad_norm": 0.61328125, "learning_rate": 1.3378671513161753e-06, "loss": 0.8944, "step": 14775 }, { "epoch": 0.8676789587852495, "grad_norm": 0.419921875, "learning_rate": 1.3232104121475057e-06, "loss": 0.8471, "step": 14800 }, { "epoch": 0.8691446327021164, "grad_norm": 0.2275390625, "learning_rate": 1.3085536729788359e-06, "loss": 0.9804, "step": 14825 }, { "epoch": 0.8706103066189834, "grad_norm": 0.39453125, "learning_rate": 1.2938969338101658e-06, "loss": 0.8258, "step": 14850 }, { "epoch": 0.8720759805358503, "grad_norm": 4.28125, "learning_rate": 1.2792401946414962e-06, "loss": 1.0303, "step": 14875 }, { "epoch": 0.8735416544527174, "grad_norm": 0.2314453125, "learning_rate": 1.2645834554728264e-06, "loss": 0.9401, "step": 14900 }, { "epoch": 0.8750073283695843, "grad_norm": 0.1845703125, "learning_rate": 1.2499267163041566e-06, "loss": 0.8907, "step": 14925 }, { "epoch": 0.8764730022864513, "grad_norm": 0.1865234375, "learning_rate": 1.235269977135487e-06, "loss": 1.0413, "step": 14950 }, { "epoch": 0.8779386762033183, "grad_norm": 0.263671875, "learning_rate": 1.2206132379668172e-06, "loss": 0.8915, "step": 14975 }, { "epoch": 0.8794043501201853, "grad_norm": 0.62109375, "learning_rate": 1.2059564987981474e-06, "loss": 1.0251, "step": 15000 }, { "epoch": 0.8808700240370523, "grad_norm": 0.291015625, "learning_rate": 1.1912997596294776e-06, "loss": 0.9363, "step": 15025 }, { "epoch": 0.8823356979539192, "grad_norm": 0.28515625, "learning_rate": 1.176643020460808e-06, "loss": 0.9041, "step": 15050 }, { "epoch": 0.8838013718707862, "grad_norm": 0.28515625, "learning_rate": 1.1619862812921382e-06, "loss": 0.9011, "step": 15075 }, { "epoch": 0.8852670457876531, "grad_norm": 0.640625, "learning_rate": 1.1473295421234684e-06, "loss": 0.8542, "step": 15100 }, { "epoch": 0.8867327197045202, "grad_norm": 0.26953125, "learning_rate": 1.1326728029547986e-06, "loss": 0.8591, "step": 15125 }, { "epoch": 0.8881983936213871, "grad_norm": 0.337890625, "learning_rate": 1.118016063786129e-06, "loss": 0.8726, "step": 15150 }, { "epoch": 0.8896640675382541, "grad_norm": 0.62890625, "learning_rate": 1.1033593246174592e-06, "loss": 0.8511, "step": 15175 }, { "epoch": 0.891129741455121, "grad_norm": 0.1884765625, "learning_rate": 1.0887025854487894e-06, "loss": 0.8862, "step": 15200 }, { "epoch": 0.892595415371988, "grad_norm": 0.400390625, "learning_rate": 1.0740458462801196e-06, "loss": 0.8561, "step": 15225 }, { "epoch": 0.894061089288855, "grad_norm": 0.271484375, "learning_rate": 1.05938910711145e-06, "loss": 0.9929, "step": 15250 }, { "epoch": 0.895526763205722, "grad_norm": 0.2470703125, "learning_rate": 1.0447323679427802e-06, "loss": 0.8928, "step": 15275 }, { "epoch": 0.8969924371225889, "grad_norm": 0.1650390625, "learning_rate": 1.0300756287741104e-06, "loss": 0.8723, "step": 15300 }, { "epoch": 0.8984581110394559, "grad_norm": 0.2109375, "learning_rate": 1.0154188896054406e-06, "loss": 0.8111, "step": 15325 }, { "epoch": 0.899923784956323, "grad_norm": 0.29296875, "learning_rate": 1.000762150436771e-06, "loss": 0.8699, "step": 15350 }, { "epoch": 0.9013894588731899, "grad_norm": 0.369140625, "learning_rate": 9.861054112681012e-07, "loss": 0.9412, "step": 15375 }, { "epoch": 0.9028551327900569, "grad_norm": 0.171875, "learning_rate": 9.714486720994314e-07, "loss": 0.9887, "step": 15400 }, { "epoch": 0.9043208067069238, "grad_norm": 0.1533203125, "learning_rate": 9.567919329307616e-07, "loss": 0.9231, "step": 15425 }, { "epoch": 0.9057864806237909, "grad_norm": 0.228515625, "learning_rate": 9.421351937620919e-07, "loss": 0.8929, "step": 15450 }, { "epoch": 0.9072521545406578, "grad_norm": 0.2197265625, "learning_rate": 9.274784545934222e-07, "loss": 0.8695, "step": 15475 }, { "epoch": 0.9087178284575248, "grad_norm": 0.2392578125, "learning_rate": 9.128217154247524e-07, "loss": 0.8565, "step": 15500 }, { "epoch": 0.9101835023743917, "grad_norm": 0.2109375, "learning_rate": 8.981649762560827e-07, "loss": 0.8357, "step": 15525 }, { "epoch": 0.9116491762912587, "grad_norm": 0.28515625, "learning_rate": 8.835082370874129e-07, "loss": 0.9423, "step": 15550 }, { "epoch": 0.9131148502081257, "grad_norm": 0.21484375, "learning_rate": 8.688514979187431e-07, "loss": 0.7956, "step": 15575 }, { "epoch": 0.9145805241249927, "grad_norm": 0.193359375, "learning_rate": 8.541947587500734e-07, "loss": 0.8043, "step": 15600 }, { "epoch": 0.9160461980418596, "grad_norm": 0.54296875, "learning_rate": 8.395380195814037e-07, "loss": 1.0452, "step": 15625 }, { "epoch": 0.9175118719587266, "grad_norm": 0.357421875, "learning_rate": 8.248812804127338e-07, "loss": 0.8549, "step": 15650 }, { "epoch": 0.9189775458755935, "grad_norm": 0.1689453125, "learning_rate": 8.10224541244064e-07, "loss": 0.904, "step": 15675 }, { "epoch": 0.9204432197924606, "grad_norm": 2.1875, "learning_rate": 7.955678020753943e-07, "loss": 0.9144, "step": 15700 }, { "epoch": 0.9219088937093276, "grad_norm": 0.2080078125, "learning_rate": 7.809110629067245e-07, "loss": 0.9322, "step": 15725 }, { "epoch": 0.9233745676261945, "grad_norm": 0.1689453125, "learning_rate": 7.662543237380548e-07, "loss": 0.8968, "step": 15750 }, { "epoch": 0.9248402415430615, "grad_norm": 0.45703125, "learning_rate": 7.51597584569385e-07, "loss": 0.834, "step": 15775 }, { "epoch": 0.9263059154599285, "grad_norm": 0.2158203125, "learning_rate": 7.369408454007153e-07, "loss": 0.8769, "step": 15800 }, { "epoch": 0.9277715893767955, "grad_norm": 0.359375, "learning_rate": 7.222841062320455e-07, "loss": 0.8159, "step": 15825 }, { "epoch": 0.9292372632936624, "grad_norm": 0.24609375, "learning_rate": 7.076273670633758e-07, "loss": 0.8529, "step": 15850 }, { "epoch": 0.9307029372105294, "grad_norm": 0.2236328125, "learning_rate": 6.92970627894706e-07, "loss": 0.8882, "step": 15875 }, { "epoch": 0.9321686111273964, "grad_norm": 0.2041015625, "learning_rate": 6.783138887260363e-07, "loss": 0.9047, "step": 15900 }, { "epoch": 0.9336342850442634, "grad_norm": 0.3671875, "learning_rate": 6.636571495573665e-07, "loss": 0.9791, "step": 15925 }, { "epoch": 0.9350999589611303, "grad_norm": 0.66796875, "learning_rate": 6.490004103886968e-07, "loss": 0.8344, "step": 15950 }, { "epoch": 0.9365656328779973, "grad_norm": 0.2158203125, "learning_rate": 6.34343671220027e-07, "loss": 0.8912, "step": 15975 }, { "epoch": 0.9380313067948642, "grad_norm": 0.26171875, "learning_rate": 6.196869320513572e-07, "loss": 0.8693, "step": 16000 }, { "epoch": 0.9394969807117313, "grad_norm": 0.296875, "learning_rate": 6.050301928826875e-07, "loss": 0.897, "step": 16025 }, { "epoch": 0.9409626546285982, "grad_norm": 0.2890625, "learning_rate": 5.903734537140177e-07, "loss": 0.8517, "step": 16050 }, { "epoch": 0.9424283285454652, "grad_norm": 0.2197265625, "learning_rate": 5.75716714545348e-07, "loss": 0.9227, "step": 16075 }, { "epoch": 0.9438940024623321, "grad_norm": 0.12060546875, "learning_rate": 5.610599753766782e-07, "loss": 0.8516, "step": 16100 }, { "epoch": 0.9453596763791992, "grad_norm": 0.451171875, "learning_rate": 5.464032362080085e-07, "loss": 0.9763, "step": 16125 }, { "epoch": 0.9468253502960662, "grad_norm": 0.765625, "learning_rate": 5.317464970393387e-07, "loss": 0.8946, "step": 16150 }, { "epoch": 0.9482910242129331, "grad_norm": 0.57421875, "learning_rate": 5.17089757870669e-07, "loss": 0.7753, "step": 16175 }, { "epoch": 0.9497566981298001, "grad_norm": 0.43359375, "learning_rate": 5.024330187019992e-07, "loss": 0.7681, "step": 16200 }, { "epoch": 0.951222372046667, "grad_norm": 0.267578125, "learning_rate": 4.877762795333295e-07, "loss": 0.9284, "step": 16225 }, { "epoch": 0.9526880459635341, "grad_norm": 0.447265625, "learning_rate": 4.731195403646597e-07, "loss": 0.8657, "step": 16250 }, { "epoch": 0.954153719880401, "grad_norm": 0.546875, "learning_rate": 4.5846280119598996e-07, "loss": 0.8839, "step": 16275 }, { "epoch": 0.955619393797268, "grad_norm": 0.1845703125, "learning_rate": 4.4380606202732016e-07, "loss": 0.9618, "step": 16300 }, { "epoch": 0.9570850677141349, "grad_norm": 0.2294921875, "learning_rate": 4.2914932285865046e-07, "loss": 0.9123, "step": 16325 }, { "epoch": 0.958550741631002, "grad_norm": 0.291015625, "learning_rate": 4.1449258368998065e-07, "loss": 1.0314, "step": 16350 }, { "epoch": 0.9600164155478689, "grad_norm": 0.267578125, "learning_rate": 3.9983584452131095e-07, "loss": 0.8674, "step": 16375 }, { "epoch": 0.9614820894647359, "grad_norm": 0.318359375, "learning_rate": 3.8517910535264115e-07, "loss": 0.8991, "step": 16400 }, { "epoch": 0.9629477633816028, "grad_norm": 0.2412109375, "learning_rate": 3.7052236618397145e-07, "loss": 0.8511, "step": 16425 }, { "epoch": 0.9644134372984698, "grad_norm": 1.140625, "learning_rate": 3.5586562701530164e-07, "loss": 0.8599, "step": 16450 }, { "epoch": 0.9658791112153368, "grad_norm": 0.279296875, "learning_rate": 3.4120888784663194e-07, "loss": 0.8352, "step": 16475 }, { "epoch": 0.9673447851322038, "grad_norm": 0.470703125, "learning_rate": 3.2655214867796213e-07, "loss": 0.7832, "step": 16500 }, { "epoch": 0.9688104590490708, "grad_norm": 0.1982421875, "learning_rate": 3.118954095092924e-07, "loss": 0.8793, "step": 16525 }, { "epoch": 0.9702761329659377, "grad_norm": 0.306640625, "learning_rate": 2.972386703406226e-07, "loss": 0.8602, "step": 16550 }, { "epoch": 0.9717418068828048, "grad_norm": 0.2353515625, "learning_rate": 2.8258193117195287e-07, "loss": 0.8972, "step": 16575 }, { "epoch": 0.9732074807996717, "grad_norm": 0.111328125, "learning_rate": 2.679251920032831e-07, "loss": 0.8231, "step": 16600 }, { "epoch": 0.9746731547165387, "grad_norm": 0.2734375, "learning_rate": 2.5326845283461337e-07, "loss": 0.8406, "step": 16625 }, { "epoch": 0.9761388286334056, "grad_norm": 0.77734375, "learning_rate": 2.386117136659436e-07, "loss": 0.9921, "step": 16650 }, { "epoch": 0.9776045025502726, "grad_norm": 0.357421875, "learning_rate": 2.2395497449727386e-07, "loss": 0.8458, "step": 16675 }, { "epoch": 0.9790701764671396, "grad_norm": 0.287109375, "learning_rate": 2.092982353286041e-07, "loss": 0.9056, "step": 16700 }, { "epoch": 0.9805358503840066, "grad_norm": 0.189453125, "learning_rate": 1.9464149615993435e-07, "loss": 0.8358, "step": 16725 }, { "epoch": 0.9820015243008735, "grad_norm": 0.1552734375, "learning_rate": 1.799847569912646e-07, "loss": 0.9285, "step": 16750 }, { "epoch": 0.9834671982177405, "grad_norm": 0.248046875, "learning_rate": 1.6532801782259485e-07, "loss": 0.8284, "step": 16775 }, { "epoch": 0.9849328721346075, "grad_norm": 0.216796875, "learning_rate": 1.506712786539251e-07, "loss": 0.8364, "step": 16800 }, { "epoch": 0.9863985460514745, "grad_norm": 0.326171875, "learning_rate": 1.3601453948525534e-07, "loss": 0.9212, "step": 16825 }, { "epoch": 0.9878642199683414, "grad_norm": 0.234375, "learning_rate": 1.213578003165856e-07, "loss": 0.8225, "step": 16850 }, { "epoch": 0.9893298938852084, "grad_norm": 0.16015625, "learning_rate": 1.0670106114791582e-07, "loss": 0.9779, "step": 16875 }, { "epoch": 0.9907955678020754, "grad_norm": 0.1845703125, "learning_rate": 9.204432197924607e-08, "loss": 0.816, "step": 16900 }, { "epoch": 0.9922612417189424, "grad_norm": 0.23046875, "learning_rate": 7.738758281057632e-08, "loss": 0.8379, "step": 16925 }, { "epoch": 0.9937269156358094, "grad_norm": 0.275390625, "learning_rate": 6.273084364190655e-08, "loss": 0.8998, "step": 16950 }, { "epoch": 0.9951925895526763, "grad_norm": 0.34375, "learning_rate": 4.80741044732368e-08, "loss": 0.9126, "step": 16975 }, { "epoch": 0.9966582634695433, "grad_norm": 0.275390625, "learning_rate": 3.341736530456704e-08, "loss": 0.8822, "step": 17000 }, { "epoch": 0.9981239373864103, "grad_norm": 0.466796875, "learning_rate": 1.8760626135897286e-08, "loss": 0.8701, "step": 17025 }, { "epoch": 0.9995896113032773, "grad_norm": 4.75, "learning_rate": 4.103886967227532e-09, "loss": 0.9079, "step": 17050 } ], "logging_steps": 25, "max_steps": 17057, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4293390811789451e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }