{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15198026808855228, "eval_steps": 500, "global_step": 131000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.28216952085494995, "learning_rate": 9.998839845281768e-05, "loss": 5.404, "step": 100 }, { "epoch": 0.0, "grad_norm": 0.7618443965911865, "learning_rate": 9.997679690563535e-05, "loss": 4.9594, "step": 200 }, { "epoch": 0.0, "grad_norm": 0.8141049742698669, "learning_rate": 9.996519535845302e-05, "loss": 4.9031, "step": 300 }, { "epoch": 0.0, "grad_norm": 2.8989076614379883, "learning_rate": 9.995359381127067e-05, "loss": 4.8882, "step": 400 }, { "epoch": 0.0, "grad_norm": 1.622833251953125, "learning_rate": 9.994199226408834e-05, "loss": 4.8752, "step": 500 }, { "epoch": 0.0, "grad_norm": 0.2884034812450409, "learning_rate": 9.993039071690601e-05, "loss": 4.8623, "step": 600 }, { "epoch": 0.0, "grad_norm": 3.4078500270843506, "learning_rate": 9.991878916972368e-05, "loss": 4.8539, "step": 700 }, { "epoch": 0.0, "grad_norm": 4.187527179718018, "learning_rate": 9.990718762254134e-05, "loss": 4.8479, "step": 800 }, { "epoch": 0.0, "grad_norm": 1.675564169883728, "learning_rate": 9.989558607535901e-05, "loss": 4.8473, "step": 900 }, { "epoch": 0.0, "grad_norm": 1.7946183681488037, "learning_rate": 9.988398452817668e-05, "loss": 4.8428, "step": 1000 }, { "epoch": 0.0, "grad_norm": 3.8930752277374268, "learning_rate": 9.987238298099435e-05, "loss": 4.8403, "step": 1100 }, { "epoch": 0.0, "grad_norm": 1.8201618194580078, "learning_rate": 9.986078143381202e-05, "loss": 4.8341, "step": 1200 }, { "epoch": 0.0, "grad_norm": 1.5597014427185059, "learning_rate": 9.984917988662968e-05, "loss": 4.8324, "step": 1300 }, { "epoch": 0.0, "grad_norm": 3.2638113498687744, "learning_rate": 9.983757833944736e-05, "loss": 4.8274, "step": 1400 }, { "epoch": 0.0, "grad_norm": 1.5054211616516113, "learning_rate": 9.982597679226502e-05, "loss": 4.8262, "step": 1500 }, { "epoch": 0.0, "grad_norm": 2.0371103286743164, "learning_rate": 9.981437524508269e-05, "loss": 4.826, "step": 1600 }, { "epoch": 0.0, "grad_norm": 2.8935482501983643, "learning_rate": 9.980277369790035e-05, "loss": 4.823, "step": 1700 }, { "epoch": 0.0, "grad_norm": 1.6551401615142822, "learning_rate": 9.979117215071803e-05, "loss": 4.8232, "step": 1800 }, { "epoch": 0.0, "grad_norm": 1.8115674257278442, "learning_rate": 9.977957060353569e-05, "loss": 4.8182, "step": 1900 }, { "epoch": 0.0, "grad_norm": 3.1292033195495605, "learning_rate": 9.976796905635336e-05, "loss": 4.8205, "step": 2000 }, { "epoch": 0.0, "grad_norm": 1.3206559419631958, "learning_rate": 9.975636750917103e-05, "loss": 4.8176, "step": 2100 }, { "epoch": 0.0, "grad_norm": 2.540377140045166, "learning_rate": 9.97447659619887e-05, "loss": 4.8147, "step": 2200 }, { "epoch": 0.0, "grad_norm": 4.180980682373047, "learning_rate": 9.973316441480637e-05, "loss": 4.8146, "step": 2300 }, { "epoch": 0.0, "grad_norm": 1.8660849332809448, "learning_rate": 9.972156286762403e-05, "loss": 4.8105, "step": 2400 }, { "epoch": 0.0, "grad_norm": 2.911318778991699, "learning_rate": 9.97099613204417e-05, "loss": 4.8131, "step": 2500 }, { "epoch": 0.0, "grad_norm": 4.366623878479004, "learning_rate": 9.969835977325937e-05, "loss": 4.8113, "step": 2600 }, { "epoch": 0.0, "grad_norm": 1.6765426397323608, "learning_rate": 9.968675822607704e-05, "loss": 4.8092, "step": 2700 }, { "epoch": 0.0, "grad_norm": 2.4829912185668945, "learning_rate": 9.96751566788947e-05, "loss": 4.8102, "step": 2800 }, { "epoch": 0.0, "grad_norm": 3.6401925086975098, "learning_rate": 9.966355513171237e-05, "loss": 4.8039, "step": 2900 }, { "epoch": 0.0, "grad_norm": 2.1426470279693604, "learning_rate": 9.965195358453004e-05, "loss": 4.8059, "step": 3000 }, { "epoch": 0.0, "grad_norm": 2.311486005783081, "learning_rate": 9.96403520373477e-05, "loss": 4.8068, "step": 3100 }, { "epoch": 0.0, "grad_norm": 2.069549083709717, "learning_rate": 9.962875049016536e-05, "loss": 4.8054, "step": 3200 }, { "epoch": 0.0, "grad_norm": 3.6770570278167725, "learning_rate": 9.961714894298305e-05, "loss": 4.8008, "step": 3300 }, { "epoch": 0.0, "grad_norm": 1.9226771593093872, "learning_rate": 9.960554739580072e-05, "loss": 4.8004, "step": 3400 }, { "epoch": 0.0, "grad_norm": 3.2032277584075928, "learning_rate": 9.959394584861837e-05, "loss": 4.7977, "step": 3500 }, { "epoch": 0.0, "grad_norm": 2.310493230819702, "learning_rate": 9.958234430143604e-05, "loss": 4.7997, "step": 3600 }, { "epoch": 0.0, "grad_norm": 2.559783935546875, "learning_rate": 9.957074275425372e-05, "loss": 4.7961, "step": 3700 }, { "epoch": 0.0, "grad_norm": 2.714775562286377, "learning_rate": 9.955914120707139e-05, "loss": 4.7943, "step": 3800 }, { "epoch": 0.0, "grad_norm": 2.100062847137451, "learning_rate": 9.954753965988904e-05, "loss": 4.7961, "step": 3900 }, { "epoch": 0.0, "grad_norm": 5.665365695953369, "learning_rate": 9.953593811270671e-05, "loss": 4.7992, "step": 4000 }, { "epoch": 0.0, "grad_norm": 2.1073904037475586, "learning_rate": 9.952433656552438e-05, "loss": 4.7957, "step": 4100 }, { "epoch": 0.0, "grad_norm": 3.3605892658233643, "learning_rate": 9.951273501834205e-05, "loss": 4.7959, "step": 4200 }, { "epoch": 0.0, "grad_norm": 4.566422939300537, "learning_rate": 9.950113347115971e-05, "loss": 4.7975, "step": 4300 }, { "epoch": 0.01, "grad_norm": 1.812922716140747, "learning_rate": 9.948953192397738e-05, "loss": 4.7942, "step": 4400 }, { "epoch": 0.01, "grad_norm": 3.2284045219421387, "learning_rate": 9.947793037679507e-05, "loss": 4.7942, "step": 4500 }, { "epoch": 0.01, "grad_norm": 4.2551093101501465, "learning_rate": 9.946632882961272e-05, "loss": 4.791, "step": 4600 }, { "epoch": 0.01, "grad_norm": 1.8002365827560425, "learning_rate": 9.945472728243039e-05, "loss": 4.7905, "step": 4700 }, { "epoch": 0.01, "grad_norm": 2.5465445518493652, "learning_rate": 9.944312573524805e-05, "loss": 4.7939, "step": 4800 }, { "epoch": 0.01, "grad_norm": 4.329401969909668, "learning_rate": 9.943152418806573e-05, "loss": 4.795, "step": 4900 }, { "epoch": 0.01, "grad_norm": 2.2818596363067627, "learning_rate": 9.941992264088339e-05, "loss": 4.7952, "step": 5000 }, { "epoch": 0.01, "grad_norm": 2.9942195415496826, "learning_rate": 9.940832109370106e-05, "loss": 4.791, "step": 5100 }, { "epoch": 0.01, "grad_norm": 5.174030780792236, "learning_rate": 9.939671954651872e-05, "loss": 4.7921, "step": 5200 }, { "epoch": 0.01, "grad_norm": 1.8438475131988525, "learning_rate": 9.93851179993364e-05, "loss": 4.79, "step": 5300 }, { "epoch": 0.01, "grad_norm": 3.8706634044647217, "learning_rate": 9.937351645215407e-05, "loss": 4.7902, "step": 5400 }, { "epoch": 0.01, "grad_norm": 5.187880516052246, "learning_rate": 9.936191490497173e-05, "loss": 4.7917, "step": 5500 }, { "epoch": 0.01, "grad_norm": 2.2282915115356445, "learning_rate": 9.93503133577894e-05, "loss": 4.7869, "step": 5600 }, { "epoch": 0.01, "grad_norm": 3.4940683841705322, "learning_rate": 9.933871181060707e-05, "loss": 4.7932, "step": 5700 }, { "epoch": 0.01, "grad_norm": 4.8060383796691895, "learning_rate": 9.932711026342474e-05, "loss": 4.788, "step": 5800 }, { "epoch": 0.01, "grad_norm": 2.129607677459717, "learning_rate": 9.93155087162424e-05, "loss": 4.7885, "step": 5900 }, { "epoch": 0.01, "grad_norm": 3.5312142372131348, "learning_rate": 9.930390716906007e-05, "loss": 4.786, "step": 6000 }, { "epoch": 0.01, "grad_norm": 4.944730281829834, "learning_rate": 9.929230562187774e-05, "loss": 4.7867, "step": 6100 }, { "epoch": 0.01, "grad_norm": 1.5370301008224487, "learning_rate": 9.928070407469541e-05, "loss": 4.7864, "step": 6200 }, { "epoch": 0.01, "grad_norm": 2.7849390506744385, "learning_rate": 9.926910252751306e-05, "loss": 4.7885, "step": 6300 }, { "epoch": 0.01, "grad_norm": 5.041503429412842, "learning_rate": 9.925750098033075e-05, "loss": 4.7867, "step": 6400 }, { "epoch": 0.01, "grad_norm": 1.8032560348510742, "learning_rate": 9.924589943314842e-05, "loss": 4.7851, "step": 6500 }, { "epoch": 0.01, "grad_norm": 3.296067953109741, "learning_rate": 9.923429788596608e-05, "loss": 4.7847, "step": 6600 }, { "epoch": 0.01, "grad_norm": 5.063502311706543, "learning_rate": 9.922269633878375e-05, "loss": 4.7832, "step": 6700 }, { "epoch": 0.01, "grad_norm": 2.149693012237549, "learning_rate": 9.921109479160142e-05, "loss": 4.779, "step": 6800 }, { "epoch": 0.01, "grad_norm": 3.4809441566467285, "learning_rate": 9.919949324441909e-05, "loss": 4.7823, "step": 6900 }, { "epoch": 0.01, "grad_norm": 4.948490142822266, "learning_rate": 9.918789169723674e-05, "loss": 4.7842, "step": 7000 }, { "epoch": 0.01, "grad_norm": 1.5616728067398071, "learning_rate": 9.917629015005441e-05, "loss": 4.7861, "step": 7100 }, { "epoch": 0.01, "grad_norm": 3.1099443435668945, "learning_rate": 9.916468860287209e-05, "loss": 4.7829, "step": 7200 }, { "epoch": 0.01, "grad_norm": 5.170409679412842, "learning_rate": 9.915308705568976e-05, "loss": 4.7817, "step": 7300 }, { "epoch": 0.01, "grad_norm": 1.6011829376220703, "learning_rate": 9.914148550850741e-05, "loss": 4.7805, "step": 7400 }, { "epoch": 0.01, "grad_norm": 3.222562789916992, "learning_rate": 9.912988396132508e-05, "loss": 4.7802, "step": 7500 }, { "epoch": 0.01, "grad_norm": 4.803954601287842, "learning_rate": 9.911828241414277e-05, "loss": 4.7828, "step": 7600 }, { "epoch": 0.01, "grad_norm": 1.8963723182678223, "learning_rate": 9.910668086696042e-05, "loss": 4.7823, "step": 7700 }, { "epoch": 0.01, "grad_norm": 2.4876952171325684, "learning_rate": 9.90950793197781e-05, "loss": 4.7768, "step": 7800 }, { "epoch": 0.01, "grad_norm": 5.709277629852295, "learning_rate": 9.908347777259575e-05, "loss": 4.7806, "step": 7900 }, { "epoch": 0.01, "grad_norm": 1.8382807970046997, "learning_rate": 9.907187622541343e-05, "loss": 4.7791, "step": 8000 }, { "epoch": 0.01, "grad_norm": 3.2304890155792236, "learning_rate": 9.906027467823109e-05, "loss": 4.778, "step": 8100 }, { "epoch": 0.01, "grad_norm": 5.252920150756836, "learning_rate": 9.904867313104876e-05, "loss": 4.7796, "step": 8200 }, { "epoch": 0.01, "grad_norm": 1.16917884349823, "learning_rate": 9.903707158386642e-05, "loss": 4.7817, "step": 8300 }, { "epoch": 0.01, "grad_norm": 2.6509056091308594, "learning_rate": 9.90254700366841e-05, "loss": 4.778, "step": 8400 }, { "epoch": 0.01, "grad_norm": 1.1537126302719116, "learning_rate": 9.901386848950176e-05, "loss": 4.7755, "step": 8500 }, { "epoch": 0.01, "grad_norm": 1.7410359382629395, "learning_rate": 9.900226694231943e-05, "loss": 4.7751, "step": 8600 }, { "epoch": 0.01, "grad_norm": 3.2682690620422363, "learning_rate": 9.89906653951371e-05, "loss": 4.7741, "step": 8700 }, { "epoch": 0.01, "grad_norm": 1.825622320175171, "learning_rate": 9.897906384795477e-05, "loss": 4.7759, "step": 8800 }, { "epoch": 0.01, "grad_norm": 0.8567586541175842, "learning_rate": 9.896746230077244e-05, "loss": 4.7738, "step": 8900 }, { "epoch": 0.01, "grad_norm": 0.5261878967285156, "learning_rate": 9.89558607535901e-05, "loss": 4.7722, "step": 9000 }, { "epoch": 0.01, "grad_norm": 0.7664969563484192, "learning_rate": 9.894425920640777e-05, "loss": 4.7757, "step": 9100 }, { "epoch": 0.01, "grad_norm": 1.6178815364837646, "learning_rate": 9.893265765922544e-05, "loss": 4.7715, "step": 9200 }, { "epoch": 0.01, "grad_norm": 5.013388156890869, "learning_rate": 9.892105611204311e-05, "loss": 4.7718, "step": 9300 }, { "epoch": 0.01, "grad_norm": 0.765504777431488, "learning_rate": 9.890945456486077e-05, "loss": 4.7707, "step": 9400 }, { "epoch": 0.01, "grad_norm": 1.0577245950698853, "learning_rate": 9.889785301767844e-05, "loss": 4.7709, "step": 9500 }, { "epoch": 0.01, "grad_norm": 0.5963281393051147, "learning_rate": 9.888625147049611e-05, "loss": 4.7738, "step": 9600 }, { "epoch": 0.01, "grad_norm": 1.5044456720352173, "learning_rate": 9.887464992331378e-05, "loss": 4.772, "step": 9700 }, { "epoch": 0.01, "grad_norm": 2.449915647506714, "learning_rate": 9.886304837613145e-05, "loss": 4.7695, "step": 9800 }, { "epoch": 0.01, "grad_norm": 4.757066249847412, "learning_rate": 9.885144682894912e-05, "loss": 4.7723, "step": 9900 }, { "epoch": 0.01, "grad_norm": 2.8173577785491943, "learning_rate": 9.883984528176679e-05, "loss": 4.7711, "step": 10000 }, { "epoch": 0.01, "grad_norm": 3.4169297218322754, "learning_rate": 9.882824373458445e-05, "loss": 4.7739, "step": 10100 }, { "epoch": 0.01, "grad_norm": 5.771091938018799, "learning_rate": 9.881664218740212e-05, "loss": 4.7747, "step": 10200 }, { "epoch": 0.01, "grad_norm": 2.6634373664855957, "learning_rate": 9.880504064021979e-05, "loss": 4.7732, "step": 10300 }, { "epoch": 0.01, "grad_norm": 4.215581893920898, "learning_rate": 9.879343909303746e-05, "loss": 4.7704, "step": 10400 }, { "epoch": 0.01, "grad_norm": 6.152705192565918, "learning_rate": 9.878183754585511e-05, "loss": 4.7699, "step": 10500 }, { "epoch": 0.01, "grad_norm": 1.5891577005386353, "learning_rate": 9.877023599867278e-05, "loss": 4.7717, "step": 10600 }, { "epoch": 0.01, "grad_norm": 4.256149768829346, "learning_rate": 9.875863445149045e-05, "loss": 4.7664, "step": 10700 }, { "epoch": 0.01, "grad_norm": 4.941390037536621, "learning_rate": 9.874703290430813e-05, "loss": 4.7721, "step": 10800 }, { "epoch": 0.01, "grad_norm": 1.2469244003295898, "learning_rate": 9.87354313571258e-05, "loss": 4.7676, "step": 10900 }, { "epoch": 0.01, "grad_norm": 1.073249340057373, "learning_rate": 9.872382980994345e-05, "loss": 4.7644, "step": 11000 }, { "epoch": 0.01, "grad_norm": 1.6434860229492188, "learning_rate": 9.871222826276114e-05, "loss": 4.7645, "step": 11100 }, { "epoch": 0.01, "grad_norm": 0.8313368558883667, "learning_rate": 9.87006267155788e-05, "loss": 4.7637, "step": 11200 }, { "epoch": 0.01, "grad_norm": 0.8201664090156555, "learning_rate": 9.868902516839646e-05, "loss": 4.7667, "step": 11300 }, { "epoch": 0.01, "grad_norm": 0.47900694608688354, "learning_rate": 9.867742362121412e-05, "loss": 4.7652, "step": 11400 }, { "epoch": 0.01, "grad_norm": 0.9311307072639465, "learning_rate": 9.86658220740318e-05, "loss": 4.766, "step": 11500 }, { "epoch": 0.01, "grad_norm": 8.805180549621582, "learning_rate": 9.865422052684946e-05, "loss": 4.7641, "step": 11600 }, { "epoch": 0.01, "grad_norm": 9.388609886169434, "learning_rate": 9.864261897966713e-05, "loss": 4.7717, "step": 11700 }, { "epoch": 0.01, "grad_norm": 2.2487683296203613, "learning_rate": 9.86310174324848e-05, "loss": 4.7689, "step": 11800 }, { "epoch": 0.01, "grad_norm": 4.081148624420166, "learning_rate": 9.861941588530247e-05, "loss": 4.7673, "step": 11900 }, { "epoch": 0.01, "grad_norm": 7.44789981842041, "learning_rate": 9.860781433812014e-05, "loss": 4.7722, "step": 12000 }, { "epoch": 0.01, "grad_norm": 1.649463176727295, "learning_rate": 9.85962127909378e-05, "loss": 4.7667, "step": 12100 }, { "epoch": 0.01, "grad_norm": 3.827794075012207, "learning_rate": 9.858461124375547e-05, "loss": 4.768, "step": 12200 }, { "epoch": 0.01, "grad_norm": 6.945425510406494, "learning_rate": 9.857300969657314e-05, "loss": 4.7689, "step": 12300 }, { "epoch": 0.01, "grad_norm": 1.1629202365875244, "learning_rate": 9.856140814939081e-05, "loss": 4.7656, "step": 12400 }, { "epoch": 0.01, "grad_norm": 0.5653738379478455, "learning_rate": 9.854980660220847e-05, "loss": 4.7631, "step": 12500 }, { "epoch": 0.01, "grad_norm": 0.724161684513092, "learning_rate": 9.853820505502614e-05, "loss": 4.7622, "step": 12600 }, { "epoch": 0.01, "grad_norm": 0.5713109970092773, "learning_rate": 9.852660350784381e-05, "loss": 4.7633, "step": 12700 }, { "epoch": 0.01, "grad_norm": 0.9219802021980286, "learning_rate": 9.851500196066148e-05, "loss": 4.7616, "step": 12800 }, { "epoch": 0.01, "grad_norm": 0.7306973338127136, "learning_rate": 9.850340041347914e-05, "loss": 4.7616, "step": 12900 }, { "epoch": 0.02, "grad_norm": 2.1123743057250977, "learning_rate": 9.849179886629682e-05, "loss": 4.7635, "step": 13000 }, { "epoch": 0.02, "grad_norm": 0.5764673352241516, "learning_rate": 9.848019731911449e-05, "loss": 4.7602, "step": 13100 }, { "epoch": 0.02, "grad_norm": 1.456514835357666, "learning_rate": 9.846859577193215e-05, "loss": 4.762, "step": 13200 }, { "epoch": 0.02, "grad_norm": 0.6817034482955933, "learning_rate": 9.845699422474982e-05, "loss": 4.7585, "step": 13300 }, { "epoch": 0.02, "grad_norm": 1.166177749633789, "learning_rate": 9.844539267756749e-05, "loss": 4.7653, "step": 13400 }, { "epoch": 0.02, "grad_norm": 1.546487808227539, "learning_rate": 9.843379113038516e-05, "loss": 4.7607, "step": 13500 }, { "epoch": 0.02, "grad_norm": 0.610224187374115, "learning_rate": 9.842218958320282e-05, "loss": 4.7598, "step": 13600 }, { "epoch": 0.02, "grad_norm": 0.5171063542366028, "learning_rate": 9.841058803602049e-05, "loss": 4.7598, "step": 13700 }, { "epoch": 0.02, "grad_norm": 2.632072687149048, "learning_rate": 9.839898648883816e-05, "loss": 4.7634, "step": 13800 }, { "epoch": 0.02, "grad_norm": 0.6531881093978882, "learning_rate": 9.838738494165583e-05, "loss": 4.7585, "step": 13900 }, { "epoch": 0.02, "grad_norm": 0.6771488189697266, "learning_rate": 9.837578339447348e-05, "loss": 4.757, "step": 14000 }, { "epoch": 0.02, "grad_norm": 1.341787338256836, "learning_rate": 9.836418184729115e-05, "loss": 4.7579, "step": 14100 }, { "epoch": 0.02, "grad_norm": 2.1839771270751953, "learning_rate": 9.835258030010884e-05, "loss": 4.7581, "step": 14200 }, { "epoch": 0.02, "grad_norm": 1.8916560411453247, "learning_rate": 9.83409787529265e-05, "loss": 4.7599, "step": 14300 }, { "epoch": 0.02, "grad_norm": 2.7338578701019287, "learning_rate": 9.832937720574417e-05, "loss": 4.7607, "step": 14400 }, { "epoch": 0.02, "grad_norm": 0.7070731520652771, "learning_rate": 9.831777565856182e-05, "loss": 4.7587, "step": 14500 }, { "epoch": 0.02, "grad_norm": 0.5772482752799988, "learning_rate": 9.83061741113795e-05, "loss": 4.763, "step": 14600 }, { "epoch": 0.02, "grad_norm": 0.9034737348556519, "learning_rate": 9.829457256419716e-05, "loss": 4.7597, "step": 14700 }, { "epoch": 0.02, "grad_norm": 2.127034902572632, "learning_rate": 9.828297101701483e-05, "loss": 4.7573, "step": 14800 }, { "epoch": 0.02, "grad_norm": 1.8689446449279785, "learning_rate": 9.82713694698325e-05, "loss": 4.757, "step": 14900 }, { "epoch": 0.02, "grad_norm": 0.7055838108062744, "learning_rate": 9.825976792265017e-05, "loss": 4.7557, "step": 15000 }, { "epoch": 0.02, "grad_norm": 1.9538499116897583, "learning_rate": 9.824816637546783e-05, "loss": 4.7611, "step": 15100 }, { "epoch": 0.02, "grad_norm": 1.487042784690857, "learning_rate": 9.82365648282855e-05, "loss": 4.7574, "step": 15200 }, { "epoch": 0.02, "grad_norm": 2.1112914085388184, "learning_rate": 9.822496328110317e-05, "loss": 4.7566, "step": 15300 }, { "epoch": 0.02, "grad_norm": 0.9239519834518433, "learning_rate": 9.821336173392084e-05, "loss": 4.7587, "step": 15400 }, { "epoch": 0.02, "grad_norm": 3.975541353225708, "learning_rate": 9.820176018673851e-05, "loss": 4.7577, "step": 15500 }, { "epoch": 0.02, "grad_norm": 0.5043540000915527, "learning_rate": 9.819015863955617e-05, "loss": 4.7565, "step": 15600 }, { "epoch": 0.02, "grad_norm": 0.8457772731781006, "learning_rate": 9.817855709237384e-05, "loss": 4.7573, "step": 15700 }, { "epoch": 0.02, "grad_norm": 0.5468181371688843, "learning_rate": 9.816695554519151e-05, "loss": 4.7555, "step": 15800 }, { "epoch": 0.02, "grad_norm": 5.87544584274292, "learning_rate": 9.815535399800918e-05, "loss": 4.7588, "step": 15900 }, { "epoch": 0.02, "grad_norm": 1.0000234842300415, "learning_rate": 9.814375245082684e-05, "loss": 4.753, "step": 16000 }, { "epoch": 0.02, "grad_norm": 0.6940212249755859, "learning_rate": 9.813215090364452e-05, "loss": 4.7543, "step": 16100 }, { "epoch": 0.02, "grad_norm": 0.9902929663658142, "learning_rate": 9.812054935646219e-05, "loss": 4.7562, "step": 16200 }, { "epoch": 0.02, "grad_norm": 17.795963287353516, "learning_rate": 9.810894780927985e-05, "loss": 4.7551, "step": 16300 }, { "epoch": 0.02, "grad_norm": 7.341447830200195, "learning_rate": 9.809734626209752e-05, "loss": 4.7569, "step": 16400 }, { "epoch": 0.02, "grad_norm": 7.844032287597656, "learning_rate": 9.808574471491519e-05, "loss": 4.7546, "step": 16500 }, { "epoch": 0.02, "grad_norm": 0.8450008034706116, "learning_rate": 9.807414316773286e-05, "loss": 4.7554, "step": 16600 }, { "epoch": 0.02, "grad_norm": 1.3422025442123413, "learning_rate": 9.806254162055052e-05, "loss": 4.7524, "step": 16700 }, { "epoch": 0.02, "grad_norm": 0.6606966853141785, "learning_rate": 9.805094007336819e-05, "loss": 4.7533, "step": 16800 }, { "epoch": 0.02, "grad_norm": 0.6223414540290833, "learning_rate": 9.803933852618586e-05, "loss": 4.7544, "step": 16900 }, { "epoch": 0.02, "grad_norm": 0.53001469373703, "learning_rate": 9.802773697900353e-05, "loss": 4.7529, "step": 17000 }, { "epoch": 0.02, "grad_norm": 0.538820743560791, "learning_rate": 9.801613543182119e-05, "loss": 4.7514, "step": 17100 }, { "epoch": 0.02, "grad_norm": 6.79536247253418, "learning_rate": 9.800453388463886e-05, "loss": 4.7522, "step": 17200 }, { "epoch": 0.02, "grad_norm": 1.8284698724746704, "learning_rate": 9.799293233745654e-05, "loss": 4.7532, "step": 17300 }, { "epoch": 0.02, "grad_norm": 0.8321298956871033, "learning_rate": 9.79813307902742e-05, "loss": 4.7553, "step": 17400 }, { "epoch": 0.02, "grad_norm": 5.962058067321777, "learning_rate": 9.796972924309187e-05, "loss": 4.7641, "step": 17500 }, { "epoch": 0.02, "grad_norm": 11.184019088745117, "learning_rate": 9.795812769590952e-05, "loss": 4.7608, "step": 17600 }, { "epoch": 0.02, "grad_norm": 1.882926106452942, "learning_rate": 9.794652614872721e-05, "loss": 4.7582, "step": 17700 }, { "epoch": 0.02, "grad_norm": 4.5953168869018555, "learning_rate": 9.793492460154486e-05, "loss": 4.756, "step": 17800 }, { "epoch": 0.02, "grad_norm": 7.986889839172363, "learning_rate": 9.792332305436253e-05, "loss": 4.759, "step": 17900 }, { "epoch": 0.02, "grad_norm": 1.7497138977050781, "learning_rate": 9.791172150718019e-05, "loss": 4.7567, "step": 18000 }, { "epoch": 0.02, "grad_norm": 0.4782524108886719, "learning_rate": 9.790011995999788e-05, "loss": 4.753, "step": 18100 }, { "epoch": 0.02, "grad_norm": 0.7002395987510681, "learning_rate": 9.788851841281553e-05, "loss": 4.7513, "step": 18200 }, { "epoch": 0.02, "grad_norm": 0.49617066979408264, "learning_rate": 9.78769168656332e-05, "loss": 4.7508, "step": 18300 }, { "epoch": 0.02, "grad_norm": 0.459750235080719, "learning_rate": 9.786531531845087e-05, "loss": 4.7518, "step": 18400 }, { "epoch": 0.02, "grad_norm": 0.6577441692352295, "learning_rate": 9.785371377126854e-05, "loss": 4.7512, "step": 18500 }, { "epoch": 0.02, "grad_norm": 1.284149408340454, "learning_rate": 9.784211222408621e-05, "loss": 4.752, "step": 18600 }, { "epoch": 0.02, "grad_norm": 2.703396797180176, "learning_rate": 9.783051067690387e-05, "loss": 4.754, "step": 18700 }, { "epoch": 0.02, "grad_norm": 0.5742882490158081, "learning_rate": 9.781890912972154e-05, "loss": 4.7521, "step": 18800 }, { "epoch": 0.02, "grad_norm": 0.5588614344596863, "learning_rate": 9.780730758253921e-05, "loss": 4.7541, "step": 18900 }, { "epoch": 0.02, "grad_norm": 1.6749471426010132, "learning_rate": 9.779570603535688e-05, "loss": 4.7521, "step": 19000 }, { "epoch": 0.02, "grad_norm": 1.4725801944732666, "learning_rate": 9.778410448817454e-05, "loss": 4.7503, "step": 19100 }, { "epoch": 0.02, "grad_norm": 0.7287809252738953, "learning_rate": 9.777250294099222e-05, "loss": 4.7492, "step": 19200 }, { "epoch": 0.02, "grad_norm": 3.7486460208892822, "learning_rate": 9.776090139380988e-05, "loss": 4.7488, "step": 19300 }, { "epoch": 0.02, "grad_norm": 2.859056234359741, "learning_rate": 9.774929984662755e-05, "loss": 4.7492, "step": 19400 }, { "epoch": 0.02, "grad_norm": 0.5980411171913147, "learning_rate": 9.773769829944522e-05, "loss": 4.7534, "step": 19500 }, { "epoch": 0.02, "grad_norm": 0.6939360499382019, "learning_rate": 9.772609675226289e-05, "loss": 4.7484, "step": 19600 }, { "epoch": 0.02, "grad_norm": 7.793490886688232, "learning_rate": 9.771449520508056e-05, "loss": 4.751, "step": 19700 }, { "epoch": 0.02, "grad_norm": 0.7629129886627197, "learning_rate": 9.770289365789822e-05, "loss": 4.7474, "step": 19800 }, { "epoch": 0.02, "grad_norm": 2.9731252193450928, "learning_rate": 9.769129211071589e-05, "loss": 4.7499, "step": 19900 }, { "epoch": 0.02, "grad_norm": 0.9709353446960449, "learning_rate": 9.767969056353356e-05, "loss": 4.7488, "step": 20000 }, { "epoch": 0.02, "grad_norm": 0.8048213720321655, "learning_rate": 9.766808901635123e-05, "loss": 4.7483, "step": 20100 }, { "epoch": 0.02, "grad_norm": 0.6946632862091064, "learning_rate": 9.765648746916889e-05, "loss": 4.748, "step": 20200 }, { "epoch": 0.02, "grad_norm": 13.102263450622559, "learning_rate": 9.764488592198656e-05, "loss": 4.7483, "step": 20300 }, { "epoch": 0.02, "grad_norm": 0.7083731889724731, "learning_rate": 9.763328437480423e-05, "loss": 4.7495, "step": 20400 }, { "epoch": 0.02, "grad_norm": 0.5328574180603027, "learning_rate": 9.76216828276219e-05, "loss": 4.7492, "step": 20500 }, { "epoch": 0.02, "grad_norm": 2.2578125, "learning_rate": 9.761008128043957e-05, "loss": 4.7474, "step": 20600 }, { "epoch": 0.02, "grad_norm": 7.049394607543945, "learning_rate": 9.759847973325723e-05, "loss": 4.7484, "step": 20700 }, { "epoch": 0.02, "grad_norm": 0.5719186663627625, "learning_rate": 9.758687818607491e-05, "loss": 4.7476, "step": 20800 }, { "epoch": 0.02, "grad_norm": 1.7516430616378784, "learning_rate": 9.757527663889257e-05, "loss": 4.7468, "step": 20900 }, { "epoch": 0.02, "grad_norm": 0.7269846200942993, "learning_rate": 9.756367509171024e-05, "loss": 4.7482, "step": 21000 }, { "epoch": 0.02, "grad_norm": 1.0633318424224854, "learning_rate": 9.75520735445279e-05, "loss": 4.7469, "step": 21100 }, { "epoch": 0.02, "grad_norm": 0.5097702145576477, "learning_rate": 9.754047199734558e-05, "loss": 4.7482, "step": 21200 }, { "epoch": 0.02, "grad_norm": 0.5308384895324707, "learning_rate": 9.752887045016323e-05, "loss": 4.7444, "step": 21300 }, { "epoch": 0.02, "grad_norm": 0.5017550587654114, "learning_rate": 9.75172689029809e-05, "loss": 4.7474, "step": 21400 }, { "epoch": 0.02, "grad_norm": 0.526594340801239, "learning_rate": 9.750566735579858e-05, "loss": 4.7463, "step": 21500 }, { "epoch": 0.03, "grad_norm": 2.675493001937866, "learning_rate": 9.749406580861625e-05, "loss": 4.7475, "step": 21600 }, { "epoch": 0.03, "grad_norm": 5.905905723571777, "learning_rate": 9.748246426143392e-05, "loss": 4.7499, "step": 21700 }, { "epoch": 0.03, "grad_norm": 4.458377838134766, "learning_rate": 9.747086271425157e-05, "loss": 4.7474, "step": 21800 }, { "epoch": 0.03, "grad_norm": 1.3677464723587036, "learning_rate": 9.745926116706924e-05, "loss": 4.7502, "step": 21900 }, { "epoch": 0.03, "grad_norm": 9.98462200164795, "learning_rate": 9.744765961988691e-05, "loss": 4.7486, "step": 22000 }, { "epoch": 0.03, "grad_norm": 6.801756858825684, "learning_rate": 9.743605807270458e-05, "loss": 4.7458, "step": 22100 }, { "epoch": 0.03, "grad_norm": 0.5047688484191895, "learning_rate": 9.742445652552224e-05, "loss": 4.7574, "step": 22200 }, { "epoch": 0.03, "grad_norm": 0.5461822152137756, "learning_rate": 9.741285497833991e-05, "loss": 4.7441, "step": 22300 }, { "epoch": 0.03, "grad_norm": 5.560943603515625, "learning_rate": 9.740125343115758e-05, "loss": 4.7447, "step": 22400 }, { "epoch": 0.03, "grad_norm": 0.5233502984046936, "learning_rate": 9.738965188397525e-05, "loss": 4.7474, "step": 22500 }, { "epoch": 0.03, "grad_norm": 0.5336612462997437, "learning_rate": 9.737805033679291e-05, "loss": 4.7425, "step": 22600 }, { "epoch": 0.03, "grad_norm": 2.945470094680786, "learning_rate": 9.736644878961059e-05, "loss": 4.7482, "step": 22700 }, { "epoch": 0.03, "grad_norm": 0.4918281137943268, "learning_rate": 9.735484724242826e-05, "loss": 4.7469, "step": 22800 }, { "epoch": 0.03, "grad_norm": 2.591059923171997, "learning_rate": 9.734324569524592e-05, "loss": 4.7458, "step": 22900 }, { "epoch": 0.03, "grad_norm": 0.5464999079704285, "learning_rate": 9.733164414806359e-05, "loss": 4.7443, "step": 23000 }, { "epoch": 0.03, "grad_norm": 6.337867259979248, "learning_rate": 9.732004260088126e-05, "loss": 4.7453, "step": 23100 }, { "epoch": 0.03, "grad_norm": 0.4762401878833771, "learning_rate": 9.730844105369893e-05, "loss": 4.7418, "step": 23200 }, { "epoch": 0.03, "grad_norm": 2.0473411083221436, "learning_rate": 9.729683950651659e-05, "loss": 4.7454, "step": 23300 }, { "epoch": 0.03, "grad_norm": 0.72366863489151, "learning_rate": 9.728523795933426e-05, "loss": 4.7425, "step": 23400 }, { "epoch": 0.03, "grad_norm": 0.5242981314659119, "learning_rate": 9.727363641215193e-05, "loss": 4.7461, "step": 23500 }, { "epoch": 0.03, "grad_norm": 0.6705228090286255, "learning_rate": 9.72620348649696e-05, "loss": 4.7495, "step": 23600 }, { "epoch": 0.03, "grad_norm": 1.549325704574585, "learning_rate": 9.725043331778726e-05, "loss": 4.7436, "step": 23700 }, { "epoch": 0.03, "grad_norm": 0.599336564540863, "learning_rate": 9.723883177060493e-05, "loss": 4.743, "step": 23800 }, { "epoch": 0.03, "grad_norm": 1.103011965751648, "learning_rate": 9.722723022342261e-05, "loss": 4.7437, "step": 23900 }, { "epoch": 0.03, "grad_norm": 3.4058070182800293, "learning_rate": 9.721562867624027e-05, "loss": 4.7466, "step": 24000 }, { "epoch": 0.03, "grad_norm": 1.0680614709854126, "learning_rate": 9.720402712905794e-05, "loss": 4.7424, "step": 24100 }, { "epoch": 0.03, "grad_norm": 0.5411515235900879, "learning_rate": 9.71924255818756e-05, "loss": 4.7408, "step": 24200 }, { "epoch": 0.03, "grad_norm": 8.111778259277344, "learning_rate": 9.718082403469328e-05, "loss": 4.7454, "step": 24300 }, { "epoch": 0.03, "grad_norm": 0.44278204441070557, "learning_rate": 9.716922248751094e-05, "loss": 4.7599, "step": 24400 }, { "epoch": 0.03, "grad_norm": 0.6122348308563232, "learning_rate": 9.71576209403286e-05, "loss": 4.7427, "step": 24500 }, { "epoch": 0.03, "grad_norm": 0.5633386373519897, "learning_rate": 9.714601939314628e-05, "loss": 4.746, "step": 24600 }, { "epoch": 0.03, "grad_norm": 0.8872265219688416, "learning_rate": 9.713441784596395e-05, "loss": 4.7439, "step": 24700 }, { "epoch": 0.03, "grad_norm": 1.6827236413955688, "learning_rate": 9.71228162987816e-05, "loss": 4.7413, "step": 24800 }, { "epoch": 0.03, "grad_norm": 0.6328549385070801, "learning_rate": 9.711121475159927e-05, "loss": 4.7408, "step": 24900 }, { "epoch": 0.03, "grad_norm": 0.4887780249118805, "learning_rate": 9.709961320441694e-05, "loss": 4.7425, "step": 25000 }, { "epoch": 0.03, "grad_norm": 2.9315319061279297, "learning_rate": 9.708801165723462e-05, "loss": 4.7431, "step": 25100 }, { "epoch": 0.03, "grad_norm": 0.6957463026046753, "learning_rate": 9.707641011005229e-05, "loss": 4.7414, "step": 25200 }, { "epoch": 0.03, "grad_norm": 0.5213463306427002, "learning_rate": 9.706480856286994e-05, "loss": 4.7429, "step": 25300 }, { "epoch": 0.03, "grad_norm": 0.486128032207489, "learning_rate": 9.705320701568761e-05, "loss": 4.7405, "step": 25400 }, { "epoch": 0.03, "grad_norm": 2.561840295791626, "learning_rate": 9.704160546850528e-05, "loss": 4.741, "step": 25500 }, { "epoch": 0.03, "grad_norm": 5.198677062988281, "learning_rate": 9.703000392132295e-05, "loss": 4.7447, "step": 25600 }, { "epoch": 0.03, "grad_norm": 3.6826071739196777, "learning_rate": 9.701840237414061e-05, "loss": 4.7435, "step": 25700 }, { "epoch": 0.03, "grad_norm": 0.7129253149032593, "learning_rate": 9.70068008269583e-05, "loss": 4.7432, "step": 25800 }, { "epoch": 0.03, "grad_norm": 2.101804733276367, "learning_rate": 9.699519927977595e-05, "loss": 4.7518, "step": 25900 }, { "epoch": 0.03, "grad_norm": 0.5510717034339905, "learning_rate": 9.698359773259362e-05, "loss": 4.7417, "step": 26000 }, { "epoch": 0.03, "grad_norm": 0.45920300483703613, "learning_rate": 9.697199618541129e-05, "loss": 4.7421, "step": 26100 }, { "epoch": 0.03, "grad_norm": 0.5336456894874573, "learning_rate": 9.696039463822896e-05, "loss": 4.7411, "step": 26200 }, { "epoch": 0.03, "grad_norm": 0.5019949078559875, "learning_rate": 9.694879309104663e-05, "loss": 4.7436, "step": 26300 }, { "epoch": 0.03, "grad_norm": 33.536495208740234, "learning_rate": 9.693719154386429e-05, "loss": 4.7436, "step": 26400 }, { "epoch": 0.03, "grad_norm": 1.4645074605941772, "learning_rate": 9.692558999668196e-05, "loss": 4.7447, "step": 26500 }, { "epoch": 0.03, "grad_norm": 3.655017614364624, "learning_rate": 9.691398844949963e-05, "loss": 4.7404, "step": 26600 }, { "epoch": 0.03, "grad_norm": 0.49820154905319214, "learning_rate": 9.69023869023173e-05, "loss": 4.7432, "step": 26700 }, { "epoch": 0.03, "grad_norm": 0.6071832180023193, "learning_rate": 9.689078535513496e-05, "loss": 4.7436, "step": 26800 }, { "epoch": 0.03, "grad_norm": 0.5458092093467712, "learning_rate": 9.687918380795263e-05, "loss": 4.7438, "step": 26900 }, { "epoch": 0.03, "grad_norm": 0.5005242824554443, "learning_rate": 9.686758226077031e-05, "loss": 4.741, "step": 27000 }, { "epoch": 0.03, "grad_norm": 0.5511128306388855, "learning_rate": 9.685598071358797e-05, "loss": 4.743, "step": 27100 }, { "epoch": 0.03, "grad_norm": 17.676786422729492, "learning_rate": 9.684437916640564e-05, "loss": 4.7423, "step": 27200 }, { "epoch": 0.03, "grad_norm": 1.3633731603622437, "learning_rate": 9.68327776192233e-05, "loss": 4.7408, "step": 27300 }, { "epoch": 0.03, "grad_norm": 1.5432199239730835, "learning_rate": 9.682117607204098e-05, "loss": 4.7391, "step": 27400 }, { "epoch": 0.03, "grad_norm": 5.02588415145874, "learning_rate": 9.680957452485864e-05, "loss": 4.7416, "step": 27500 }, { "epoch": 0.03, "grad_norm": 2.1455700397491455, "learning_rate": 9.679797297767631e-05, "loss": 4.7398, "step": 27600 }, { "epoch": 0.03, "grad_norm": 1.6886168718338013, "learning_rate": 9.678637143049398e-05, "loss": 4.7418, "step": 27700 }, { "epoch": 0.03, "grad_norm": 0.5645326375961304, "learning_rate": 9.677476988331165e-05, "loss": 4.7381, "step": 27800 }, { "epoch": 0.03, "grad_norm": 6.7613911628723145, "learning_rate": 9.67631683361293e-05, "loss": 4.7404, "step": 27900 }, { "epoch": 0.03, "grad_norm": 0.5324685573577881, "learning_rate": 9.675156678894698e-05, "loss": 4.7378, "step": 28000 }, { "epoch": 0.03, "grad_norm": 4.839243412017822, "learning_rate": 9.673996524176465e-05, "loss": 4.7395, "step": 28100 }, { "epoch": 0.03, "grad_norm": 0.458783894777298, "learning_rate": 9.672836369458232e-05, "loss": 4.7422, "step": 28200 }, { "epoch": 0.03, "grad_norm": 0.9166305065155029, "learning_rate": 9.671676214739999e-05, "loss": 4.7428, "step": 28300 }, { "epoch": 0.03, "grad_norm": 1.108514428138733, "learning_rate": 9.670516060021764e-05, "loss": 4.7403, "step": 28400 }, { "epoch": 0.03, "grad_norm": 1.5330324172973633, "learning_rate": 9.669355905303531e-05, "loss": 4.7366, "step": 28500 }, { "epoch": 0.03, "grad_norm": 0.4888221025466919, "learning_rate": 9.668195750585298e-05, "loss": 4.7384, "step": 28600 }, { "epoch": 0.03, "grad_norm": 0.600286602973938, "learning_rate": 9.667035595867066e-05, "loss": 4.7367, "step": 28700 }, { "epoch": 0.03, "grad_norm": 2.486511468887329, "learning_rate": 9.665875441148831e-05, "loss": 4.7391, "step": 28800 }, { "epoch": 0.03, "grad_norm": 0.9454842209815979, "learning_rate": 9.6647152864306e-05, "loss": 4.7384, "step": 28900 }, { "epoch": 0.03, "grad_norm": 1.6840468645095825, "learning_rate": 9.663555131712365e-05, "loss": 4.7379, "step": 29000 }, { "epoch": 0.03, "grad_norm": 0.795483410358429, "learning_rate": 9.662394976994132e-05, "loss": 4.7393, "step": 29100 }, { "epoch": 0.03, "grad_norm": 0.8666725158691406, "learning_rate": 9.6612348222759e-05, "loss": 4.7389, "step": 29200 }, { "epoch": 0.03, "grad_norm": 4.16463565826416, "learning_rate": 9.660074667557666e-05, "loss": 4.7395, "step": 29300 }, { "epoch": 0.03, "grad_norm": 0.6866464614868164, "learning_rate": 9.658914512839433e-05, "loss": 4.7401, "step": 29400 }, { "epoch": 0.03, "grad_norm": 23.52367401123047, "learning_rate": 9.657754358121199e-05, "loss": 4.7562, "step": 29500 }, { "epoch": 0.03, "grad_norm": 4.992708683013916, "learning_rate": 9.656594203402966e-05, "loss": 4.7743, "step": 29600 }, { "epoch": 0.03, "grad_norm": 4.8701677322387695, "learning_rate": 9.655434048684733e-05, "loss": 4.7586, "step": 29700 }, { "epoch": 0.03, "grad_norm": 4.0893425941467285, "learning_rate": 9.6542738939665e-05, "loss": 4.7572, "step": 29800 }, { "epoch": 0.03, "grad_norm": 4.048985958099365, "learning_rate": 9.653113739248266e-05, "loss": 4.7514, "step": 29900 }, { "epoch": 0.03, "grad_norm": 4.577606678009033, "learning_rate": 9.651953584530033e-05, "loss": 4.7529, "step": 30000 }, { "epoch": 0.03, "grad_norm": 4.831415176391602, "learning_rate": 9.6507934298118e-05, "loss": 4.7512, "step": 30100 }, { "epoch": 0.04, "grad_norm": 4.870159149169922, "learning_rate": 9.649633275093567e-05, "loss": 4.7494, "step": 30200 }, { "epoch": 0.04, "grad_norm": 4.836753845214844, "learning_rate": 9.648473120375334e-05, "loss": 4.7493, "step": 30300 }, { "epoch": 0.04, "grad_norm": 4.718664169311523, "learning_rate": 9.6473129656571e-05, "loss": 4.7511, "step": 30400 }, { "epoch": 0.04, "grad_norm": 5.49000358581543, "learning_rate": 9.646152810938868e-05, "loss": 4.7494, "step": 30500 }, { "epoch": 0.04, "grad_norm": 4.819366931915283, "learning_rate": 9.644992656220634e-05, "loss": 4.7504, "step": 30600 }, { "epoch": 0.04, "grad_norm": 0.7323962450027466, "learning_rate": 9.643832501502401e-05, "loss": 4.7461, "step": 30700 }, { "epoch": 0.04, "grad_norm": 0.8463295102119446, "learning_rate": 9.642672346784168e-05, "loss": 4.7376, "step": 30800 }, { "epoch": 0.04, "grad_norm": 0.5474389791488647, "learning_rate": 9.641512192065935e-05, "loss": 4.7394, "step": 30900 }, { "epoch": 0.04, "grad_norm": 0.6245602965354919, "learning_rate": 9.640352037347701e-05, "loss": 4.7363, "step": 31000 }, { "epoch": 0.04, "grad_norm": 1.6616430282592773, "learning_rate": 9.639191882629468e-05, "loss": 4.7375, "step": 31100 }, { "epoch": 0.04, "grad_norm": 2.0729475021362305, "learning_rate": 9.638031727911235e-05, "loss": 4.7393, "step": 31200 }, { "epoch": 0.04, "grad_norm": 0.4616137146949768, "learning_rate": 9.636871573193002e-05, "loss": 4.7374, "step": 31300 }, { "epoch": 0.04, "grad_norm": 1.6392802000045776, "learning_rate": 9.635711418474769e-05, "loss": 4.7383, "step": 31400 }, { "epoch": 0.04, "grad_norm": 0.5004580616950989, "learning_rate": 9.634551263756535e-05, "loss": 4.7378, "step": 31500 }, { "epoch": 0.04, "grad_norm": 0.5481105446815491, "learning_rate": 9.633391109038302e-05, "loss": 4.7382, "step": 31600 }, { "epoch": 0.04, "grad_norm": 0.6281868815422058, "learning_rate": 9.632230954320069e-05, "loss": 4.7383, "step": 31700 }, { "epoch": 0.04, "grad_norm": 0.664202868938446, "learning_rate": 9.631070799601836e-05, "loss": 4.739, "step": 31800 }, { "epoch": 0.04, "grad_norm": 0.6041770577430725, "learning_rate": 9.629910644883601e-05, "loss": 4.7359, "step": 31900 }, { "epoch": 0.04, "grad_norm": 3.237818717956543, "learning_rate": 9.62875049016537e-05, "loss": 4.7384, "step": 32000 }, { "epoch": 0.04, "grad_norm": 0.4940323829650879, "learning_rate": 9.627590335447135e-05, "loss": 4.7363, "step": 32100 }, { "epoch": 0.04, "grad_norm": 0.5114046335220337, "learning_rate": 9.626430180728902e-05, "loss": 4.7383, "step": 32200 }, { "epoch": 0.04, "grad_norm": 0.9840266704559326, "learning_rate": 9.625270026010668e-05, "loss": 4.7378, "step": 32300 }, { "epoch": 0.04, "grad_norm": 0.5324087738990784, "learning_rate": 9.624109871292437e-05, "loss": 4.7398, "step": 32400 }, { "epoch": 0.04, "grad_norm": 4.636378765106201, "learning_rate": 9.622949716574204e-05, "loss": 4.7336, "step": 32500 }, { "epoch": 0.04, "grad_norm": 0.4898914396762848, "learning_rate": 9.621789561855969e-05, "loss": 4.7374, "step": 32600 }, { "epoch": 0.04, "grad_norm": 0.6327505111694336, "learning_rate": 9.620629407137736e-05, "loss": 4.7321, "step": 32700 }, { "epoch": 0.04, "grad_norm": 10.219440460205078, "learning_rate": 9.619469252419503e-05, "loss": 4.737, "step": 32800 }, { "epoch": 0.04, "grad_norm": 0.5579794049263, "learning_rate": 9.61830909770127e-05, "loss": 4.7367, "step": 32900 }, { "epoch": 0.04, "grad_norm": 0.46085384488105774, "learning_rate": 9.617148942983036e-05, "loss": 4.7379, "step": 33000 }, { "epoch": 0.04, "grad_norm": 0.5021482110023499, "learning_rate": 9.615988788264803e-05, "loss": 4.7368, "step": 33100 }, { "epoch": 0.04, "grad_norm": 15.20097541809082, "learning_rate": 9.61482863354657e-05, "loss": 4.736, "step": 33200 }, { "epoch": 0.04, "grad_norm": 0.7271726727485657, "learning_rate": 9.613668478828337e-05, "loss": 4.7366, "step": 33300 }, { "epoch": 0.04, "grad_norm": 0.48880913853645325, "learning_rate": 9.612508324110103e-05, "loss": 4.7366, "step": 33400 }, { "epoch": 0.04, "grad_norm": 0.5590758323669434, "learning_rate": 9.61134816939187e-05, "loss": 4.7391, "step": 33500 }, { "epoch": 0.04, "grad_norm": 0.4674455225467682, "learning_rate": 9.610188014673638e-05, "loss": 4.7379, "step": 33600 }, { "epoch": 0.04, "grad_norm": 0.517918050289154, "learning_rate": 9.609027859955404e-05, "loss": 4.7367, "step": 33700 }, { "epoch": 0.04, "grad_norm": 1.2398717403411865, "learning_rate": 9.607867705237171e-05, "loss": 4.738, "step": 33800 }, { "epoch": 0.04, "grad_norm": 0.5215381979942322, "learning_rate": 9.606707550518937e-05, "loss": 4.7352, "step": 33900 }, { "epoch": 0.04, "grad_norm": 2.9021263122558594, "learning_rate": 9.605547395800705e-05, "loss": 4.7348, "step": 34000 }, { "epoch": 0.04, "grad_norm": 0.46950528025627136, "learning_rate": 9.604387241082471e-05, "loss": 4.7348, "step": 34100 }, { "epoch": 0.04, "grad_norm": 0.5117696523666382, "learning_rate": 9.603227086364238e-05, "loss": 4.7371, "step": 34200 }, { "epoch": 0.04, "grad_norm": 0.7093480825424194, "learning_rate": 9.602066931646005e-05, "loss": 4.7383, "step": 34300 }, { "epoch": 0.04, "grad_norm": 0.4637995958328247, "learning_rate": 9.600906776927772e-05, "loss": 4.7371, "step": 34400 }, { "epoch": 0.04, "grad_norm": 0.5002011060714722, "learning_rate": 9.599746622209538e-05, "loss": 4.7359, "step": 34500 }, { "epoch": 0.04, "grad_norm": 0.7651986479759216, "learning_rate": 9.598586467491305e-05, "loss": 4.7303, "step": 34600 }, { "epoch": 0.04, "grad_norm": 1.2940102815628052, "learning_rate": 9.597426312773072e-05, "loss": 4.7331, "step": 34700 }, { "epoch": 0.04, "grad_norm": 0.5338752269744873, "learning_rate": 9.596266158054839e-05, "loss": 4.7361, "step": 34800 }, { "epoch": 0.04, "grad_norm": 1.049210786819458, "learning_rate": 9.595106003336606e-05, "loss": 4.7347, "step": 34900 }, { "epoch": 0.04, "grad_norm": 2.3599870204925537, "learning_rate": 9.593945848618372e-05, "loss": 4.7375, "step": 35000 }, { "epoch": 0.04, "grad_norm": 0.5492005348205566, "learning_rate": 9.59278569390014e-05, "loss": 4.7352, "step": 35100 }, { "epoch": 0.04, "grad_norm": 0.5505391955375671, "learning_rate": 9.591625539181906e-05, "loss": 4.7332, "step": 35200 }, { "epoch": 0.04, "grad_norm": 0.6726617813110352, "learning_rate": 9.590465384463673e-05, "loss": 4.7315, "step": 35300 }, { "epoch": 0.04, "grad_norm": 0.558929979801178, "learning_rate": 9.589305229745438e-05, "loss": 4.732, "step": 35400 }, { "epoch": 0.04, "grad_norm": 1.7398250102996826, "learning_rate": 9.588145075027207e-05, "loss": 4.7329, "step": 35500 }, { "epoch": 0.04, "grad_norm": 0.5553580522537231, "learning_rate": 9.586984920308972e-05, "loss": 4.7355, "step": 35600 }, { "epoch": 0.04, "grad_norm": 0.5289317965507507, "learning_rate": 9.58582476559074e-05, "loss": 4.7363, "step": 35700 }, { "epoch": 0.04, "grad_norm": 3.049525499343872, "learning_rate": 9.584664610872507e-05, "loss": 4.7342, "step": 35800 }, { "epoch": 0.04, "grad_norm": 0.4871656000614166, "learning_rate": 9.583504456154274e-05, "loss": 4.7356, "step": 35900 }, { "epoch": 0.04, "grad_norm": 2.0833821296691895, "learning_rate": 9.58234430143604e-05, "loss": 4.7345, "step": 36000 }, { "epoch": 0.04, "grad_norm": 5.624002456665039, "learning_rate": 9.581184146717806e-05, "loss": 4.7323, "step": 36100 }, { "epoch": 0.04, "grad_norm": 0.6378651261329651, "learning_rate": 9.580023991999573e-05, "loss": 4.7349, "step": 36200 }, { "epoch": 0.04, "grad_norm": 2.1615560054779053, "learning_rate": 9.57886383728134e-05, "loss": 4.7345, "step": 36300 }, { "epoch": 0.04, "grad_norm": 0.5154897570610046, "learning_rate": 9.577703682563107e-05, "loss": 4.7357, "step": 36400 }, { "epoch": 0.04, "grad_norm": 6.502463340759277, "learning_rate": 9.576543527844873e-05, "loss": 4.7354, "step": 36500 }, { "epoch": 0.04, "grad_norm": 0.5349368453025818, "learning_rate": 9.57538337312664e-05, "loss": 4.7342, "step": 36600 }, { "epoch": 0.04, "grad_norm": 1.0265626907348633, "learning_rate": 9.574223218408407e-05, "loss": 4.7327, "step": 36700 }, { "epoch": 0.04, "grad_norm": 0.8190938830375671, "learning_rate": 9.573063063690174e-05, "loss": 4.732, "step": 36800 }, { "epoch": 0.04, "grad_norm": 9.420807838439941, "learning_rate": 9.571902908971941e-05, "loss": 4.7322, "step": 36900 }, { "epoch": 0.04, "grad_norm": 0.5019901990890503, "learning_rate": 9.570742754253707e-05, "loss": 4.7314, "step": 37000 }, { "epoch": 0.04, "grad_norm": 2.35811448097229, "learning_rate": 9.569582599535475e-05, "loss": 4.7314, "step": 37100 }, { "epoch": 0.04, "grad_norm": 6.828240871429443, "learning_rate": 9.568422444817241e-05, "loss": 4.7329, "step": 37200 }, { "epoch": 0.04, "grad_norm": 0.4542797803878784, "learning_rate": 9.567262290099008e-05, "loss": 4.7297, "step": 37300 }, { "epoch": 0.04, "grad_norm": 0.4769699275493622, "learning_rate": 9.566102135380775e-05, "loss": 4.7334, "step": 37400 }, { "epoch": 0.04, "grad_norm": 0.6346319317817688, "learning_rate": 9.564941980662542e-05, "loss": 4.7324, "step": 37500 }, { "epoch": 0.04, "grad_norm": 0.621337890625, "learning_rate": 9.563781825944308e-05, "loss": 4.7335, "step": 37600 }, { "epoch": 0.04, "grad_norm": 0.49926477670669556, "learning_rate": 9.562621671226075e-05, "loss": 4.7334, "step": 37700 }, { "epoch": 0.04, "grad_norm": 1.4097819328308105, "learning_rate": 9.561461516507842e-05, "loss": 4.7316, "step": 37800 }, { "epoch": 0.04, "grad_norm": 0.4786432087421417, "learning_rate": 9.560301361789609e-05, "loss": 4.7323, "step": 37900 }, { "epoch": 0.04, "grad_norm": 0.7441820502281189, "learning_rate": 9.559141207071376e-05, "loss": 4.7322, "step": 38000 }, { "epoch": 0.04, "grad_norm": 0.5333019495010376, "learning_rate": 9.557981052353142e-05, "loss": 4.7326, "step": 38100 }, { "epoch": 0.04, "grad_norm": 0.9219884872436523, "learning_rate": 9.556820897634909e-05, "loss": 4.732, "step": 38200 }, { "epoch": 0.04, "grad_norm": 0.7354302406311035, "learning_rate": 9.555660742916676e-05, "loss": 4.7322, "step": 38300 }, { "epoch": 0.04, "grad_norm": 0.43798136711120605, "learning_rate": 9.554500588198443e-05, "loss": 4.7319, "step": 38400 }, { "epoch": 0.04, "grad_norm": 0.5193877220153809, "learning_rate": 9.553340433480208e-05, "loss": 4.7313, "step": 38500 }, { "epoch": 0.04, "grad_norm": 0.6578642725944519, "learning_rate": 9.552180278761977e-05, "loss": 4.7301, "step": 38600 }, { "epoch": 0.04, "grad_norm": 0.481916606426239, "learning_rate": 9.551020124043743e-05, "loss": 4.7336, "step": 38700 }, { "epoch": 0.05, "grad_norm": 0.495345801115036, "learning_rate": 9.54985996932551e-05, "loss": 4.7292, "step": 38800 }, { "epoch": 0.05, "grad_norm": 1.2032209634780884, "learning_rate": 9.548699814607277e-05, "loss": 4.731, "step": 38900 }, { "epoch": 0.05, "grad_norm": 0.45609620213508606, "learning_rate": 9.547539659889044e-05, "loss": 4.7309, "step": 39000 }, { "epoch": 0.05, "grad_norm": 0.46040889620780945, "learning_rate": 9.546379505170811e-05, "loss": 4.7319, "step": 39100 }, { "epoch": 0.05, "grad_norm": 0.8751170635223389, "learning_rate": 9.545219350452576e-05, "loss": 4.7304, "step": 39200 }, { "epoch": 0.05, "grad_norm": 0.4785304665565491, "learning_rate": 9.544059195734343e-05, "loss": 4.729, "step": 39300 }, { "epoch": 0.05, "grad_norm": 4.840359210968018, "learning_rate": 9.54289904101611e-05, "loss": 4.7368, "step": 39400 }, { "epoch": 0.05, "grad_norm": 0.5387877225875854, "learning_rate": 9.541738886297878e-05, "loss": 4.7325, "step": 39500 }, { "epoch": 0.05, "grad_norm": 1.0035640001296997, "learning_rate": 9.540578731579643e-05, "loss": 4.7307, "step": 39600 }, { "epoch": 0.05, "grad_norm": 0.5036232471466064, "learning_rate": 9.53941857686141e-05, "loss": 4.7324, "step": 39700 }, { "epoch": 0.05, "grad_norm": 0.8626024127006531, "learning_rate": 9.538258422143177e-05, "loss": 4.7286, "step": 39800 }, { "epoch": 0.05, "grad_norm": 6.899303436279297, "learning_rate": 9.537098267424944e-05, "loss": 4.7311, "step": 39900 }, { "epoch": 0.05, "grad_norm": 0.5646871328353882, "learning_rate": 9.535938112706711e-05, "loss": 4.7303, "step": 40000 }, { "epoch": 0.05, "grad_norm": 0.9518368244171143, "learning_rate": 9.534777957988477e-05, "loss": 4.7314, "step": 40100 }, { "epoch": 0.05, "grad_norm": 0.5652722120285034, "learning_rate": 9.533617803270245e-05, "loss": 4.7304, "step": 40200 }, { "epoch": 0.05, "grad_norm": 0.5541896224021912, "learning_rate": 9.532457648552011e-05, "loss": 4.73, "step": 40300 }, { "epoch": 0.05, "grad_norm": 0.49006637930870056, "learning_rate": 9.531297493833778e-05, "loss": 4.7311, "step": 40400 }, { "epoch": 0.05, "grad_norm": 0.5993065237998962, "learning_rate": 9.530137339115545e-05, "loss": 4.7309, "step": 40500 }, { "epoch": 0.05, "grad_norm": 0.5642876029014587, "learning_rate": 9.528977184397312e-05, "loss": 4.7299, "step": 40600 }, { "epoch": 0.05, "grad_norm": 7.888554096221924, "learning_rate": 9.527817029679078e-05, "loss": 4.7435, "step": 40700 }, { "epoch": 0.05, "grad_norm": 12.3725004196167, "learning_rate": 9.526656874960845e-05, "loss": 4.7401, "step": 40800 }, { "epoch": 0.05, "grad_norm": 2.139461040496826, "learning_rate": 9.525496720242612e-05, "loss": 4.7359, "step": 40900 }, { "epoch": 0.05, "grad_norm": 0.4835149645805359, "learning_rate": 9.524336565524379e-05, "loss": 4.7327, "step": 41000 }, { "epoch": 0.05, "grad_norm": 0.4717291593551636, "learning_rate": 9.523176410806146e-05, "loss": 4.7305, "step": 41100 }, { "epoch": 0.05, "grad_norm": 0.48854538798332214, "learning_rate": 9.522016256087912e-05, "loss": 4.7265, "step": 41200 }, { "epoch": 0.05, "grad_norm": 0.48703619837760925, "learning_rate": 9.520856101369679e-05, "loss": 4.7299, "step": 41300 }, { "epoch": 0.05, "grad_norm": 0.49546846747398376, "learning_rate": 9.519695946651446e-05, "loss": 4.7297, "step": 41400 }, { "epoch": 0.05, "grad_norm": 0.4952056109905243, "learning_rate": 9.518535791933213e-05, "loss": 4.7326, "step": 41500 }, { "epoch": 0.05, "grad_norm": 8.376193046569824, "learning_rate": 9.517375637214979e-05, "loss": 4.7306, "step": 41600 }, { "epoch": 0.05, "grad_norm": 0.4306395351886749, "learning_rate": 9.516215482496747e-05, "loss": 4.7286, "step": 41700 }, { "epoch": 0.05, "grad_norm": 0.9443186521530151, "learning_rate": 9.515055327778513e-05, "loss": 4.7308, "step": 41800 }, { "epoch": 0.05, "grad_norm": 0.47867128252983093, "learning_rate": 9.51389517306028e-05, "loss": 4.7289, "step": 41900 }, { "epoch": 0.05, "grad_norm": 1.2091776132583618, "learning_rate": 9.512735018342045e-05, "loss": 4.7378, "step": 42000 }, { "epoch": 0.05, "grad_norm": 0.4395917057991028, "learning_rate": 9.511574863623814e-05, "loss": 4.7328, "step": 42100 }, { "epoch": 0.05, "grad_norm": 0.723639965057373, "learning_rate": 9.510414708905581e-05, "loss": 4.7288, "step": 42200 }, { "epoch": 0.05, "grad_norm": 5.487166881561279, "learning_rate": 9.509254554187347e-05, "loss": 4.7298, "step": 42300 }, { "epoch": 0.05, "grad_norm": 2.0608367919921875, "learning_rate": 9.508094399469114e-05, "loss": 4.7312, "step": 42400 }, { "epoch": 0.05, "grad_norm": 0.465772807598114, "learning_rate": 9.50693424475088e-05, "loss": 4.7308, "step": 42500 }, { "epoch": 0.05, "grad_norm": 0.4866921007633209, "learning_rate": 9.505774090032648e-05, "loss": 4.7295, "step": 42600 }, { "epoch": 0.05, "grad_norm": 1.1015464067459106, "learning_rate": 9.504613935314413e-05, "loss": 4.7296, "step": 42700 }, { "epoch": 0.05, "grad_norm": 0.6402974128723145, "learning_rate": 9.50345378059618e-05, "loss": 4.7285, "step": 42800 }, { "epoch": 0.05, "grad_norm": 2.76873517036438, "learning_rate": 9.502293625877947e-05, "loss": 4.7306, "step": 42900 }, { "epoch": 0.05, "grad_norm": 0.7945021390914917, "learning_rate": 9.501133471159715e-05, "loss": 4.729, "step": 43000 }, { "epoch": 0.05, "grad_norm": 0.3986837565898895, "learning_rate": 9.49997331644148e-05, "loss": 4.7306, "step": 43100 }, { "epoch": 0.05, "grad_norm": 0.4017668068408966, "learning_rate": 9.498813161723247e-05, "loss": 4.7277, "step": 43200 }, { "epoch": 0.05, "grad_norm": 1.6943633556365967, "learning_rate": 9.497653007005016e-05, "loss": 4.7301, "step": 43300 }, { "epoch": 0.05, "grad_norm": 2.686843156814575, "learning_rate": 9.496492852286781e-05, "loss": 4.728, "step": 43400 }, { "epoch": 0.05, "grad_norm": 0.7867078185081482, "learning_rate": 9.495332697568548e-05, "loss": 4.7272, "step": 43500 }, { "epoch": 0.05, "grad_norm": 1.0648784637451172, "learning_rate": 9.494172542850315e-05, "loss": 4.7263, "step": 43600 }, { "epoch": 0.05, "grad_norm": 1.6653295755386353, "learning_rate": 9.493012388132082e-05, "loss": 4.7309, "step": 43700 }, { "epoch": 0.05, "grad_norm": 0.9347316026687622, "learning_rate": 9.491852233413848e-05, "loss": 4.7303, "step": 43800 }, { "epoch": 0.05, "grad_norm": 1.3211824893951416, "learning_rate": 9.490692078695615e-05, "loss": 4.7294, "step": 43900 }, { "epoch": 0.05, "grad_norm": 0.8230929970741272, "learning_rate": 9.489531923977382e-05, "loss": 4.7282, "step": 44000 }, { "epoch": 0.05, "grad_norm": 1.3526966571807861, "learning_rate": 9.488371769259149e-05, "loss": 4.7267, "step": 44100 }, { "epoch": 0.05, "grad_norm": 0.4441579282283783, "learning_rate": 9.487211614540915e-05, "loss": 4.7265, "step": 44200 }, { "epoch": 0.05, "grad_norm": 5.511246204376221, "learning_rate": 9.486051459822682e-05, "loss": 4.7302, "step": 44300 }, { "epoch": 0.05, "grad_norm": 0.5045779943466187, "learning_rate": 9.484891305104449e-05, "loss": 4.73, "step": 44400 }, { "epoch": 0.05, "grad_norm": 0.4756096601486206, "learning_rate": 9.483731150386216e-05, "loss": 4.7291, "step": 44500 }, { "epoch": 0.05, "grad_norm": 0.4292340576648712, "learning_rate": 9.482570995667983e-05, "loss": 4.7272, "step": 44600 }, { "epoch": 0.05, "grad_norm": 0.6878976821899414, "learning_rate": 9.481410840949749e-05, "loss": 4.726, "step": 44700 }, { "epoch": 0.05, "grad_norm": 0.502358078956604, "learning_rate": 9.480250686231517e-05, "loss": 4.7266, "step": 44800 }, { "epoch": 0.05, "grad_norm": 0.6721329092979431, "learning_rate": 9.479090531513283e-05, "loss": 4.7294, "step": 44900 }, { "epoch": 0.05, "grad_norm": 0.438326895236969, "learning_rate": 9.47793037679505e-05, "loss": 4.728, "step": 45000 }, { "epoch": 0.05, "grad_norm": 0.722699761390686, "learning_rate": 9.476770222076816e-05, "loss": 4.7298, "step": 45100 }, { "epoch": 0.05, "grad_norm": 7.889718532562256, "learning_rate": 9.475610067358584e-05, "loss": 4.724, "step": 45200 }, { "epoch": 0.05, "grad_norm": 25.698381423950195, "learning_rate": 9.47444991264035e-05, "loss": 4.7291, "step": 45300 }, { "epoch": 0.05, "grad_norm": 0.5010745525360107, "learning_rate": 9.473289757922117e-05, "loss": 4.7344, "step": 45400 }, { "epoch": 0.05, "grad_norm": 3.237755537033081, "learning_rate": 9.472129603203884e-05, "loss": 4.7266, "step": 45500 }, { "epoch": 0.05, "grad_norm": 0.9018293619155884, "learning_rate": 9.470969448485651e-05, "loss": 4.728, "step": 45600 }, { "epoch": 0.05, "grad_norm": 0.5156495571136475, "learning_rate": 9.469809293767418e-05, "loss": 4.7259, "step": 45700 }, { "epoch": 0.05, "grad_norm": 5.2465362548828125, "learning_rate": 9.468649139049184e-05, "loss": 4.7269, "step": 45800 }, { "epoch": 0.05, "grad_norm": 0.610849916934967, "learning_rate": 9.46748898433095e-05, "loss": 4.7292, "step": 45900 }, { "epoch": 0.05, "grad_norm": 2.3922882080078125, "learning_rate": 9.466328829612718e-05, "loss": 4.7283, "step": 46000 }, { "epoch": 0.05, "grad_norm": 0.42459169030189514, "learning_rate": 9.465168674894485e-05, "loss": 4.727, "step": 46100 }, { "epoch": 0.05, "grad_norm": 0.4948953092098236, "learning_rate": 9.46400852017625e-05, "loss": 4.7247, "step": 46200 }, { "epoch": 0.05, "grad_norm": 0.4495919346809387, "learning_rate": 9.462848365458017e-05, "loss": 4.7275, "step": 46300 }, { "epoch": 0.05, "grad_norm": 1.1093862056732178, "learning_rate": 9.461688210739784e-05, "loss": 4.7297, "step": 46400 }, { "epoch": 0.05, "grad_norm": 0.4342365860939026, "learning_rate": 9.460528056021551e-05, "loss": 4.7306, "step": 46500 }, { "epoch": 0.05, "grad_norm": 25.432937622070312, "learning_rate": 9.459367901303319e-05, "loss": 4.732, "step": 46600 }, { "epoch": 0.05, "grad_norm": 2.0753910541534424, "learning_rate": 9.458207746585086e-05, "loss": 4.7368, "step": 46700 }, { "epoch": 0.05, "grad_norm": 4.888164520263672, "learning_rate": 9.457047591866853e-05, "loss": 4.7355, "step": 46800 }, { "epoch": 0.05, "grad_norm": 0.5124868750572205, "learning_rate": 9.455887437148618e-05, "loss": 4.7255, "step": 46900 }, { "epoch": 0.05, "grad_norm": 0.4563619792461395, "learning_rate": 9.454727282430385e-05, "loss": 4.7266, "step": 47000 }, { "epoch": 0.05, "grad_norm": 0.6962786316871643, "learning_rate": 9.453567127712152e-05, "loss": 4.7266, "step": 47100 }, { "epoch": 0.05, "grad_norm": 0.6182125210762024, "learning_rate": 9.45240697299392e-05, "loss": 4.7262, "step": 47200 }, { "epoch": 0.05, "grad_norm": 0.44259679317474365, "learning_rate": 9.451246818275685e-05, "loss": 4.7289, "step": 47300 }, { "epoch": 0.05, "grad_norm": 1.4451597929000854, "learning_rate": 9.450086663557452e-05, "loss": 4.7253, "step": 47400 }, { "epoch": 0.06, "grad_norm": 6.354624271392822, "learning_rate": 9.448926508839219e-05, "loss": 4.7252, "step": 47500 }, { "epoch": 0.06, "grad_norm": 0.49475088715553284, "learning_rate": 9.447766354120986e-05, "loss": 4.7261, "step": 47600 }, { "epoch": 0.06, "grad_norm": 5.40777587890625, "learning_rate": 9.446606199402753e-05, "loss": 4.7274, "step": 47700 }, { "epoch": 0.06, "grad_norm": 2.0609936714172363, "learning_rate": 9.445446044684519e-05, "loss": 4.7283, "step": 47800 }, { "epoch": 0.06, "grad_norm": 1.6895414590835571, "learning_rate": 9.444285889966287e-05, "loss": 4.7247, "step": 47900 }, { "epoch": 0.06, "grad_norm": 0.4589858949184418, "learning_rate": 9.443125735248053e-05, "loss": 4.7275, "step": 48000 }, { "epoch": 0.06, "grad_norm": 3.8931922912597656, "learning_rate": 9.44196558052982e-05, "loss": 4.7245, "step": 48100 }, { "epoch": 0.06, "grad_norm": 0.5537588000297546, "learning_rate": 9.440805425811586e-05, "loss": 4.7283, "step": 48200 }, { "epoch": 0.06, "grad_norm": 4.216842174530029, "learning_rate": 9.439645271093354e-05, "loss": 4.7238, "step": 48300 }, { "epoch": 0.06, "grad_norm": 0.4659540355205536, "learning_rate": 9.43848511637512e-05, "loss": 4.7269, "step": 48400 }, { "epoch": 0.06, "grad_norm": 0.4347868859767914, "learning_rate": 9.437324961656887e-05, "loss": 4.7272, "step": 48500 }, { "epoch": 0.06, "grad_norm": 0.4908686578273773, "learning_rate": 9.436164806938654e-05, "loss": 4.7217, "step": 48600 }, { "epoch": 0.06, "grad_norm": 0.4566729664802551, "learning_rate": 9.435004652220421e-05, "loss": 4.7259, "step": 48700 }, { "epoch": 0.06, "grad_norm": 0.6793931126594543, "learning_rate": 9.433844497502188e-05, "loss": 4.7272, "step": 48800 }, { "epoch": 0.06, "grad_norm": 0.4818676710128784, "learning_rate": 9.432684342783954e-05, "loss": 4.7261, "step": 48900 }, { "epoch": 0.06, "grad_norm": 0.475321888923645, "learning_rate": 9.431524188065721e-05, "loss": 4.7261, "step": 49000 }, { "epoch": 0.06, "grad_norm": 0.4802456200122833, "learning_rate": 9.430364033347488e-05, "loss": 4.7276, "step": 49100 }, { "epoch": 0.06, "grad_norm": 1.9790329933166504, "learning_rate": 9.429203878629255e-05, "loss": 4.7238, "step": 49200 }, { "epoch": 0.06, "grad_norm": 0.4709641933441162, "learning_rate": 9.42804372391102e-05, "loss": 4.7238, "step": 49300 }, { "epoch": 0.06, "grad_norm": 0.43033114075660706, "learning_rate": 9.426883569192788e-05, "loss": 4.7242, "step": 49400 }, { "epoch": 0.06, "grad_norm": 0.4878010153770447, "learning_rate": 9.425723414474555e-05, "loss": 4.7291, "step": 49500 }, { "epoch": 0.06, "grad_norm": 0.6672825813293457, "learning_rate": 9.424563259756322e-05, "loss": 4.7276, "step": 49600 }, { "epoch": 0.06, "grad_norm": 2.705127239227295, "learning_rate": 9.423403105038089e-05, "loss": 4.7258, "step": 49700 }, { "epoch": 0.06, "grad_norm": 0.4465647339820862, "learning_rate": 9.422242950319854e-05, "loss": 4.7254, "step": 49800 }, { "epoch": 0.06, "grad_norm": 1.3617794513702393, "learning_rate": 9.421082795601623e-05, "loss": 4.7275, "step": 49900 }, { "epoch": 0.06, "grad_norm": 1.3789376020431519, "learning_rate": 9.419922640883388e-05, "loss": 4.727, "step": 50000 }, { "epoch": 0.06, "grad_norm": 0.8773742318153381, "learning_rate": 9.418762486165156e-05, "loss": 4.7279, "step": 50100 }, { "epoch": 0.06, "grad_norm": 0.47493115067481995, "learning_rate": 9.417602331446923e-05, "loss": 4.7244, "step": 50200 }, { "epoch": 0.06, "grad_norm": 0.4560215175151825, "learning_rate": 9.41644217672869e-05, "loss": 4.7258, "step": 50300 }, { "epoch": 0.06, "grad_norm": 0.5064975023269653, "learning_rate": 9.415282022010455e-05, "loss": 4.7246, "step": 50400 }, { "epoch": 0.06, "grad_norm": 1.276395320892334, "learning_rate": 9.414121867292222e-05, "loss": 4.73, "step": 50500 }, { "epoch": 0.06, "grad_norm": 0.6075153350830078, "learning_rate": 9.41296171257399e-05, "loss": 4.726, "step": 50600 }, { "epoch": 0.06, "grad_norm": 3.9961352348327637, "learning_rate": 9.411801557855756e-05, "loss": 4.7256, "step": 50700 }, { "epoch": 0.06, "grad_norm": 0.469164103269577, "learning_rate": 9.410641403137523e-05, "loss": 4.7264, "step": 50800 }, { "epoch": 0.06, "grad_norm": 0.9661677479743958, "learning_rate": 9.409481248419289e-05, "loss": 4.7256, "step": 50900 }, { "epoch": 0.06, "grad_norm": 4.348484039306641, "learning_rate": 9.408321093701058e-05, "loss": 4.7207, "step": 51000 }, { "epoch": 0.06, "grad_norm": 0.4262404143810272, "learning_rate": 9.407160938982823e-05, "loss": 4.7248, "step": 51100 }, { "epoch": 0.06, "grad_norm": 2.119818687438965, "learning_rate": 9.40600078426459e-05, "loss": 4.7234, "step": 51200 }, { "epoch": 0.06, "grad_norm": 0.43322816491127014, "learning_rate": 9.404840629546356e-05, "loss": 4.7262, "step": 51300 }, { "epoch": 0.06, "grad_norm": 0.43069151043891907, "learning_rate": 9.403680474828124e-05, "loss": 4.725, "step": 51400 }, { "epoch": 0.06, "grad_norm": 0.5061689019203186, "learning_rate": 9.40252032010989e-05, "loss": 4.7281, "step": 51500 }, { "epoch": 0.06, "grad_norm": 1.387876272201538, "learning_rate": 9.401360165391657e-05, "loss": 4.7265, "step": 51600 }, { "epoch": 0.06, "grad_norm": 0.48391860723495483, "learning_rate": 9.400200010673423e-05, "loss": 4.7293, "step": 51700 }, { "epoch": 0.06, "grad_norm": 0.8254903554916382, "learning_rate": 9.399039855955191e-05, "loss": 4.7232, "step": 51800 }, { "epoch": 0.06, "grad_norm": 3.460291862487793, "learning_rate": 9.397879701236958e-05, "loss": 4.7237, "step": 51900 }, { "epoch": 0.06, "grad_norm": 1.3960816860198975, "learning_rate": 9.396719546518724e-05, "loss": 4.7254, "step": 52000 }, { "epoch": 0.06, "grad_norm": 6.771088600158691, "learning_rate": 9.395559391800491e-05, "loss": 4.7251, "step": 52100 }, { "epoch": 0.06, "grad_norm": 0.49326056241989136, "learning_rate": 9.394399237082258e-05, "loss": 4.7239, "step": 52200 }, { "epoch": 0.06, "grad_norm": 0.45919495820999146, "learning_rate": 9.393239082364025e-05, "loss": 4.7262, "step": 52300 }, { "epoch": 0.06, "grad_norm": 0.7638592720031738, "learning_rate": 9.392078927645791e-05, "loss": 4.7253, "step": 52400 }, { "epoch": 0.06, "grad_norm": 1.0643978118896484, "learning_rate": 9.390918772927558e-05, "loss": 4.722, "step": 52500 }, { "epoch": 0.06, "grad_norm": 0.4550251364707947, "learning_rate": 9.389758618209325e-05, "loss": 4.7239, "step": 52600 }, { "epoch": 0.06, "grad_norm": 0.7114538550376892, "learning_rate": 9.388598463491092e-05, "loss": 4.7239, "step": 52700 }, { "epoch": 0.06, "grad_norm": 0.4556228220462799, "learning_rate": 9.387438308772857e-05, "loss": 4.725, "step": 52800 }, { "epoch": 0.06, "grad_norm": 0.4266548752784729, "learning_rate": 9.386278154054625e-05, "loss": 4.7238, "step": 52900 }, { "epoch": 0.06, "grad_norm": 1.829122543334961, "learning_rate": 9.385117999336393e-05, "loss": 4.7246, "step": 53000 }, { "epoch": 0.06, "grad_norm": 0.4608537554740906, "learning_rate": 9.383957844618159e-05, "loss": 4.723, "step": 53100 }, { "epoch": 0.06, "grad_norm": 2.6241915225982666, "learning_rate": 9.382797689899926e-05, "loss": 4.7239, "step": 53200 }, { "epoch": 0.06, "grad_norm": 0.4583094120025635, "learning_rate": 9.381637535181693e-05, "loss": 4.722, "step": 53300 }, { "epoch": 0.06, "grad_norm": 1.0573890209197998, "learning_rate": 9.38047738046346e-05, "loss": 4.726, "step": 53400 }, { "epoch": 0.06, "grad_norm": 1.0593518018722534, "learning_rate": 9.379317225745225e-05, "loss": 4.7224, "step": 53500 }, { "epoch": 0.06, "grad_norm": 15.059647560119629, "learning_rate": 9.378157071026992e-05, "loss": 4.7271, "step": 53600 }, { "epoch": 0.06, "grad_norm": 0.4176161587238312, "learning_rate": 9.37699691630876e-05, "loss": 4.7219, "step": 53700 }, { "epoch": 0.06, "grad_norm": 7.073505401611328, "learning_rate": 9.375836761590527e-05, "loss": 4.7261, "step": 53800 }, { "epoch": 0.06, "grad_norm": 0.4442752003669739, "learning_rate": 9.374676606872292e-05, "loss": 4.7203, "step": 53900 }, { "epoch": 0.06, "grad_norm": 0.9274020195007324, "learning_rate": 9.373516452154059e-05, "loss": 4.7251, "step": 54000 }, { "epoch": 0.06, "grad_norm": 0.5794118046760559, "learning_rate": 9.372356297435826e-05, "loss": 4.7229, "step": 54100 }, { "epoch": 0.06, "grad_norm": 0.49662327766418457, "learning_rate": 9.371196142717593e-05, "loss": 4.7233, "step": 54200 }, { "epoch": 0.06, "grad_norm": 1.1663075685501099, "learning_rate": 9.37003598799936e-05, "loss": 4.7245, "step": 54300 }, { "epoch": 0.06, "grad_norm": 0.5266515612602234, "learning_rate": 9.368875833281126e-05, "loss": 4.7285, "step": 54400 }, { "epoch": 0.06, "grad_norm": 0.9966477751731873, "learning_rate": 9.367715678562894e-05, "loss": 4.7221, "step": 54500 }, { "epoch": 0.06, "grad_norm": 0.42000776529312134, "learning_rate": 9.36655552384466e-05, "loss": 4.7277, "step": 54600 }, { "epoch": 0.06, "grad_norm": 0.4215773344039917, "learning_rate": 9.365395369126427e-05, "loss": 4.7204, "step": 54700 }, { "epoch": 0.06, "grad_norm": 0.4621349573135376, "learning_rate": 9.364235214408193e-05, "loss": 4.7214, "step": 54800 }, { "epoch": 0.06, "grad_norm": 0.4330434203147888, "learning_rate": 9.363075059689961e-05, "loss": 4.7244, "step": 54900 }, { "epoch": 0.06, "grad_norm": 0.45555201172828674, "learning_rate": 9.361914904971727e-05, "loss": 4.7201, "step": 55000 }, { "epoch": 0.06, "grad_norm": 0.45038706064224243, "learning_rate": 9.360754750253494e-05, "loss": 4.7254, "step": 55100 }, { "epoch": 0.06, "grad_norm": 1.2518788576126099, "learning_rate": 9.359594595535261e-05, "loss": 4.7237, "step": 55200 }, { "epoch": 0.06, "grad_norm": 0.44323647022247314, "learning_rate": 9.358434440817028e-05, "loss": 4.7269, "step": 55300 }, { "epoch": 0.06, "grad_norm": 1.72452974319458, "learning_rate": 9.357274286098795e-05, "loss": 4.7218, "step": 55400 }, { "epoch": 0.06, "grad_norm": 10.425994873046875, "learning_rate": 9.356114131380561e-05, "loss": 4.7237, "step": 55500 }, { "epoch": 0.06, "grad_norm": 1.400721549987793, "learning_rate": 9.354953976662328e-05, "loss": 4.7282, "step": 55600 }, { "epoch": 0.06, "grad_norm": 0.49108538031578064, "learning_rate": 9.353793821944095e-05, "loss": 4.7187, "step": 55700 }, { "epoch": 0.06, "grad_norm": 0.47107619047164917, "learning_rate": 9.352633667225862e-05, "loss": 4.7217, "step": 55800 }, { "epoch": 0.06, "grad_norm": 0.43458929657936096, "learning_rate": 9.351473512507628e-05, "loss": 4.72, "step": 55900 }, { "epoch": 0.06, "grad_norm": 0.4294680655002594, "learning_rate": 9.350313357789395e-05, "loss": 4.7232, "step": 56000 }, { "epoch": 0.07, "grad_norm": 0.8897697329521179, "learning_rate": 9.349153203071162e-05, "loss": 4.721, "step": 56100 }, { "epoch": 0.07, "grad_norm": 0.4212525188922882, "learning_rate": 9.347993048352929e-05, "loss": 4.7234, "step": 56200 }, { "epoch": 0.07, "grad_norm": 0.9617418646812439, "learning_rate": 9.346832893634696e-05, "loss": 4.7209, "step": 56300 }, { "epoch": 0.07, "grad_norm": 0.46716544032096863, "learning_rate": 9.345672738916463e-05, "loss": 4.7212, "step": 56400 }, { "epoch": 0.07, "grad_norm": 0.8855605721473694, "learning_rate": 9.34451258419823e-05, "loss": 4.7255, "step": 56500 }, { "epoch": 0.07, "grad_norm": 0.47797083854675293, "learning_rate": 9.343352429479996e-05, "loss": 4.7252, "step": 56600 }, { "epoch": 0.07, "grad_norm": 0.5321144461631775, "learning_rate": 9.342192274761763e-05, "loss": 4.7234, "step": 56700 }, { "epoch": 0.07, "grad_norm": 1.9607291221618652, "learning_rate": 9.34103212004353e-05, "loss": 4.7244, "step": 56800 }, { "epoch": 0.07, "grad_norm": 0.45025816559791565, "learning_rate": 9.339871965325297e-05, "loss": 4.7214, "step": 56900 }, { "epoch": 0.07, "grad_norm": 0.46686848998069763, "learning_rate": 9.338711810607062e-05, "loss": 4.7222, "step": 57000 }, { "epoch": 0.07, "grad_norm": 0.4851933419704437, "learning_rate": 9.33755165588883e-05, "loss": 4.7246, "step": 57100 }, { "epoch": 0.07, "grad_norm": 0.9222490787506104, "learning_rate": 9.336391501170596e-05, "loss": 4.7234, "step": 57200 }, { "epoch": 0.07, "grad_norm": 2.9428153038024902, "learning_rate": 9.335231346452364e-05, "loss": 4.7218, "step": 57300 }, { "epoch": 0.07, "grad_norm": 0.5928464531898499, "learning_rate": 9.33407119173413e-05, "loss": 4.7201, "step": 57400 }, { "epoch": 0.07, "grad_norm": 0.47166940569877625, "learning_rate": 9.332911037015896e-05, "loss": 4.719, "step": 57500 }, { "epoch": 0.07, "grad_norm": 0.5025938153266907, "learning_rate": 9.331750882297665e-05, "loss": 4.7216, "step": 57600 }, { "epoch": 0.07, "grad_norm": 0.40926048159599304, "learning_rate": 9.33059072757943e-05, "loss": 4.7214, "step": 57700 }, { "epoch": 0.07, "grad_norm": 0.5747073292732239, "learning_rate": 9.329430572861197e-05, "loss": 4.7236, "step": 57800 }, { "epoch": 0.07, "grad_norm": 0.4234209656715393, "learning_rate": 9.328270418142963e-05, "loss": 4.7218, "step": 57900 }, { "epoch": 0.07, "grad_norm": 0.47158312797546387, "learning_rate": 9.327110263424731e-05, "loss": 4.7212, "step": 58000 }, { "epoch": 0.07, "grad_norm": 0.48128268122673035, "learning_rate": 9.325950108706497e-05, "loss": 4.7217, "step": 58100 }, { "epoch": 0.07, "grad_norm": 0.6812758445739746, "learning_rate": 9.324789953988264e-05, "loss": 4.7229, "step": 58200 }, { "epoch": 0.07, "grad_norm": 0.4417859613895416, "learning_rate": 9.32362979927003e-05, "loss": 4.7203, "step": 58300 }, { "epoch": 0.07, "grad_norm": 0.41741812229156494, "learning_rate": 9.322469644551798e-05, "loss": 4.7253, "step": 58400 }, { "epoch": 0.07, "grad_norm": 0.9673991799354553, "learning_rate": 9.321309489833565e-05, "loss": 4.7205, "step": 58500 }, { "epoch": 0.07, "grad_norm": 0.4821186363697052, "learning_rate": 9.320149335115331e-05, "loss": 4.7232, "step": 58600 }, { "epoch": 0.07, "grad_norm": 0.46331697702407837, "learning_rate": 9.318989180397098e-05, "loss": 4.7198, "step": 58700 }, { "epoch": 0.07, "grad_norm": 0.4690409004688263, "learning_rate": 9.317829025678865e-05, "loss": 4.7235, "step": 58800 }, { "epoch": 0.07, "grad_norm": 5.2196197509765625, "learning_rate": 9.316668870960632e-05, "loss": 4.7232, "step": 58900 }, { "epoch": 0.07, "grad_norm": 0.4771330952644348, "learning_rate": 9.315508716242398e-05, "loss": 4.7252, "step": 59000 }, { "epoch": 0.07, "grad_norm": 0.5050112009048462, "learning_rate": 9.314348561524165e-05, "loss": 4.7231, "step": 59100 }, { "epoch": 0.07, "grad_norm": 5.69675874710083, "learning_rate": 9.313188406805932e-05, "loss": 4.7234, "step": 59200 }, { "epoch": 0.07, "grad_norm": 0.6608816981315613, "learning_rate": 9.312028252087699e-05, "loss": 4.721, "step": 59300 }, { "epoch": 0.07, "grad_norm": 0.42843931913375854, "learning_rate": 9.310868097369466e-05, "loss": 4.7207, "step": 59400 }, { "epoch": 0.07, "grad_norm": 0.5942014455795288, "learning_rate": 9.309707942651233e-05, "loss": 4.7235, "step": 59500 }, { "epoch": 0.07, "grad_norm": 4.165734767913818, "learning_rate": 9.308547787933e-05, "loss": 4.7201, "step": 59600 }, { "epoch": 0.07, "grad_norm": 0.4411238729953766, "learning_rate": 9.307387633214766e-05, "loss": 4.7214, "step": 59700 }, { "epoch": 0.07, "grad_norm": 0.42770957946777344, "learning_rate": 9.306227478496533e-05, "loss": 4.7232, "step": 59800 }, { "epoch": 0.07, "grad_norm": 1.3136483430862427, "learning_rate": 9.3050673237783e-05, "loss": 4.7232, "step": 59900 }, { "epoch": 0.07, "grad_norm": 3.146456241607666, "learning_rate": 9.303907169060067e-05, "loss": 4.7231, "step": 60000 }, { "epoch": 0.07, "grad_norm": 0.6570073366165161, "learning_rate": 9.302747014341833e-05, "loss": 4.7241, "step": 60100 }, { "epoch": 0.07, "grad_norm": 0.44109871983528137, "learning_rate": 9.3015868596236e-05, "loss": 4.7227, "step": 60200 }, { "epoch": 0.07, "grad_norm": 4.838292598724365, "learning_rate": 9.300426704905367e-05, "loss": 4.7241, "step": 60300 }, { "epoch": 0.07, "grad_norm": 0.6892091631889343, "learning_rate": 9.299266550187134e-05, "loss": 4.7245, "step": 60400 }, { "epoch": 0.07, "grad_norm": 0.4367011785507202, "learning_rate": 9.298106395468901e-05, "loss": 4.7186, "step": 60500 }, { "epoch": 0.07, "grad_norm": 0.45093631744384766, "learning_rate": 9.296946240750666e-05, "loss": 4.7211, "step": 60600 }, { "epoch": 0.07, "grad_norm": 0.8475301861763, "learning_rate": 9.295786086032435e-05, "loss": 4.7201, "step": 60700 }, { "epoch": 0.07, "grad_norm": 0.5478389263153076, "learning_rate": 9.2946259313142e-05, "loss": 4.7238, "step": 60800 }, { "epoch": 0.07, "grad_norm": 0.43729883432388306, "learning_rate": 9.293465776595968e-05, "loss": 4.7177, "step": 60900 }, { "epoch": 0.07, "grad_norm": 1.243775725364685, "learning_rate": 9.292305621877733e-05, "loss": 4.7232, "step": 61000 }, { "epoch": 0.07, "grad_norm": 0.4283261299133301, "learning_rate": 9.291145467159502e-05, "loss": 4.7211, "step": 61100 }, { "epoch": 0.07, "grad_norm": 0.5506263375282288, "learning_rate": 9.289985312441267e-05, "loss": 4.7194, "step": 61200 }, { "epoch": 0.07, "grad_norm": 2.6256954669952393, "learning_rate": 9.288825157723034e-05, "loss": 4.7229, "step": 61300 }, { "epoch": 0.07, "grad_norm": 0.47135502099990845, "learning_rate": 9.2876650030048e-05, "loss": 4.7194, "step": 61400 }, { "epoch": 0.07, "grad_norm": 0.4235610067844391, "learning_rate": 9.286504848286568e-05, "loss": 4.7228, "step": 61500 }, { "epoch": 0.07, "grad_norm": 0.691027820110321, "learning_rate": 9.285344693568335e-05, "loss": 4.7178, "step": 61600 }, { "epoch": 0.07, "grad_norm": 1.3785357475280762, "learning_rate": 9.284184538850101e-05, "loss": 4.722, "step": 61700 }, { "epoch": 0.07, "grad_norm": 0.7555792927742004, "learning_rate": 9.283024384131868e-05, "loss": 4.7192, "step": 61800 }, { "epoch": 0.07, "grad_norm": 0.45149528980255127, "learning_rate": 9.281864229413635e-05, "loss": 4.7223, "step": 61900 }, { "epoch": 0.07, "grad_norm": 0.3915523886680603, "learning_rate": 9.280704074695402e-05, "loss": 4.7281, "step": 62000 }, { "epoch": 0.07, "grad_norm": 0.6808405518531799, "learning_rate": 9.279543919977168e-05, "loss": 4.7186, "step": 62100 }, { "epoch": 0.07, "grad_norm": 0.4248678982257843, "learning_rate": 9.278383765258935e-05, "loss": 4.7212, "step": 62200 }, { "epoch": 0.07, "grad_norm": 0.4634566605091095, "learning_rate": 9.277223610540702e-05, "loss": 4.7232, "step": 62300 }, { "epoch": 0.07, "grad_norm": 0.435062438249588, "learning_rate": 9.276063455822469e-05, "loss": 4.7195, "step": 62400 }, { "epoch": 0.07, "grad_norm": 0.44273582100868225, "learning_rate": 9.274903301104235e-05, "loss": 4.7201, "step": 62500 }, { "epoch": 0.07, "grad_norm": 1.6790070533752441, "learning_rate": 9.273743146386002e-05, "loss": 4.7233, "step": 62600 }, { "epoch": 0.07, "grad_norm": 0.6105982065200806, "learning_rate": 9.27258299166777e-05, "loss": 4.7195, "step": 62700 }, { "epoch": 0.07, "grad_norm": 0.4533052444458008, "learning_rate": 9.271422836949536e-05, "loss": 4.7196, "step": 62800 }, { "epoch": 0.07, "grad_norm": 2.9967572689056396, "learning_rate": 9.270262682231303e-05, "loss": 4.7203, "step": 62900 }, { "epoch": 0.07, "grad_norm": 0.4764550030231476, "learning_rate": 9.26910252751307e-05, "loss": 4.7199, "step": 63000 }, { "epoch": 0.07, "grad_norm": 0.6913366913795471, "learning_rate": 9.267942372794837e-05, "loss": 4.7205, "step": 63100 }, { "epoch": 0.07, "grad_norm": 0.43220970034599304, "learning_rate": 9.266782218076603e-05, "loss": 4.7217, "step": 63200 }, { "epoch": 0.07, "grad_norm": 0.44269081950187683, "learning_rate": 9.26562206335837e-05, "loss": 4.7206, "step": 63300 }, { "epoch": 0.07, "grad_norm": 0.4221411645412445, "learning_rate": 9.264461908640137e-05, "loss": 4.7208, "step": 63400 }, { "epoch": 0.07, "grad_norm": 2.0757687091827393, "learning_rate": 9.263301753921904e-05, "loss": 4.7207, "step": 63500 }, { "epoch": 0.07, "grad_norm": 3.6522767543792725, "learning_rate": 9.26214159920367e-05, "loss": 4.721, "step": 63600 }, { "epoch": 0.07, "grad_norm": 0.5005024075508118, "learning_rate": 9.260981444485437e-05, "loss": 4.7216, "step": 63700 }, { "epoch": 0.07, "grad_norm": 0.45725876092910767, "learning_rate": 9.259821289767205e-05, "loss": 4.7182, "step": 63800 }, { "epoch": 0.07, "grad_norm": 7.862248420715332, "learning_rate": 9.25866113504897e-05, "loss": 4.7165, "step": 63900 }, { "epoch": 0.07, "grad_norm": 0.43823152780532837, "learning_rate": 9.257500980330738e-05, "loss": 4.7194, "step": 64000 }, { "epoch": 0.07, "grad_norm": 0.4907792806625366, "learning_rate": 9.256340825612503e-05, "loss": 4.7216, "step": 64100 }, { "epoch": 0.07, "grad_norm": 0.4658365547657013, "learning_rate": 9.255180670894272e-05, "loss": 4.7176, "step": 64200 }, { "epoch": 0.07, "grad_norm": 0.7001955509185791, "learning_rate": 9.254020516176037e-05, "loss": 4.7205, "step": 64300 }, { "epoch": 0.07, "grad_norm": 0.4331388771533966, "learning_rate": 9.252860361457805e-05, "loss": 4.7193, "step": 64400 }, { "epoch": 0.07, "grad_norm": 1.5930231809616089, "learning_rate": 9.25170020673957e-05, "loss": 4.7191, "step": 64500 }, { "epoch": 0.07, "grad_norm": 0.5135887265205383, "learning_rate": 9.250540052021339e-05, "loss": 4.7156, "step": 64600 }, { "epoch": 0.08, "grad_norm": 13.119867324829102, "learning_rate": 9.249379897303104e-05, "loss": 4.717, "step": 64700 }, { "epoch": 0.08, "grad_norm": 0.43670549988746643, "learning_rate": 9.248219742584871e-05, "loss": 4.7208, "step": 64800 }, { "epoch": 0.08, "grad_norm": 1.4829180240631104, "learning_rate": 9.247059587866638e-05, "loss": 4.717, "step": 64900 }, { "epoch": 0.08, "grad_norm": 0.5183930397033691, "learning_rate": 9.245899433148405e-05, "loss": 4.7213, "step": 65000 }, { "epoch": 0.08, "grad_norm": 0.3946682810783386, "learning_rate": 9.244739278430172e-05, "loss": 4.7212, "step": 65100 }, { "epoch": 0.08, "grad_norm": 0.46105512976646423, "learning_rate": 9.243579123711938e-05, "loss": 4.7192, "step": 65200 }, { "epoch": 0.08, "grad_norm": 0.44661739468574524, "learning_rate": 9.242418968993705e-05, "loss": 4.7211, "step": 65300 }, { "epoch": 0.08, "grad_norm": 0.6695325374603271, "learning_rate": 9.241258814275472e-05, "loss": 4.7193, "step": 65400 }, { "epoch": 0.08, "grad_norm": 2.9194817543029785, "learning_rate": 9.240098659557239e-05, "loss": 4.7224, "step": 65500 }, { "epoch": 0.08, "grad_norm": 0.4088338017463684, "learning_rate": 9.238938504839005e-05, "loss": 4.7186, "step": 65600 }, { "epoch": 0.08, "grad_norm": 0.5131499767303467, "learning_rate": 9.237778350120772e-05, "loss": 4.7158, "step": 65700 }, { "epoch": 0.08, "grad_norm": 9.862509727478027, "learning_rate": 9.236618195402539e-05, "loss": 4.716, "step": 65800 }, { "epoch": 0.08, "grad_norm": 0.40794941782951355, "learning_rate": 9.235458040684306e-05, "loss": 4.718, "step": 65900 }, { "epoch": 0.08, "grad_norm": 0.40948912501335144, "learning_rate": 9.234297885966073e-05, "loss": 4.7162, "step": 66000 }, { "epoch": 0.08, "grad_norm": 3.3764171600341797, "learning_rate": 9.23313773124784e-05, "loss": 4.7209, "step": 66100 }, { "epoch": 0.08, "grad_norm": 0.44073447585105896, "learning_rate": 9.231977576529607e-05, "loss": 4.7222, "step": 66200 }, { "epoch": 0.08, "grad_norm": 0.5790233016014099, "learning_rate": 9.230817421811373e-05, "loss": 4.7167, "step": 66300 }, { "epoch": 0.08, "grad_norm": 0.9112139344215393, "learning_rate": 9.22965726709314e-05, "loss": 4.7198, "step": 66400 }, { "epoch": 0.08, "grad_norm": 0.5049501061439514, "learning_rate": 9.228497112374907e-05, "loss": 4.7158, "step": 66500 }, { "epoch": 0.08, "grad_norm": 0.6256558895111084, "learning_rate": 9.227336957656674e-05, "loss": 4.7217, "step": 66600 }, { "epoch": 0.08, "grad_norm": 0.392869234085083, "learning_rate": 9.22617680293844e-05, "loss": 4.72, "step": 66700 }, { "epoch": 0.08, "grad_norm": 0.4745340347290039, "learning_rate": 9.225016648220207e-05, "loss": 4.719, "step": 66800 }, { "epoch": 0.08, "grad_norm": 0.4279547333717346, "learning_rate": 9.223856493501974e-05, "loss": 4.7191, "step": 66900 }, { "epoch": 0.08, "grad_norm": 3.0647037029266357, "learning_rate": 9.222696338783741e-05, "loss": 4.7241, "step": 67000 }, { "epoch": 0.08, "grad_norm": 0.4187820255756378, "learning_rate": 9.221536184065508e-05, "loss": 4.7211, "step": 67100 }, { "epoch": 0.08, "grad_norm": 0.9273259043693542, "learning_rate": 9.220376029347274e-05, "loss": 4.7245, "step": 67200 }, { "epoch": 0.08, "grad_norm": 0.49627095460891724, "learning_rate": 9.219215874629042e-05, "loss": 4.7182, "step": 67300 }, { "epoch": 0.08, "grad_norm": 1.4430867433547974, "learning_rate": 9.218055719910808e-05, "loss": 4.7196, "step": 67400 }, { "epoch": 0.08, "grad_norm": 0.4100867509841919, "learning_rate": 9.216895565192575e-05, "loss": 4.7207, "step": 67500 }, { "epoch": 0.08, "grad_norm": 0.49010923504829407, "learning_rate": 9.21573541047434e-05, "loss": 4.7169, "step": 67600 }, { "epoch": 0.08, "grad_norm": 0.4209338426589966, "learning_rate": 9.214575255756109e-05, "loss": 4.7162, "step": 67700 }, { "epoch": 0.08, "grad_norm": 0.45588409900665283, "learning_rate": 9.213415101037874e-05, "loss": 4.7177, "step": 67800 }, { "epoch": 0.08, "grad_norm": 0.405272901058197, "learning_rate": 9.212254946319641e-05, "loss": 4.7175, "step": 67900 }, { "epoch": 0.08, "grad_norm": 2.1375739574432373, "learning_rate": 9.211094791601409e-05, "loss": 4.7211, "step": 68000 }, { "epoch": 0.08, "grad_norm": 0.4474073648452759, "learning_rate": 9.209934636883176e-05, "loss": 4.7152, "step": 68100 }, { "epoch": 0.08, "grad_norm": 3.441603422164917, "learning_rate": 9.208774482164943e-05, "loss": 4.7209, "step": 68200 }, { "epoch": 0.08, "grad_norm": 0.562470018863678, "learning_rate": 9.207614327446708e-05, "loss": 4.7188, "step": 68300 }, { "epoch": 0.08, "grad_norm": 0.4169847071170807, "learning_rate": 9.206454172728475e-05, "loss": 4.7166, "step": 68400 }, { "epoch": 0.08, "grad_norm": 0.43674352765083313, "learning_rate": 9.205294018010242e-05, "loss": 4.7136, "step": 68500 }, { "epoch": 0.08, "grad_norm": 0.39852142333984375, "learning_rate": 9.20413386329201e-05, "loss": 4.7176, "step": 68600 }, { "epoch": 0.08, "grad_norm": 0.41788631677627563, "learning_rate": 9.202973708573775e-05, "loss": 4.7202, "step": 68700 }, { "epoch": 0.08, "grad_norm": 0.4472859501838684, "learning_rate": 9.201813553855542e-05, "loss": 4.7204, "step": 68800 }, { "epoch": 0.08, "grad_norm": 0.3944435119628906, "learning_rate": 9.200653399137309e-05, "loss": 4.7181, "step": 68900 }, { "epoch": 0.08, "grad_norm": 0.444679319858551, "learning_rate": 9.199493244419076e-05, "loss": 4.723, "step": 69000 }, { "epoch": 0.08, "grad_norm": 0.4338514804840088, "learning_rate": 9.198333089700842e-05, "loss": 4.7213, "step": 69100 }, { "epoch": 0.08, "grad_norm": 0.4399813711643219, "learning_rate": 9.19717293498261e-05, "loss": 4.7167, "step": 69200 }, { "epoch": 0.08, "grad_norm": 0.46220558881759644, "learning_rate": 9.196012780264377e-05, "loss": 4.7158, "step": 69300 }, { "epoch": 0.08, "grad_norm": 1.313628911972046, "learning_rate": 9.194852625546143e-05, "loss": 4.7165, "step": 69400 }, { "epoch": 0.08, "grad_norm": 0.5480996370315552, "learning_rate": 9.19369247082791e-05, "loss": 4.7174, "step": 69500 }, { "epoch": 0.08, "grad_norm": 0.4436434805393219, "learning_rate": 9.192532316109677e-05, "loss": 4.7198, "step": 69600 }, { "epoch": 0.08, "grad_norm": 5.995100498199463, "learning_rate": 9.191372161391444e-05, "loss": 4.7176, "step": 69700 }, { "epoch": 0.08, "grad_norm": 0.44759735465049744, "learning_rate": 9.19021200667321e-05, "loss": 4.7166, "step": 69800 }, { "epoch": 0.08, "grad_norm": 0.9924051761627197, "learning_rate": 9.189051851954977e-05, "loss": 4.7191, "step": 69900 }, { "epoch": 0.08, "grad_norm": 3.259697675704956, "learning_rate": 9.187891697236744e-05, "loss": 4.7166, "step": 70000 }, { "epoch": 0.08, "grad_norm": 0.5200223326683044, "learning_rate": 9.186731542518511e-05, "loss": 4.72, "step": 70100 }, { "epoch": 0.08, "grad_norm": 0.5573539733886719, "learning_rate": 9.185571387800278e-05, "loss": 4.7173, "step": 70200 }, { "epoch": 0.08, "grad_norm": 0.46177276968955994, "learning_rate": 9.184411233082044e-05, "loss": 4.718, "step": 70300 }, { "epoch": 0.08, "grad_norm": 0.4673067033290863, "learning_rate": 9.183251078363812e-05, "loss": 4.7181, "step": 70400 }, { "epoch": 0.08, "grad_norm": 0.4644356667995453, "learning_rate": 9.182090923645578e-05, "loss": 4.7146, "step": 70500 }, { "epoch": 0.08, "grad_norm": 0.4181482195854187, "learning_rate": 9.180930768927345e-05, "loss": 4.7132, "step": 70600 }, { "epoch": 0.08, "grad_norm": 0.42789730429649353, "learning_rate": 9.17977061420911e-05, "loss": 4.7172, "step": 70700 }, { "epoch": 0.08, "grad_norm": 3.978273391723633, "learning_rate": 9.178610459490879e-05, "loss": 4.7201, "step": 70800 }, { "epoch": 0.08, "grad_norm": 6.412673473358154, "learning_rate": 9.177450304772645e-05, "loss": 4.7201, "step": 70900 }, { "epoch": 0.08, "grad_norm": 0.45621249079704285, "learning_rate": 9.176290150054412e-05, "loss": 4.7167, "step": 71000 }, { "epoch": 0.08, "grad_norm": 0.4043485224246979, "learning_rate": 9.175129995336179e-05, "loss": 4.7154, "step": 71100 }, { "epoch": 0.08, "grad_norm": 0.45778888463974, "learning_rate": 9.173969840617946e-05, "loss": 4.7193, "step": 71200 }, { "epoch": 0.08, "grad_norm": 0.42115822434425354, "learning_rate": 9.172809685899713e-05, "loss": 4.7165, "step": 71300 }, { "epoch": 0.08, "grad_norm": 0.421622633934021, "learning_rate": 9.171649531181478e-05, "loss": 4.717, "step": 71400 }, { "epoch": 0.08, "grad_norm": 0.4167540967464447, "learning_rate": 9.170489376463245e-05, "loss": 4.7161, "step": 71500 }, { "epoch": 0.08, "grad_norm": 0.7092134356498718, "learning_rate": 9.169329221745013e-05, "loss": 4.7197, "step": 71600 }, { "epoch": 0.08, "grad_norm": 0.5543451905250549, "learning_rate": 9.16816906702678e-05, "loss": 4.7185, "step": 71700 }, { "epoch": 0.08, "grad_norm": 0.44146397709846497, "learning_rate": 9.167008912308545e-05, "loss": 4.7146, "step": 71800 }, { "epoch": 0.08, "grad_norm": 0.5362756848335266, "learning_rate": 9.165848757590312e-05, "loss": 4.72, "step": 71900 }, { "epoch": 0.08, "grad_norm": 0.4360448122024536, "learning_rate": 9.16468860287208e-05, "loss": 4.7198, "step": 72000 }, { "epoch": 0.08, "grad_norm": 0.40648597478866577, "learning_rate": 9.163528448153846e-05, "loss": 4.7168, "step": 72100 }, { "epoch": 0.08, "grad_norm": 0.8124480843544006, "learning_rate": 9.162368293435612e-05, "loss": 4.7185, "step": 72200 }, { "epoch": 0.08, "grad_norm": 0.48873409628868103, "learning_rate": 9.16120813871738e-05, "loss": 4.7192, "step": 72300 }, { "epoch": 0.08, "grad_norm": 1.013222098350525, "learning_rate": 9.160047983999148e-05, "loss": 4.7156, "step": 72400 }, { "epoch": 0.08, "grad_norm": 0.4423314034938812, "learning_rate": 9.158887829280913e-05, "loss": 4.7172, "step": 72500 }, { "epoch": 0.08, "grad_norm": 0.5054183602333069, "learning_rate": 9.15772767456268e-05, "loss": 4.7199, "step": 72600 }, { "epoch": 0.08, "grad_norm": 0.4140661060810089, "learning_rate": 9.156567519844447e-05, "loss": 4.7142, "step": 72700 }, { "epoch": 0.08, "grad_norm": 1.734790325164795, "learning_rate": 9.155407365126214e-05, "loss": 4.7135, "step": 72800 }, { "epoch": 0.08, "grad_norm": 0.3996482193470001, "learning_rate": 9.15424721040798e-05, "loss": 4.7194, "step": 72900 }, { "epoch": 0.08, "grad_norm": 0.4260541796684265, "learning_rate": 9.153087055689747e-05, "loss": 4.7143, "step": 73000 }, { "epoch": 0.08, "grad_norm": 2.339716672897339, "learning_rate": 9.151926900971514e-05, "loss": 4.7185, "step": 73100 }, { "epoch": 0.08, "grad_norm": 0.48179367184638977, "learning_rate": 9.150766746253281e-05, "loss": 4.7161, "step": 73200 }, { "epoch": 0.09, "grad_norm": 0.4243863821029663, "learning_rate": 9.149606591535047e-05, "loss": 4.7168, "step": 73300 }, { "epoch": 0.09, "grad_norm": 0.38295459747314453, "learning_rate": 9.148446436816814e-05, "loss": 4.715, "step": 73400 }, { "epoch": 0.09, "grad_norm": 0.5023996829986572, "learning_rate": 9.147286282098582e-05, "loss": 4.7115, "step": 73500 }, { "epoch": 0.09, "grad_norm": 0.5002963542938232, "learning_rate": 9.146126127380348e-05, "loss": 4.7145, "step": 73600 }, { "epoch": 0.09, "grad_norm": 9.655466079711914, "learning_rate": 9.144965972662115e-05, "loss": 4.7166, "step": 73700 }, { "epoch": 0.09, "grad_norm": 0.5054044723510742, "learning_rate": 9.14380581794388e-05, "loss": 4.7193, "step": 73800 }, { "epoch": 0.09, "grad_norm": 0.453080415725708, "learning_rate": 9.142645663225649e-05, "loss": 4.7161, "step": 73900 }, { "epoch": 0.09, "grad_norm": 0.44624748826026917, "learning_rate": 9.141485508507415e-05, "loss": 4.7187, "step": 74000 }, { "epoch": 0.09, "grad_norm": 0.4639555811882019, "learning_rate": 9.140325353789182e-05, "loss": 4.7176, "step": 74100 }, { "epoch": 0.09, "grad_norm": 0.41725417971611023, "learning_rate": 9.139165199070947e-05, "loss": 4.7163, "step": 74200 }, { "epoch": 0.09, "grad_norm": 0.866028904914856, "learning_rate": 9.138005044352716e-05, "loss": 4.716, "step": 74300 }, { "epoch": 0.09, "grad_norm": 0.46224868297576904, "learning_rate": 9.136844889634482e-05, "loss": 4.7139, "step": 74400 }, { "epoch": 0.09, "grad_norm": 0.43956613540649414, "learning_rate": 9.135684734916249e-05, "loss": 4.7171, "step": 74500 }, { "epoch": 0.09, "grad_norm": 0.4600673019886017, "learning_rate": 9.134524580198016e-05, "loss": 4.7157, "step": 74600 }, { "epoch": 0.09, "grad_norm": 1.0996651649475098, "learning_rate": 9.133364425479783e-05, "loss": 4.7146, "step": 74700 }, { "epoch": 0.09, "grad_norm": 0.5937373042106628, "learning_rate": 9.13220427076155e-05, "loss": 4.719, "step": 74800 }, { "epoch": 0.09, "grad_norm": 3.145916223526001, "learning_rate": 9.131044116043315e-05, "loss": 4.7144, "step": 74900 }, { "epoch": 0.09, "grad_norm": 5.668306827545166, "learning_rate": 9.129883961325082e-05, "loss": 4.7148, "step": 75000 }, { "epoch": 0.09, "grad_norm": 0.5367663502693176, "learning_rate": 9.12872380660685e-05, "loss": 4.7152, "step": 75100 }, { "epoch": 0.09, "grad_norm": 0.4246484637260437, "learning_rate": 9.127563651888617e-05, "loss": 4.7177, "step": 75200 }, { "epoch": 0.09, "grad_norm": 0.47237107157707214, "learning_rate": 9.126403497170382e-05, "loss": 4.7166, "step": 75300 }, { "epoch": 0.09, "grad_norm": 0.5941634178161621, "learning_rate": 9.12524334245215e-05, "loss": 4.7158, "step": 75400 }, { "epoch": 0.09, "grad_norm": 0.4129926562309265, "learning_rate": 9.124083187733916e-05, "loss": 4.7151, "step": 75500 }, { "epoch": 0.09, "grad_norm": 0.5305066704750061, "learning_rate": 9.122923033015683e-05, "loss": 4.7151, "step": 75600 }, { "epoch": 0.09, "grad_norm": 0.43299585580825806, "learning_rate": 9.12176287829745e-05, "loss": 4.716, "step": 75700 }, { "epoch": 0.09, "grad_norm": 10.120293617248535, "learning_rate": 9.120602723579217e-05, "loss": 4.7183, "step": 75800 }, { "epoch": 0.09, "grad_norm": 0.393725723028183, "learning_rate": 9.119442568860984e-05, "loss": 4.7173, "step": 75900 }, { "epoch": 0.09, "grad_norm": 5.1044440269470215, "learning_rate": 9.11828241414275e-05, "loss": 4.7144, "step": 76000 }, { "epoch": 0.09, "grad_norm": 0.38623175024986267, "learning_rate": 9.117122259424517e-05, "loss": 4.7165, "step": 76100 }, { "epoch": 0.09, "grad_norm": 2.17921781539917, "learning_rate": 9.115962104706284e-05, "loss": 4.7171, "step": 76200 }, { "epoch": 0.09, "grad_norm": 0.4206479489803314, "learning_rate": 9.114801949988051e-05, "loss": 4.7119, "step": 76300 }, { "epoch": 0.09, "grad_norm": 0.46056804060935974, "learning_rate": 9.113641795269817e-05, "loss": 4.7129, "step": 76400 }, { "epoch": 0.09, "grad_norm": 0.4047752320766449, "learning_rate": 9.112481640551584e-05, "loss": 4.7133, "step": 76500 }, { "epoch": 0.09, "grad_norm": 0.4942108988761902, "learning_rate": 9.111321485833351e-05, "loss": 4.7143, "step": 76600 }, { "epoch": 0.09, "grad_norm": 1.5642993450164795, "learning_rate": 9.110161331115118e-05, "loss": 4.7126, "step": 76700 }, { "epoch": 0.09, "grad_norm": 0.4183765947818756, "learning_rate": 9.109001176396885e-05, "loss": 4.7138, "step": 76800 }, { "epoch": 0.09, "grad_norm": 0.4382226765155792, "learning_rate": 9.107841021678651e-05, "loss": 4.7156, "step": 76900 }, { "epoch": 0.09, "grad_norm": 0.429971843957901, "learning_rate": 9.106680866960419e-05, "loss": 4.7152, "step": 77000 }, { "epoch": 0.09, "grad_norm": 0.4051077663898468, "learning_rate": 9.105520712242185e-05, "loss": 4.7146, "step": 77100 }, { "epoch": 0.09, "grad_norm": 0.4416183829307556, "learning_rate": 9.104360557523952e-05, "loss": 4.716, "step": 77200 }, { "epoch": 0.09, "grad_norm": 1.668931007385254, "learning_rate": 9.103200402805718e-05, "loss": 4.7116, "step": 77300 }, { "epoch": 0.09, "grad_norm": 0.42298269271850586, "learning_rate": 9.102040248087486e-05, "loss": 4.7173, "step": 77400 }, { "epoch": 0.09, "grad_norm": 0.7401153445243835, "learning_rate": 9.100880093369252e-05, "loss": 4.7148, "step": 77500 }, { "epoch": 0.09, "grad_norm": 0.6153188943862915, "learning_rate": 9.099719938651019e-05, "loss": 4.7157, "step": 77600 }, { "epoch": 0.09, "grad_norm": 0.41621410846710205, "learning_rate": 9.098559783932786e-05, "loss": 4.7169, "step": 77700 }, { "epoch": 0.09, "grad_norm": 0.4147420823574066, "learning_rate": 9.097399629214553e-05, "loss": 4.7155, "step": 77800 }, { "epoch": 0.09, "grad_norm": 8.179230690002441, "learning_rate": 9.09623947449632e-05, "loss": 4.7142, "step": 77900 }, { "epoch": 0.09, "grad_norm": 0.8721107244491577, "learning_rate": 9.095079319778086e-05, "loss": 4.7145, "step": 78000 }, { "epoch": 0.09, "grad_norm": 1.0575512647628784, "learning_rate": 9.093919165059853e-05, "loss": 4.7156, "step": 78100 }, { "epoch": 0.09, "grad_norm": 0.7179790139198303, "learning_rate": 9.09275901034162e-05, "loss": 4.7145, "step": 78200 }, { "epoch": 0.09, "grad_norm": 1.8817191123962402, "learning_rate": 9.091598855623387e-05, "loss": 4.7154, "step": 78300 }, { "epoch": 0.09, "grad_norm": 0.47256988286972046, "learning_rate": 9.090438700905152e-05, "loss": 4.7176, "step": 78400 }, { "epoch": 0.09, "grad_norm": 0.7659066319465637, "learning_rate": 9.08927854618692e-05, "loss": 4.7176, "step": 78500 }, { "epoch": 0.09, "grad_norm": 0.4704316556453705, "learning_rate": 9.088118391468686e-05, "loss": 4.7173, "step": 78600 }, { "epoch": 0.09, "grad_norm": 0.47578001022338867, "learning_rate": 9.086958236750454e-05, "loss": 4.7122, "step": 78700 }, { "epoch": 0.09, "grad_norm": 3.0918359756469727, "learning_rate": 9.085798082032219e-05, "loss": 4.712, "step": 78800 }, { "epoch": 0.09, "grad_norm": 0.712931752204895, "learning_rate": 9.084637927313988e-05, "loss": 4.7125, "step": 78900 }, { "epoch": 0.09, "grad_norm": 0.39107823371887207, "learning_rate": 9.083477772595755e-05, "loss": 4.7133, "step": 79000 }, { "epoch": 0.09, "grad_norm": 0.45276641845703125, "learning_rate": 9.08231761787752e-05, "loss": 4.7135, "step": 79100 }, { "epoch": 0.09, "grad_norm": 0.4419955611228943, "learning_rate": 9.081157463159287e-05, "loss": 4.7133, "step": 79200 }, { "epoch": 0.09, "grad_norm": 0.39920610189437866, "learning_rate": 9.079997308441054e-05, "loss": 4.7121, "step": 79300 }, { "epoch": 0.09, "grad_norm": 0.8428456783294678, "learning_rate": 9.078837153722821e-05, "loss": 4.7135, "step": 79400 }, { "epoch": 0.09, "grad_norm": 0.664378821849823, "learning_rate": 9.077676999004587e-05, "loss": 4.7165, "step": 79500 }, { "epoch": 0.09, "grad_norm": 0.4127453863620758, "learning_rate": 9.076516844286354e-05, "loss": 4.7149, "step": 79600 }, { "epoch": 0.09, "grad_norm": 1.8230032920837402, "learning_rate": 9.075356689568121e-05, "loss": 4.7123, "step": 79700 }, { "epoch": 0.09, "grad_norm": 0.4852809011936188, "learning_rate": 9.074196534849888e-05, "loss": 4.7155, "step": 79800 }, { "epoch": 0.09, "grad_norm": 10.022642135620117, "learning_rate": 9.073036380131654e-05, "loss": 4.7138, "step": 79900 }, { "epoch": 0.09, "grad_norm": 0.4676394462585449, "learning_rate": 9.071876225413421e-05, "loss": 4.7148, "step": 80000 }, { "epoch": 0.09, "grad_norm": 0.4943694770336151, "learning_rate": 9.07071607069519e-05, "loss": 4.7136, "step": 80100 }, { "epoch": 0.09, "grad_norm": 0.45253655314445496, "learning_rate": 9.069555915976955e-05, "loss": 4.7134, "step": 80200 }, { "epoch": 0.09, "grad_norm": 0.42640450596809387, "learning_rate": 9.068395761258722e-05, "loss": 4.7107, "step": 80300 }, { "epoch": 0.09, "grad_norm": 0.409261018037796, "learning_rate": 9.067235606540488e-05, "loss": 4.7141, "step": 80400 }, { "epoch": 0.09, "grad_norm": 0.9824286103248596, "learning_rate": 9.066075451822256e-05, "loss": 4.7141, "step": 80500 }, { "epoch": 0.09, "grad_norm": 0.6486682891845703, "learning_rate": 9.064915297104022e-05, "loss": 4.7161, "step": 80600 }, { "epoch": 0.09, "grad_norm": 0.587768018245697, "learning_rate": 9.063755142385789e-05, "loss": 4.712, "step": 80700 }, { "epoch": 0.09, "grad_norm": 0.4047750234603882, "learning_rate": 9.062594987667556e-05, "loss": 4.7133, "step": 80800 }, { "epoch": 0.09, "grad_norm": 0.6553632616996765, "learning_rate": 9.061434832949323e-05, "loss": 4.7139, "step": 80900 }, { "epoch": 0.09, "grad_norm": 0.46661463379859924, "learning_rate": 9.06027467823109e-05, "loss": 4.7138, "step": 81000 }, { "epoch": 0.09, "grad_norm": 0.44048547744750977, "learning_rate": 9.059114523512856e-05, "loss": 4.7159, "step": 81100 }, { "epoch": 0.09, "grad_norm": 0.9623928666114807, "learning_rate": 9.057954368794623e-05, "loss": 4.7134, "step": 81200 }, { "epoch": 0.09, "grad_norm": 0.40359175205230713, "learning_rate": 9.05679421407639e-05, "loss": 4.7147, "step": 81300 }, { "epoch": 0.09, "grad_norm": 0.6080948114395142, "learning_rate": 9.055634059358157e-05, "loss": 4.7132, "step": 81400 }, { "epoch": 0.09, "grad_norm": 2.992509126663208, "learning_rate": 9.054473904639923e-05, "loss": 4.7137, "step": 81500 }, { "epoch": 0.09, "grad_norm": 0.4392452538013458, "learning_rate": 9.05331374992169e-05, "loss": 4.7122, "step": 81600 }, { "epoch": 0.09, "grad_norm": 0.9614498019218445, "learning_rate": 9.052153595203457e-05, "loss": 4.71, "step": 81700 }, { "epoch": 0.09, "grad_norm": 0.4181789755821228, "learning_rate": 9.050993440485224e-05, "loss": 4.7127, "step": 81800 }, { "epoch": 0.1, "grad_norm": 1.0023537874221802, "learning_rate": 9.04983328576699e-05, "loss": 4.7144, "step": 81900 }, { "epoch": 0.1, "grad_norm": 9.406643867492676, "learning_rate": 9.048673131048758e-05, "loss": 4.7134, "step": 82000 }, { "epoch": 0.1, "grad_norm": 0.4215865433216095, "learning_rate": 9.047512976330525e-05, "loss": 4.7137, "step": 82100 }, { "epoch": 0.1, "grad_norm": 0.4328692853450775, "learning_rate": 9.04635282161229e-05, "loss": 4.7125, "step": 82200 }, { "epoch": 0.1, "grad_norm": 0.38356509804725647, "learning_rate": 9.045192666894058e-05, "loss": 4.7195, "step": 82300 }, { "epoch": 0.1, "grad_norm": 0.4107106626033783, "learning_rate": 9.044032512175825e-05, "loss": 4.7097, "step": 82400 }, { "epoch": 0.1, "grad_norm": 18.908138275146484, "learning_rate": 9.042872357457592e-05, "loss": 4.7187, "step": 82500 }, { "epoch": 0.1, "grad_norm": 0.48430249094963074, "learning_rate": 9.041712202739357e-05, "loss": 4.7174, "step": 82600 }, { "epoch": 0.1, "grad_norm": 0.4645005166530609, "learning_rate": 9.040552048021124e-05, "loss": 4.7146, "step": 82700 }, { "epoch": 0.1, "grad_norm": 1.38825261592865, "learning_rate": 9.039391893302891e-05, "loss": 4.7159, "step": 82800 }, { "epoch": 0.1, "grad_norm": 1.1587923765182495, "learning_rate": 9.038231738584658e-05, "loss": 4.7182, "step": 82900 }, { "epoch": 0.1, "grad_norm": 0.42529210448265076, "learning_rate": 9.037071583866424e-05, "loss": 4.7118, "step": 83000 }, { "epoch": 0.1, "grad_norm": 6.866576671600342, "learning_rate": 9.035911429148191e-05, "loss": 4.7124, "step": 83100 }, { "epoch": 0.1, "grad_norm": 0.415056973695755, "learning_rate": 9.03475127442996e-05, "loss": 4.7136, "step": 83200 }, { "epoch": 0.1, "grad_norm": 0.4472216069698334, "learning_rate": 9.033591119711725e-05, "loss": 4.71, "step": 83300 }, { "epoch": 0.1, "grad_norm": 0.5460192561149597, "learning_rate": 9.032430964993492e-05, "loss": 4.7155, "step": 83400 }, { "epoch": 0.1, "grad_norm": 1.7862334251403809, "learning_rate": 9.031270810275258e-05, "loss": 4.7135, "step": 83500 }, { "epoch": 0.1, "grad_norm": 0.4369088411331177, "learning_rate": 9.030110655557026e-05, "loss": 4.7136, "step": 83600 }, { "epoch": 0.1, "grad_norm": 0.40611496567726135, "learning_rate": 9.028950500838792e-05, "loss": 4.7144, "step": 83700 }, { "epoch": 0.1, "grad_norm": 0.434088796377182, "learning_rate": 9.027790346120559e-05, "loss": 4.7161, "step": 83800 }, { "epoch": 0.1, "grad_norm": 0.3754402995109558, "learning_rate": 9.026630191402326e-05, "loss": 4.7112, "step": 83900 }, { "epoch": 0.1, "grad_norm": 0.507156491279602, "learning_rate": 9.025470036684093e-05, "loss": 4.7091, "step": 84000 }, { "epoch": 0.1, "grad_norm": 0.43876662850379944, "learning_rate": 9.024309881965859e-05, "loss": 4.7107, "step": 84100 }, { "epoch": 0.1, "grad_norm": 1.3095866441726685, "learning_rate": 9.023149727247626e-05, "loss": 4.7114, "step": 84200 }, { "epoch": 0.1, "grad_norm": 1.327343225479126, "learning_rate": 9.021989572529393e-05, "loss": 4.713, "step": 84300 }, { "epoch": 0.1, "grad_norm": 0.4155600965023041, "learning_rate": 9.02082941781116e-05, "loss": 4.7136, "step": 84400 }, { "epoch": 0.1, "grad_norm": 0.4112358093261719, "learning_rate": 9.019669263092927e-05, "loss": 4.7096, "step": 84500 }, { "epoch": 0.1, "grad_norm": 1.422731637954712, "learning_rate": 9.018509108374693e-05, "loss": 4.7132, "step": 84600 }, { "epoch": 0.1, "grad_norm": 0.42826709151268005, "learning_rate": 9.01734895365646e-05, "loss": 4.7125, "step": 84700 }, { "epoch": 0.1, "grad_norm": 0.4018416702747345, "learning_rate": 9.016188798938227e-05, "loss": 4.7123, "step": 84800 }, { "epoch": 0.1, "grad_norm": 0.3829653561115265, "learning_rate": 9.015028644219994e-05, "loss": 4.7143, "step": 84900 }, { "epoch": 0.1, "grad_norm": 0.45742008090019226, "learning_rate": 9.01386848950176e-05, "loss": 4.7114, "step": 85000 }, { "epoch": 0.1, "grad_norm": 2.7736401557922363, "learning_rate": 9.012708334783528e-05, "loss": 4.7138, "step": 85100 }, { "epoch": 0.1, "grad_norm": 0.40791991353034973, "learning_rate": 9.011548180065294e-05, "loss": 4.7116, "step": 85200 }, { "epoch": 0.1, "grad_norm": 0.40508711338043213, "learning_rate": 9.01038802534706e-05, "loss": 4.7109, "step": 85300 }, { "epoch": 0.1, "grad_norm": 0.41179603338241577, "learning_rate": 9.009227870628828e-05, "loss": 4.7147, "step": 85400 }, { "epoch": 0.1, "grad_norm": 0.49404746294021606, "learning_rate": 9.008067715910595e-05, "loss": 4.714, "step": 85500 }, { "epoch": 0.1, "grad_norm": 0.4049994647502899, "learning_rate": 9.006907561192362e-05, "loss": 4.7131, "step": 85600 }, { "epoch": 0.1, "grad_norm": 0.3943819999694824, "learning_rate": 9.005747406474127e-05, "loss": 4.7099, "step": 85700 }, { "epoch": 0.1, "grad_norm": 0.3848678171634674, "learning_rate": 9.004587251755894e-05, "loss": 4.7105, "step": 85800 }, { "epoch": 0.1, "grad_norm": 0.39097893238067627, "learning_rate": 9.003427097037662e-05, "loss": 4.7134, "step": 85900 }, { "epoch": 0.1, "grad_norm": 0.5381259918212891, "learning_rate": 9.002266942319429e-05, "loss": 4.7111, "step": 86000 }, { "epoch": 0.1, "grad_norm": 0.44293051958084106, "learning_rate": 9.001106787601194e-05, "loss": 4.7132, "step": 86100 }, { "epoch": 0.1, "grad_norm": 2.449153423309326, "learning_rate": 8.999946632882961e-05, "loss": 4.7108, "step": 86200 }, { "epoch": 0.1, "grad_norm": 1.677003026008606, "learning_rate": 8.998786478164728e-05, "loss": 4.7117, "step": 86300 }, { "epoch": 0.1, "grad_norm": 0.4721989035606384, "learning_rate": 8.997626323446495e-05, "loss": 4.7084, "step": 86400 }, { "epoch": 0.1, "grad_norm": 0.4921315908432007, "learning_rate": 8.996466168728262e-05, "loss": 4.7157, "step": 86500 }, { "epoch": 0.1, "grad_norm": 0.6939030885696411, "learning_rate": 8.995306014010028e-05, "loss": 4.7114, "step": 86600 }, { "epoch": 0.1, "grad_norm": 1.3602544069290161, "learning_rate": 8.994145859291797e-05, "loss": 4.7116, "step": 86700 }, { "epoch": 0.1, "grad_norm": 0.40210890769958496, "learning_rate": 8.992985704573562e-05, "loss": 4.7132, "step": 86800 }, { "epoch": 0.1, "grad_norm": 0.41714897751808167, "learning_rate": 8.991825549855329e-05, "loss": 4.713, "step": 86900 }, { "epoch": 0.1, "grad_norm": 3.1149511337280273, "learning_rate": 8.990665395137096e-05, "loss": 4.709, "step": 87000 }, { "epoch": 0.1, "grad_norm": 1.3356086015701294, "learning_rate": 8.989505240418863e-05, "loss": 4.7102, "step": 87100 }, { "epoch": 0.1, "grad_norm": 0.36242830753326416, "learning_rate": 8.988345085700629e-05, "loss": 4.7091, "step": 87200 }, { "epoch": 0.1, "grad_norm": 0.45975425839424133, "learning_rate": 8.987184930982396e-05, "loss": 4.7098, "step": 87300 }, { "epoch": 0.1, "grad_norm": 0.38699838519096375, "learning_rate": 8.986024776264163e-05, "loss": 4.7113, "step": 87400 }, { "epoch": 0.1, "grad_norm": 2.0604562759399414, "learning_rate": 8.98486462154593e-05, "loss": 4.7108, "step": 87500 }, { "epoch": 0.1, "grad_norm": 0.39834097027778625, "learning_rate": 8.983704466827697e-05, "loss": 4.7131, "step": 87600 }, { "epoch": 0.1, "grad_norm": 0.40465226769447327, "learning_rate": 8.982544312109463e-05, "loss": 4.7101, "step": 87700 }, { "epoch": 0.1, "grad_norm": 0.43859535455703735, "learning_rate": 8.98138415739123e-05, "loss": 4.7118, "step": 87800 }, { "epoch": 0.1, "grad_norm": 0.5537000894546509, "learning_rate": 8.980224002672997e-05, "loss": 4.7106, "step": 87900 }, { "epoch": 0.1, "grad_norm": 0.5020445585250854, "learning_rate": 8.979063847954764e-05, "loss": 4.7094, "step": 88000 }, { "epoch": 0.1, "grad_norm": 0.419494092464447, "learning_rate": 8.97790369323653e-05, "loss": 4.7111, "step": 88100 }, { "epoch": 0.1, "grad_norm": 0.4324280023574829, "learning_rate": 8.976743538518298e-05, "loss": 4.7151, "step": 88200 }, { "epoch": 0.1, "grad_norm": 0.410915732383728, "learning_rate": 8.975583383800064e-05, "loss": 4.7126, "step": 88300 }, { "epoch": 0.1, "grad_norm": 0.4660918414592743, "learning_rate": 8.974423229081831e-05, "loss": 4.7097, "step": 88400 }, { "epoch": 0.1, "grad_norm": 1.4695265293121338, "learning_rate": 8.973263074363596e-05, "loss": 4.7094, "step": 88500 }, { "epoch": 0.1, "grad_norm": 0.7962493896484375, "learning_rate": 8.972102919645365e-05, "loss": 4.7106, "step": 88600 }, { "epoch": 0.1, "grad_norm": 0.41975751519203186, "learning_rate": 8.970942764927132e-05, "loss": 4.7082, "step": 88700 }, { "epoch": 0.1, "grad_norm": 2.06545090675354, "learning_rate": 8.969782610208898e-05, "loss": 4.7115, "step": 88800 }, { "epoch": 0.1, "grad_norm": 0.5522103309631348, "learning_rate": 8.968622455490665e-05, "loss": 4.7116, "step": 88900 }, { "epoch": 0.1, "grad_norm": 0.4133060574531555, "learning_rate": 8.967462300772432e-05, "loss": 4.7117, "step": 89000 }, { "epoch": 0.1, "grad_norm": 0.6902609467506409, "learning_rate": 8.966302146054199e-05, "loss": 4.7114, "step": 89100 }, { "epoch": 0.1, "grad_norm": 0.39645135402679443, "learning_rate": 8.965141991335964e-05, "loss": 4.7075, "step": 89200 }, { "epoch": 0.1, "grad_norm": 0.702682614326477, "learning_rate": 8.963981836617731e-05, "loss": 4.7132, "step": 89300 }, { "epoch": 0.1, "grad_norm": 0.4847519099712372, "learning_rate": 8.962821681899499e-05, "loss": 4.7107, "step": 89400 }, { "epoch": 0.1, "grad_norm": 0.4338213801383972, "learning_rate": 8.961661527181266e-05, "loss": 4.7112, "step": 89500 }, { "epoch": 0.1, "grad_norm": 0.5931529402732849, "learning_rate": 8.960501372463031e-05, "loss": 4.713, "step": 89600 }, { "epoch": 0.1, "grad_norm": 0.4164227545261383, "learning_rate": 8.959341217744798e-05, "loss": 4.7124, "step": 89700 }, { "epoch": 0.1, "grad_norm": 0.41698023676872253, "learning_rate": 8.958181063026567e-05, "loss": 4.7098, "step": 89800 }, { "epoch": 0.1, "grad_norm": 0.45001962780952454, "learning_rate": 8.957020908308332e-05, "loss": 4.7099, "step": 89900 }, { "epoch": 0.1, "grad_norm": 0.5672646164894104, "learning_rate": 8.9558607535901e-05, "loss": 4.7101, "step": 90000 }, { "epoch": 0.1, "grad_norm": 0.4138546586036682, "learning_rate": 8.954700598871865e-05, "loss": 4.7114, "step": 90100 }, { "epoch": 0.1, "grad_norm": 0.5982836484909058, "learning_rate": 8.953540444153633e-05, "loss": 4.7077, "step": 90200 }, { "epoch": 0.1, "grad_norm": 0.39376482367515564, "learning_rate": 8.952380289435399e-05, "loss": 4.713, "step": 90300 }, { "epoch": 0.1, "grad_norm": 0.39758920669555664, "learning_rate": 8.951220134717166e-05, "loss": 4.7105, "step": 90400 }, { "epoch": 0.1, "grad_norm": 1.4163142442703247, "learning_rate": 8.950059979998933e-05, "loss": 4.7071, "step": 90500 }, { "epoch": 0.11, "grad_norm": 0.4065834581851959, "learning_rate": 8.9488998252807e-05, "loss": 4.7136, "step": 90600 }, { "epoch": 0.11, "grad_norm": 0.4078928530216217, "learning_rate": 8.947739670562466e-05, "loss": 4.7081, "step": 90700 }, { "epoch": 0.11, "grad_norm": 2.143505096435547, "learning_rate": 8.946579515844233e-05, "loss": 4.7119, "step": 90800 }, { "epoch": 0.11, "grad_norm": 0.4692690074443817, "learning_rate": 8.945419361126e-05, "loss": 4.71, "step": 90900 }, { "epoch": 0.11, "grad_norm": 0.4491908848285675, "learning_rate": 8.944259206407767e-05, "loss": 4.709, "step": 91000 }, { "epoch": 0.11, "grad_norm": 0.4015776515007019, "learning_rate": 8.943099051689534e-05, "loss": 4.7133, "step": 91100 }, { "epoch": 0.11, "grad_norm": 12.54626178741455, "learning_rate": 8.9419388969713e-05, "loss": 4.7115, "step": 91200 }, { "epoch": 0.11, "grad_norm": 0.4521196484565735, "learning_rate": 8.940778742253068e-05, "loss": 4.7098, "step": 91300 }, { "epoch": 0.11, "grad_norm": 0.40918394923210144, "learning_rate": 8.939618587534834e-05, "loss": 4.7064, "step": 91400 }, { "epoch": 0.11, "grad_norm": 0.7197582721710205, "learning_rate": 8.938458432816601e-05, "loss": 4.7083, "step": 91500 }, { "epoch": 0.11, "grad_norm": 0.4415791630744934, "learning_rate": 8.937298278098367e-05, "loss": 4.7121, "step": 91600 }, { "epoch": 0.11, "grad_norm": 0.37231072783470154, "learning_rate": 8.936138123380135e-05, "loss": 4.7076, "step": 91700 }, { "epoch": 0.11, "grad_norm": 0.5812079906463623, "learning_rate": 8.934977968661902e-05, "loss": 4.7103, "step": 91800 }, { "epoch": 0.11, "grad_norm": 0.42401689291000366, "learning_rate": 8.933817813943668e-05, "loss": 4.7087, "step": 91900 }, { "epoch": 0.11, "grad_norm": 0.433369904756546, "learning_rate": 8.932657659225435e-05, "loss": 4.7102, "step": 92000 }, { "epoch": 0.11, "grad_norm": 0.4144749045372009, "learning_rate": 8.931497504507202e-05, "loss": 4.711, "step": 92100 }, { "epoch": 0.11, "grad_norm": 0.3901808559894562, "learning_rate": 8.930337349788969e-05, "loss": 4.712, "step": 92200 }, { "epoch": 0.11, "grad_norm": 0.41925275325775146, "learning_rate": 8.929177195070735e-05, "loss": 4.7123, "step": 92300 }, { "epoch": 0.11, "grad_norm": 0.44110241532325745, "learning_rate": 8.928017040352502e-05, "loss": 4.7107, "step": 92400 }, { "epoch": 0.11, "grad_norm": 0.3761931359767914, "learning_rate": 8.926856885634269e-05, "loss": 4.7067, "step": 92500 }, { "epoch": 0.11, "grad_norm": 0.37371134757995605, "learning_rate": 8.925696730916036e-05, "loss": 4.7076, "step": 92600 }, { "epoch": 0.11, "grad_norm": 2.6820991039276123, "learning_rate": 8.924536576197801e-05, "loss": 4.7101, "step": 92700 }, { "epoch": 0.11, "grad_norm": 8.034706115722656, "learning_rate": 8.923376421479568e-05, "loss": 4.7052, "step": 92800 }, { "epoch": 0.11, "grad_norm": 0.6186811327934265, "learning_rate": 8.922216266761337e-05, "loss": 4.7108, "step": 92900 }, { "epoch": 0.11, "grad_norm": 16.127689361572266, "learning_rate": 8.921056112043103e-05, "loss": 4.7109, "step": 93000 }, { "epoch": 0.11, "grad_norm": 0.4097209572792053, "learning_rate": 8.91989595732487e-05, "loss": 4.7107, "step": 93100 }, { "epoch": 0.11, "grad_norm": 0.4645543098449707, "learning_rate": 8.918735802606635e-05, "loss": 4.706, "step": 93200 }, { "epoch": 0.11, "grad_norm": 0.4279698431491852, "learning_rate": 8.917575647888404e-05, "loss": 4.7081, "step": 93300 }, { "epoch": 0.11, "grad_norm": 0.3847067654132843, "learning_rate": 8.91641549317017e-05, "loss": 4.7104, "step": 93400 }, { "epoch": 0.11, "grad_norm": 0.9405023455619812, "learning_rate": 8.915255338451936e-05, "loss": 4.7077, "step": 93500 }, { "epoch": 0.11, "grad_norm": 0.4204134941101074, "learning_rate": 8.914095183733703e-05, "loss": 4.7102, "step": 93600 }, { "epoch": 0.11, "grad_norm": 0.411493718624115, "learning_rate": 8.91293502901547e-05, "loss": 4.7128, "step": 93700 }, { "epoch": 0.11, "grad_norm": 0.4645240604877472, "learning_rate": 8.911774874297236e-05, "loss": 4.708, "step": 93800 }, { "epoch": 0.11, "grad_norm": 8.295918464660645, "learning_rate": 8.910614719579003e-05, "loss": 4.7087, "step": 93900 }, { "epoch": 0.11, "grad_norm": 0.5342881083488464, "learning_rate": 8.90945456486077e-05, "loss": 4.7076, "step": 94000 }, { "epoch": 0.11, "grad_norm": 0.41708967089653015, "learning_rate": 8.908294410142537e-05, "loss": 4.7096, "step": 94100 }, { "epoch": 0.11, "grad_norm": 0.50910484790802, "learning_rate": 8.907134255424304e-05, "loss": 4.7074, "step": 94200 }, { "epoch": 0.11, "grad_norm": 0.8689035773277283, "learning_rate": 8.90597410070607e-05, "loss": 4.7084, "step": 94300 }, { "epoch": 0.11, "grad_norm": 1.477888584136963, "learning_rate": 8.904813945987837e-05, "loss": 4.7089, "step": 94400 }, { "epoch": 0.11, "grad_norm": 0.3866770565509796, "learning_rate": 8.903653791269604e-05, "loss": 4.71, "step": 94500 }, { "epoch": 0.11, "grad_norm": 0.4197094440460205, "learning_rate": 8.902493636551371e-05, "loss": 4.7062, "step": 94600 }, { "epoch": 0.11, "grad_norm": 0.4031221866607666, "learning_rate": 8.901333481833137e-05, "loss": 4.7105, "step": 94700 }, { "epoch": 0.11, "grad_norm": 0.4098550081253052, "learning_rate": 8.900173327114905e-05, "loss": 4.7099, "step": 94800 }, { "epoch": 0.11, "grad_norm": 0.37702327966690063, "learning_rate": 8.899013172396671e-05, "loss": 4.7073, "step": 94900 }, { "epoch": 0.11, "grad_norm": 0.4298543632030487, "learning_rate": 8.897853017678438e-05, "loss": 4.7092, "step": 95000 }, { "epoch": 0.11, "grad_norm": 0.49779465794563293, "learning_rate": 8.896692862960205e-05, "loss": 4.7059, "step": 95100 }, { "epoch": 0.11, "grad_norm": 0.3974045515060425, "learning_rate": 8.895532708241972e-05, "loss": 4.7116, "step": 95200 }, { "epoch": 0.11, "grad_norm": 4.801963806152344, "learning_rate": 8.894372553523739e-05, "loss": 4.7103, "step": 95300 }, { "epoch": 0.11, "grad_norm": 0.6826298236846924, "learning_rate": 8.893212398805505e-05, "loss": 4.7124, "step": 95400 }, { "epoch": 0.11, "grad_norm": 0.8550306558609009, "learning_rate": 8.892052244087272e-05, "loss": 4.7078, "step": 95500 }, { "epoch": 0.11, "grad_norm": 0.4076642394065857, "learning_rate": 8.890892089369039e-05, "loss": 4.7065, "step": 95600 }, { "epoch": 0.11, "grad_norm": 0.4062858521938324, "learning_rate": 8.889731934650806e-05, "loss": 4.7086, "step": 95700 }, { "epoch": 0.11, "grad_norm": 2.432161808013916, "learning_rate": 8.888571779932572e-05, "loss": 4.7049, "step": 95800 }, { "epoch": 0.11, "grad_norm": 0.41037145256996155, "learning_rate": 8.887411625214339e-05, "loss": 4.7042, "step": 95900 }, { "epoch": 0.11, "grad_norm": 0.3852427005767822, "learning_rate": 8.886251470496106e-05, "loss": 4.7051, "step": 96000 }, { "epoch": 0.11, "grad_norm": 0.43618643283843994, "learning_rate": 8.885091315777873e-05, "loss": 4.7092, "step": 96100 }, { "epoch": 0.11, "grad_norm": 0.4727887511253357, "learning_rate": 8.88393116105964e-05, "loss": 4.7076, "step": 96200 }, { "epoch": 0.11, "grad_norm": 0.4139232337474823, "learning_rate": 8.882771006341405e-05, "loss": 4.7078, "step": 96300 }, { "epoch": 0.11, "grad_norm": 2.8521788120269775, "learning_rate": 8.881610851623174e-05, "loss": 4.7086, "step": 96400 }, { "epoch": 0.11, "grad_norm": 0.43892329931259155, "learning_rate": 8.88045069690494e-05, "loss": 4.7067, "step": 96500 }, { "epoch": 0.11, "grad_norm": 0.40643131732940674, "learning_rate": 8.879290542186707e-05, "loss": 4.7078, "step": 96600 }, { "epoch": 0.11, "grad_norm": 0.44639840722084045, "learning_rate": 8.878130387468474e-05, "loss": 4.7092, "step": 96700 }, { "epoch": 0.11, "grad_norm": 10.743888854980469, "learning_rate": 8.87697023275024e-05, "loss": 4.7048, "step": 96800 }, { "epoch": 0.11, "grad_norm": 0.39078572392463684, "learning_rate": 8.875810078032006e-05, "loss": 4.7015, "step": 96900 }, { "epoch": 0.11, "grad_norm": 0.4048124849796295, "learning_rate": 8.874649923313773e-05, "loss": 4.7126, "step": 97000 }, { "epoch": 0.11, "grad_norm": 0.5833559632301331, "learning_rate": 8.87348976859554e-05, "loss": 4.7057, "step": 97100 }, { "epoch": 0.11, "grad_norm": 0.4627319276332855, "learning_rate": 8.872329613877307e-05, "loss": 4.7091, "step": 97200 }, { "epoch": 0.11, "grad_norm": 0.460225909948349, "learning_rate": 8.871169459159074e-05, "loss": 4.7079, "step": 97300 }, { "epoch": 0.11, "grad_norm": 0.44441863894462585, "learning_rate": 8.87000930444084e-05, "loss": 4.7039, "step": 97400 }, { "epoch": 0.11, "grad_norm": 0.40614771842956543, "learning_rate": 8.868849149722607e-05, "loss": 4.7067, "step": 97500 }, { "epoch": 0.11, "grad_norm": 0.48935455083847046, "learning_rate": 8.867688995004374e-05, "loss": 4.7072, "step": 97600 }, { "epoch": 0.11, "grad_norm": 1.503010869026184, "learning_rate": 8.866528840286141e-05, "loss": 4.7051, "step": 97700 }, { "epoch": 0.11, "grad_norm": 0.40835824608802795, "learning_rate": 8.865368685567907e-05, "loss": 4.7085, "step": 97800 }, { "epoch": 0.11, "grad_norm": 0.47787487506866455, "learning_rate": 8.864208530849675e-05, "loss": 4.7042, "step": 97900 }, { "epoch": 0.11, "grad_norm": 0.44062966108322144, "learning_rate": 8.863048376131441e-05, "loss": 4.7088, "step": 98000 }, { "epoch": 0.11, "grad_norm": 0.5162871479988098, "learning_rate": 8.861888221413208e-05, "loss": 4.705, "step": 98100 }, { "epoch": 0.11, "grad_norm": 0.7115480899810791, "learning_rate": 8.860728066694974e-05, "loss": 4.7069, "step": 98200 }, { "epoch": 0.11, "grad_norm": 0.43299001455307007, "learning_rate": 8.859567911976742e-05, "loss": 4.7037, "step": 98300 }, { "epoch": 0.11, "grad_norm": 0.5156663656234741, "learning_rate": 8.858407757258509e-05, "loss": 4.7057, "step": 98400 }, { "epoch": 0.11, "grad_norm": 1.3159596920013428, "learning_rate": 8.857247602540275e-05, "loss": 4.705, "step": 98500 }, { "epoch": 0.11, "grad_norm": 5.239011287689209, "learning_rate": 8.856087447822042e-05, "loss": 4.7104, "step": 98600 }, { "epoch": 0.11, "grad_norm": 0.5143164992332458, "learning_rate": 8.854927293103809e-05, "loss": 4.7075, "step": 98700 }, { "epoch": 0.11, "grad_norm": 0.4091811180114746, "learning_rate": 8.853767138385576e-05, "loss": 4.706, "step": 98800 }, { "epoch": 0.11, "grad_norm": 0.3682583272457123, "learning_rate": 8.852606983667342e-05, "loss": 4.7049, "step": 98900 }, { "epoch": 0.11, "grad_norm": 1.9574017524719238, "learning_rate": 8.851446828949109e-05, "loss": 4.7085, "step": 99000 }, { "epoch": 0.11, "grad_norm": 0.4863421320915222, "learning_rate": 8.850286674230876e-05, "loss": 4.7042, "step": 99100 }, { "epoch": 0.12, "grad_norm": 0.41971755027770996, "learning_rate": 8.849126519512643e-05, "loss": 4.7046, "step": 99200 }, { "epoch": 0.12, "grad_norm": 0.45720914006233215, "learning_rate": 8.847966364794409e-05, "loss": 4.705, "step": 99300 }, { "epoch": 0.12, "grad_norm": 0.3879556953907013, "learning_rate": 8.846806210076176e-05, "loss": 4.7051, "step": 99400 }, { "epoch": 0.12, "grad_norm": 0.6103081703186035, "learning_rate": 8.845646055357944e-05, "loss": 4.7018, "step": 99500 }, { "epoch": 0.12, "grad_norm": 0.4066198468208313, "learning_rate": 8.84448590063971e-05, "loss": 4.7039, "step": 99600 }, { "epoch": 0.12, "grad_norm": 0.568277895450592, "learning_rate": 8.843325745921477e-05, "loss": 4.7046, "step": 99700 }, { "epoch": 0.12, "grad_norm": 0.41322973370552063, "learning_rate": 8.842165591203244e-05, "loss": 4.7051, "step": 99800 }, { "epoch": 0.12, "grad_norm": 3.952580690383911, "learning_rate": 8.841005436485011e-05, "loss": 4.7017, "step": 99900 }, { "epoch": 0.12, "grad_norm": 0.40354979038238525, "learning_rate": 8.839845281766776e-05, "loss": 4.7076, "step": 100000 }, { "epoch": 0.12, "grad_norm": 0.44302529096603394, "learning_rate": 8.838685127048543e-05, "loss": 4.7071, "step": 100100 }, { "epoch": 0.12, "grad_norm": 0.5012074112892151, "learning_rate": 8.83752497233031e-05, "loss": 4.7024, "step": 100200 }, { "epoch": 0.12, "grad_norm": 0.4387538433074951, "learning_rate": 8.836364817612078e-05, "loss": 4.7019, "step": 100300 }, { "epoch": 0.12, "grad_norm": 4.33089017868042, "learning_rate": 8.835204662893843e-05, "loss": 4.7034, "step": 100400 }, { "epoch": 0.12, "grad_norm": 0.6666650176048279, "learning_rate": 8.83404450817561e-05, "loss": 4.7006, "step": 100500 }, { "epoch": 0.12, "grad_norm": 0.43695616722106934, "learning_rate": 8.832884353457377e-05, "loss": 4.7028, "step": 100600 }, { "epoch": 0.12, "grad_norm": 0.39040979743003845, "learning_rate": 8.831724198739144e-05, "loss": 4.703, "step": 100700 }, { "epoch": 0.12, "grad_norm": 0.4081096053123474, "learning_rate": 8.830564044020911e-05, "loss": 4.7062, "step": 100800 }, { "epoch": 0.12, "grad_norm": 0.4371529519557953, "learning_rate": 8.829403889302677e-05, "loss": 4.7027, "step": 100900 }, { "epoch": 0.12, "grad_norm": 0.4820528030395508, "learning_rate": 8.828243734584446e-05, "loss": 4.7026, "step": 101000 }, { "epoch": 0.12, "grad_norm": 0.4896353781223297, "learning_rate": 8.827083579866211e-05, "loss": 4.7052, "step": 101100 }, { "epoch": 0.12, "grad_norm": 0.4941895604133606, "learning_rate": 8.825923425147978e-05, "loss": 4.7065, "step": 101200 }, { "epoch": 0.12, "grad_norm": 0.4064702093601227, "learning_rate": 8.824763270429744e-05, "loss": 4.7032, "step": 101300 }, { "epoch": 0.12, "grad_norm": 0.7341260313987732, "learning_rate": 8.823603115711512e-05, "loss": 4.7029, "step": 101400 }, { "epoch": 0.12, "grad_norm": 0.41835737228393555, "learning_rate": 8.822442960993278e-05, "loss": 4.7012, "step": 101500 }, { "epoch": 0.12, "grad_norm": 0.4256590008735657, "learning_rate": 8.821282806275045e-05, "loss": 4.7038, "step": 101600 }, { "epoch": 0.12, "grad_norm": 0.43846938014030457, "learning_rate": 8.820122651556812e-05, "loss": 4.7014, "step": 101700 }, { "epoch": 0.12, "grad_norm": 0.4327394962310791, "learning_rate": 8.818962496838579e-05, "loss": 4.7022, "step": 101800 }, { "epoch": 0.12, "grad_norm": 0.42558759450912476, "learning_rate": 8.817802342120346e-05, "loss": 4.7001, "step": 101900 }, { "epoch": 0.12, "grad_norm": 0.41578829288482666, "learning_rate": 8.816642187402112e-05, "loss": 4.6996, "step": 102000 }, { "epoch": 0.12, "grad_norm": 1.1245514154434204, "learning_rate": 8.815482032683879e-05, "loss": 4.7055, "step": 102100 }, { "epoch": 0.12, "grad_norm": 0.43621131777763367, "learning_rate": 8.814321877965646e-05, "loss": 4.7011, "step": 102200 }, { "epoch": 0.12, "grad_norm": 0.555548906326294, "learning_rate": 8.813161723247413e-05, "loss": 4.7026, "step": 102300 }, { "epoch": 0.12, "grad_norm": 0.5187997221946716, "learning_rate": 8.812001568529179e-05, "loss": 4.7046, "step": 102400 }, { "epoch": 0.12, "grad_norm": 0.8702124953269958, "learning_rate": 8.810841413810946e-05, "loss": 4.7009, "step": 102500 }, { "epoch": 0.12, "grad_norm": 0.4865017533302307, "learning_rate": 8.809681259092714e-05, "loss": 4.7012, "step": 102600 }, { "epoch": 0.12, "grad_norm": 0.4582134783267975, "learning_rate": 8.80852110437448e-05, "loss": 4.7024, "step": 102700 }, { "epoch": 0.12, "grad_norm": 0.48400819301605225, "learning_rate": 8.807360949656247e-05, "loss": 4.7009, "step": 102800 }, { "epoch": 0.12, "grad_norm": 0.4129534363746643, "learning_rate": 8.806200794938013e-05, "loss": 4.6953, "step": 102900 }, { "epoch": 0.12, "grad_norm": 0.5257245302200317, "learning_rate": 8.805040640219781e-05, "loss": 4.6998, "step": 103000 }, { "epoch": 0.12, "grad_norm": 0.6813905239105225, "learning_rate": 8.803880485501547e-05, "loss": 4.7037, "step": 103100 }, { "epoch": 0.12, "grad_norm": 0.6715384721755981, "learning_rate": 8.802720330783314e-05, "loss": 4.7011, "step": 103200 }, { "epoch": 0.12, "grad_norm": 0.41265252232551575, "learning_rate": 8.801560176065081e-05, "loss": 4.7019, "step": 103300 }, { "epoch": 0.12, "grad_norm": 0.37935546040534973, "learning_rate": 8.800400021346848e-05, "loss": 4.6995, "step": 103400 }, { "epoch": 0.12, "grad_norm": 2.578775644302368, "learning_rate": 8.799239866628613e-05, "loss": 4.7025, "step": 103500 }, { "epoch": 0.12, "grad_norm": 0.5321478843688965, "learning_rate": 8.79807971191038e-05, "loss": 4.6983, "step": 103600 }, { "epoch": 0.12, "grad_norm": 0.4280731976032257, "learning_rate": 8.796919557192148e-05, "loss": 4.6947, "step": 103700 }, { "epoch": 0.12, "grad_norm": 0.559546709060669, "learning_rate": 8.795759402473915e-05, "loss": 4.7027, "step": 103800 }, { "epoch": 0.12, "grad_norm": 0.39881765842437744, "learning_rate": 8.794599247755682e-05, "loss": 4.7024, "step": 103900 }, { "epoch": 0.12, "grad_norm": 0.5303432941436768, "learning_rate": 8.793439093037447e-05, "loss": 4.6984, "step": 104000 }, { "epoch": 0.12, "grad_norm": 0.4243880808353424, "learning_rate": 8.792278938319216e-05, "loss": 4.7011, "step": 104100 }, { "epoch": 0.12, "grad_norm": 1.8046822547912598, "learning_rate": 8.791118783600981e-05, "loss": 4.703, "step": 104200 }, { "epoch": 0.12, "grad_norm": 0.45566943287849426, "learning_rate": 8.789958628882748e-05, "loss": 4.702, "step": 104300 }, { "epoch": 0.12, "grad_norm": 4.0667338371276855, "learning_rate": 8.788798474164514e-05, "loss": 4.6984, "step": 104400 }, { "epoch": 0.12, "grad_norm": 0.4209063947200775, "learning_rate": 8.787638319446282e-05, "loss": 4.6985, "step": 104500 }, { "epoch": 0.12, "grad_norm": 0.41932806372642517, "learning_rate": 8.786478164728048e-05, "loss": 4.6987, "step": 104600 }, { "epoch": 0.12, "grad_norm": 3.0371077060699463, "learning_rate": 8.785318010009815e-05, "loss": 4.7016, "step": 104700 }, { "epoch": 0.12, "grad_norm": 0.5310882925987244, "learning_rate": 8.784157855291582e-05, "loss": 4.6996, "step": 104800 }, { "epoch": 0.12, "grad_norm": 0.43061837553977966, "learning_rate": 8.782997700573349e-05, "loss": 4.6987, "step": 104900 }, { "epoch": 0.12, "grad_norm": 0.5050943493843079, "learning_rate": 8.781837545855116e-05, "loss": 4.7012, "step": 105000 }, { "epoch": 0.12, "grad_norm": 0.39062759280204773, "learning_rate": 8.780677391136882e-05, "loss": 4.6966, "step": 105100 }, { "epoch": 0.12, "grad_norm": 4.528038024902344, "learning_rate": 8.779517236418649e-05, "loss": 4.6964, "step": 105200 }, { "epoch": 0.12, "grad_norm": 0.3973850905895233, "learning_rate": 8.778357081700416e-05, "loss": 4.6989, "step": 105300 }, { "epoch": 0.12, "grad_norm": 0.42976003885269165, "learning_rate": 8.777196926982183e-05, "loss": 4.6996, "step": 105400 }, { "epoch": 0.12, "grad_norm": 0.43507105112075806, "learning_rate": 8.776036772263949e-05, "loss": 4.698, "step": 105500 }, { "epoch": 0.12, "grad_norm": 0.4329013526439667, "learning_rate": 8.774876617545716e-05, "loss": 4.6967, "step": 105600 }, { "epoch": 0.12, "grad_norm": 0.40144628286361694, "learning_rate": 8.773716462827483e-05, "loss": 4.6944, "step": 105700 }, { "epoch": 0.12, "grad_norm": 0.5410645008087158, "learning_rate": 8.77255630810925e-05, "loss": 4.6962, "step": 105800 }, { "epoch": 0.12, "grad_norm": 0.709266722202301, "learning_rate": 8.771396153391017e-05, "loss": 4.696, "step": 105900 }, { "epoch": 0.12, "grad_norm": 0.4841790199279785, "learning_rate": 8.770235998672783e-05, "loss": 4.7003, "step": 106000 }, { "epoch": 0.12, "grad_norm": 0.48105648159980774, "learning_rate": 8.769075843954551e-05, "loss": 4.6925, "step": 106100 }, { "epoch": 0.12, "grad_norm": 0.4711418151855469, "learning_rate": 8.767915689236317e-05, "loss": 4.6967, "step": 106200 }, { "epoch": 0.12, "grad_norm": 0.4943852126598358, "learning_rate": 8.766755534518084e-05, "loss": 4.6958, "step": 106300 }, { "epoch": 0.12, "grad_norm": 1.1731679439544678, "learning_rate": 8.765595379799851e-05, "loss": 4.697, "step": 106400 }, { "epoch": 0.12, "grad_norm": 2.6942670345306396, "learning_rate": 8.764435225081618e-05, "loss": 4.6988, "step": 106500 }, { "epoch": 0.12, "grad_norm": 0.5932111144065857, "learning_rate": 8.763275070363384e-05, "loss": 4.7008, "step": 106600 }, { "epoch": 0.12, "grad_norm": 0.498844176530838, "learning_rate": 8.76211491564515e-05, "loss": 4.6966, "step": 106700 }, { "epoch": 0.12, "grad_norm": 0.525398850440979, "learning_rate": 8.760954760926918e-05, "loss": 4.6992, "step": 106800 }, { "epoch": 0.12, "grad_norm": 2.405345916748047, "learning_rate": 8.759794606208685e-05, "loss": 4.6968, "step": 106900 }, { "epoch": 0.12, "grad_norm": 0.4208034873008728, "learning_rate": 8.758634451490452e-05, "loss": 4.6919, "step": 107000 }, { "epoch": 0.12, "grad_norm": 0.3912876844406128, "learning_rate": 8.757474296772217e-05, "loss": 4.6921, "step": 107100 }, { "epoch": 0.12, "grad_norm": 0.4323633909225464, "learning_rate": 8.756314142053984e-05, "loss": 4.6966, "step": 107200 }, { "epoch": 0.12, "grad_norm": 0.39893805980682373, "learning_rate": 8.755153987335752e-05, "loss": 4.6928, "step": 107300 }, { "epoch": 0.12, "grad_norm": 0.42258456349372864, "learning_rate": 8.753993832617519e-05, "loss": 4.6955, "step": 107400 }, { "epoch": 0.12, "grad_norm": 0.40447258949279785, "learning_rate": 8.752833677899284e-05, "loss": 4.6972, "step": 107500 }, { "epoch": 0.12, "grad_norm": 0.5002463459968567, "learning_rate": 8.751673523181053e-05, "loss": 4.6977, "step": 107600 }, { "epoch": 0.12, "grad_norm": 5.072608470916748, "learning_rate": 8.750513368462818e-05, "loss": 4.6992, "step": 107700 }, { "epoch": 0.13, "grad_norm": 0.4288334250450134, "learning_rate": 8.749353213744585e-05, "loss": 4.6984, "step": 107800 }, { "epoch": 0.13, "grad_norm": 0.529216468334198, "learning_rate": 8.748193059026351e-05, "loss": 4.698, "step": 107900 }, { "epoch": 0.13, "grad_norm": 0.42044612765312195, "learning_rate": 8.74703290430812e-05, "loss": 4.6957, "step": 108000 }, { "epoch": 0.13, "grad_norm": 0.8280035853385925, "learning_rate": 8.745872749589886e-05, "loss": 4.6916, "step": 108100 }, { "epoch": 0.13, "grad_norm": 0.41592320799827576, "learning_rate": 8.744712594871652e-05, "loss": 4.6946, "step": 108200 }, { "epoch": 0.13, "grad_norm": 0.425014466047287, "learning_rate": 8.743552440153419e-05, "loss": 4.6955, "step": 108300 }, { "epoch": 0.13, "grad_norm": 0.40150442719459534, "learning_rate": 8.742392285435186e-05, "loss": 4.6946, "step": 108400 }, { "epoch": 0.13, "grad_norm": 0.441785991191864, "learning_rate": 8.741232130716953e-05, "loss": 4.6908, "step": 108500 }, { "epoch": 0.13, "grad_norm": 1.035128116607666, "learning_rate": 8.740071975998719e-05, "loss": 4.6926, "step": 108600 }, { "epoch": 0.13, "grad_norm": 0.4290466606616974, "learning_rate": 8.738911821280486e-05, "loss": 4.6926, "step": 108700 }, { "epoch": 0.13, "grad_norm": 0.4321587085723877, "learning_rate": 8.737751666562253e-05, "loss": 4.6937, "step": 108800 }, { "epoch": 0.13, "grad_norm": 0.40536269545555115, "learning_rate": 8.73659151184402e-05, "loss": 4.69, "step": 108900 }, { "epoch": 0.13, "grad_norm": 0.3988741934299469, "learning_rate": 8.735431357125786e-05, "loss": 4.6935, "step": 109000 }, { "epoch": 0.13, "grad_norm": 0.8648415803909302, "learning_rate": 8.734271202407553e-05, "loss": 4.6978, "step": 109100 }, { "epoch": 0.13, "grad_norm": 0.6086846590042114, "learning_rate": 8.733111047689321e-05, "loss": 4.6929, "step": 109200 }, { "epoch": 0.13, "grad_norm": 0.38623344898223877, "learning_rate": 8.731950892971087e-05, "loss": 4.697, "step": 109300 }, { "epoch": 0.13, "grad_norm": 0.7714725732803345, "learning_rate": 8.730790738252854e-05, "loss": 4.6926, "step": 109400 }, { "epoch": 0.13, "grad_norm": 0.37906432151794434, "learning_rate": 8.729630583534621e-05, "loss": 4.6927, "step": 109500 }, { "epoch": 0.13, "grad_norm": 0.4156550168991089, "learning_rate": 8.728470428816388e-05, "loss": 4.6953, "step": 109600 }, { "epoch": 0.13, "grad_norm": 0.39191532135009766, "learning_rate": 8.727310274098154e-05, "loss": 4.6935, "step": 109700 }, { "epoch": 0.13, "grad_norm": 0.4391324818134308, "learning_rate": 8.726150119379921e-05, "loss": 4.6939, "step": 109800 }, { "epoch": 0.13, "grad_norm": 0.42475369572639465, "learning_rate": 8.724989964661688e-05, "loss": 4.6903, "step": 109900 }, { "epoch": 0.13, "grad_norm": 0.4388252794742584, "learning_rate": 8.723829809943455e-05, "loss": 4.6914, "step": 110000 }, { "epoch": 0.13, "grad_norm": 0.4022296667098999, "learning_rate": 8.72266965522522e-05, "loss": 4.6948, "step": 110100 }, { "epoch": 0.13, "grad_norm": 0.4492380917072296, "learning_rate": 8.721509500506988e-05, "loss": 4.6911, "step": 110200 }, { "epoch": 0.13, "grad_norm": 0.4049302637577057, "learning_rate": 8.720349345788755e-05, "loss": 4.6902, "step": 110300 }, { "epoch": 0.13, "grad_norm": 1.9303175210952759, "learning_rate": 8.719189191070522e-05, "loss": 4.6892, "step": 110400 }, { "epoch": 0.13, "grad_norm": 0.48216354846954346, "learning_rate": 8.718029036352289e-05, "loss": 4.6882, "step": 110500 }, { "epoch": 0.13, "grad_norm": 1.4418731927871704, "learning_rate": 8.716868881634054e-05, "loss": 4.6893, "step": 110600 }, { "epoch": 0.13, "grad_norm": 0.5436264276504517, "learning_rate": 8.715708726915823e-05, "loss": 4.6892, "step": 110700 }, { "epoch": 0.13, "grad_norm": 0.3900034427642822, "learning_rate": 8.714548572197588e-05, "loss": 4.6862, "step": 110800 }, { "epoch": 0.13, "grad_norm": 0.4286581575870514, "learning_rate": 8.713388417479356e-05, "loss": 4.6902, "step": 110900 }, { "epoch": 0.13, "grad_norm": 2.8317441940307617, "learning_rate": 8.712228262761121e-05, "loss": 4.692, "step": 111000 }, { "epoch": 0.13, "grad_norm": 0.4669002890586853, "learning_rate": 8.71106810804289e-05, "loss": 4.6933, "step": 111100 }, { "epoch": 0.13, "grad_norm": 12.876310348510742, "learning_rate": 8.709907953324655e-05, "loss": 4.6933, "step": 111200 }, { "epoch": 0.13, "grad_norm": 0.4043096601963043, "learning_rate": 8.708747798606422e-05, "loss": 4.6905, "step": 111300 }, { "epoch": 0.13, "grad_norm": 0.6544508934020996, "learning_rate": 8.70758764388819e-05, "loss": 4.6862, "step": 111400 }, { "epoch": 0.13, "grad_norm": 0.4368617534637451, "learning_rate": 8.706427489169956e-05, "loss": 4.6938, "step": 111500 }, { "epoch": 0.13, "grad_norm": 0.8235392570495605, "learning_rate": 8.705267334451723e-05, "loss": 4.6894, "step": 111600 }, { "epoch": 0.13, "grad_norm": 0.44141313433647156, "learning_rate": 8.704107179733489e-05, "loss": 4.6899, "step": 111700 }, { "epoch": 0.13, "grad_norm": 0.4662952423095703, "learning_rate": 8.702947025015256e-05, "loss": 4.6862, "step": 111800 }, { "epoch": 0.13, "grad_norm": 0.5402454137802124, "learning_rate": 8.701786870297023e-05, "loss": 4.6885, "step": 111900 }, { "epoch": 0.13, "grad_norm": 0.5668596029281616, "learning_rate": 8.70062671557879e-05, "loss": 4.6842, "step": 112000 }, { "epoch": 0.13, "grad_norm": 0.586438000202179, "learning_rate": 8.699466560860556e-05, "loss": 4.6893, "step": 112100 }, { "epoch": 0.13, "grad_norm": 0.4443139135837555, "learning_rate": 8.698306406142323e-05, "loss": 4.6914, "step": 112200 }, { "epoch": 0.13, "grad_norm": 0.4333912432193756, "learning_rate": 8.69714625142409e-05, "loss": 4.691, "step": 112300 }, { "epoch": 0.13, "grad_norm": 0.5807641744613647, "learning_rate": 8.695986096705857e-05, "loss": 4.6935, "step": 112400 }, { "epoch": 0.13, "grad_norm": 0.3854735493659973, "learning_rate": 8.694825941987624e-05, "loss": 4.6889, "step": 112500 }, { "epoch": 0.13, "grad_norm": 0.40090426802635193, "learning_rate": 8.693665787269391e-05, "loss": 4.6874, "step": 112600 }, { "epoch": 0.13, "grad_norm": 0.40820324420928955, "learning_rate": 8.692505632551158e-05, "loss": 4.6902, "step": 112700 }, { "epoch": 0.13, "grad_norm": 1.9625461101531982, "learning_rate": 8.691345477832924e-05, "loss": 4.6876, "step": 112800 }, { "epoch": 0.13, "grad_norm": 0.39590463042259216, "learning_rate": 8.690185323114691e-05, "loss": 4.6895, "step": 112900 }, { "epoch": 0.13, "grad_norm": 0.4090369641780853, "learning_rate": 8.689025168396458e-05, "loss": 4.6921, "step": 113000 }, { "epoch": 0.13, "grad_norm": 0.48455721139907837, "learning_rate": 8.687865013678225e-05, "loss": 4.686, "step": 113100 }, { "epoch": 0.13, "grad_norm": 0.5107011795043945, "learning_rate": 8.686704858959991e-05, "loss": 4.6863, "step": 113200 }, { "epoch": 0.13, "grad_norm": 0.4489879608154297, "learning_rate": 8.685544704241758e-05, "loss": 4.689, "step": 113300 }, { "epoch": 0.13, "grad_norm": 0.7847305536270142, "learning_rate": 8.684384549523525e-05, "loss": 4.687, "step": 113400 }, { "epoch": 0.13, "grad_norm": 0.4807884693145752, "learning_rate": 8.683224394805292e-05, "loss": 4.6844, "step": 113500 }, { "epoch": 0.13, "grad_norm": 0.4227710962295532, "learning_rate": 8.682064240087059e-05, "loss": 4.6882, "step": 113600 }, { "epoch": 0.13, "grad_norm": 0.425329327583313, "learning_rate": 8.680904085368825e-05, "loss": 4.6876, "step": 113700 }, { "epoch": 0.13, "grad_norm": 0.39754214882850647, "learning_rate": 8.679743930650593e-05, "loss": 4.6928, "step": 113800 }, { "epoch": 0.13, "grad_norm": 0.41271573305130005, "learning_rate": 8.678583775932359e-05, "loss": 4.6871, "step": 113900 }, { "epoch": 0.13, "grad_norm": 0.6075764298439026, "learning_rate": 8.677423621214126e-05, "loss": 4.6846, "step": 114000 }, { "epoch": 0.13, "grad_norm": 0.4607229232788086, "learning_rate": 8.676263466495891e-05, "loss": 4.6907, "step": 114100 }, { "epoch": 0.13, "grad_norm": 0.49409857392311096, "learning_rate": 8.67510331177766e-05, "loss": 4.6835, "step": 114200 }, { "epoch": 0.13, "grad_norm": 1.1049710512161255, "learning_rate": 8.673943157059425e-05, "loss": 4.6851, "step": 114300 }, { "epoch": 0.13, "grad_norm": 0.4263964593410492, "learning_rate": 8.672783002341192e-05, "loss": 4.6881, "step": 114400 }, { "epoch": 0.13, "grad_norm": 1.2732186317443848, "learning_rate": 8.67162284762296e-05, "loss": 4.6919, "step": 114500 }, { "epoch": 0.13, "grad_norm": 2.2248380184173584, "learning_rate": 8.670462692904727e-05, "loss": 4.6864, "step": 114600 }, { "epoch": 0.13, "grad_norm": 1.1274431943893433, "learning_rate": 8.669302538186494e-05, "loss": 4.6905, "step": 114700 }, { "epoch": 0.13, "grad_norm": 0.41689029335975647, "learning_rate": 8.668142383468259e-05, "loss": 4.6888, "step": 114800 }, { "epoch": 0.13, "grad_norm": 0.39221587777137756, "learning_rate": 8.666982228750026e-05, "loss": 4.6836, "step": 114900 }, { "epoch": 0.13, "grad_norm": 0.4654403626918793, "learning_rate": 8.665822074031793e-05, "loss": 4.6869, "step": 115000 }, { "epoch": 0.13, "grad_norm": 0.5727940201759338, "learning_rate": 8.66466191931356e-05, "loss": 4.6911, "step": 115100 }, { "epoch": 0.13, "grad_norm": 0.3905259966850281, "learning_rate": 8.663501764595326e-05, "loss": 4.6837, "step": 115200 }, { "epoch": 0.13, "grad_norm": 1.094152808189392, "learning_rate": 8.662341609877093e-05, "loss": 4.6847, "step": 115300 }, { "epoch": 0.13, "grad_norm": 0.39949262142181396, "learning_rate": 8.66118145515886e-05, "loss": 4.6877, "step": 115400 }, { "epoch": 0.13, "grad_norm": 0.3685074746608734, "learning_rate": 8.660021300440627e-05, "loss": 4.6849, "step": 115500 }, { "epoch": 0.13, "grad_norm": 0.41748788952827454, "learning_rate": 8.658861145722394e-05, "loss": 4.6861, "step": 115600 }, { "epoch": 0.13, "grad_norm": 5.671760082244873, "learning_rate": 8.657700991004161e-05, "loss": 4.6914, "step": 115700 }, { "epoch": 0.13, "grad_norm": 0.40038684010505676, "learning_rate": 8.656540836285928e-05, "loss": 4.6923, "step": 115800 }, { "epoch": 0.13, "grad_norm": 0.4368140995502472, "learning_rate": 8.655380681567694e-05, "loss": 4.6875, "step": 115900 }, { "epoch": 0.13, "grad_norm": 0.4060142934322357, "learning_rate": 8.654220526849461e-05, "loss": 4.6899, "step": 116000 }, { "epoch": 0.13, "grad_norm": 0.4354085922241211, "learning_rate": 8.653060372131228e-05, "loss": 4.6814, "step": 116100 }, { "epoch": 0.13, "grad_norm": 0.6066785454750061, "learning_rate": 8.651900217412995e-05, "loss": 4.6823, "step": 116200 }, { "epoch": 0.13, "grad_norm": 0.35917678475379944, "learning_rate": 8.650740062694761e-05, "loss": 4.6882, "step": 116300 }, { "epoch": 0.14, "grad_norm": 0.41072890162467957, "learning_rate": 8.649579907976528e-05, "loss": 4.6804, "step": 116400 }, { "epoch": 0.14, "grad_norm": 0.494043231010437, "learning_rate": 8.648419753258295e-05, "loss": 4.6811, "step": 116500 }, { "epoch": 0.14, "grad_norm": 0.4103025197982788, "learning_rate": 8.647259598540062e-05, "loss": 4.682, "step": 116600 }, { "epoch": 0.14, "grad_norm": 0.5634316802024841, "learning_rate": 8.646099443821829e-05, "loss": 4.6873, "step": 116700 }, { "epoch": 0.14, "grad_norm": 0.4846033453941345, "learning_rate": 8.644939289103595e-05, "loss": 4.6852, "step": 116800 }, { "epoch": 0.14, "grad_norm": 0.8417425751686096, "learning_rate": 8.643779134385363e-05, "loss": 4.6835, "step": 116900 }, { "epoch": 0.14, "grad_norm": 0.9649463295936584, "learning_rate": 8.642618979667129e-05, "loss": 4.6895, "step": 117000 }, { "epoch": 0.14, "grad_norm": 0.4397778809070587, "learning_rate": 8.641458824948896e-05, "loss": 4.6847, "step": 117100 }, { "epoch": 0.14, "grad_norm": 0.40810275077819824, "learning_rate": 8.640298670230662e-05, "loss": 4.6849, "step": 117200 }, { "epoch": 0.14, "grad_norm": 0.39354971051216125, "learning_rate": 8.63913851551243e-05, "loss": 4.687, "step": 117300 }, { "epoch": 0.14, "grad_norm": 0.6670671701431274, "learning_rate": 8.637978360794196e-05, "loss": 4.6858, "step": 117400 }, { "epoch": 0.14, "grad_norm": 0.8771277070045471, "learning_rate": 8.636818206075963e-05, "loss": 4.6842, "step": 117500 }, { "epoch": 0.14, "grad_norm": 0.6346202492713928, "learning_rate": 8.635658051357728e-05, "loss": 4.6821, "step": 117600 }, { "epoch": 0.14, "grad_norm": 0.3756822347640991, "learning_rate": 8.634497896639497e-05, "loss": 4.6852, "step": 117700 }, { "epoch": 0.14, "grad_norm": 0.809264063835144, "learning_rate": 8.633337741921264e-05, "loss": 4.6809, "step": 117800 }, { "epoch": 0.14, "grad_norm": 0.4695824980735779, "learning_rate": 8.63217758720303e-05, "loss": 4.6829, "step": 117900 }, { "epoch": 0.14, "grad_norm": 0.3882189095020294, "learning_rate": 8.631017432484797e-05, "loss": 4.6824, "step": 118000 }, { "epoch": 0.14, "grad_norm": 0.46574699878692627, "learning_rate": 8.629857277766564e-05, "loss": 4.6873, "step": 118100 }, { "epoch": 0.14, "grad_norm": 1.8306362628936768, "learning_rate": 8.62869712304833e-05, "loss": 4.6865, "step": 118200 }, { "epoch": 0.14, "grad_norm": 0.5190596580505371, "learning_rate": 8.627536968330096e-05, "loss": 4.6839, "step": 118300 }, { "epoch": 0.14, "grad_norm": 0.5554775595664978, "learning_rate": 8.626376813611863e-05, "loss": 4.6833, "step": 118400 }, { "epoch": 0.14, "grad_norm": 0.5029674768447876, "learning_rate": 8.62521665889363e-05, "loss": 4.6858, "step": 118500 }, { "epoch": 0.14, "grad_norm": 0.5281274914741516, "learning_rate": 8.624056504175397e-05, "loss": 4.6801, "step": 118600 }, { "epoch": 0.14, "grad_norm": 0.4034444987773895, "learning_rate": 8.622896349457163e-05, "loss": 4.6797, "step": 118700 }, { "epoch": 0.14, "grad_norm": 0.4782603085041046, "learning_rate": 8.62173619473893e-05, "loss": 4.6808, "step": 118800 }, { "epoch": 0.14, "grad_norm": 0.505681574344635, "learning_rate": 8.620576040020699e-05, "loss": 4.6805, "step": 118900 }, { "epoch": 0.14, "grad_norm": 0.45551398396492004, "learning_rate": 8.619415885302464e-05, "loss": 4.681, "step": 119000 }, { "epoch": 0.14, "grad_norm": 0.46791940927505493, "learning_rate": 8.618255730584231e-05, "loss": 4.6804, "step": 119100 }, { "epoch": 0.14, "grad_norm": 0.40453869104385376, "learning_rate": 8.617095575865998e-05, "loss": 4.6849, "step": 119200 }, { "epoch": 0.14, "grad_norm": 0.5323479771614075, "learning_rate": 8.615935421147765e-05, "loss": 4.6856, "step": 119300 }, { "epoch": 0.14, "grad_norm": 0.3924371600151062, "learning_rate": 8.614775266429531e-05, "loss": 4.6811, "step": 119400 }, { "epoch": 0.14, "grad_norm": 0.4504171311855316, "learning_rate": 8.613615111711298e-05, "loss": 4.6807, "step": 119500 }, { "epoch": 0.14, "grad_norm": 1.3027788400650024, "learning_rate": 8.612454956993065e-05, "loss": 4.6847, "step": 119600 }, { "epoch": 0.14, "grad_norm": 0.5537983179092407, "learning_rate": 8.611294802274832e-05, "loss": 4.681, "step": 119700 }, { "epoch": 0.14, "grad_norm": 0.3960654139518738, "learning_rate": 8.610134647556598e-05, "loss": 4.686, "step": 119800 }, { "epoch": 0.14, "grad_norm": 0.4028005003929138, "learning_rate": 8.608974492838365e-05, "loss": 4.6777, "step": 119900 }, { "epoch": 0.14, "grad_norm": 0.42540809512138367, "learning_rate": 8.607814338120133e-05, "loss": 4.6831, "step": 120000 }, { "epoch": 0.14, "grad_norm": 1.0470972061157227, "learning_rate": 8.606654183401899e-05, "loss": 4.6818, "step": 120100 }, { "epoch": 0.14, "grad_norm": 0.4977876842021942, "learning_rate": 8.605494028683666e-05, "loss": 4.6789, "step": 120200 }, { "epoch": 0.14, "grad_norm": 0.5205646753311157, "learning_rate": 8.604333873965432e-05, "loss": 4.6806, "step": 120300 }, { "epoch": 0.14, "grad_norm": 0.3748117983341217, "learning_rate": 8.6031737192472e-05, "loss": 4.6847, "step": 120400 }, { "epoch": 0.14, "grad_norm": 0.4421583414077759, "learning_rate": 8.602013564528966e-05, "loss": 4.6772, "step": 120500 }, { "epoch": 0.14, "grad_norm": 0.596479594707489, "learning_rate": 8.600853409810733e-05, "loss": 4.6808, "step": 120600 }, { "epoch": 0.14, "grad_norm": 1.599827527999878, "learning_rate": 8.599693255092498e-05, "loss": 4.6842, "step": 120700 }, { "epoch": 0.14, "grad_norm": 0.47555598616600037, "learning_rate": 8.598533100374267e-05, "loss": 4.6842, "step": 120800 }, { "epoch": 0.14, "grad_norm": 0.5468947887420654, "learning_rate": 8.597372945656033e-05, "loss": 4.6863, "step": 120900 }, { "epoch": 0.14, "grad_norm": 0.4049210548400879, "learning_rate": 8.5962127909378e-05, "loss": 4.6802, "step": 121000 }, { "epoch": 0.14, "grad_norm": 0.39633214473724365, "learning_rate": 8.595052636219567e-05, "loss": 4.681, "step": 121100 }, { "epoch": 0.14, "grad_norm": 0.3883484899997711, "learning_rate": 8.593892481501334e-05, "loss": 4.6846, "step": 121200 }, { "epoch": 0.14, "grad_norm": 0.4120100736618042, "learning_rate": 8.592732326783101e-05, "loss": 4.6829, "step": 121300 }, { "epoch": 0.14, "grad_norm": 0.554779052734375, "learning_rate": 8.591572172064866e-05, "loss": 4.6828, "step": 121400 }, { "epoch": 0.14, "grad_norm": 1.2129675149917603, "learning_rate": 8.590412017346633e-05, "loss": 4.6802, "step": 121500 }, { "epoch": 0.14, "grad_norm": 0.413712739944458, "learning_rate": 8.5892518626284e-05, "loss": 4.6826, "step": 121600 }, { "epoch": 0.14, "grad_norm": 1.532698392868042, "learning_rate": 8.588091707910168e-05, "loss": 4.6837, "step": 121700 }, { "epoch": 0.14, "grad_norm": 0.3998155891895294, "learning_rate": 8.586931553191933e-05, "loss": 4.6829, "step": 121800 }, { "epoch": 0.14, "grad_norm": 0.41272029280662537, "learning_rate": 8.5857713984737e-05, "loss": 4.6809, "step": 121900 }, { "epoch": 0.14, "grad_norm": 0.40928176045417786, "learning_rate": 8.584611243755467e-05, "loss": 4.6827, "step": 122000 }, { "epoch": 0.14, "grad_norm": 0.6598598957061768, "learning_rate": 8.583451089037234e-05, "loss": 4.6869, "step": 122100 }, { "epoch": 0.14, "grad_norm": 0.47744888067245483, "learning_rate": 8.582290934319001e-05, "loss": 4.6834, "step": 122200 }, { "epoch": 0.14, "grad_norm": 0.42328187823295593, "learning_rate": 8.581130779600768e-05, "loss": 4.6823, "step": 122300 }, { "epoch": 0.14, "grad_norm": 0.5083212852478027, "learning_rate": 8.579970624882535e-05, "loss": 4.6799, "step": 122400 }, { "epoch": 0.14, "grad_norm": 0.41937893629074097, "learning_rate": 8.578810470164301e-05, "loss": 4.6823, "step": 122500 }, { "epoch": 0.14, "grad_norm": 0.46155425906181335, "learning_rate": 8.577650315446068e-05, "loss": 4.6847, "step": 122600 }, { "epoch": 0.14, "grad_norm": 0.4063146710395813, "learning_rate": 8.576490160727835e-05, "loss": 4.6777, "step": 122700 }, { "epoch": 0.14, "grad_norm": 0.9644930362701416, "learning_rate": 8.575330006009602e-05, "loss": 4.6816, "step": 122800 }, { "epoch": 0.14, "grad_norm": 0.4703579246997833, "learning_rate": 8.574169851291368e-05, "loss": 4.6841, "step": 122900 }, { "epoch": 0.14, "grad_norm": 0.3799445629119873, "learning_rate": 8.573009696573135e-05, "loss": 4.6874, "step": 123000 }, { "epoch": 0.14, "grad_norm": 1.223569393157959, "learning_rate": 8.571849541854902e-05, "loss": 4.685, "step": 123100 }, { "epoch": 0.14, "grad_norm": 1.3866465091705322, "learning_rate": 8.570689387136669e-05, "loss": 4.6814, "step": 123200 }, { "epoch": 0.14, "grad_norm": 0.8391069769859314, "learning_rate": 8.569529232418436e-05, "loss": 4.676, "step": 123300 }, { "epoch": 0.14, "grad_norm": 0.41561365127563477, "learning_rate": 8.568369077700202e-05, "loss": 4.6841, "step": 123400 }, { "epoch": 0.14, "grad_norm": 1.2650662660598755, "learning_rate": 8.56720892298197e-05, "loss": 4.6828, "step": 123500 }, { "epoch": 0.14, "grad_norm": 0.6244620084762573, "learning_rate": 8.566048768263736e-05, "loss": 4.6762, "step": 123600 }, { "epoch": 0.14, "grad_norm": 0.36179304122924805, "learning_rate": 8.564888613545503e-05, "loss": 4.6765, "step": 123700 }, { "epoch": 0.14, "grad_norm": 0.4196653366088867, "learning_rate": 8.563728458827269e-05, "loss": 4.6831, "step": 123800 }, { "epoch": 0.14, "grad_norm": 0.3799944818019867, "learning_rate": 8.562568304109037e-05, "loss": 4.6783, "step": 123900 }, { "epoch": 0.14, "grad_norm": 0.47710663080215454, "learning_rate": 8.561408149390803e-05, "loss": 4.681, "step": 124000 }, { "epoch": 0.14, "grad_norm": 0.38043680787086487, "learning_rate": 8.56024799467257e-05, "loss": 4.6831, "step": 124100 }, { "epoch": 0.14, "grad_norm": 3.3860080242156982, "learning_rate": 8.559087839954337e-05, "loss": 4.6771, "step": 124200 }, { "epoch": 0.14, "grad_norm": 0.5481556057929993, "learning_rate": 8.557927685236104e-05, "loss": 4.6815, "step": 124300 }, { "epoch": 0.14, "grad_norm": 0.5027362704277039, "learning_rate": 8.556767530517871e-05, "loss": 4.6813, "step": 124400 }, { "epoch": 0.14, "grad_norm": 15.900703430175781, "learning_rate": 8.555607375799637e-05, "loss": 4.6881, "step": 124500 }, { "epoch": 0.14, "grad_norm": 0.3703874349594116, "learning_rate": 8.554447221081404e-05, "loss": 4.6873, "step": 124600 }, { "epoch": 0.14, "grad_norm": 0.3847333490848541, "learning_rate": 8.55328706636317e-05, "loss": 4.6814, "step": 124700 }, { "epoch": 0.14, "grad_norm": 0.6109323501586914, "learning_rate": 8.552126911644938e-05, "loss": 4.6812, "step": 124800 }, { "epoch": 0.14, "grad_norm": 0.39739909768104553, "learning_rate": 8.550966756926703e-05, "loss": 4.6791, "step": 124900 }, { "epoch": 0.15, "grad_norm": 0.4672738313674927, "learning_rate": 8.54980660220847e-05, "loss": 4.682, "step": 125000 }, { "epoch": 0.15, "grad_norm": 0.3909046947956085, "learning_rate": 8.548646447490237e-05, "loss": 4.676, "step": 125100 }, { "epoch": 0.15, "grad_norm": 0.7728373408317566, "learning_rate": 8.547486292772005e-05, "loss": 4.6798, "step": 125200 }, { "epoch": 0.15, "grad_norm": 0.6358431577682495, "learning_rate": 8.546326138053772e-05, "loss": 4.682, "step": 125300 }, { "epoch": 0.15, "grad_norm": 0.8379632234573364, "learning_rate": 8.545165983335539e-05, "loss": 4.6806, "step": 125400 }, { "epoch": 0.15, "grad_norm": 0.4461159110069275, "learning_rate": 8.544005828617306e-05, "loss": 4.6822, "step": 125500 }, { "epoch": 0.15, "grad_norm": 1.537218689918518, "learning_rate": 8.542845673899071e-05, "loss": 4.6802, "step": 125600 }, { "epoch": 0.15, "grad_norm": 0.43007928133010864, "learning_rate": 8.541685519180838e-05, "loss": 4.6779, "step": 125700 }, { "epoch": 0.15, "grad_norm": 0.5105913281440735, "learning_rate": 8.540525364462605e-05, "loss": 4.6817, "step": 125800 }, { "epoch": 0.15, "grad_norm": 0.5022956728935242, "learning_rate": 8.539365209744372e-05, "loss": 4.678, "step": 125900 }, { "epoch": 0.15, "grad_norm": 0.45969635248184204, "learning_rate": 8.538205055026138e-05, "loss": 4.6779, "step": 126000 }, { "epoch": 0.15, "grad_norm": 0.8507632613182068, "learning_rate": 8.537044900307905e-05, "loss": 4.6747, "step": 126100 }, { "epoch": 0.15, "grad_norm": 0.4339176118373871, "learning_rate": 8.535884745589672e-05, "loss": 4.6818, "step": 126200 }, { "epoch": 0.15, "grad_norm": 0.814734160900116, "learning_rate": 8.534724590871439e-05, "loss": 4.6786, "step": 126300 }, { "epoch": 0.15, "grad_norm": 0.4143752455711365, "learning_rate": 8.533564436153206e-05, "loss": 4.6783, "step": 126400 }, { "epoch": 0.15, "grad_norm": 0.44699016213417053, "learning_rate": 8.532404281434972e-05, "loss": 4.6752, "step": 126500 }, { "epoch": 0.15, "grad_norm": 0.38036221265792847, "learning_rate": 8.53124412671674e-05, "loss": 4.679, "step": 126600 }, { "epoch": 0.15, "grad_norm": 0.38587382435798645, "learning_rate": 8.530083971998506e-05, "loss": 4.6801, "step": 126700 }, { "epoch": 0.15, "grad_norm": 0.4019007682800293, "learning_rate": 8.528923817280273e-05, "loss": 4.681, "step": 126800 }, { "epoch": 0.15, "grad_norm": 0.43139323592185974, "learning_rate": 8.527763662562039e-05, "loss": 4.6849, "step": 126900 }, { "epoch": 0.15, "grad_norm": 0.3975641131401062, "learning_rate": 8.526603507843807e-05, "loss": 4.6796, "step": 127000 }, { "epoch": 0.15, "grad_norm": 1.2765636444091797, "learning_rate": 8.525443353125573e-05, "loss": 4.6789, "step": 127100 }, { "epoch": 0.15, "grad_norm": 0.3746165335178375, "learning_rate": 8.52428319840734e-05, "loss": 4.6801, "step": 127200 }, { "epoch": 0.15, "grad_norm": 3.3519067764282227, "learning_rate": 8.523123043689106e-05, "loss": 4.6818, "step": 127300 }, { "epoch": 0.15, "grad_norm": 0.41541022062301636, "learning_rate": 8.521962888970874e-05, "loss": 4.674, "step": 127400 }, { "epoch": 0.15, "grad_norm": 0.41416847705841064, "learning_rate": 8.520802734252641e-05, "loss": 4.6802, "step": 127500 }, { "epoch": 0.15, "grad_norm": 0.4105985164642334, "learning_rate": 8.519642579534407e-05, "loss": 4.6818, "step": 127600 }, { "epoch": 0.15, "grad_norm": 0.5681924223899841, "learning_rate": 8.518482424816174e-05, "loss": 4.6801, "step": 127700 }, { "epoch": 0.15, "grad_norm": 0.5585260391235352, "learning_rate": 8.517322270097941e-05, "loss": 4.6775, "step": 127800 }, { "epoch": 0.15, "grad_norm": 0.5471286177635193, "learning_rate": 8.516162115379708e-05, "loss": 4.6785, "step": 127900 }, { "epoch": 0.15, "grad_norm": 0.39895522594451904, "learning_rate": 8.515001960661474e-05, "loss": 4.6804, "step": 128000 }, { "epoch": 0.15, "grad_norm": 0.38762810826301575, "learning_rate": 8.51384180594324e-05, "loss": 4.6781, "step": 128100 }, { "epoch": 0.15, "grad_norm": 0.42689523100852966, "learning_rate": 8.512681651225008e-05, "loss": 4.6797, "step": 128200 }, { "epoch": 0.15, "grad_norm": 0.39311423897743225, "learning_rate": 8.511521496506775e-05, "loss": 4.6883, "step": 128300 }, { "epoch": 0.15, "grad_norm": 0.5697504878044128, "learning_rate": 8.51036134178854e-05, "loss": 4.6743, "step": 128400 }, { "epoch": 0.15, "grad_norm": 0.42035338282585144, "learning_rate": 8.509201187070309e-05, "loss": 4.6797, "step": 128500 }, { "epoch": 0.15, "grad_norm": 0.7120895981788635, "learning_rate": 8.508041032352076e-05, "loss": 4.6778, "step": 128600 }, { "epoch": 0.15, "grad_norm": 0.38941749930381775, "learning_rate": 8.506880877633841e-05, "loss": 4.6787, "step": 128700 }, { "epoch": 0.15, "grad_norm": 0.7306220531463623, "learning_rate": 8.505720722915609e-05, "loss": 4.6772, "step": 128800 }, { "epoch": 0.15, "grad_norm": 0.38047537207603455, "learning_rate": 8.504560568197376e-05, "loss": 4.6804, "step": 128900 }, { "epoch": 0.15, "grad_norm": 2.079150676727295, "learning_rate": 8.503400413479143e-05, "loss": 4.6788, "step": 129000 }, { "epoch": 0.15, "grad_norm": 0.44098183512687683, "learning_rate": 8.502240258760908e-05, "loss": 4.6749, "step": 129100 }, { "epoch": 0.15, "grad_norm": 1.4035288095474243, "learning_rate": 8.501080104042675e-05, "loss": 4.6691, "step": 129200 }, { "epoch": 0.15, "grad_norm": 0.43114110827445984, "learning_rate": 8.499919949324442e-05, "loss": 4.6797, "step": 129300 }, { "epoch": 0.15, "grad_norm": 0.5038126707077026, "learning_rate": 8.49875979460621e-05, "loss": 4.6818, "step": 129400 }, { "epoch": 0.15, "grad_norm": 0.44660595059394836, "learning_rate": 8.497599639887975e-05, "loss": 4.6804, "step": 129500 }, { "epoch": 0.15, "grad_norm": 0.3962138593196869, "learning_rate": 8.496439485169742e-05, "loss": 4.6718, "step": 129600 }, { "epoch": 0.15, "grad_norm": 0.5164237022399902, "learning_rate": 8.49527933045151e-05, "loss": 4.6833, "step": 129700 }, { "epoch": 0.15, "grad_norm": 1.7946994304656982, "learning_rate": 8.494119175733276e-05, "loss": 4.678, "step": 129800 }, { "epoch": 0.15, "grad_norm": 0.39110898971557617, "learning_rate": 8.492959021015043e-05, "loss": 4.6782, "step": 129900 }, { "epoch": 0.15, "grad_norm": 8.766246795654297, "learning_rate": 8.491798866296809e-05, "loss": 4.6728, "step": 130000 }, { "epoch": 0.15, "grad_norm": 7.374971866607666, "learning_rate": 8.490638711578577e-05, "loss": 4.6776, "step": 130100 }, { "epoch": 0.15, "grad_norm": 0.5221861600875854, "learning_rate": 8.489478556860343e-05, "loss": 4.679, "step": 130200 }, { "epoch": 0.15, "grad_norm": 0.3894909620285034, "learning_rate": 8.48831840214211e-05, "loss": 4.6727, "step": 130300 }, { "epoch": 0.15, "grad_norm": 0.39896321296691895, "learning_rate": 8.487158247423876e-05, "loss": 4.6779, "step": 130400 }, { "epoch": 0.15, "grad_norm": 0.4062064588069916, "learning_rate": 8.485998092705644e-05, "loss": 4.6774, "step": 130500 }, { "epoch": 0.15, "grad_norm": 0.5894352197647095, "learning_rate": 8.48483793798741e-05, "loss": 4.679, "step": 130600 }, { "epoch": 0.15, "grad_norm": 0.7942706942558289, "learning_rate": 8.483677783269177e-05, "loss": 4.673, "step": 130700 }, { "epoch": 0.15, "grad_norm": 0.4256235361099243, "learning_rate": 8.482517628550944e-05, "loss": 4.6752, "step": 130800 }, { "epoch": 0.15, "grad_norm": 0.4181482791900635, "learning_rate": 8.481357473832711e-05, "loss": 4.6809, "step": 130900 }, { "epoch": 0.15, "grad_norm": 0.37271997332572937, "learning_rate": 8.480197319114478e-05, "loss": 4.678, "step": 131000 } ], "logging_steps": 100, "max_steps": 861954, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 3.271273415079469e+18, "train_batch_size": 192, "trial_name": null, "trial_params": null }