{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": NaN, "learning_rate": 9.999948122981575e-05, "loss": 1.0507, "step": 32 }, { "epoch": 0.0064, "grad_norm": 1.0899442434310913, "learning_rate": 9.999770471768777e-05, "loss": 1.016, "step": 64 }, { "epoch": 0.0096, "grad_norm": 2.957014799118042, "learning_rate": 9.999466495684926e-05, "loss": 0.9928, "step": 96 }, { "epoch": 0.0128, "grad_norm": 1.3575732707977295, "learning_rate": 9.999036202410325e-05, "loss": 0.8757, "step": 128 }, { "epoch": 0.016, "grad_norm": 1.0611408948898315, "learning_rate": 9.998498908285819e-05, "loss": 0.8615, "step": 160 }, { "epoch": 0.0192, "grad_norm": 1.382992148399353, "learning_rate": 9.997819962824957e-05, "loss": 0.8216, "step": 192 }, { "epoch": 0.0224, "grad_norm": 2.863276481628418, "learning_rate": 9.997014741774866e-05, "loss": 0.7406, "step": 224 }, { "epoch": 0.0256, "grad_norm": 1.1629211902618408, "learning_rate": 9.996083265480365e-05, "loss": 0.8171, "step": 256 }, { "epoch": 0.0288, "grad_norm": 2.2264232635498047, "learning_rate": 9.995025557476261e-05, "loss": 0.8835, "step": 288 }, { "epoch": 0.032, "grad_norm": 1.7896003723144531, "learning_rate": 9.993841644486747e-05, "loss": 0.7303, "step": 320 }, { "epoch": 0.0352, "grad_norm": 1.403260350227356, "learning_rate": 9.992531556424726e-05, "loss": 0.7384, "step": 352 }, { "epoch": 0.0384, "grad_norm": 2.308896780014038, "learning_rate": 9.99109532639106e-05, "loss": 0.8211, "step": 384 }, { "epoch": 0.0416, "grad_norm": 1.282929539680481, "learning_rate": 9.989532990673728e-05, "loss": 0.7211, "step": 416 }, { "epoch": 0.0448, "grad_norm": 2.4921414852142334, "learning_rate": 9.987844588746915e-05, "loss": 0.8204, "step": 448 }, { "epoch": 0.048, "grad_norm": 1.3490195274353027, "learning_rate": 9.986030163270011e-05, "loss": 0.7623, "step": 480 }, { "epoch": 0.0512, "grad_norm": 1.436516523361206, "learning_rate": 9.98408976008653e-05, "loss": 0.7981, "step": 512 }, { "epoch": 0.0544, "grad_norm": 2.3144304752349854, "learning_rate": 9.982023428222962e-05, "loss": 0.7422, "step": 544 }, { "epoch": 0.0576, "grad_norm": 1.2702479362487793, "learning_rate": 9.979831219887525e-05, "loss": 0.8107, "step": 576 }, { "epoch": 0.0608, "grad_norm": 3.110814332962036, "learning_rate": 9.977513190468848e-05, "loss": 0.8395, "step": 608 }, { "epoch": 0.064, "grad_norm": 4.934881687164307, "learning_rate": 9.975069398534574e-05, "loss": 0.8456, "step": 640 }, { "epoch": 0.0672, "grad_norm": 1.5248093605041504, "learning_rate": 9.972499905829875e-05, "loss": 0.7604, "step": 672 }, { "epoch": 0.0704, "grad_norm": 1.5269616842269897, "learning_rate": 9.9698047772759e-05, "loss": 0.7557, "step": 704 }, { "epoch": 0.0736, "grad_norm": 1.523474097251892, "learning_rate": 9.966984080968128e-05, "loss": 0.7622, "step": 736 }, { "epoch": 0.0768, "grad_norm": 1.3121402263641357, "learning_rate": 9.96403788817465e-05, "loss": 0.6912, "step": 768 }, { "epoch": 0.08, "grad_norm": 0.9180154800415039, "learning_rate": 9.96096627333437e-05, "loss": 0.8783, "step": 800 }, { "epoch": 0.0832, "grad_norm": 2.254473924636841, "learning_rate": 9.957769314055117e-05, "loss": 0.7987, "step": 832 }, { "epoch": 0.0864, "grad_norm": 1.9398365020751953, "learning_rate": 9.954447091111694e-05, "loss": 0.7703, "step": 864 }, { "epoch": 0.0896, "grad_norm": 1.4880696535110474, "learning_rate": 9.950999688443833e-05, "loss": 0.7258, "step": 896 }, { "epoch": 0.0928, "grad_norm": 1.8427962064743042, "learning_rate": 9.947427193154071e-05, "loss": 0.6981, "step": 928 }, { "epoch": 0.096, "grad_norm": 3.3647401332855225, "learning_rate": 9.943729695505552e-05, "loss": 0.7862, "step": 960 }, { "epoch": 0.0992, "grad_norm": 1.852992296218872, "learning_rate": 9.939907288919747e-05, "loss": 0.8016, "step": 992 }, { "epoch": 0.1024, "grad_norm": 1.2231330871582031, "learning_rate": 9.935960069974096e-05, "loss": 0.8001, "step": 1024 }, { "epoch": 0.1056, "grad_norm": 1.2329598665237427, "learning_rate": 9.931888138399561e-05, "loss": 0.7656, "step": 1056 }, { "epoch": 0.1088, "grad_norm": 1.4887111186981201, "learning_rate": 9.927691597078108e-05, "loss": 0.7772, "step": 1088 }, { "epoch": 0.112, "grad_norm": 1.1879202127456665, "learning_rate": 9.923370552040116e-05, "loss": 0.7368, "step": 1120 }, { "epoch": 0.1152, "grad_norm": 1.4578642845153809, "learning_rate": 9.918925112461688e-05, "loss": 0.7226, "step": 1152 }, { "epoch": 0.1184, "grad_norm": 3.8356716632843018, "learning_rate": 9.914355390661896e-05, "loss": 0.7468, "step": 1184 }, { "epoch": 0.1216, "grad_norm": 3.390878200531006, "learning_rate": 9.909661502099943e-05, "loss": 0.7163, "step": 1216 }, { "epoch": 0.1248, "grad_norm": 2.217479944229126, "learning_rate": 9.904843565372248e-05, "loss": 0.7805, "step": 1248 }, { "epoch": 0.128, "grad_norm": 0.7309045195579529, "learning_rate": 9.899901702209445e-05, "loss": 0.6929, "step": 1280 }, { "epoch": 0.1312, "grad_norm": 1.173700213432312, "learning_rate": 9.89483603747331e-05, "loss": 0.726, "step": 1312 }, { "epoch": 0.1344, "grad_norm": 1.4089820384979248, "learning_rate": 9.88964669915361e-05, "loss": 0.8606, "step": 1344 }, { "epoch": 0.1376, "grad_norm": 1.0375796556472778, "learning_rate": 9.884333818364861e-05, "loss": 0.721, "step": 1376 }, { "epoch": 0.1408, "grad_norm": 2.082084894180298, "learning_rate": 9.878897529343023e-05, "loss": 0.7884, "step": 1408 }, { "epoch": 0.144, "grad_norm": 0.7961512804031372, "learning_rate": 9.873337969442101e-05, "loss": 0.774, "step": 1440 }, { "epoch": 0.1472, "grad_norm": 1.3074238300323486, "learning_rate": 9.867655279130683e-05, "loss": 0.7392, "step": 1472 }, { "epoch": 0.1504, "grad_norm": 1.5205963850021362, "learning_rate": 9.861849601988383e-05, "loss": 0.7731, "step": 1504 }, { "epoch": 0.1536, "grad_norm": 1.4995771646499634, "learning_rate": 9.855921084702219e-05, "loss": 0.8281, "step": 1536 }, { "epoch": 0.1568, "grad_norm": 1.0279921293258667, "learning_rate": 9.849869877062902e-05, "loss": 0.6942, "step": 1568 }, { "epoch": 0.16, "grad_norm": 2.8020853996276855, "learning_rate": 9.843696131961058e-05, "loss": 0.7389, "step": 1600 }, { "epoch": 0.1632, "grad_norm": 2.99129056930542, "learning_rate": 9.837400005383354e-05, "loss": 0.7483, "step": 1632 }, { "epoch": 0.1664, "grad_norm": 2.325167179107666, "learning_rate": 9.830981656408574e-05, "loss": 0.7483, "step": 1664 }, { "epoch": 0.1696, "grad_norm": 0.7245140671730042, "learning_rate": 9.824441247203579e-05, "loss": 0.7633, "step": 1696 }, { "epoch": 0.1728, "grad_norm": 2.7938778400421143, "learning_rate": 9.817778943019228e-05, "loss": 0.7812, "step": 1728 }, { "epoch": 0.176, "grad_norm": 1.2263625860214233, "learning_rate": 9.810994912186189e-05, "loss": 0.7712, "step": 1760 }, { "epoch": 0.1792, "grad_norm": 1.2694672346115112, "learning_rate": 9.804089326110697e-05, "loss": 0.7297, "step": 1792 }, { "epoch": 0.1824, "grad_norm": 1.255414366722107, "learning_rate": 9.797062359270215e-05, "loss": 0.735, "step": 1824 }, { "epoch": 0.1856, "grad_norm": 1.3175591230392456, "learning_rate": 9.789914189209029e-05, "loss": 0.7633, "step": 1856 }, { "epoch": 0.1888, "grad_norm": 1.0326446294784546, "learning_rate": 9.78264499653376e-05, "loss": 0.7955, "step": 1888 }, { "epoch": 0.192, "grad_norm": 1.093620777130127, "learning_rate": 9.775254964908807e-05, "loss": 0.766, "step": 1920 }, { "epoch": 0.1952, "grad_norm": 1.4234970808029175, "learning_rate": 9.767744281051701e-05, "loss": 0.6725, "step": 1952 }, { "epoch": 0.1984, "grad_norm": 0.7571769952774048, "learning_rate": 9.760113134728384e-05, "loss": 0.6953, "step": 1984 }, { "epoch": 0.2016, "grad_norm": 1.7207865715026855, "learning_rate": 9.752361718748423e-05, "loss": 0.7356, "step": 2016 }, { "epoch": 0.2048, "grad_norm": 2.240748882293701, "learning_rate": 9.744490228960138e-05, "loss": 0.8067, "step": 2048 }, { "epoch": 0.208, "grad_norm": 1.2544214725494385, "learning_rate": 9.736498864245638e-05, "loss": 0.7618, "step": 2080 }, { "epoch": 0.2112, "grad_norm": 5.976646900177002, "learning_rate": 9.728387826515819e-05, "loss": 0.6825, "step": 2112 }, { "epoch": 0.2144, "grad_norm": 4.557011127471924, "learning_rate": 9.72015732070525e-05, "loss": 0.7623, "step": 2144 }, { "epoch": 0.2176, "grad_norm": 0.8000884056091309, "learning_rate": 9.71180755476699e-05, "loss": 0.7719, "step": 2176 }, { "epoch": 0.2208, "grad_norm": 1.115488052368164, "learning_rate": 9.703338739667346e-05, "loss": 0.7913, "step": 2208 }, { "epoch": 0.224, "grad_norm": 1.3180317878723145, "learning_rate": 9.694751089380536e-05, "loss": 0.7452, "step": 2240 }, { "epoch": 0.2272, "grad_norm": 2.9995932579040527, "learning_rate": 9.686044820883285e-05, "loss": 0.7962, "step": 2272 }, { "epoch": 0.2304, "grad_norm": 1.234027624130249, "learning_rate": 9.677220154149336e-05, "loss": 0.828, "step": 2304 }, { "epoch": 0.2336, "grad_norm": 1.6579309701919556, "learning_rate": 9.668277312143907e-05, "loss": 0.7569, "step": 2336 }, { "epoch": 0.2368, "grad_norm": 1.5580084323883057, "learning_rate": 9.65921652081804e-05, "loss": 0.7947, "step": 2368 }, { "epoch": 0.24, "grad_norm": 0.6711795330047607, "learning_rate": 9.650038009102905e-05, "loss": 0.7461, "step": 2400 }, { "epoch": 0.2432, "grad_norm": 1.2285038232803345, "learning_rate": 9.640742008904005e-05, "loss": 0.6587, "step": 2432 }, { "epoch": 0.2464, "grad_norm": 0.7901808619499207, "learning_rate": 9.631328755095334e-05, "loss": 0.7182, "step": 2464 }, { "epoch": 0.2496, "grad_norm": 0.6125284433364868, "learning_rate": 9.62179848551342e-05, "loss": 0.709, "step": 2496 }, { "epoch": 0.2528, "grad_norm": 1.1602981090545654, "learning_rate": 9.612151440951334e-05, "loss": 0.7039, "step": 2528 }, { "epoch": 0.256, "grad_norm": 2.366184711456299, "learning_rate": 9.602387865152597e-05, "loss": 0.8669, "step": 2560 }, { "epoch": 0.2592, "grad_norm": 2.583352565765381, "learning_rate": 9.592508004805023e-05, "loss": 0.7258, "step": 2592 }, { "epoch": 0.2624, "grad_norm": 2.132749557495117, "learning_rate": 9.58251210953449e-05, "loss": 0.6971, "step": 2624 }, { "epoch": 0.2656, "grad_norm": 1.4479436874389648, "learning_rate": 9.572400431898627e-05, "loss": 0.8086, "step": 2656 }, { "epoch": 0.2688, "grad_norm": 1.2764617204666138, "learning_rate": 9.562173227380436e-05, "loss": 0.7426, "step": 2688 }, { "epoch": 0.272, "grad_norm": 3.4120121002197266, "learning_rate": 9.55183075438184e-05, "loss": 0.7382, "step": 2720 }, { "epoch": 0.2752, "grad_norm": 1.9773039817810059, "learning_rate": 9.541373274217145e-05, "loss": 0.7903, "step": 2752 }, { "epoch": 0.2784, "grad_norm": 1.4097728729248047, "learning_rate": 9.530801051106449e-05, "loss": 0.7713, "step": 2784 }, { "epoch": 0.2816, "grad_norm": 1.0817668437957764, "learning_rate": 9.520114352168958e-05, "loss": 0.7275, "step": 2816 }, { "epoch": 0.2848, "grad_norm": 1.2667794227600098, "learning_rate": 9.509313447416242e-05, "loss": 0.6648, "step": 2848 }, { "epoch": 0.288, "grad_norm": 1.8679159879684448, "learning_rate": 9.498398609745405e-05, "loss": 0.7445, "step": 2880 }, { "epoch": 0.2912, "grad_norm": 2.8598544597625732, "learning_rate": 9.487370114932202e-05, "loss": 0.733, "step": 2912 }, { "epoch": 0.2944, "grad_norm": 0.9554559588432312, "learning_rate": 9.476228241624059e-05, "loss": 0.7487, "step": 2944 }, { "epoch": 0.2976, "grad_norm": 1.926672101020813, "learning_rate": 9.464973271333042e-05, "loss": 0.8864, "step": 2976 }, { "epoch": 0.3008, "grad_norm": 0.8425309658050537, "learning_rate": 9.45360548842874e-05, "loss": 0.7295, "step": 3008 }, { "epoch": 0.304, "grad_norm": 1.3110431432724, "learning_rate": 9.442125180131078e-05, "loss": 0.7547, "step": 3040 }, { "epoch": 0.3072, "grad_norm": 0.9774306416511536, "learning_rate": 9.430532636503068e-05, "loss": 0.7099, "step": 3072 }, { "epoch": 0.3104, "grad_norm": 0.6718234419822693, "learning_rate": 9.418828150443469e-05, "loss": 0.7636, "step": 3104 }, { "epoch": 0.3136, "grad_norm": 1.2758376598358154, "learning_rate": 9.407012017679393e-05, "loss": 0.7066, "step": 3136 }, { "epoch": 0.3168, "grad_norm": 1.3185311555862427, "learning_rate": 9.395084536758838e-05, "loss": 0.6785, "step": 3168 }, { "epoch": 0.32, "grad_norm": 1.550795078277588, "learning_rate": 9.383046009043134e-05, "loss": 0.7451, "step": 3200 }, { "epoch": 0.3232, "grad_norm": 1.1686354875564575, "learning_rate": 9.370896738699339e-05, "loss": 0.6652, "step": 3232 }, { "epoch": 0.3264, "grad_norm": 0.848976194858551, "learning_rate": 9.358637032692545e-05, "loss": 0.7705, "step": 3264 }, { "epoch": 0.3296, "grad_norm": 1.3812384605407715, "learning_rate": 9.346267200778126e-05, "loss": 0.7168, "step": 3296 }, { "epoch": 0.3328, "grad_norm": 1.008135199546814, "learning_rate": 9.333787555493914e-05, "loss": 0.7352, "step": 3328 }, { "epoch": 0.336, "grad_norm": 1.2273484468460083, "learning_rate": 9.321198412152301e-05, "loss": 0.7979, "step": 3360 }, { "epoch": 0.3392, "grad_norm": 0.8740741610527039, "learning_rate": 9.308500088832272e-05, "loss": 0.6846, "step": 3392 }, { "epoch": 0.3424, "grad_norm": 1.3684589862823486, "learning_rate": 9.295692906371363e-05, "loss": 0.7758, "step": 3424 }, { "epoch": 0.3456, "grad_norm": 1.2861257791519165, "learning_rate": 9.282777188357565e-05, "loss": 0.6581, "step": 3456 }, { "epoch": 0.3488, "grad_norm": 0.8915108442306519, "learning_rate": 9.269753261121138e-05, "loss": 0.7935, "step": 3488 }, { "epoch": 0.352, "grad_norm": 1.1308799982070923, "learning_rate": 9.256621453726379e-05, "loss": 0.7759, "step": 3520 }, { "epoch": 0.3552, "grad_norm": 1.0886152982711792, "learning_rate": 9.243382097963291e-05, "loss": 0.7207, "step": 3552 }, { "epoch": 0.3584, "grad_norm": 0.675757110118866, "learning_rate": 9.230035528339211e-05, "loss": 0.6876, "step": 3584 }, { "epoch": 0.3616, "grad_norm": 3.258622884750366, "learning_rate": 9.216582082070358e-05, "loss": 0.7498, "step": 3616 }, { "epoch": 0.3648, "grad_norm": 3.8826818466186523, "learning_rate": 9.203022099073309e-05, "loss": 0.7993, "step": 3648 }, { "epoch": 0.368, "grad_norm": 0.9782927632331848, "learning_rate": 9.189355921956412e-05, "loss": 0.7005, "step": 3680 }, { "epoch": 0.3712, "grad_norm": 1.1662654876708984, "learning_rate": 9.175583896011131e-05, "loss": 0.6732, "step": 3712 }, { "epoch": 0.3744, "grad_norm": 1.038501501083374, "learning_rate": 9.161706369203317e-05, "loss": 0.7414, "step": 3744 }, { "epoch": 0.3776, "grad_norm": 2.3218936920166016, "learning_rate": 9.147723692164427e-05, "loss": 0.8008, "step": 3776 }, { "epoch": 0.3808, "grad_norm": 2.1190292835235596, "learning_rate": 9.13363621818265e-05, "loss": 0.711, "step": 3808 }, { "epoch": 0.384, "grad_norm": 1.8653652667999268, "learning_rate": 9.119444303193996e-05, "loss": 0.7641, "step": 3840 }, { "epoch": 0.3872, "grad_norm": 1.760704517364502, "learning_rate": 9.10514830577329e-05, "loss": 0.7231, "step": 3872 }, { "epoch": 0.3904, "grad_norm": 0.8437080979347229, "learning_rate": 9.090748587125118e-05, "loss": 0.7089, "step": 3904 }, { "epoch": 0.3936, "grad_norm": 1.6417099237442017, "learning_rate": 9.076245511074703e-05, "loss": 0.7645, "step": 3936 }, { "epoch": 0.3968, "grad_norm": 1.0280499458312988, "learning_rate": 9.06163944405871e-05, "loss": 0.78, "step": 3968 }, { "epoch": 0.4, "grad_norm": 2.645205020904541, "learning_rate": 9.046930755115985e-05, "loss": 0.7443, "step": 4000 }, { "epoch": 0.4032, "grad_norm": 1.4332572221755981, "learning_rate": 9.032119815878236e-05, "loss": 0.7138, "step": 4032 }, { "epoch": 0.4064, "grad_norm": 1.3062529563903809, "learning_rate": 9.017207000560639e-05, "loss": 0.6866, "step": 4064 }, { "epoch": 0.4096, "grad_norm": 1.559920072555542, "learning_rate": 9.002192685952385e-05, "loss": 0.7289, "step": 4096 }, { "epoch": 0.4128, "grad_norm": 2.111950635910034, "learning_rate": 8.987077251407158e-05, "loss": 0.7011, "step": 4128 }, { "epoch": 0.416, "grad_norm": 0.8812033534049988, "learning_rate": 8.971861078833557e-05, "loss": 0.7469, "step": 4160 }, { "epoch": 0.4192, "grad_norm": 0.8479238748550415, "learning_rate": 8.956544552685437e-05, "loss": 0.7263, "step": 4192 }, { "epoch": 0.4224, "grad_norm": 1.0125929117202759, "learning_rate": 8.941128059952201e-05, "loss": 0.6762, "step": 4224 }, { "epoch": 0.4256, "grad_norm": 0.9122424721717834, "learning_rate": 8.925611990149021e-05, "loss": 0.7076, "step": 4256 }, { "epoch": 0.4288, "grad_norm": 1.814253330230713, "learning_rate": 8.909996735306996e-05, "loss": 0.7143, "step": 4288 }, { "epoch": 0.432, "grad_norm": 1.4890289306640625, "learning_rate": 8.894282689963251e-05, "loss": 0.6931, "step": 4320 }, { "epoch": 0.4352, "grad_norm": 1.908116340637207, "learning_rate": 8.878470251150959e-05, "loss": 0.701, "step": 4352 }, { "epoch": 0.4384, "grad_norm": 1.2831019163131714, "learning_rate": 8.862559818389322e-05, "loss": 0.7625, "step": 4384 }, { "epoch": 0.4416, "grad_norm": 0.923768162727356, "learning_rate": 8.846551793673467e-05, "loss": 0.7902, "step": 4416 }, { "epoch": 0.4448, "grad_norm": 1.6989527940750122, "learning_rate": 8.83044658146429e-05, "loss": 0.7006, "step": 4448 }, { "epoch": 0.448, "grad_norm": 1.203029990196228, "learning_rate": 8.814244588678245e-05, "loss": 0.7588, "step": 4480 }, { "epoch": 0.4512, "grad_norm": 1.8377019166946411, "learning_rate": 8.797946224677052e-05, "loss": 0.6975, "step": 4512 }, { "epoch": 0.4544, "grad_norm": 1.4714457988739014, "learning_rate": 8.78155190125736e-05, "loss": 0.6502, "step": 4544 }, { "epoch": 0.4576, "grad_norm": 1.6311497688293457, "learning_rate": 8.765062032640346e-05, "loss": 0.7536, "step": 4576 }, { "epoch": 0.4608, "grad_norm": 1.8238953351974487, "learning_rate": 8.748477035461238e-05, "loss": 0.7899, "step": 4608 }, { "epoch": 0.464, "grad_norm": 1.5541362762451172, "learning_rate": 8.7317973287588e-05, "loss": 0.6904, "step": 4640 }, { "epoch": 0.4672, "grad_norm": 1.032272219657898, "learning_rate": 8.715023333964736e-05, "loss": 0.7395, "step": 4672 }, { "epoch": 0.4704, "grad_norm": 1.3095510005950928, "learning_rate": 8.69815547489305e-05, "loss": 0.6854, "step": 4704 }, { "epoch": 0.4736, "grad_norm": 1.5274263620376587, "learning_rate": 8.681194177729327e-05, "loss": 0.7498, "step": 4736 }, { "epoch": 0.4768, "grad_norm": 1.4236122369766235, "learning_rate": 8.66413987101998e-05, "loss": 0.7356, "step": 4768 }, { "epoch": 0.48, "grad_norm": 1.2118279933929443, "learning_rate": 8.646992985661404e-05, "loss": 0.7178, "step": 4800 }, { "epoch": 0.4832, "grad_norm": 3.3495805263519287, "learning_rate": 8.629753954889107e-05, "loss": 0.7326, "step": 4832 }, { "epoch": 0.4864, "grad_norm": 0.6829349398612976, "learning_rate": 8.612423214266749e-05, "loss": 0.7838, "step": 4864 }, { "epoch": 0.4896, "grad_norm": 0.8314148187637329, "learning_rate": 8.595001201675147e-05, "loss": 0.7007, "step": 4896 }, { "epoch": 0.4928, "grad_norm": 1.2672547101974487, "learning_rate": 8.577488357301209e-05, "loss": 0.7377, "step": 4928 }, { "epoch": 0.496, "grad_norm": 1.3968323469161987, "learning_rate": 8.559885123626807e-05, "loss": 0.6774, "step": 4960 }, { "epoch": 0.4992, "grad_norm": 1.2808008193969727, "learning_rate": 8.542191945417601e-05, "loss": 0.6807, "step": 4992 }, { "epoch": 0.5024, "grad_norm": 1.9290404319763184, "learning_rate": 8.524409269711807e-05, "loss": 0.7376, "step": 5024 }, { "epoch": 0.5056, "grad_norm": 1.3726913928985596, "learning_rate": 8.506537545808892e-05, "loss": 0.7402, "step": 5056 }, { "epoch": 0.5088, "grad_norm": 1.7894905805587769, "learning_rate": 8.48857722525823e-05, "loss": 0.6991, "step": 5088 }, { "epoch": 0.512, "grad_norm": 1.1462016105651855, "learning_rate": 8.470528761847684e-05, "loss": 0.7989, "step": 5120 }, { "epoch": 0.5152, "grad_norm": 0.7457314729690552, "learning_rate": 8.452392611592153e-05, "loss": 0.7616, "step": 5152 }, { "epoch": 0.5184, "grad_norm": 1.728968858718872, "learning_rate": 8.434169232722043e-05, "loss": 0.6324, "step": 5184 }, { "epoch": 0.5216, "grad_norm": 0.9103218913078308, "learning_rate": 8.415859085671683e-05, "loss": 0.7222, "step": 5216 }, { "epoch": 0.5248, "grad_norm": 1.602072834968567, "learning_rate": 8.397462633067704e-05, "loss": 0.7265, "step": 5248 }, { "epoch": 0.528, "grad_norm": 0.9967379570007324, "learning_rate": 8.378980339717349e-05, "loss": 0.7042, "step": 5280 }, { "epoch": 0.5312, "grad_norm": 1.9905532598495483, "learning_rate": 8.360412672596712e-05, "loss": 0.8098, "step": 5312 }, { "epoch": 0.5344, "grad_norm": 1.1432929039001465, "learning_rate": 8.341760100838965e-05, "loss": 0.7591, "step": 5344 }, { "epoch": 0.5376, "grad_norm": 2.1654775142669678, "learning_rate": 8.323023095722486e-05, "loss": 0.8071, "step": 5376 }, { "epoch": 0.5408, "grad_norm": 1.2390097379684448, "learning_rate": 8.304202130658959e-05, "loss": 0.834, "step": 5408 }, { "epoch": 0.544, "grad_norm": 1.0290433168411255, "learning_rate": 8.285297681181408e-05, "loss": 0.8228, "step": 5440 }, { "epoch": 0.5472, "grad_norm": 1.299111008644104, "learning_rate": 8.26631022493219e-05, "loss": 0.7099, "step": 5472 }, { "epoch": 0.5504, "grad_norm": 0.8850242495536804, "learning_rate": 8.247240241650918e-05, "loss": 0.7646, "step": 5504 }, { "epoch": 0.5536, "grad_norm": 1.980812907218933, "learning_rate": 8.22808821316235e-05, "loss": 0.7312, "step": 5536 }, { "epoch": 0.5568, "grad_norm": 1.0378026962280273, "learning_rate": 8.208854623364202e-05, "loss": 0.7277, "step": 5568 }, { "epoch": 0.56, "grad_norm": 1.6820452213287354, "learning_rate": 8.189539958214935e-05, "loss": 0.7654, "step": 5600 }, { "epoch": 0.5632, "grad_norm": 1.494661808013916, "learning_rate": 8.170144705721465e-05, "loss": 0.7208, "step": 5632 }, { "epoch": 0.5664, "grad_norm": 0.9761049747467041, "learning_rate": 8.150669355926846e-05, "loss": 0.6898, "step": 5664 }, { "epoch": 0.5696, "grad_norm": 1.3057583570480347, "learning_rate": 8.131114400897874e-05, "loss": 0.7887, "step": 5696 }, { "epoch": 0.5728, "grad_norm": 1.0025156736373901, "learning_rate": 8.111480334712665e-05, "loss": 0.6483, "step": 5728 }, { "epoch": 0.576, "grad_norm": 0.9818746447563171, "learning_rate": 8.091767653448167e-05, "loss": 0.8385, "step": 5760 }, { "epoch": 0.5792, "grad_norm": 1.1921987533569336, "learning_rate": 8.071976855167629e-05, "loss": 0.6707, "step": 5792 }, { "epoch": 0.5824, "grad_norm": 1.5055749416351318, "learning_rate": 8.052108439908013e-05, "loss": 0.7086, "step": 5824 }, { "epoch": 0.5856, "grad_norm": 1.7581650018692017, "learning_rate": 8.032162909667362e-05, "loss": 0.6696, "step": 5856 }, { "epoch": 0.5888, "grad_norm": 1.8909873962402344, "learning_rate": 8.01214076839212e-05, "loss": 0.7471, "step": 5888 }, { "epoch": 0.592, "grad_norm": 1.3570644855499268, "learning_rate": 7.992042521964389e-05, "loss": 0.655, "step": 5920 }, { "epoch": 0.5952, "grad_norm": 0.6561287641525269, "learning_rate": 7.971868678189161e-05, "loss": 0.719, "step": 5952 }, { "epoch": 0.5984, "grad_norm": 1.3650476932525635, "learning_rate": 7.951619746781474e-05, "loss": 0.7405, "step": 5984 }, { "epoch": 0.6016, "grad_norm": 2.8344266414642334, "learning_rate": 7.931296239353544e-05, "loss": 0.7192, "step": 6016 }, { "epoch": 0.6048, "grad_norm": 1.6202623844146729, "learning_rate": 7.910898669401839e-05, "loss": 0.7671, "step": 6048 }, { "epoch": 0.608, "grad_norm": 1.1194038391113281, "learning_rate": 7.890427552294093e-05, "loss": 0.7915, "step": 6080 }, { "epoch": 0.6112, "grad_norm": 0.8267541527748108, "learning_rate": 7.869883405256295e-05, "loss": 0.7441, "step": 6112 }, { "epoch": 0.6144, "grad_norm": 1.229134202003479, "learning_rate": 7.849266747359619e-05, "loss": 0.6548, "step": 6144 }, { "epoch": 0.6176, "grad_norm": 1.151248812675476, "learning_rate": 7.828578099507308e-05, "loss": 0.6795, "step": 6176 }, { "epoch": 0.6208, "grad_norm": 1.620975375175476, "learning_rate": 7.80781798442151e-05, "loss": 0.6352, "step": 6208 }, { "epoch": 0.624, "grad_norm": 0.9030219912528992, "learning_rate": 7.786986926630078e-05, "loss": 0.7185, "step": 6240 }, { "epoch": 0.6272, "grad_norm": 1.2997703552246094, "learning_rate": 7.766085452453312e-05, "loss": 0.6523, "step": 6272 }, { "epoch": 0.6304, "grad_norm": 1.208347201347351, "learning_rate": 7.74511408999066e-05, "loss": 0.6928, "step": 6304 }, { "epoch": 0.6336, "grad_norm": 0.723646879196167, "learning_rate": 7.724073369107376e-05, "loss": 0.6603, "step": 6336 }, { "epoch": 0.6368, "grad_norm": 1.125978946685791, "learning_rate": 7.702963821421133e-05, "loss": 0.7328, "step": 6368 }, { "epoch": 0.64, "grad_norm": 2.039461135864258, "learning_rate": 7.6817859802886e-05, "loss": 0.7545, "step": 6400 }, { "epoch": 0.6432, "grad_norm": 1.3743586540222168, "learning_rate": 7.660540380791942e-05, "loss": 0.67, "step": 6432 }, { "epoch": 0.6464, "grad_norm": 1.402256727218628, "learning_rate": 7.639227559725332e-05, "loss": 0.636, "step": 6464 }, { "epoch": 0.6496, "grad_norm": 1.0240074396133423, "learning_rate": 7.617848055581361e-05, "loss": 0.8179, "step": 6496 }, { "epoch": 0.6528, "grad_norm": 0.8905365467071533, "learning_rate": 7.596402408537443e-05, "loss": 0.7542, "step": 6528 }, { "epoch": 0.656, "grad_norm": 1.8598270416259766, "learning_rate": 7.574891160442179e-05, "loss": 0.7266, "step": 6560 }, { "epoch": 0.6592, "grad_norm": 0.9146720170974731, "learning_rate": 7.553314854801641e-05, "loss": 0.7861, "step": 6592 }, { "epoch": 0.6624, "grad_norm": 1.8956897258758545, "learning_rate": 7.531674036765662e-05, "loss": 0.7113, "step": 6624 }, { "epoch": 0.6656, "grad_norm": 1.0353283882141113, "learning_rate": 7.509969253114055e-05, "loss": 0.6984, "step": 6656 }, { "epoch": 0.6688, "grad_norm": 1.890493631362915, "learning_rate": 7.488201052242789e-05, "loss": 0.6687, "step": 6688 }, { "epoch": 0.672, "grad_norm": 0.9367122054100037, "learning_rate": 7.46636998415015e-05, "loss": 0.719, "step": 6720 }, { "epoch": 0.6752, "grad_norm": 1.1989344358444214, "learning_rate": 7.444476600422828e-05, "loss": 0.775, "step": 6752 }, { "epoch": 0.6784, "grad_norm": 0.8481733202934265, "learning_rate": 7.42252145422199e-05, "loss": 0.7667, "step": 6784 }, { "epoch": 0.6816, "grad_norm": 1.0271095037460327, "learning_rate": 7.400505100269307e-05, "loss": 0.653, "step": 6816 }, { "epoch": 0.6848, "grad_norm": 1.3998816013336182, "learning_rate": 7.378428094832931e-05, "loss": 0.6651, "step": 6848 }, { "epoch": 0.688, "grad_norm": 1.3338642120361328, "learning_rate": 7.356290995713437e-05, "loss": 0.6266, "step": 6880 }, { "epoch": 0.6912, "grad_norm": 0.8170168995857239, "learning_rate": 7.334094362229739e-05, "loss": 0.765, "step": 6912 }, { "epoch": 0.6944, "grad_norm": 1.4982614517211914, "learning_rate": 7.311838755204959e-05, "loss": 0.641, "step": 6944 }, { "epoch": 0.6976, "grad_norm": 1.623159646987915, "learning_rate": 7.290222928580347e-05, "loss": 0.6462, "step": 6976 }, { "epoch": 0.7008, "grad_norm": 1.169145941734314, "learning_rate": 7.267852862072673e-05, "loss": 0.7506, "step": 7008 }, { "epoch": 0.704, "grad_norm": 1.011816382408142, "learning_rate": 7.245425495690538e-05, "loss": 0.7183, "step": 7040 }, { "epoch": 0.7072, "grad_norm": 3.0435078144073486, "learning_rate": 7.222941396086789e-05, "loss": 0.7948, "step": 7072 }, { "epoch": 0.7104, "grad_norm": 0.802679717540741, "learning_rate": 7.2004011313477e-05, "loss": 0.8216, "step": 7104 }, { "epoch": 0.7136, "grad_norm": 0.7551457285881042, "learning_rate": 7.17780527097862e-05, "loss": 0.7823, "step": 7136 }, { "epoch": 0.7168, "grad_norm": 1.3118380308151245, "learning_rate": 7.155154385889589e-05, "loss": 0.7803, "step": 7168 }, { "epoch": 0.72, "grad_norm": 1.1100643873214722, "learning_rate": 7.132449048380907e-05, "loss": 0.7425, "step": 7200 }, { "epoch": 0.7232, "grad_norm": 0.8792561888694763, "learning_rate": 7.109689832128673e-05, "loss": 0.7515, "step": 7232 }, { "epoch": 0.7264, "grad_norm": 0.8382082581520081, "learning_rate": 7.0868773121703e-05, "loss": 0.8134, "step": 7264 }, { "epoch": 0.7296, "grad_norm": 1.7332772016525269, "learning_rate": 7.064012064889971e-05, "loss": 0.6971, "step": 7296 }, { "epoch": 0.7328, "grad_norm": 1.4402042627334595, "learning_rate": 7.041094668004093e-05, "loss": 0.6845, "step": 7328 }, { "epoch": 0.736, "grad_norm": 1.1810777187347412, "learning_rate": 7.018125700546683e-05, "loss": 0.7472, "step": 7360 }, { "epoch": 0.7392, "grad_norm": 0.9390580058097839, "learning_rate": 6.995105742854759e-05, "loss": 0.8127, "step": 7392 }, { "epoch": 0.7424, "grad_norm": 1.570432186126709, "learning_rate": 6.972035376553656e-05, "loss": 0.7071, "step": 7424 }, { "epoch": 0.7456, "grad_norm": 1.168547511100769, "learning_rate": 6.94891518454234e-05, "loss": 0.7017, "step": 7456 }, { "epoch": 0.7488, "grad_norm": 1.1337932348251343, "learning_rate": 6.925745750978686e-05, "loss": 0.6738, "step": 7488 }, { "epoch": 0.752, "grad_norm": 1.351352334022522, "learning_rate": 6.902527661264701e-05, "loss": 0.7548, "step": 7520 }, { "epoch": 0.7552, "grad_norm": 0.6679269671440125, "learning_rate": 6.87926150203176e-05, "loss": 0.7106, "step": 7552 }, { "epoch": 0.7584, "grad_norm": 1.3825992345809937, "learning_rate": 6.855947861125759e-05, "loss": 0.6443, "step": 7584 }, { "epoch": 0.7616, "grad_norm": 1.1650683879852295, "learning_rate": 6.832587327592275e-05, "loss": 0.7547, "step": 7616 }, { "epoch": 0.7648, "grad_norm": 1.5112355947494507, "learning_rate": 6.809180491661678e-05, "loss": 0.7076, "step": 7648 }, { "epoch": 0.768, "grad_norm": 0.8795199990272522, "learning_rate": 6.785727944734228e-05, "loss": 0.7345, "step": 7680 }, { "epoch": 0.7712, "grad_norm": 1.6340776681900024, "learning_rate": 6.762230279365114e-05, "loss": 0.7517, "step": 7712 }, { "epoch": 0.7744, "grad_norm": 1.022924542427063, "learning_rate": 6.738688089249502e-05, "loss": 0.6874, "step": 7744 }, { "epoch": 0.7776, "grad_norm": 1.2930107116699219, "learning_rate": 6.715101969207525e-05, "loss": 0.7479, "step": 7776 }, { "epoch": 0.7808, "grad_norm": 1.9842311143875122, "learning_rate": 6.691472515169251e-05, "loss": 0.7479, "step": 7808 }, { "epoch": 0.784, "grad_norm": 1.5960675477981567, "learning_rate": 6.667800324159636e-05, "loss": 0.7928, "step": 7840 }, { "epoch": 0.7872, "grad_norm": 3.447913885116577, "learning_rate": 6.644085994283433e-05, "loss": 0.6924, "step": 7872 }, { "epoch": 0.7904, "grad_norm": 0.8809865713119507, "learning_rate": 6.620330124710077e-05, "loss": 0.7955, "step": 7904 }, { "epoch": 0.7936, "grad_norm": 1.3761461973190308, "learning_rate": 6.596533315658555e-05, "loss": 0.6842, "step": 7936 }, { "epoch": 0.7968, "grad_norm": 0.9557456374168396, "learning_rate": 6.572696168382235e-05, "loss": 0.7285, "step": 7968 }, { "epoch": 0.8, "grad_norm": 0.7569695115089417, "learning_rate": 6.548819285153676e-05, "loss": 0.6431, "step": 8000 }, { "epoch": 0.8032, "grad_norm": 1.2884209156036377, "learning_rate": 6.524903269249411e-05, "loss": 0.739, "step": 8032 }, { "epoch": 0.8064, "grad_norm": 1.033050775527954, "learning_rate": 6.500948724934703e-05, "loss": 0.6759, "step": 8064 }, { "epoch": 0.8096, "grad_norm": 0.9404661655426025, "learning_rate": 6.47695625744828e-05, "loss": 0.696, "step": 8096 }, { "epoch": 0.8128, "grad_norm": 0.8363805413246155, "learning_rate": 6.452926472987044e-05, "loss": 0.7273, "step": 8128 }, { "epoch": 0.816, "grad_norm": 0.7976164817810059, "learning_rate": 6.428859978690748e-05, "loss": 0.6671, "step": 8160 }, { "epoch": 0.8192, "grad_norm": 1.6969666481018066, "learning_rate": 6.404757382626669e-05, "loss": 0.6968, "step": 8192 }, { "epoch": 0.8224, "grad_norm": 1.061860203742981, "learning_rate": 6.380619293774223e-05, "loss": 0.7424, "step": 8224 }, { "epoch": 0.8256, "grad_norm": 1.2336043119430542, "learning_rate": 6.356446322009607e-05, "loss": 0.6786, "step": 8256 }, { "epoch": 0.8288, "grad_norm": 1.3530735969543457, "learning_rate": 6.332239078090358e-05, "loss": 0.7042, "step": 8288 }, { "epoch": 0.832, "grad_norm": 0.9186837673187256, "learning_rate": 6.307998173639954e-05, "loss": 0.7433, "step": 8320 }, { "epoch": 0.8352, "grad_norm": 1.0583479404449463, "learning_rate": 6.283724221132333e-05, "loss": 0.6515, "step": 8352 }, { "epoch": 0.8384, "grad_norm": 1.468887209892273, "learning_rate": 6.259417833876432e-05, "loss": 0.7033, "step": 8384 }, { "epoch": 0.8416, "grad_norm": 0.7726921439170837, "learning_rate": 6.235079626000694e-05, "loss": 0.721, "step": 8416 }, { "epoch": 0.8448, "grad_norm": 1.8641211986541748, "learning_rate": 6.21071021243754e-05, "loss": 0.626, "step": 8448 }, { "epoch": 0.848, "grad_norm": 1.9702180624008179, "learning_rate": 6.186310208907839e-05, "loss": 0.6017, "step": 8480 }, { "epoch": 0.8512, "grad_norm": 2.057535171508789, "learning_rate": 6.161880231905354e-05, "loss": 0.7612, "step": 8512 }, { "epoch": 0.8544, "grad_norm": 2.2840230464935303, "learning_rate": 6.137420898681158e-05, "loss": 0.6609, "step": 8544 }, { "epoch": 0.8576, "grad_norm": 1.7856135368347168, "learning_rate": 6.112932827228044e-05, "loss": 0.7015, "step": 8576 }, { "epoch": 0.8608, "grad_norm": 1.0354335308074951, "learning_rate": 6.0884166362649075e-05, "loss": 0.6714, "step": 8608 }, { "epoch": 0.864, "grad_norm": 1.054237961769104, "learning_rate": 6.063872945221118e-05, "loss": 0.6928, "step": 8640 }, { "epoch": 0.8672, "grad_norm": 1.004862904548645, "learning_rate": 6.039302374220861e-05, "loss": 0.7676, "step": 8672 }, { "epoch": 0.8704, "grad_norm": 0.8693735003471375, "learning_rate": 6.0147055440674795e-05, "loss": 0.7562, "step": 8704 }, { "epoch": 0.8736, "grad_norm": 1.6824612617492676, "learning_rate": 5.990083076227782e-05, "loss": 0.6509, "step": 8736 }, { "epoch": 0.8768, "grad_norm": 3.1215667724609375, "learning_rate": 5.9654355928163416e-05, "loss": 0.7553, "step": 8768 }, { "epoch": 0.88, "grad_norm": 1.4479137659072876, "learning_rate": 5.9407637165797793e-05, "loss": 0.8046, "step": 8800 }, { "epoch": 0.8832, "grad_norm": 2.769347906112671, "learning_rate": 5.916068070881026e-05, "loss": 0.6869, "step": 8832 }, { "epoch": 0.8864, "grad_norm": 1.338932752609253, "learning_rate": 5.891349279683578e-05, "loss": 0.6742, "step": 8864 }, { "epoch": 0.8896, "grad_norm": 1.15195631980896, "learning_rate": 5.8666079675357285e-05, "loss": 0.6972, "step": 8896 }, { "epoch": 0.8928, "grad_norm": 1.0247623920440674, "learning_rate": 5.841844759554787e-05, "loss": 0.7107, "step": 8928 }, { "epoch": 0.896, "grad_norm": 1.4130921363830566, "learning_rate": 5.817060281411284e-05, "loss": 0.7327, "step": 8960 }, { "epoch": 0.8992, "grad_norm": 0.6436507701873779, "learning_rate": 5.792255159313169e-05, "loss": 0.6418, "step": 8992 }, { "epoch": 0.9024, "grad_norm": 0.9555985331535339, "learning_rate": 5.7674300199899834e-05, "loss": 0.7157, "step": 9024 }, { "epoch": 0.9056, "grad_norm": 0.8774769306182861, "learning_rate": 5.742585490677024e-05, "loss": 0.6197, "step": 9056 }, { "epoch": 0.9088, "grad_norm": 0.9347734451293945, "learning_rate": 5.7177221990995e-05, "loss": 0.6672, "step": 9088 }, { "epoch": 0.912, "grad_norm": 1.2730952501296997, "learning_rate": 5.692840773456669e-05, "loss": 0.7524, "step": 9120 }, { "epoch": 0.9152, "grad_norm": 1.3449304103851318, "learning_rate": 5.667941842405968e-05, "loss": 0.7106, "step": 9152 }, { "epoch": 0.9184, "grad_norm": 2.288444757461548, "learning_rate": 5.643026035047128e-05, "loss": 0.7239, "step": 9184 }, { "epoch": 0.9216, "grad_norm": 1.1817107200622559, "learning_rate": 5.618093980906276e-05, "loss": 0.7342, "step": 9216 }, { "epoch": 0.9248, "grad_norm": 1.4276821613311768, "learning_rate": 5.5931463099200355e-05, "loss": 0.6198, "step": 9248 }, { "epoch": 0.928, "grad_norm": 1.0878974199295044, "learning_rate": 5.568183652419606e-05, "loss": 0.7204, "step": 9280 }, { "epoch": 0.9312, "grad_norm": 1.5497533082962036, "learning_rate": 5.54320663911484e-05, "loss": 0.7218, "step": 9312 }, { "epoch": 0.9344, "grad_norm": 0.5286266207695007, "learning_rate": 5.518215901078302e-05, "loss": 0.8243, "step": 9344 }, { "epoch": 0.9376, "grad_norm": 1.9889594316482544, "learning_rate": 5.493212069729332e-05, "loss": 0.6849, "step": 9376 }, { "epoch": 0.9408, "grad_norm": 1.6639822721481323, "learning_rate": 5.468195776818084e-05, "loss": 0.682, "step": 9408 }, { "epoch": 0.944, "grad_norm": 3.0651698112487793, "learning_rate": 5.4431676544095676e-05, "loss": 0.8112, "step": 9440 }, { "epoch": 0.9472, "grad_norm": 1.0381174087524414, "learning_rate": 5.4181283348676806e-05, "loss": 0.6497, "step": 9472 }, { "epoch": 0.9504, "grad_norm": 1.0353689193725586, "learning_rate": 5.393078450839228e-05, "loss": 0.6654, "step": 9504 }, { "epoch": 0.9536, "grad_norm": 1.6130503416061401, "learning_rate": 5.368018635237936e-05, "loss": 0.7351, "step": 9536 }, { "epoch": 0.9568, "grad_norm": 1.171970248222351, "learning_rate": 5.3429495212284665e-05, "loss": 0.7099, "step": 9568 }, { "epoch": 0.96, "grad_norm": 1.937739610671997, "learning_rate": 5.3178717422104144e-05, "loss": 0.6366, "step": 9600 }, { "epoch": 0.9632, "grad_norm": 1.8911631107330322, "learning_rate": 5.2927859318023073e-05, "loss": 0.6813, "step": 9632 }, { "epoch": 0.9664, "grad_norm": 1.1599578857421875, "learning_rate": 5.2676927238255946e-05, "loss": 0.7155, "step": 9664 }, { "epoch": 0.9696, "grad_norm": 1.2809479236602783, "learning_rate": 5.242592752288632e-05, "loss": 0.7051, "step": 9696 }, { "epoch": 0.9728, "grad_norm": 2.0790278911590576, "learning_rate": 5.2174866513706646e-05, "loss": 0.7387, "step": 9728 }, { "epoch": 0.976, "grad_norm": 1.0074536800384521, "learning_rate": 5.1923750554058084e-05, "loss": 0.6751, "step": 9760 }, { "epoch": 0.9792, "grad_norm": 1.3937727212905884, "learning_rate": 5.16725859886701e-05, "loss": 0.6902, "step": 9792 }, { "epoch": 0.9824, "grad_norm": 0.8866567015647888, "learning_rate": 5.142137916350028e-05, "loss": 0.7443, "step": 9824 }, { "epoch": 0.9856, "grad_norm": 0.857765793800354, "learning_rate": 5.1170136425573956e-05, "loss": 0.7032, "step": 9856 }, { "epoch": 0.9888, "grad_norm": 0.6846195459365845, "learning_rate": 5.0918864122823816e-05, "loss": 0.6508, "step": 9888 }, { "epoch": 0.992, "grad_norm": 0.9779634475708008, "learning_rate": 5.066756860392956e-05, "loss": 0.7161, "step": 9920 }, { "epoch": 0.9952, "grad_norm": 1.3198580741882324, "learning_rate": 5.0416256218157476e-05, "loss": 0.6885, "step": 9952 }, { "epoch": 0.9984, "grad_norm": 0.8396392464637756, "learning_rate": 5.0164933315199955e-05, "loss": 0.7511, "step": 9984 } ], "logging_steps": 32, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0363220663791616e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }